User:YiFeiBot/wikiindex size.py

From WikiIndex
< User:YiFeiBot
Revision as of 08:21, 22 May 2013 by Zhuyifei1999 (talk | contribs) (Update -- finally working -- at least I think so)
Jump to navigation Jump to search

#!/usr/bin/python
# -*- coding: utf-8  -*-
#
__version__ = '$Id: wikiindex-size.py 11155 2013-05-13 10:39:02Z xqt $'
#

import re

import wikipedia as pywikibot
import pagegenerators
from pywikibot import i18n
import catlib
import sys
import urllib
from datetime import datetime

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;': pagegenerators.parameterHelp
}


class WiSizeBot:
    # Edit summary message that should be used is placed on /i18n subdirectory.
    # The file containing these messages should have the same name as the caller
    # script (i.e. wikiindex-size.py in this case)

    def __init__(self, generator, summary):
        """
        Constructor. Parameters:
            @param generator: The page generator that determines on which pages
                              to work.
            @type generator: generator.
            @param summary: Set the summary message text for the edit.
            @type summary: (unicode) string.
        """
        self.generator = generator
        # init constants
        self.site = pywikibot.getSite(code=pywikibot.default_code)
        # Set the edit summary message
        if summary:
            self.summary = summary
        else:
            self.summary = i18n.twtranslate(self.site, 'basic-changing')

    def run(self):
        for page in self.generator:
            self.treat(page)

    def treat(self, page):
        """
        Loads the given page, does some changes, and saves it.
        """
        text = self.load(page)
        if not text:
            return

        newtext = text
        
        size_r = re.compile(ur"""(?im)(?P<all>\{\{\s*Size\s*((\s*\|\s*(?P<pages>pages|wiki[ _]pages)\s*=\s*(?P<pages_value>\d*)\s*[^\|\}]*\s*)|(\s*\|\s*(?P<pagesurl>statistics[ _]URL|wiki[ _]statistics[ _]URL)\s*=\s*(?P<pagesurl_value>https?://[^ \|\}\<]*)\s*[^\|\}]*\s*)|(\s*\|\s*(?P<wikifactor>wikiFactor)\s*=\s*(?P<wikifactor_value>\d*)\s*[^\|\}]*\s*)|(\s*\|\s*(?P<wikifactorurl>wikiFactor[ _]URL)\s*=\s*(?P<wikifactorurl_value>http://[^ \|\}\<]*)\s*[^\|\}]*\s*))+\s*\|?\s*\}\}(\(As of (?P<day>[1-31]) (?P<month>(?=January|February|March|April|May|June|July|August|September|October|November|December)) (?P<year>[0-9][0-9][0-9][0-9])\)+\s*\|?\s*)*)""")

        wtext = page.get()
        m = size_r.finditer(wtext)
 
        all = ""
        newvalues = ""
        for i in m:
            all = i.group('all') and i.group('all').strip() or 
            pages = i.group('pages') and i.group('pages').strip() or 
            pagesurl = i.group('pagesurl') and i.group('pagesurl').strip() or 
            wikifactor = i.group('wikifactor') and i.group('wikifactor').strip() or 
            wikifactorurl = i.group('wikifactorurl') and i.group('wikifactorurl').strip() or 
            
            pages_value = i.group('pages_value') and i.group('pages_value').strip() or '0'
            pagesurl_value = i.group('pagesurl_value') and i.group('pagesurl_value').strip() or 
            wikifactor_value = i.group('wikifactor_value') and i.group('wikifactor_value').strip() or 
            wikifactorurl_value = i.group('wikifactorurl_value') and i.group('wikifactorurl_value').strip() or 
            time_all = i.group('all') and i.group('all').strip() or 
            day = i.group('day') and i.group('day').strip() or 
            month = i.group('month') and i.group('month').strip() or 
            year = i.group('year') and i.group('year').strip() or 
            
            #get new values
            n = re.findall(ur"(https?://[^\|\}\]]+\?action=raw|https?://[^\|\}\]]+:Statistics)", pagesurl_value)
            if n:
                raw = 
                try:
                    url = n[0]
                    if url.endswith(":Statistics"):
                        url += '?action=raw'
                    f = urllib.urlopen(url)
                    raw = unicode(f.read(), 'utf-8')
                    f.close()
                except:
                    break
                o = re.findall(ur"total=\d+;good=(\d+);", raw)
                if o:
                    if o[0] and int(pages_value) != int(o[0]):
                        comment = u"Robot: Updating size: %s -> %s" %     (pages_value, o[0])
                        pages_value = o[0]
                        newtime = True
                    else:
                        break
                else:
                    break
            else:
                break
            #end get
            
            #recalculate wikifactor
            pass #TODO, leave AS IS meanwhile
            #end recalculate
            
            """print pages, pages_value
            print pagesurl, pagesurl_value
            print wikifactor, wikifactor_value
            print wikifactorurl, wikifactorurl_value"""
            
            if newtime:
                dt = datetime.date(datetime.utcnow())
                
                day = dt.strftime('%d')
                month = dt.strftime('%B')
                year = dt.strftime('%Y')
                
            newvalues = u"""
Wiki size: unknown size
(As of: %s %s %s)""" % (pages and pages or 'pages', pages_value and pages_value or , pagesurl and pagesurl or 'statistics URL', pagesurl_value and pagesurl_value or , wikifactor and wikifactor or 'wikiFactor', wikifactor_value and wikifactor_value or , wikifactorurl and wikifactorurl or 'wikiFactor URL', wikifactorurl_value and wikifactorurl_value or , day and day or , month and month or , year and year or ) newtext = text.replace(all, newvalues) if not self.save(newtext, page, self.summary): pywikibot.output(u'Page %s not saved.' % page.title(asLink=True)) def load(self, page): """ Loads the given page, does some changes, and saves it. """ try: # Load the page text = page.get() except pywikibot.NoPage: pywikibot.output(u"Page %s does not exist; skipping."  % page.title(asLink=True)) except pywikibot.IsRedirectPage: pywikibot.output(u"Page %s is a redirect; skipping."  % page.title(asLink=True)) else: return text return None def save(self, text, page, comment=None, **kwargs): # only save if something was changed if text != page.get(): # Show the title of the page we're working on. # Highlight the title in purple. pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<"  % page.title()) # show what was changed pywikibot.showDiff(page.get(), text) pywikibot.output(u'Comment: %s' % comment) choice = pywikibot.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N') if choice == 'y': try: # Save the page page.put(text, comment=comment or self.comment, **kwargs) except pywikibot.LockedPage: pywikibot.output(u"Page %s is locked; skipping."  % page.title(asLink=True)) except pywikibot.EditConflict: pywikibot.output( u'Skipping %s because of edit conflict'  % (page.title())) except pywikibot.SpamfilterError, error: pywikibot.output( u'Cannot change %s because of spam blacklist entry %s'  % (page.title(), error.url)) else: return True return False class AutoWiSizeBot(WiSizeBot): # Intended for usage e.g. as cronjob without prompting the user. _REGEX_eol = re.compile(u'\n') def __init__(self): WiSizeBot.__init__(self, None, None) ## @since 10326 # @remarks needed by various bots def save(self, page, text, comment=None, **kwargs): pywikibot.output(u'\03{lightblue}Writing to wiki on %s...\03{default}'  % page.title(asLink=True)) comment_output = comment or pywikibot.action pywikibot.output(u'\03{lightblue}Comment: %s\03{default}'  % comment_output) #pywikibot.showDiff(page.get(), text) for i in range(3): try: # Save the page page.put(text, comment=comment, **kwargs) except pywikibot.LockedPage: pywikibot.output( u"\03{lightblue}Page %s is locked; skipping.\03{default}"  % page.title(asLink=True)) except pywikibot.EditConflict: pywikibot.output( u'\03{lightblue}Skipping %s because of edit ' u'conflict\03{default}' % (page.title())) except pywikibot.SpamfilterError, error: pywikibot.output( u'\03{lightblue}Cannot change %s because of spam blacklist ' u'entry %s\03{default}' % (page.title(), error.url)) else: return True return False ## @since 10326 # @remarks needed by various bots def append(self, page, text, comment=None, section=None, **kwargs): if section: pywikibot.output( u'\03{lightblue}Appending to wiki on %s in section ' u'%s...\03{default}' % (page.title(asLink=True), section)) for i in range(3): try: # Append to page section page.append(text, comment=comment, section=section, **kwargs) except pywikibot.PageNotSaved, error: pywikibot.output( u'\03{lightblue}Cannot change %s because of ' u'%s\03{default}' % (page.title(), error)) else: return True else: content = self.load(page) # 'None' if not existing page if not content: # (create new page) content = u content += u'\n\n' content += text return self.save(page, content, comment=comment, **kwargs) ## @since 10326 # @remarks needed by various bots def loadTemplates(self, page, template, default={}): """Get operating mode from page with template by searching the template. @param page: The user (page) for which the data should be retrieved. Returns a list of dict with the templates parameters found. """ self._content = self.load(page) # 'None' if not existing page templates = [] if not self._content: return templates # catch empty or not existing page for tmpl in pywikibot.extract_templates_and_params(self._content): if tmpl[0] == template: param_default = {} param_default.update(default) param_default.update(tmpl[1]) templates.append(param_default) return templates ## @since 10326 # @remarks common interface to bot job queue on wiki def loadJobQueue(self, page, queue_security, reset=True): """Check if the data queue security is ok to execute the jobs, if so read the jobs and reset the queue. @param page: Wiki page containing job queue. @type page: page @param queue_security: This string must match the last edit comment, or else nothing is done. @type queue_security: string Returns a list of jobs. This list may be empty. """ try: actual = page.getVersionHistory(revCount=1)[0] except: pass secure = False for item in queue_security[0]: secure = secure or (actual[2] == item) secure = secure and (actual[3] == queue_security[1]) if not secure: return [] data = self._REGEX_eol.split(page.get()) if reset: pywikibot.output(u'\03{lightblue}Job queue reset...\03{default}') pywikibot.setAction(u'reset job queue') page.put(u, minorEdit=True) queue = [] for line in data: queue.append(line[1:].strip()) return queue def main(): # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # The generator gives the pages that should be worked upon. gen = None # This temporary array is used to read the page title if one single # page to work on is specified by the arguments. pageTitleParts = [] # summary message editSummary = start = "!" # Parse command line arguments for arg in pywikibot.handleArgs(): if arg.startswith('-summary:'): editSummary = arg[9:] elif arg.startswith('-start:'): start = arg[7:] else: pywikibot.output(u'Unknown argument: %s' % arg) cat = catlib.Category(pywikibot.getSite(), 'Category:MediaWiki') gen = pagegenerators.CategorizedPageGenerator(cat, start=start) #if not gen: # gen = genFactory.getCombinedGenerator() if gen: # The preloading generator is responsible for downloading multiple # pages from the wiki simultaneously. gen = pagegenerators.PreloadingGenerator(gen) bot = WiSizeBot(gen, editSummary) bot.run() else: pywikibot.showHelp() if __name__ == "__main__": try: main() finally: pywikibot.stopme()