User:YiFeiBot/wikiindex size.py

From WikiIndex
< User:YiFeiBot
Revision as of 10:49, 13 May 2013 by Zhuyifei1999 (talk | contribs) (highlight)
Jump to navigation Jump to search

#!/usr/bin/python
# -*- coding: utf-8 -*-

# Copyright (C) 2011 emijrp
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import catlib
import re
import pagegenerators
import sys
import urllib
import wikipedia

s = wikipedia.Site('en', 'wikiindex')
cat = catlib.Category(s, 'Category:MediaWiki')

start = '!'
if len(sys.argv) == 2:
    start = sys.argv[1]

gen = pagegenerators.CategorizedPageGenerator(cat, start=start)
pre = pagegenerators.PreloadingGenerator(gen, pageNumber=50)

"""
{{Size
|pages = <!--Necessary. Type the plain number of pages here - no thousands separators.-->
wiki pages, wiki_pages

|statistics URL = <!--Preferred, source of page count (mostly a statistics page). If unknown leave void.-->
wiki statistics URL, wiki_statistics_URL

|wikiFactor = <!--Optional. If unknown leave void. (See Proposal:wikiFactor)-->
|wikiFactor URL = <!--Optional, source of wiki factor. If unknown leave void.-->
}}
"""

"""
{{Size <!--see Template:Size for full detail-->
|pages= <!--type the plain number of pages - NO thousands separators-->
|statistics URL= No <!--page count source (often a 'Statistics' page); if unknown type 'No'-->
|wikiFactor= <!--preferred; if unknown leave void; see: Category:wikiFactor for help-->
|wikiFactor URL= No <!--wF source (often 'PopularPages', 'Mostvisitedpages' or 'PageHits'); if unknown leave void-->
}}(As of: DD Month 2013)<!--manually add/amend date when stats are verified and/or updated-->
"""

size_r = re.compile(ur"""(?im)(?P<all>\{\{\s*Size\s*((\s*\|\s*(?P<pages>pages|wiki[ _]pages)\s*=\s*(?P<pages_value>\d*)\s*[^\|\}]*\s*)|(\s*\|\s*(?P<pagesurl>statistics[ _]URL|wiki[ _]statistics[ _]URL)\s*=\s*(?P<pagesurl_value>https?://[^ \|\}\<]*)\s*[^\|\}]*\s*)|(\s*\|\s*(?P<wikifactor>wikiFactor)\s*=\s*(?P<wikifactor_value>\d*)\s*[^\|\}]*\s*)|(\s*\|\s*(?P<wikifactorurl>wikiFactor[ _]URL)\s*=\s*(?P<wikifactorurl_value>http://[^ \|\}\<]*)\s*[^\|\}]*\s*))+\s*\|?\s*\}\})""")

for page in pre:
    if not page.exists() or page.isRedirectPage():
        continue
    
    wikipedia.output('--> %s <--' % (page.title()))
    wtext = page.get()
    newtext = wtext
    
    m = size_r.finditer(wtext)
    for i in m:
        all = i.group('all') and i.group('all').strip() or ''
        pages = i.group('pages') and i.group('pages').strip() or ''
        pagesurl = i.group('pagesurl') and i.group('pagesurl').strip() or ''
        wikifactor = i.group('wikifactor') and i.group('wikifactor').strip() or ''
        wikifactorurl = i.group('wikifactorurl') and i.group('wikifactorurl').strip() or ''
        
        pages_value = i.group('pages_value') and i.group('pages_value').strip() or '0'
        pagesurl_value = i.group('pagesurl_value') and i.group('pagesurl_value').strip() or ''
        wikifactor_value = i.group('wikifactor_value') and i.group('wikifactor_value').strip() or ''
        wikifactorurl_value = i.group('wikifactorurl_value') and i.group('wikifactorurl_value').strip() or ''
        
        #get new values
        n = re.findall(ur"(https?://[^\|\}\]]+\?action=raw|https?://[^\|\}\]]+:Statistics)", pagesurl_value)
        if n:
            raw = ''
            try:
                url = n[0]
                if url.endswith(":Statistics"):
                    url += '?action=raw'
                f = urllib.urlopen(url)
                raw = unicode(f.read(), 'utf-8')
                f.close()
            except:
                break
            o = re.findall(ur"total=\d+;good=(\d+);", raw)
            if o:
                if o[0] and int(pages_value) != int(o[0]):
                    summary = u"Robot: Updating size: %s -> %s" % (pages_value, o[0])
                    pages_value = o[0]
                else:
                    break
            else:
                break
        else:
            break
        #end get
        
        #recalculate wikifactor
        pass #TODO, leave AS IS meanwhile
        #end recalculate
        
        """print pages, pages_value
        print pagesurl, pagesurl_value
        print wikifactor, wikifactor_value
        print wikifactorurl, wikifactorurl_value"""
        
        newvalues = u"""{{Size <!--see Template:Size for full detail-->
| %s = %s <!--type the plain number of pages - NO thousands separators-->
| %s = %s <!--page count source (often a 'Statistics' page); if unknown type 'No'-->
| %s = %s <!--preferred; if unknown leave void; see: Category:wikiFactor for help-->
| %s = %s <!--wF source (often 'PopularPages', 'Mostvisitedpages' or 'PageHits'); if unknown leave void-->
}}""" % (pages and pages or 'pages', pages_value and pages_value or '', pagesurl and pagesurl or 'statistics URL', pagesurl_value and pagesurl_value or '', wikifactor and wikifactor or 'wikiFactor', wikifactor_value and wikifactor_value or '', wikifactorurl and wikifactorurl or 'wikiFactor URL', wikifactorurl_value and wikifactorurl_value or '')
        newtext = wtext.replace(all, newvalues)
        if wtext != newtext:
            wikipedia.showDiff(wtext, newtext)
            page.put(newtext, summary)
            
        break