Jump to content

User:CobraBot/Code

From Wikipedia, the free encyclopedia
# -*- coding: utf-8  -*-

import wikipedia
import pagegenerators
import re
import warnings
from time import sleep
from sys import stdout
from oclc import isbn2oclc

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
    '&params;': pagegenerators.parameterHelp
}

TEMPLATE_PREFIX = u"Template:"
SITE = wikipedia.getSite()
def pagesUsingTemplate(templateName):
    transclusionPageName = unicode(SITE.namespace(10)) + u":" + templateName
    transclusionPage = wikipedia.Page(SITE, transclusionPageName)
    gen = pagegenerators.ReferringPageGenerator(transclusionPage, onlyTemplateInclusion=True)
    return gen

class BailOut(StandardError):
    """Immediately stop processing the current page"""

class OCLCBot:
    # Edit summary message that should be used.
    EDIT_SUMMARY = u'Adding [[OCLC]]# to book infobox based on [[ISBN]] ([[User:CobraBot|CobraBot]]; PLEASE [[User talk:CobraBot|report any problems]])'
    BOOK_INFOBOX = u"Infobox Book"
    DASHES = [u'-', u'‒', u'–', u'—', u'―']
    TERMINATOR = re.compile(u"(}})|\\|")
    INFOBOX_START = re.compile(u"\\{\\{[ \t\n]*infobox[ _]((book(s)?)|(novel))", re.IGNORECASE)
    OCLC_PARAM = u"\\|[ \t\n]*oclc[ \t\n]*=[ \t\n]*"
    ISBN_MIN_LEN = 10

    def __init__(self, debug):
        """
        Constructor. Parameters:
            * generator - The page generator that determines on which pages
                          to work on.
            * debug     - If True, doesn't do any real changes, but only shows
                          what would have been changed.
        """
        
        self.generator = pagesUsingTemplate(self.BOOK_INFOBOX)
        self.debug = debug
        self.editCount = 0
        self.log = file("skipped.log", 'a')

    def run(self):
        N = 371+145+36+29+38+26+48+56+48+188+85+45+171+130+105
        # Set the edit summary message
        wikipedia.setAction(self.EDIT_SUMMARY)
        print "Advancing by %s..." % N
        stdout.flush()
        for i in xrange(N):
            next(self.generator)
        print "Done advancing!"
        stdout.flush()
        for pageIndex, page in enumerate(self.generator):
            self.treat(page, pageIndex)
        self.log.close()

    #########
    def partition(self, text):
        boxmatch = self.INFOBOX_START.search(text)
        if not boxmatch:
            wikipedia.output(u"SKIPPING: Page either uses 'Book infobox' alias or is false positive")
            raise BailOut, "SKIPPING: Page either uses 'Book infobox' alias or is false positive"

        boxStart = boxmatch.start()
        boxEnd = boxStart + re.search(u"\\}\\}", text[boxStart:]).end()
        
        prebox = text[:boxStart]
        box = text[boxStart:boxEnd]
        postbox = text[boxEnd:]
        return prebox, box, postbox
    
    def checkForOclc(self, box):
        paramMatch = re.search(self.OCLC_PARAM, box)
        if paramMatch: #has |oclc=
            oclcValAndRest = box[paramMatch.end():]
            oclcTermMatch = self.TERMINATOR.search(oclcValAndRest)
            value = oclcValAndRest[:oclcTermMatch.start()].strip() # | oclc = VALUE |
            if value: #already has |oclc= filled in
                wikipedia.output(u"SKIPPING: oclc param already filled")
                raise BailOut, "SKIPPING: oclc param already filled"
            else: #remove the |oclc=
                # print "REMOVED OCLC:", repr(paramMatch.group())
                box = box[:paramMatch.start()] + box[paramMatch.start()+len(paramMatch.group()):]
                # print "NEW BOX:"
                # print box
                return box
        return box
    
    def findIsbnVal(self, box):
        paramMatch = re.search(u"\\|([ \t\n])*isbn([ \t\n])*=([ \t\n])*", box)
        if not paramMatch: #no ISBN present
            wikipedia.output(u"SKIPPING: No isbn param present")
            raise BailOut, "SKIPPING: No isbn param present"
        isbnValAndRest = box[paramMatch.end():]
        termMatch = self.TERMINATOR.search(isbnValAndRest)
        isbnVal = isbnValAndRest[:termMatch.start()]
        relIsbnTerm = self.TERMINATOR.search(isbnValAndRest).start()
        isbnTerm = paramMatch.end() + relIsbnTerm
        isbnFrag = isbnValAndRest[:relIsbnTerm]
        if '[[' in isbnFrag and ']]' not in isbnFrag:
            wikipedia.output(u"SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle")
            raise BailOut, "SKIPPING: Piped wikilink in |isbn= field; bot too stupid to handle"
        return isbnVal, isbnTerm
    
    def removeDashes(self, isbn):
        for dash in self.DASHES:
            isbn = isbn.replace(dash, '')
        return isbn

    def checkForNA(self, isbn):
        if re.match(u"N/?A", isbn, re.IGNORECASE):
            wikipedia.output(u"SKIPPING: ISBN Not/Applicable")
            raise BailOut, "SKIPPING: ISBN Not/Applicable"
    
    def removeExtraISBN(self, isbnVal):
        match = re.match(u"([ \t\n])*ISBN([ \t\n])*", isbnVal)
        if match:
            return isbnVal[match.end():]
        return isbnVal

    def firstWord(self, isbnVal):
        wordMatch = re.search("[^ \t\n<,;\\[\\]]+", isbnVal)
        return wordMatch.group()

    def normalize(self, string):
        return string.replace(u' ',u'').replace(u"-",u'').replace(u"and",  u"&").replace(u',', u'').replace(u'.', u'').replace(u"'", u'').replace(u'"', u'').replace(u"’", u'').lower().replace(u"the", u'')

    def treat(self, page, pageIndex):
        """
        Loads the given page, does some changes, and saves it.
        """
        
        print "=================================================================="
        
        # if u"British" not in page.title(): return
        # raw_input("Continue?")
        print "PAGE TITLE:", page.title()
        print "PAGE#:", pageIndex+1
        print "EDIT COUNT:", self.editCount
        if page.namespace() != 0:
            wikipedia.output(u"SKIPPING: Non-article namespace!")
            return
        
        try:
            # Load the page
            text = page.get()
        except wikipedia.NoPage:
            wikipedia.output(u"Page %s does not exist; skipping." % page.aslink())
            return
        except wikipedia.IsRedirectPage:
            wikipedia.output(u"Page %s is a redirect; skipping." % page.aslink())
            return

        ################################################################
        # NOTE: Here you can modify the text in whatever way you want. #
        ################################################################

        # If you find out that you do not want to edit this page, just return.

        try:
            prebox, box, postbox = self.partition(text)
            # print "BOX:"
            # print box
            box = self.checkForOclc(box)
            isbnVal, isbnTerm = self.findIsbnVal(box)
            # print "INITIAL ISBN:", repr(isbnVal)
            isbnVal = self.removeDashes(isbnVal).strip()
            # print "ISBN SANS DASH:", repr(isbnVal)
            isbnVal = self.removeExtraISBN(isbnVal)
            self.checkForNA(isbnVal)
            # print "ISBN SANS ISBN:", repr(isbnVal)
            if not isbnVal: #empty |isbn=
                wikipedia.output(u"SKIPPING: Empty isbn param")
                raise BailOut, "SKIPPING: Empty isbn param"
            isbn = self.firstWord(isbnVal)
            # print "ONE TRUE ISBN:", isbn
            print "ISBN#:", isbn
            if len(isbn) < self.ISBN_MIN_LEN:
                wikipedia.output(u"SKIPPING: Malformed ISBN, too short (%s)" % isbn)
                raise BailOut, ("SKIPPING: Malformed ISBN, too short (%s)" % isbn)
            if not re.search("[0-9]", isbn):
                wikipedia.output(u"SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
                raise BailOut, ("SKIPPING: Malformed ISBN, no numbers (%s)" % isbn)
            
        except BailOut as e:
            self.log.write(page.title().encode('utf8')+"; "+e.message+"\n")
            return
        
        #do lookup
        try:
            oclc, oclcTitle = isbn2oclc(isbn)
        except RuntimeError as e:
            wikipedia.output(u"ABORTED: Problem looking up OCLC# (%s)" % e.message)
            return
        print "PAGE TITLE:", page.title()
        
        wikiCanon = self.normalize(page.title().split(u"(")[0])
        oclcCanon = self.normalize(oclcTitle.split(u":")[0])
        titlesMatch = oclcCanon.startswith(wikiCanon)
        if titlesMatch:
            print
            print "--Canonical titles DO MATCH.--"
        else:
            print wikiCanon
            print oclcCanon
        
        box = box[:isbnTerm] + "| oclc= "+oclc+(" " if self.debug else "\n") + box[isbnTerm:]
        text = prebox + box + postbox

        # only save if something was changed
        if text != page.get():
            # Show the title of the page we're working on.
            # Highlight the title in purple.
            wikipedia.output(u"\n>>> \03{lightpurple}%s\03{default} <<<" % page.title())
            # show what was changed
            wikipedia.showDiff(page.get(), text)
            # raw_input("Continue?")
            # sleep(3)
            if not self.debug:
                choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
                if choice != 'y':
                    return
                try:
                    # Save the page
                    page.put(text)
                except wikipedia.LockedPage:
                    wikipedia.output(u"Page %s is locked; skipping." % page.aslink())
                except wikipedia.EditConflict:
                    wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
                except wikipedia.SpamfilterError, error:
                    wikipedia.output(u'Cannot change %s because of spam blacklist entry %s' % (page.title(), error.url))
                else:
                    self.editCount += 1


def main():
    DEBUG = False # True
    bot = OCLCBot(DEBUG)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        bot.run()

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()