User:Gdr/taxoconvert.py

From Wikipedia, the free encyclopedia
#!/usr/bin/python
# -*- encoding:utf-8 -*-
#
# taxoconvert.py -- convert multi-template taxoboxes to single template

import codecs
import getopt
import os
import pickle
import re
import sys
import tempfile
import wikipedia

global checks, edit, debug
site = wikipedia.Site('en')
checks = True
edit = False
debug = False

class Error(Exception):
    def __init__(self, text):
        self.text = text
    def __str__(self):
        return self.text

class NoError(Error):
    None

def edittext(s):
    fn = tempfile.mktemp()
    f = codecs.open(fn, 'w', 'utf-8')
    f.write(s)
    f.close()
    os.system('%s "%s"' % (os.getenv('EDITOR', 'vi'), fn))
    f = codecs.open(fn, 'r', 'utf-8')
    s = f.read()
    f.close()
    return s

def canonize(s):
    return filter(lambda c: c.isalnum(), s).lower()

def check(text, newtext):
    if not checks:
        return newtext
    while 1:
        wikipedia.showDiff(text, newtext)
        i = wikipedia.input(u'OK? [yNeq]')
        if i == 'q':
            raise IOError
        elif i == 'y':
            return newtext
        elif i == 'e':
            newtext = edittext(newtext)
        else:
            return None

def record(params, key, value):
    if debug:
        wikipedia.output(u"%s = %s" % (key, value))
    if params.has_key(key):
        raise Error(u"Duplicate key %s" % key)
    if value:
        params['sequence'].append(key)
        params[key] = value

def parse_nomial(suffix, n, lines, params):
    if debug:
        wikipedia.output(u"parse_nomial: suffix = '%s', lines[n] = %s" % (suffix, lines[n]))
    orig_n = n
    found = False

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
                 r'(?:[ _]+(?:simple|botany|parens))? *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'\1_name *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
    if m:
        record(params, m.group(1) + suffix, "''%s''" % m.group(2))
        n += 1
        found = True

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
                 r'(?:[ _]+part)? *\| *'
                 r'(?:color *= *[a-z]+ *\| *)?'
                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
                 r'author *= *([^\}]*[^\} ]) *\| *'
                 r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
    if m and not found:
        record(params, m.group(1) + suffix, "''%s''" % m.group(2))
        record(params, '%s%s_authority' % (m.group(1), suffix),
               '%s, %s' % (m.group(3), m.group(4)))
        n += 1
        found = True

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial) *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
                 r'author *= *([^\}]*[^\} ]) *\| *'
                 r'date *= *}}$', lines[n])
    if m and not found:
        record(params, m.group(1) + suffix, "''%s''" % m.group(2))
        record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3))
        n += 1
        found = True

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
                 r'(?:[ _]+(?:parens|botany|simple))? *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
                 r'author *= *\| *'
                 r'date *= *}}$', lines[n])
    if m and not found:
        record(params, m.group(1) + suffix, "''%s''" % m.group(2))
        n += 1
        found = True

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
                 r'[ _]+parens(?:[ _]+part)? *\| *'
                 r'(?:color *= *[a-z]+ *\| *)?'
                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
                 r'author *= *([^\}]*[^\} ]) *\| *'
                 r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
    if m and not found:
        record(params, m.group(1) + suffix, "''%s''" % m.group(2))
        record(params, '%s%s_authority' % (m.group(1), suffix),
               '(%s, %s)' % (m.group(3), m.group(4)))
        n += 1
        found = True

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
                 r'[ _]+botany *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
                 r'author *= *([^\}]*[^\} ]|) *}}$', lines[n])
    if m and not found:
        record(params, m.group(1) + suffix, "''%s''" % m.group(2))
        record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3))
        n += 1
        found = True

    if n + 1 < len(lines):
        m = re.match(r'(?i){{taxobox[ _]+section[ _]+((?:b|tr)inomial)'
                     r'[ _]+botany *\| *'
                     r'color *= *[a-z]+ *\| *'
                     r'\1_name *= *([^\|\}]*[^\|\} ]) *\| *'
                     r'author *= *([^\}]*[^\} ]|) *}}$', lines[n] + lines[n+1])
        if m and not found:
            record(params, m.group(1) + suffix, "''%s''" % m.group(2))
            record(params, '%s%s_authority' % (m.group(1), suffix), m.group(3))
            n += 2
            found = True

    m = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *'
                 r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *'
                 r'(?:\| *([0-9]+px))?(?:\|[^\]]*)?\]\] *\| *'
                 r'caption *= *([^\}]*[^\} ]|) *}}$', lines[n])
    if m and re.search(r'(?i)(?:range|distribution)', lines[n]):
        record(params, 'range_map%s' % suffix, m.group(1))
        record(params, 'range_map%s_width' % suffix, m.group(2))
        record(params, 'range_map%s_caption' % suffix, m.group(3))
        n += 1

    return (n, orig_n != n)

def parse(text, linkname):
    """parse(text, linkname) -- parse multi-template taxobox from 'text' and
    return it as a dictionary suitable for constructing a taxobox
    template."""

    params = {'sequence': []}
    text = re.sub(r'(?m)[ \t\r]+$', '', text)

    if 1 < len(re.findall(r'(?i){{taxobox[ _]+begin *\|', text)):
        raise Error(u"Two occurrences of {{taxobox begin}}.")
    if 1 < len(re.findall(r'(?i){{taxobox[ _]+end *}}', text)):
        raise Error(u"Two occurrences of {{taxobox end}}.")
    m = re.search(r'(?is){{taxobox[ _]+begin.*{{taxobox[ _]+end *}}', text)
    if not m:
        global done
        done[linkname] = True
        raise NoError(u"Can't find taxobox.")
    lines = re.split(r'(?: *(?:</?br */?>(?= *(?:{{|<))|\n) *)+', m.group(0))
    n = 0

    m1 = re.match(r'(?i){{taxobox[ _]+begin *\| *color *= *([a-z]+) *\| *'
                  'name *= *(.*[^ ]) *}}[ \t]*(?:<br */?> *)?$', lines[n])
    m2 = re.match(r'(?i){{taxobox[ _]+begin *\| *name *= *(.*[^ ]) *\| *'
                  'color *= *([a-z]+) *}}[ \t]*(?:<br */?> *)?$', lines[n])
    if m1:
        record(params, 'color', m1.group(1))
        record(params, 'name', m1.group(2))
        n += 1
    elif m2:
        record(params, 'color', m2.group(2))
        record(params, 'name', m2.group(1))
        n += 1
    else:
        raise Error(u"Can't find {{taxobox begin}}: %s" % lines[n])

    m = re.match(r'(?i){{(?:template:)?(status[^\}]+)}}', lines[n])
    if m:
        record(params, 'status', '{{%s}}' % m.group(1))
        n += 1

    m = re.match(r'(?i)(?:<small> *)?fossil +(?:range|record): +([^<\n]*[^<\n ]) *'
                 r'(?:</small>)?', lines[n])
    if m:
        record(params, 'fossil_range', m.group(1))
        n += 1

    if re.match(r'(?i)<!--.*-->', lines[n]):
        n += 1

    image_re = (r'(?i){{taxobox[ _]+image *\| *image *= *'
                r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *'
                r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *\| *'
                r'caption *= *([^\}]*[^\} ]|) *}}$')

    m1 = re.match(image_re, lines[n])
    m2 = re.match(image_re, lines[n] + lines[n+1])
    m3 = re.match(r'(?i){{taxobox[ _]+image *\| *image *= *'
                  r'\[\[ *Image: *([^\|\]]*[^\|\] ]) *'
                  r'(?:\| *([0-9]+px))?(?:\|.*)?\]\] *}}$', lines[n])
    if m1:
        record(params, 'image', m1.group(1))
        record(params, 'image_width', m1.group(2))
        record(params, 'image_caption', m1.group(3))
        n += 1
    elif m2:
        record(params, 'image', m2.group(1))
        record(params, 'image_width', m2.group(2))
        record(params, 'image_caption', m2.group(3))
        n += 2
    elif m3:
        record(params, 'image', m3.group(1))
        record(params, 'image_width', m3.group(2))
        n += 1

    m = re.match(image_re, lines[n])
    if m:
        record(params, 'image2', m.group(1))
        record(params, 'image2_width', m.group(2))
        record(params, 'image2_caption', m.group(3))
        n += 1

    if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *\| *caption *= *}}$',
                lines[n]):
        n += 1
    if re.match(r'(?i){{taxobox[ _]+image *\| *image *= *(?:|\|.*)}}$',
                lines[n]):
        n += 1
    if re.match(r'(?i){{taxobox[ _]+image.*(?:Image with unknown copyright status removed|Unsourced image removed)', lines[n]):
        n += 1
    if re.match(r'(?i)<!--.*-->', lines[n]):
        n += 1
    if re.match(r'(?is)<!--.*-->', lines[n] + lines[n+1]):
        n += 2

    m = re.match(r'(?i){{taxobox[ _]+begin[ _]+placement *\| *'
                 r'color *= *[a-z]+ *}}$', lines[n])
    if not m:
        raise Error(u"Can't find {{taxobox begin placement}}: %s"
                          % lines[n])
    n += 1

    while n < len(lines):
        m0 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *'
                      r'taxon *= *([^\}]*[^\} ]) *'
                      r'<small>(.*)</?small>}}$', lines[n] + lines[n+1])
        if m0:
            record(params, m0.group(1), m0.group(2))
            record(params, m0.group(1) + '_authority', m0.group(3))
            n += 2
            continue

        m1 = re.match(r'(?i){{taxobox[ _]+([a-z_]+)[ _]+entry[ _]*\| *'
                      r'taxon *= *([^\}]*[^\} ]) *}}(?:<br */?>)?$', lines[n])
        if not m1:
            break
        record(params, m1.group(1), m1.group(2))
        n += 1
        m2 = re.match(r'(?i){{taxobox[ _]+authority *\| *'
                      r'author *= *([^\}]*[^\} ]) *\| *'
                      r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
        if m2:
            record(params, m1.group(1) + '_authority',
                   '%s, %s' % (m2.group(1), m2.group(2)))
            n += 1
            continue
        m3 = re.match(r'(?i){{taxobox[ _]+authority[ _]+parens *\| *'
                      r'author *= *([^\}]*[^\} ]) *\| *'
                      r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
        if m3:
            record(params, m1.group(1) + '_authority',
                   '(%s, %s)' % (m3.group(1), m3.group(2)))
            n += 1
            continue
        m4 = re.match(r'(?i){{taxobox[ _]+authority[ _]+(?:new|botany)? *\| *'
                      r'author(?:ity)? *= *([^\}]*[^\} ]) *}}$', lines[n])
        if m4:
            record(params, m1.group(1) + '_authority', m4.group(1))
            n += 1
            continue
        m5 = re.match(r'(?i)<small> *(.*[^ ]) *(?:</?small>)?', lines[n])
        if m5:
            record(params, m1.group(1) + '_authority', m5.group(1))
            n += 1
            continue

    m = re.match(r'(?i){{taxobox[ _]+end[ _]+placement(?: *\| *color *= *[a-z]+ *)?}}$', lines[n])
    if not m:
        raise Error(u"Expected {{taxobox end placement}}: %s"
                          % lines[n])
    n += 1

    n, found = parse_nomial('', n, lines, params)
    if found:
        n, found = parse_nomial('2', n, lines, params)
        if found:
            n, found = parse_nomial('3', n, lines, params)
            if found:
                n, found = parse_nomial('4', n, lines, params)

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+type[ _]+species *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'species *= *([^\}]*[^\} ]) *\| *'
                 r'comment *= *([^\}]*[^\} ]|) *}}$', lines[n])
    if m:
        record(params, 'type_species', "''%s''" % m.group(1))
        record(params, 'type_species_authority', m.group(2))
        n += 1

    if re.match(r'(?i)<!--.*-->', lines[n]):
        n += 1

    m = re.match(r'(?i){{taxobox[ _]+begin[ _]+synonyms *\| *'
                 r'color *= *[a-z]+ *}}$', lines[n])
    if m:
        n += 1
        syn = []
        while 1:
            m1 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+simple'
                          r' *\| *binomial_name *= *([^\}]*[^\} ]) *}}$',
                          lines[n])
            m2 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry[ _]+botany'
                          r' *\| *binomial_name *= *([^\}]*[^\} ]) *\| *'
                          r'author *= *([^\}]*[^\} ]) *}}$',
                          lines[n])
            m3 = re.match(r'(?i){{taxobox[ _]+synonym[ _]+entry *\| *'
                         r'binomial_name *= *([^\|\}]*[^\|\} ]) *\| *'
                         r'author *= *([^\}]*[^\} ]) *\| *'
                         r'date *= *([^\|\}]*[^\|\} ]) *}}$', lines[n])
            if m1:
                syn.append("''%s''" % m1.group(1))
            elif m2:
                syn.append("''%s'' <small>%s</small>"
                           % (m2.group(1), m2.group(2)))
            elif m3:
                syn.append("''%s'' <small>%s, %s</small>"
                           % (m3.group(1), m3.group(2), m3.group(3)))
            else:
                break
            n += 1
        record(params, 'synonyms', '<br/>'.join(syn))
        m = re.match(r'(?i){{taxobox[ _]+end[ _]+synonyms}}$', lines[n])
        if not m:
            raise Error(u"Expected {{taxobox synonyms end}} but found: %s"
                        % lines[n])
        n += 1

    if not params.has_key('binomial') and not params.has_key('trinomial'):
        n, found = parse_nomial('', n, lines, params)

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+diversity *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'link *= *([^\}]*[^\} ]) *\| *'
                 r'diversity *= *([^\}]*[^\} ]) *}}$', lines[n])
    if m:
        record(params, 'diversity', m.group(2))
        record(params, 'diversity_link', m.group(1))
        n += 1

    m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *'
                 r'color *= *[a-z]+ *\| *'
                 r'plural_taxon *= *([^\}]*[^\} ]) *}}$', lines[n])
    if not m:
        m = re.match(r'(?i){{taxobox[ _]+section[ _]+(?:subdivision|list) *\| *'
                     r'plural_taxon *= *([^\}]*[^\} ]) *\| *'
                     r'color *= *[a-z]+ *}}$', lines[n])
    if m:
        record(params, 'subdivision_ranks', m.group(1))
        n += 1
        m = n
        while not re.match(r'(?i){{taxobox', lines[n]):
            n += 1
        record(params, 'subdivision', '\n' + '\n'.join(lines[m:n]))

    if re.match(r'(?i)<!--.*-->', lines[n]):
        n += 1
    if n + 1 < len(lines) and re.match(r'(?i)<!--.*-->', lines[n] + lines[n+1]):
        n += 2

    m = re.match(r'(?i){{taxobox[ _]+end *}}$', lines[n])
    if not m:
        raise Error(u"Unrecognized line: %s" % lines[n])

    # Some other checks
    if params.has_key('norank'):
        raise Error(u"Can't handle {{taxobox norank entry}}, sorry.")
    if params.has_key('unranked'):
        raise Error(u"Can't handle {{taxobox unranked entry}}, sorry.")

    # Fix some simple mistakes.
    if (params.has_key('genus') and params.has_key('name')
        and params['genus'] == "'''''%s'''''" % params['name']):
        params['name'] = "''%s''" % params['name']
    if (params.has_key('binomial') and params.has_key('name')
        and params['binomial'] == "''%s''" % params['name']):
        params['name'] = "''%s''" % params['name']
    if (params.has_key('trinomial') and params.has_key('name')
        and params['trinomial'] == "''%s''" % params['name']):
        params['name'] = "''%s''" % params['name']
    if (params.has_key('image_caption')
        and canonize(params['image_caption'])
        in (canonize(params.get('name', '')),
            canonize(params.get('binomial', '')),
            canonize(params.get('trinomial', '')),
            canonize(params.get('genus', '')) + 'sp',
            canonize(params.get('name', '') + params.get('binomial', '')),
            )):
        del params['image_caption']
    if params.has_key('binomial_authority'):
        params['binomial_authority'] = re.sub(r',,', ',',
                                              params['binomial_authority'])
    if params.has_key('trinomial_authority'):
        params['trinomial_authority'] = re.sub(r',,', ',',
                                              params['trinomial_authority'])
    if params.has_key('genus') and re.match(r"'''''[[.*]]'''''$", params['genus']):
        params['genus'] = params['genus'][3:-3]
    if params.has_key('name'):
        m = re.match(r"<center> *(.*[^ ]) *</center>$", params['name'])
        if m:
            params['name'] = m.group(1)
    if params.has_key('subdivision_ranks'):
        m = re.match(r"<center> *(.*[^ ]) *</center>$", params['subdivision_ranks'])
        if m:
            params['subdivision_ranks'] = m.group(1)
    if params.has_key('genus') and re.match(r"(''')?[^']+\1$", params['genus']):
        params['genus'] = "''%s''" % params['genus']
    if params.has_key('species') and re.match(r"(''')?[^']+\1$", params['species']):
        params['species'] = "''%s''" % params['species']
    if params.has_key('subspecies') and re.match(r"(''')?[^']+\1$", params['subspecies']):
        params['subspecies'] = "''%s''" % params['subspecies']
    if params.has_key('species') and params.has_key('binomial') and re.match(r"''[^']+''$", params['species']):
        params['species'] = "'''%s'''" % params['species']
    if params.has_key('subspecies') and params.has_key('trinomial') and re.match(r"''[^']+''$", params['subspecies']):
        params['subspecies'] = "'''%s'''" % params['subspecies']
    if params.has_key('subdivision') and canonize(params['subdivision']) == 'seetext':
        params['subdivision'] = '\nSee text.'
    if (params.has_key('binomial') and params.has_key('species')
        and re.match("'''''[^']*'''''$", params['species'])):
        m = re.match(r"'*([A-Z])[a-z-]* ([a-z-]*)'*", params['binomial'])
        if m:
            params['species'] = "'''''%s. %s'''''" % (m.group(1), m.group(2))
    if (params.has_key('trinomial') and params.has_key('subspecies')
        and re.match("'''''.*'''''$", params['subspecies'])):
        m = re.match(r"'*([A-Z])[a-z-]* ([a-z])[a-z-]* ([a-z][a-z-]*)'*", params['trinomial'])
        if m:
            params['subspecies'] = "'''''%s. %s. %s'''''" % (m.group(1), m.group(2), m.group(3))

    return params

def convert(pl):
    text = pl.get()
    if edit:
        text = edittext(text)
    params = parse(text, pl.title())
    newtext = re.sub(r'(?is){{taxobox[ _]+begin *\|.*{{taxobox[ _]+end *}}',
                     '{{Taxobox\n'
                     + ''.join(map(lambda k: '| %s = %s\n' % (k, params[k]),
                                   filter(lambda s: params.has_key(s),
                                          params['sequence'])))
                     + '}}', text)
    newtext = check(pl.get(), newtext)
    if newtext:
        status, reason, data = pl.put(newtext, u'nomialbot — converted multi-template taxobox to {{Taxobox}}')
        global done
        if data == '':
            done[pl.title()] = True

def convertmany():
    global site, n, linknames, done
    pages = map(lambda l: wikipedia.Page(site, l), linknames)
    fetched = []
    while n < len(linknames):
        try:
            if not done.get(linknames[n]):
                if linknames[n] not in fetched:
                    tofetch = filter(lambda p: not done.get(p.title()), pages[n:])[:50]
                    wikipedia.getall(site, tofetch)
                    fetched += map(lambda p: p.title(), tofetch)
                wikipedia.output("Trying %s" % linknames[n])
                if pl.namespace() != 0:
                    done[pl.title()] = True
                    wikipedia.output(u"%s not in main namespace" % pl.title())
                else:
                    convert(pages[n])
        except wikipedia.LockedPage:
            wikipedia.output("%s is locked" % linknames[n])
        except wikipedia.IsRedirectPage:
            wikipedia.output("%s is redirect" % linknames[n])
            done[linknames[n]] = True
        except NoError:
            None
        except Error, e:
            wikipedia.output(u'***' + e.text)
        n += 1

def main():
    global checks, edit, debug
    offset = None
    reload = None
    try:
        opts, args = getopt.getopt(sys.argv[1:], 'r:dneo:',
                                   ['reload=', 'debug', 'no-checks', 'edit', 'offset='])
        for o, a in opts:
            if o in ('-n', '--no-checks'):
                checks = False
            elif o in ('-o', '--offset'):
                offset = int(a)
            elif o in ('-e', '--edit'):
                edit = True
            elif o in ('-d', '--debug'):
                debug = True
            elif o in ('-r', '--reload'):
                reload = a
            else:
                print "Bad option: %s" % o
                return
    except getopt.GetoptError:
        print "Bad command line"
        return
    global n, linknames, done
    done = {}
    try:
        f = file('taxoconvert.db', 'rb')
        n, linknames, done = pickle.load(f)
        f.close()
        if reload:
            raise IOError
    except IOError:
        tb = wikipedia.Page(site, reload)
        linknames = map(lambda p:p.title(), tb.getReferences())
        print len(linknames), "pages found"
        n = 0
    try:
        if offset != None:
            n = offset
        if args:
            for aa in args:
                convert(wikipedia.Page(site, aa))
        else:
            convertmany()
    finally:
        f = file('taxoconvert.db.new', 'wb')
        pickle.dump((n, linknames, done), f)
        f.close()
        os.rename('taxoconvert.db.new', 'taxoconvert.db')

if __name__ == '__main__':
    try:
        main()
    finally:
        wikipedia.stopme()