User:AdultSwim/reflist

From Wikipedia, the free encyclopedia

This script returns all references used on a page. I have not tested it on pages with nested or broken references, strange ref names, or pages with other issues. --AdultSwim (talk) 22:10, 25 July 2008 (UTC)

from BeautifulSoup import BeautifulStoneSoup
import wikipedia
import pagegenerators
import re

genFactory = pagegenerators.GeneratorFactory()
gen = None
pageTitleParts = []
pageTitle = "Kim Deal" #if you don't specify a title, this is what you get
history = False

#Call the script by C:\path\ref.py Page Title
#Don't worry about spaces in the title, its ok
#Special characters like '&' are a known problem through
#Try C:\path\ref.py "Page&Title"
#For titles with unicode characters, just replace the pageTitle at the top of this script

def getrefs(text):
    #Beautiful Soup does not like wikimarkup of ref tag stubs
    #It trys to complete the tags and just screws it all up, nesting the references, lets prevent that.
    #Remove all instances of <ref name="Stub"/>
    regex = '< *ref *name *= *[^>]+? */ *>'
    text = re.sub(regex, "", text) 

    soup = BeautifulStoneSoup(text)
    #print soup.prettify()
    refarray = soup.findAll('ref')
    return refarray

def print_array(refarray):
    for ref in refarray:
        wikipedia.output(str(ref))
        wikipedia.output("")

for arg in wikipedia.handleArgs():
    if arg.startswith("-history"):
        history = True
    else:
        generator = genFactory.handleArg(arg)
        if generator:
            gen = generator
        else:
            pageTitleParts.append(arg)

if pageTitleParts != []:
    pageTitle = ' '.join(pageTitleParts)


page = wikipedia.Page(wikipedia.getSite(), pageTitle)

if history == True:
    editcount = wikipedia.input(u'Please enter the number of edits to retrieve (0 for all): ')
    if editcount == 0:
        wikipedia.output("Searching all previous versions")
        vh = page.getVersionHistory(getAll = True)
    else:
        wikipedia.output("Searching %s previous versions" % editcount)
        vh = page.getVersionHistory(revCount = int(editcount) )
    refarray=[]
    for entry in vh:
        wikipedia.output(entry[0])
        text=page.getEditPage(oldid=entry[0])[0]
        array=getrefs(text)
        for entry in array:
            if entry not in refarray:
                refarray.append(entry)
    wikipedia.output("")
    print_array(refarray)
    
else:
    text = page.get()
    refarray=getrefs(text)
    print_array(refarray)

This page is GFDL. Feel free with personal/educational usage ,but give a shoutout if you repost or build further.