User:Salix alba/subsup.py

From Wikipedia, the free encyclopedia
import sys
import re

dump = len(sys.argv)>1 and sys.argv[1]=='-d'
titleRE = re.compile('<title>(.*)</title>')
nsRE = re.compile('<ns>(.*)</ns>')
subsupRE = re.compile('&lt;/?su[pb]&gt;')
pageEndRE = re.compile('</page>')
supOc = 0
supCc = 0
subOc = 0
subCc = 0

title =""
ns = -1
for line in sys.stdin:
	m = titleRE.search(line)
	if m :
		title = m.group(1)
		supOc = 0
		supCc = 0
		subOc = 0
		subCc = 0
		if dump : print line
	m = nsRE.search(line)
	if m :
		ns = m.group(1)
	a = subsupRE.findall(line)
	c1 = a.count('&lt;sub&gt;') 
	c2 = a.count('&lt;/sub&gt;')
	c3 = a.count('&lt;sup&gt;')
	c4 = a.count('&lt;/sup&gt;')
	subOc += c1
	subCc += c2
	supOc += c3
	supCc += c4
	if dump and (c1!=c2 or c3!=c4) :
		 print c1,c2,c3,c4,line.replace('&lt;','<').replace('&gt;','>')
	if pageEndRE.search(line) and ns==0 and ( supOc > 0 or supCc > 0 or subOc > 0 or subCc > 0) : 
		if dump :
			print title, supOc, supCc, subOc, subCc
		elif supOc <> supCc or subOc <> subCc :
			print title