Jump to content

User:BogBot/Source code/Task 03

From Wikipedia, the free encyclopedia
#!/usr/bin/python
# -*- coding: UTF-8 -*-

# Bot Script to populate new clinical fields in Drugbox templates in Wikipedia drug articles.
# The new fields are:
# | tradename   =  <!-- comma separated list of tradenames --> 
# | Drugs.com   =  <!--  link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->
# | MedlinePlus =  <!-- MedlinePlus drug accession number, e.g.,  "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> 
# In addition, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing with data from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip
# The tradnames were obtained from http://www.merckmanuals.com/home/drugnames-index/trade/a.html
# Finally the script sorts the fields in the order they are currently rendered by the drugbox template
# (in the order of clinical, pharmacokinetic, identifiers, and chemical data)

"""{{Drugbox
| Watchedfields
| verifiedrevid = 408577806
| IUPAC_name        = 
| OtherNames        = 
| image             = 
| width             = 
| alt               = 
| image2            = 
| width2            = 
| alt2              = 
| imagename         = <!-- else may use drug_name -->
| drug_name         = <!-- else may use imagename -->
| caption           = 

<!--Clinical data-->
| tradename         =  
| Drugs.com         =
| MedlinePlus       =
| licence_EU        = <!-- EMA requires brand name -->
| licence_US        = <!-- FDA may use generic name -->
| DailyMedID        = <!-- preference to licence_US -->
| pregnancy_AU      = <!-- A / B1 / B2 / B3 / C / D / X -->
| pregnancy_US      = <!-- A / B            / C / D / X -->
| pregnancy_category= 
| legal_AU = <!-- S2, S3, S4, S5, S6, S7, S8, S9 or Unscheduled-->
| legal_CA = <!-- OTC, Rx-only, Schedule I, II, III, IV, V, VI, VII, VIII -->
| legal_UK = <!-- GSL, P, POM, CD, CD Lic, CD POM, CD No Reg POM, CD (Benz) POM, CD (Anab) POM or CD Inv POM -->
| legal_US = <!-- OTC / Rx-only / Schedule I, II, III, IV, V -->
| legal_status      = 
| dependency_liability = 
| routes_of_administration = 

<!--Pharmacokinetic data-->
| bioavailability   = 
| protein_bound     = 
| metabolism        = 
| elimination_half-life = 
| excretion         = 

<!--Identifiers-->
| CAS_number        = 
| CAS_supplemental  = 
| ATCvet            = 
| ATC_prefix        = <!-- 'none' if uncategorised -->
| ATC_suffix        = 
| ATC_supplemental  = 
| PubChem           = 
| PubChemSubstance  = 
| IUPHAR_ligand     = 
| DrugBank          = 
| ChemSpiderID      = 
| UNII              =
| KEGG              =
| ChEBI             =
| ChEMBL            =

<!--Chemical data-->
| chemical_formula  = 
| C= | H= | Ag= | As= | Au= | B= | Bi= | Br= | Cl= | Co= | F= | Fe= | Gd= | I=
| K= | Mn= | N= | Na= | O= | P= | Pt= | S= | Sb= | Se= | Sr= | Tc= | Zn= | charge=
| molecular_weight  = 
| smiles            = 
| StdInChI          =
| StdInChI_comment  =
| StdInChIKey       =
| synonyms          = 
| density           = 
| melting_point     = 
| melting_high      = 
| melting_notes     = 
| boiling_point     = 
| boiling_notes     = 
| solubility        = 
| specific_rotation = 
| sec_combustion    = 

<!--Combo data-->
| type              = combo
| drug_name         = 
| component1        = <!-- Drugname, automatically linked -->
| class1            = <!-- Group, manual link using [[..|..]] -->
| component2        = <!-- Drugname, automatically linked -->
| class2            = <!-- Group, manual link using [[..|..]] -->
| component3        = <!-- Drugname, automatically linked -->
| class3            = <!-- Group, manual link using [[..|..]] -->
| component4        = <!-- Drugname, automatically linked -->
| class4            = <!-- Group, manual link using [[..|..]] -->

<!--Monoclonal antibody data-->
| type              = mab
| image             = 
| width             = 
| alt               = 
| image2            = 
| width2            = 
| alt2              = 
| imagename         = <!-- else may use drug_name -->
| drug_name         = <!-- else may use imagename -->
| mab_type          = <!-- mab, Fab, F(ab')2, Fab', scFv, di-scFv, 3funct, clFab, BiTE -->
| source            = <!-- a, e, i, o, u, xi/a, zu/a, xizu/a, axo, ... -->
| target            = <!-- antigen -->

<!--Vacine data-->
| type              = vaccine
| image             = 
| alt               = 
| width             = 
| image2            = 
| alt2              = 
| width2            = 
| imagename         = <!-- else may use drug_name -->
| drug_name         = <!-- else may use imagename -->
| target            = <!-- the antigen/bacteria/toxin/virus to protect against -->
| vaccine_type      = <!-- killed/attenuated/live/toxoid/protein subunit/subunit/conjugate/recombinant/DNA -->

}}"""

from collections import defaultdict
import codecs
import csv
import re
import string
import sys
import urllib
import urlparse
import wikipedia

# compiled regular expression

user =  "BogBot"
regexp_ab              = re.compile(r'\{\{(nobots|bots\|(allow=none|deny=.*?' + user + r'.*?|optout=all|deny=all))\}\}')

# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
# Build a regex to locate the drugbox
exp =       r'\{\{'       # the opening brackets for the infobox 
exp = exp + r'\s*'        # any amount of whitespace
exp = exp + r'[Dd]rugbox' # the word "drugbox", capitalized or not
exp = exp + r'.*\}\}'     # any amount of anything, followed by the end of the drugbox

regexp_drug_infobox    = re.compile(exp, re.DOTALL)
regexp_param           = re.compile(r"^\s*?\|\s*?(?P<PARAM>\S+)\s*?=\s*?(?P<VALUE>.+)\s*?($|\|)")
regexp_nested_template = re.compile(r"\{\{(?P<PARAMS>.+)\}\}")

regexp_open_square_bracket          = re.compile(r"\[",   re.DOTALL)
regexp_close_square_bracket         = re.compile(r"\]",   re.DOTALL)
regexp_open_curly_bracket           = re.compile(r"}",    re.DOTALL)
regexp_close_curly_bracket          = re.compile(r"{",    re.DOTALL)

regexp_double_open_square_bracket   = re.compile(r"\[\[", re.DOTALL)
regexp_double_close_square_bracket  = re.compile(r"\[\[", re.DOTALL)
regexp_double_open_curly_bracket    = re.compile(r"}}",   re.DOTALL)
regexp_double_close_curly_bracket   = re.compile(r"{{",   re.DOTALL)

# ATC_supplemental =  {{ATC|B01|AC06}}, {{ATC|N02|BA01}}
regexp_ATC_supplemental = re.compile(r"\|\s*?ATC_supplemental\s*?=\s*?(?P<TEMPLATE>.*\{\{s*(ATC).+?\}\})\s*?($|\|)")
# CASNo_Ref = {{cascite|correct|CAS}}
regexp_CASNo_Ref = re.compile(r"\|\s*?CASNo_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Cascite|cascite).*?\}\})")
# CAS_supplemental = {{CAS|405-41-4}}
regexp_CAS_supplemental = re.compile(r"\|\s*?CAS_supplemental\s*?=\s*?(?P<TEMPLATE>\{\{CAS.*?\}\})")
# ChEMBL_Ref = {{ebicite|correct|EBI}}
regexp_ChEMBL_Ref = re.compile(r"\|\s*?ChEMBL_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Ebicite|ebicite).*?\}\})")
# ChemSpiderID_Ref = {{chemspidercite|correct|chemspider}}
regexp_ChemSpiderID_Ref = re.compile(r"\|\s*?ChemSpiderID_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Chemspidercite|chemspidercite).*?\}\})")
# Drugs.com = {{drugs.com|monograph|lisinopril}}
regexp_Drugs_com = re.compile(r"\|\s*?Drugs\.com\s*?=\s*?(?P<TEMPLATE>\{\{(Drugs\.com|drugs\.com).*?\}\})")
# KEGG_Ref = {{keggcite|correct|kegg}}
regexp_KEGG_Ref = re.compile(r"\|\s*?KEGG_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Keggcite|keggcite).*?\}\})")
# StdInChI_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChI_Ref = re.compile(r"\|\s*?StdInChI_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Stdinchicite|stdinchicite).*?\}\})")
# StdInChIKey_Ref = {{stdinchicite|correct|chemspider}}
regexp_StdInChIKey_Ref = re.compile(r"\|\s*?StdInChIKey_Ref\s*?=\*s?(?P<TEMPLATE>\{\{(Stdinchicite|stdinchicite).*\}\})")
# UNII_Ref = {{fdacite|changed|FDA}}
regexp_UNII_Ref = re.compile(r"\|\s*?UNII_Ref\s*?=\s*?(?P<TEMPLATE>\{\{(Fdacite|fdacite).*?\}\})")

# named ref tag = <ref name="fattinger2000"> but not <ref name="fattinger2000" />
regexp_ref_tag_begin     = re.compile(r"<ref>|<ref name.*?[^/]>")
regexp_ref_tag_end       = re.compile(r"</ref>")
regexp_citation_template = re.compile(r"\{\{[C|c]ite\s*?(?P<TEMPLATE>.*?)\}\}")

# href='/monograph/maprotiline-hydrochloride.html'
regexp_monograph_url     = re.compile("href='/monograph/(?P<STEM>.*?)\.html'",   re.DOTALL) 

# http://www.nlm.nih.gov/medlineplus/druginfo/meds/a604021.html
regexp_medlineplus_url     = re.compile("www.nlm.nih.gov/medlineplus/druginfo/meds/(?P<ACNO>.*?)\.html",   re.DOTALL) 

def Allowbots(text):
    if (regexp_ab.search(text)):
        return False
    return True

def urlEncodeNonAscii(b):
    return re.sub('[\x80-\xFF]', lambda c: '%%%02x' % ord(c.group(0)), b)

def iriToUri(iri):
    parts= urlparse.urlparse(iri)
    return urlparse.urlunparse(
        part.encode('idna') if parti==1 else urlEncodeNonAscii(part.encode('utf-8'))
        for parti, part in enumerate(parts)
    )

def find_drugbox_from_text(article_text):
#   adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
    search_result = regexp_drug_infobox.search(article_text)
    if search_result:
        result_text = search_result.group(0) # returns the entire matching sequence
        begin, end  = search_result.span()
    else:
        return None
    # the regex isn't perfect, so look for the closing brackets of the infobox
    count = 0
    last_ind = None
    for ind, c in enumerate(result_text):
        if c == '}':
            count = count - 1
        elif c == '{':
            count = count + 1
        if count == 0 and not ind == 0:
            last_ind = ind
            break
    offset = result_text.find('|')
    location = (begin+offset, begin+last_ind-1, begin, begin+last_ind+1)
    return location

def drugbank():
#   drugbank data obtained from http://www.drugbank.ca/system/downloads/current/drugbank.txt.zip

    drugbank_data = {}

#           0           1           2           3               4               5                   6               7						8
#   Name    Trade_Names Drug_Type   MedlinePlus Drugs.com_link  KEGG_Drug_ID    KEGG_Compound_ID    ChemSpider_ID   PubChem_Compound_ID		DrugBank_ID

    drug_data = csv.reader(open('/Users/BogBot/progs/pywikipedia/drugbox/drug_links_agumented.csv', 'rU'),  dialect='excel')

    for row in drug_data:
        drugbank_data[row[0]] = row[1:]
    
    return drugbank_data

def assign_nested_templates(parameters, current_parameters):
#   extract and assign nested templates commonly used in drugbox templates

    result_ATC_supplemental = regexp_ATC_supplemental.search(parameters)
    if result_ATC_supplemental:
        template = result_ATC_supplemental.group('TEMPLATE')
        current_parameters['ATC_supplemental'] = template
#       print "found result_ATC_supplemental! ", template
        parameters = re.sub(regexp_ATC_supplemental, "|", parameters)

    result_ChEMBL_Ref = regexp_ChEMBL_Ref.search(parameters)
    if result_ChEMBL_Ref:
        template = result_ChEMBL_Ref.group('TEMPLATE')
        current_parameters['ChEMBL_Ref'] = template
#       print "found result_ChEMBL_Ref! ", template
        parameters = re.sub(regexp_ChEMBL_Ref, "", parameters)

    result_CASNo_Ref = regexp_CASNo_Ref.search(parameters)
    if result_CASNo_Ref:
        template = result_CASNo_Ref.group('TEMPLATE')
        current_parameters['CASNo_Ref'] = template
#       print "found result_CASNo_Ref! ", template
        parameters = re.sub(regexp_CASNo_Ref, "", parameters)

    result_CAS_supplemental = regexp_CAS_supplemental.search(parameters)
    if result_CAS_supplemental:
        template = result_CAS_supplemental.group('TEMPLATE')
        current_parameters['CAS_supplemental'] = template
#       print "found result_CAS_supplemental! ", template
        parameters = re.sub(regexp_CAS_supplemental, "", parameters)

    result_ChemSpiderID_Ref = regexp_ChemSpiderID_Ref.search(parameters)
    if result_ChemSpiderID_Ref:
        template = result_ChemSpiderID_Ref.group('TEMPLATE')
        current_parameters['ChemSpiderID_Ref'] = template
#       print "found ChemSpiderID_Ref! ", template
        parameters = re.sub(regexp_ChemSpiderID_Ref, "", parameters)  

    result_Drugs_com = regexp_Drugs_com.search(parameters)
    if result_Drugs_com:
        template = result_Drugs_com.group('TEMPLATE')
        current_parameters['Drugs.com'] = template
#       print "found result_Drugs_com! ", template
        parameters = re.sub(regexp_Drugs_com, "", parameters)  

    result_KEGG_Ref = regexp_KEGG_Ref.search(parameters)
    if result_KEGG_Ref:
        template = result_KEGG_Ref.group('TEMPLATE')
        current_parameters['KEGG_Ref'] = template
#       print "found KEGG_Ref! ", template
        parameters = re.sub(regexp_KEGG_Ref, "", parameters)  

    result_StdInChI_Ref = regexp_StdInChI_Ref.search(parameters)
    if result_StdInChI_Ref:
        template = result_StdInChI_Ref.group('TEMPLATE')
        current_parameters['StdInChI_Ref'] = template
#       print "found StdInChI_Ref! ", template
        parameters = re.sub(regexp_StdInChI_Ref, "", parameters)  

    result_UNII_Ref = regexp_UNII_Ref.search(parameters)
    if result_UNII_Ref:
        template = result_UNII_Ref.group('TEMPLATE')
        current_parameters['UNII_Ref'] = template
#       print "found UNII_Ref! ", template
        parameters = re.sub(regexp_UNII_Ref, "", parameters)  

    return parameters

 
def rejoin(begin, end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/
 
    new_list = [sub_strings[0]]
    for sub_string in sub_strings[1:]:
        if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
            if type == "line":
                new_list[-1] = new_list[-1]       + sub_string
            if type == "parameter":
                new_list[-1] = new_list[-1] + '|' + sub_string
        else:
            new_list.append(sub_string)
    sub_strings = new_list
    return sub_strings
  
def test_disjoint(begin,end,sub_strings):
    disjoint = False
    for sub_string in sub_strings:
       if (end in sub_string) and ((not begin in sub_string) or sub_string.find(end) < sub_string.find(begin)):
           disjoint = True
           break
    return disjoint

def regex_rejoin(regexp_begin, regexp_end, sub_strings, type):
# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links, nested templates, citations, etc.
# adapted from http://mcstrother.wordpress.com/2011/02/22/scraping-and-parsing-wikipedia-infoboxes-in-python/

    new_list = [sub_strings[0]]
    for sub_string in sub_strings[1:]:
        begin = False; end = False; n_begin = 0; n_end = 0
        if regexp_begin.search(sub_string):
            begin   = True
            match   = regexp_begin.findall(sub_string)
            n_begin = len(match)
        if regexp_end.search(sub_string):
            end     = True
            match   = regexp_end.findall(sub_string)
            n_end   = len(match)
        if ((end and not begin) or n_end < n_begin):
            if type == "line":
                new_list[-1] = new_list[-1]       + sub_string
            if type == "parameter":
                new_list[-1] = new_list[-1] + '|' + sub_string
        else:
            new_list.append(sub_string)
    sub_strings = new_list
    return sub_strings

def regex_test_disjoint(regexp_begin, regexp_end, sub_strings):
    disjoint = False
    begin = False; end = False; n_begin = 0; n_end = 0
    for sub_string in sub_strings:
        if regexp_begin.search(sub_string):
            begin   = True
            match   = regexp_begin.findall(sub_string)
            n_begin = len(match)
        if regexp_end.search(sub_string):
            end     = True
            match   = regexp_end.findall(sub_string)
            n_end   = len(match)
        if ((end and not begin) or n_end < n_begin):
           disjoint = True
           break
    return disjoint

def pad_parameters(text):

    matches = regexp_citation_template.findall(text)

    for match in matches:
        sub_strings = match.split("|")
        new_strings = " " + sub_strings[0].strip()
        for item in sub_strings[1:]:
            item = " | " + item.strip()
            new_strings += item

        sub_strings = new_strings.split("=")
        new_strings = " " + sub_strings[0].strip()
        for item in sub_strings[1:]:
            item = " = " + item.strip()
            new_strings += item
            
        text = text.replace(match,new_strings)
        
    return text

def parse_line(line, current_parameters):

#   print "index: ", line.count('|')
#   if (line.count('|') > 1 and line.count('[[') < 1 ):
    if (line.count('|') > 1):
#       print "line1: ", line
        sub_strings = line.split("|")

# make sure we split only on the pipes that represent ends of the infobox entry, not the pipes used in links
        sub_strings = rejoin('[[',']]',sub_strings, 'parameter')
          
# do the same for nested templates
        forever = True
        while forever:
            if test_disjoint('{{','}}',sub_strings):
                forever = True
                sub_strings = rejoin('{{','}}',sub_strings, 'parameter')
            else:
                forever = False
 
# do the same for citations:
        forever = True
        while forever:
            if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings):
                forever = True
                sub_strings = regex_rejoin(regexp_ref_tag_begin, regexp_ref_tag_end, sub_strings, 'parameter')
            else:
                forever = False
                  
# now assign the parameters
        for sub_string in sub_strings:
#           print "sub_string: ", sub_string
            if (sub_string.count("=") > 0):
                parts     = sub_string.split("=", 1)
#               print "parts: ", parts
                parameter = str(parts[0].encode("utf-8")).strip()
                value     = str(parts[1].encode("utf-8")).strip()
                value     = pad_parameters(value)
#               print "parameter, value: ", parameter, " ", value
                if not value:
                    value = " "
                current_parameters[parameter] = value
    else:
        result_drug_param = regexp_param.search(line)
#       print line
        if result_drug_param:
#           print "made it!"
#           print "line2: ", line
            parameter = (result_drug_param.group('PARAM').encode("utf-8")).strip()
            value     = (result_drug_param.group('VALUE').encode("utf-8")).strip()
            value     = pad_parameters(value)
            if not value:
                value = " "
            current_parameters[parameter] = value
            
        return
        
def remove_embedded_carriage_returns(parameters):
# remove embedded carriage returns from templates:

    lines = parameters.splitlines()
    
    forever = True
    while forever:
        if test_disjoint('{{', '}}', lines):
            forever = True
            lines = rejoin('{{', '}}', lines, 'line')
        else:
            forever = False

    forever = True
    while forever:
        if regex_test_disjoint(regexp_ref_tag_begin, regexp_ref_tag_end, lines):
            forever = True
            lines = regex_rejoin(regexp_ref_tag_end, regexp_ref_tag_begin, lines, 'line')
        else:
            forever = False

    parameters = string.join(lines, "\n")

    return parameters

def build_new_drugbox(current_parameters):
# build new drugbox template

#   make sure that all values in the current_parameters dictionary are properly encoded

    encoding = 'utf-8'
    for k, v in current_parameters.iteritems():
        if isinstance(v, basestring):
            if not isinstance(v, unicode):
                v = unicode(v, encoding)
                current_parameters[k] = v

#   if type parameter is missing, check subordinate parameters that infer type, and if found, assign type

    if not current_parameters.has_key("type"):
        if ("component1" in current_parameters or 
            "class1" in current_parameters or 
            "component2" in current_parameters or 
            "class2" in current_parameters or 
            "component3" in current_parameters or 
            "class3" in current_parameters or 
            "component4" in current_parameters 
            or "class4" in current_parameters):
            current_parameters['type'] = "combo"
        elif ("mab_type") in current_parameters:
            current_parameters['type'] = "mab"
        elif ("vaccine_type") in current_parameters:
            current_parameters['type'] = "vaccine"

# if not previously assigned, add the following "empty" parameters
    if not current_parameters.has_key("tradename"):
        current_parameters["tradename"] = " "

    new_drugbox = unicode( "{{Drugbox\n", "utf-8" )
    if current_parameters.has_key("Verifiedfields"):           new_drugbox += "| Verifiedfields = "           + current_parameters['Verifiedfields']           + "\n"
    if current_parameters.has_key("Watchedfields"):            new_drugbox += "| Watchedfields = "            + current_parameters['Watchedfields']            + "\n"
    if current_parameters.has_key("verifiedrevid"):            new_drugbox += "| verifiedrevid = "            + current_parameters['verifiedrevid']            + "\n"
    if current_parameters.has_key("IUPAC_name"):               new_drugbox += "| IUPAC_name = "               + current_parameters['IUPAC_name']               + "\n"
    if current_parameters.has_key("OtherNames"):               new_drugbox += "| OtherNames = "               + current_parameters['OtherNames']               + "\n"
    if current_parameters.has_key("image"):                    new_drugbox += "| image = "                    + current_parameters['image']                    + "\n"
    if current_parameters.has_key("width"):                    new_drugbox += "| width = "                    + current_parameters['width']                    + "\n"
    if current_parameters.has_key("alt"):                      new_drugbox += "| alt = "                      + current_parameters['alt']                      + "\n"
    if current_parameters.has_key("image2"):                   new_drugbox += "| image2 = "                   + current_parameters['image2']                   + "\n"
    if current_parameters.has_key("width2"):                   new_drugbox += "| width2 = "                   + current_parameters['width2']                   + "\n"
    if current_parameters.has_key("imagename"):                new_drugbox += "| imagename = "                + current_parameters['imagename']                + "\n"
    if current_parameters.has_key("drug_name"):                new_drugbox += "| drug_name = "                + current_parameters['drug_name']                + "\n"
    if current_parameters.has_key("caption"):                  new_drugbox += "| caption = "                  + current_parameters['caption']                  + "\n"
    
    if current_parameters.has_key("type"):

      if current_parameters['type'] == "combo":
        new_drugbox += "\n<!--Combo data-->\n"
        if current_parameters.has_key("type"):                 new_drugbox += "| type = "                     + current_parameters['type']                     + "\n"
        if current_parameters.has_key("component1"):           new_drugbox += "| component1 = "               + current_parameters['component1']               + "\n"
        if current_parameters.has_key("class1"):               new_drugbox += "| class1 = "                   + current_parameters['class1']                   + "\n"
        if current_parameters.has_key("component2"):           new_drugbox += "| component2 = "               + current_parameters['component2']               + "\n"
        if current_parameters.has_key("class2"):               new_drugbox += "| class2 = "                   + current_parameters['class2']                   + "\n"
        if current_parameters.has_key("component3"):           new_drugbox += "| component3 = "               + current_parameters['component3']               + "\n"
        if current_parameters.has_key("class3"):               new_drugbox += "| class3 = "                   + current_parameters['class3']                   + "\n"
        if current_parameters.has_key("component4"):           new_drugbox += "| component4 = "               + current_parameters['component4']               + "\n"
        if current_parameters.has_key("class4"):               new_drugbox += "| class4 = "                   + current_parameters['class4']                   + "\n"

      if current_parameters['type'] == "mab":
        new_drugbox += "\n<!--Monoclonal antibody data-->\n"
        if current_parameters.has_key("type"):                 new_drugbox += "| type = "                     + current_parameters['type']                     + "\n"
        if current_parameters.has_key("mab_type"):             new_drugbox += "| mab_type = "                 + current_parameters['mab_type']                 + "\n"
        if current_parameters.has_key("source"):               new_drugbox += "| source = "                   + current_parameters['source']                   + "\n"
        if current_parameters.has_key("target"):               new_drugbox += "| target = "                   + current_parameters['target']                   + "\n"

      if current_parameters['type'] == "vaccine":
        new_drugbox += "\n<!--Vacine data-->\n"
        if current_parameters.has_key("type"):                 new_drugbox += "| type = "                     + current_parameters['type']                     + "\n"
        if current_parameters.has_key("target"):               new_drugbox += "| target = "                   + current_parameters['target']                   + "\n"
        if current_parameters.has_key("vaccine_type"):         new_drugbox += "| vaccine_type = "             + current_parameters['vaccine_type']             + "\n"

    if ("tradename" in current_parameters or "Drugs.com" in current_parameters or "MedlinePlus" in current_parameters or "licence_EU" in current_parameters or 
        "licence_US" in current_parameters or "DailyMedID" in current_parameters or "pregnancy_AU" in current_parameters or "pregnancy_US" in current_parameters or 
        "pregnancy_category" in current_parameters or "legal_AU" in current_parameters or "legal_CA" in current_parameters or "legal_UK" in current_parameters or 
        "legal_US" in current_parameters or "legal_status" in current_parameters or "dependency_liability" or "routes_of_administration" in current_parameters):

        new_drugbox += "\n<!--Clinical data-->\n"
        if current_parameters.has_key("tradename"):                new_drugbox += "| tradename = "                + current_parameters['tradename']                + "\n"
        if current_parameters.has_key("Drugs.com"):                new_drugbox += "| Drugs.com = "                + current_parameters['Drugs.com']                + "\n"
        if current_parameters.has_key("MedlinePlus"):              new_drugbox += "| MedlinePlus = "              + current_parameters['MedlinePlus']              + "\n"
        if current_parameters.has_key("licence_EU"):               new_drugbox += "| licence_EU = "               + current_parameters['licence_EU']               + "\n"
        if current_parameters.has_key("licence_US"):               new_drugbox += "| licence_US = "               + current_parameters['licence_US']               + "\n"
        if current_parameters.has_key("DailyMedID"):               new_drugbox += "| DailyMedID = "               + current_parameters['DailyMedID']               + "\n"
        if current_parameters.has_key("pregnancy_AU"):             new_drugbox += "| pregnancy_AU = "             + current_parameters['pregnancy_AU']             + "\n"
        if current_parameters.has_key("pregnancy_US"):             new_drugbox += "| pregnancy_US = "             + current_parameters['pregnancy_US']             + "\n"
        if current_parameters.has_key("pregnancy_category"):       new_drugbox += "| pregnancy_category = "       + current_parameters['pregnancy_category']       + "\n"
        if current_parameters.has_key("legal_AU"):                 new_drugbox += "| legal_AU = "                 + current_parameters['legal_AU']                 + "\n"
        if current_parameters.has_key("legal_CA"):                 new_drugbox += "| legal_CA = "                 + current_parameters['legal_CA']                 + "\n"
        if current_parameters.has_key("legal_UK"):                 new_drugbox += "| legal_UK = "                 + current_parameters['legal_UK']                 + "\n"
        if current_parameters.has_key("legal_US"):                 new_drugbox += "| legal_US = "                 + current_parameters['legal_US']                 + "\n"
        if current_parameters.has_key("legal_status"):             new_drugbox += "| legal_status = "             + current_parameters['legal_status']             + "\n"
        if current_parameters.has_key("dependency_liability"):     new_drugbox += "| dependency_liability = "     + current_parameters['dependency_liability']     + "\n"
        if current_parameters.has_key("routes_of_administration"): new_drugbox += "| routes_of_administration = " + current_parameters['routes_of_administration'] + "\n"

    if ("bioavailability" in current_parameters or "protein_bound metabolism" in current_parameters or "elimination_half-life" in current_parameters or "excretion" in current_parameters):

        new_drugbox += "\n<!--Pharmacokinetic data-->\n"
        if current_parameters.has_key("bioavailability"):          new_drugbox += "| bioavailability = "           + current_parameters['bioavailability']         + "\n"
        if current_parameters.has_key("protein_bound"):            new_drugbox += "| protein_bound = "             + current_parameters['protein_bound']           + "\n"
        if current_parameters.has_key("metabolism"):               new_drugbox += "| metabolism = "                + current_parameters['metabolism']              + "\n"
        if current_parameters.has_key("elimination_half-life"):    new_drugbox += "| elimination_half-life = "     + current_parameters['elimination_half-life']   + "\n"
        if current_parameters.has_key("excretion"):                new_drugbox += "| excretion = "                 + current_parameters['excretion']               + "\n"
    
    if ("CAS_number" in current_parameters or "CAS_supplemental" in current_parameters or "ATCvet" in current_parameters or "ATC_prefix" in current_parameters or 
        "ATC_suffix" in current_parameters or "ATC_supplemental" in current_parameters or "PubChem" in current_parameters or "PubChemSubstance" in current_parameters or 
        "IUPHAR_ligand" in current_parameters or "DrugBank" in current_parameters or "ChemSpiderID" in current_parameters or "UNII" in current_parameters or 
        "KEGG" in current_parameters or "ChEBI" in current_parameters or "ChEMBL" in current_parameters):
        new_drugbox += "\n<!--Identifiers-->\n"
        if current_parameters.has_key("CAS_number_Ref"):           new_drugbox += "| CAS_number_Ref = "            + current_parameters['CAS_number_Ref']          + "\n"
        if current_parameters.has_key("CASNo_Ref"):                new_drugbox += "| CASNo_Ref = "                 + current_parameters['CASNo_Ref']               + "\n"
        if current_parameters.has_key("CAS_number"):               new_drugbox += "| CAS_number = "                + current_parameters['CAS_number']              + "\n"
        if current_parameters.has_key("CAS_supplemental"):         new_drugbox += "| CAS_supplemental = "          + current_parameters['CAS_supplemental']        + "\n"
        if current_parameters.has_key("ATCvet"):                   new_drugbox += "| ATCvet = "                    + current_parameters['ATCvet']                  + "\n"
        if current_parameters.has_key("ATC_prefix"):               new_drugbox += "| ATC_prefix = "                + current_parameters['ATC_prefix']              + "\n"
        if current_parameters.has_key("ATC_suffix"):               new_drugbox += "| ATC_suffix = "                + current_parameters['ATC_suffix']              + "\n"
        if current_parameters.has_key("ATC_supplemental"):         new_drugbox += "| ATC_supplemental = "          + current_parameters['ATC_supplemental']        + "\n"
        if current_parameters.has_key("PubChem"):                  new_drugbox += "| PubChem = "                   + current_parameters['PubChem']                 + "\n"
        if current_parameters.has_key("PubChemSubstance"):         new_drugbox += "| PubChemSubstance = "          + current_parameters['PubChemSubstance']        + "\n"
        if current_parameters.has_key("IUPHAR_ligand"):            new_drugbox += "| IUPHAR_ligand = "             + current_parameters['IUPHAR_ligand']           + "\n"
        if current_parameters.has_key("DrugBank_Ref"):             new_drugbox += "| DrugBank_Ref = "              + current_parameters['DrugBank_Ref']            + "\n"
        if current_parameters.has_key("DrugBank"):                 new_drugbox += "| DrugBank = "                  + current_parameters['DrugBank']                + "\n"
        if current_parameters.has_key("ChemSpiderID_Ref"):         new_drugbox += "| ChemSpiderID_Ref = "          + current_parameters['ChemSpiderID_Ref']        + "\n"
        if current_parameters.has_key("ChemSpiderID"):             new_drugbox += "| ChemSpiderID = "              + current_parameters['ChemSpiderID']            + "\n"
        if current_parameters.has_key("UNII_Ref"):                 new_drugbox += "| UNII_Ref = "                  + current_parameters['UNII_Ref']                + "\n"
        if current_parameters.has_key("UNII"):                     new_drugbox += "| UNII = "                      + current_parameters['UNII']                    + "\n"
        if current_parameters.has_key("KEGG_Ref"):                 new_drugbox += "| KEGG_Ref = "                  + current_parameters['KEGG_Ref']                + "\n"
        if current_parameters.has_key("KEGG"):                     new_drugbox += "| KEGG = "                      + current_parameters['KEGG']                    + "\n"
        if current_parameters.has_key("ChEBI_Ref"):                new_drugbox += "| ChEBI_Ref = "                 + current_parameters['ChEBI_Ref']               + "\n"
        if current_parameters.has_key("ChEBI"):                    new_drugbox += "| ChEBI = "                     + current_parameters['ChEBI']                   + "\n"
        if current_parameters.has_key("ChEMBL_Ref"):               new_drugbox += "| ChEMBL_Ref = "                + current_parameters['ChEMBL_Ref']              + "\n"
        if current_parameters.has_key("ChEMBL"):                   new_drugbox += "| ChEMBL = "                    + current_parameters['ChEMBL']                  + "\n"

    new_drugbox += "\n<!--Chemical data-->\n"
    if ("chemical_formula" in current_parameters or "C" in current_parameters or "H" in current_parameters or "Ag" in current_parameters or "As" in current_parameters or 
        "Au" in current_parameters or "B" in current_parameters or "Bi" in current_parameters or "Br" in current_parameters or "Cl" in current_parameters or "Co" in current_parameters or 
        "F" in current_parameters or "Fe" in current_parameters or "Gd" in current_parameters or "I" in current_parameters or "K" in current_parameters or "Mn" in current_parameters or 
        "N" in current_parameters or "Na" in current_parameters or "O" in current_parameters or "P" in current_parameters or "Pt" in current_parameters or "S" in current_parameters or 
        "Sb" in current_parameters or "Se" in current_parameters or "Sr" in current_parameters or "Tc" in current_parameters or "charge" in current_parameters):
        if current_parameters.has_key("chemical_formula"):         new_drugbox += "| chemical_formula = "          + current_parameters['chemical_formula']        + "\n"
#        new_drugbox += " "
        if current_parameters.has_key("C"):                        new_drugbox += "| C="                           + current_parameters['C']                       + " "
        if current_parameters.has_key("H"):                        new_drugbox += "| H="                           + current_parameters['H']                       + " "
        if current_parameters.has_key("Ag"):                       new_drugbox += "| Ag="                          + current_parameters['Ag']                      + " "
        if current_parameters.has_key("As"):                       new_drugbox += "| As="                          + current_parameters['As']                      + " "
        if current_parameters.has_key("Au"):                       new_drugbox += "| Au="                          + current_parameters['Au']                      + " "
        if current_parameters.has_key("B"):                        new_drugbox += "| B="                           + current_parameters['B']                       + " "
        if current_parameters.has_key("Bi"):                       new_drugbox += "| Bi="                          + current_parameters['Bi']                      + " "
        if current_parameters.has_key("Br"):                       new_drugbox += "| Br="                          + current_parameters['Br']                      + " "
        if current_parameters.has_key("Cl"):                       new_drugbox += "| Cl="                          + current_parameters['Cl']                      + " "
        if current_parameters.has_key("Co"):                       new_drugbox += "| Co="                          + current_parameters['Co']                      + " "
        if current_parameters.has_key("F"):                        new_drugbox += "| F="                           + current_parameters['F']                       + " "
        if current_parameters.has_key("Fe"):                       new_drugbox += "| Fe="                          + current_parameters['Fe']                      + " "
        if current_parameters.has_key("Gd"):                       new_drugbox += "| Gd="                          + current_parameters['Gd']                      + " "
        if current_parameters.has_key("I"):                        new_drugbox += "| I="                           + current_parameters['I']                       + " "
        if current_parameters.has_key("K"):                        new_drugbox += "| K="                           + current_parameters['K']                       + " "
        if current_parameters.has_key("Mn"):                       new_drugbox += "| Mn="                          + current_parameters['Mn']                      + " "
        if current_parameters.has_key("N"):                        new_drugbox += "| N="                           + current_parameters['N']                       + " "
        if current_parameters.has_key("Na"):                       new_drugbox += "| Na="                          + current_parameters['Na']                      + " "
        if current_parameters.has_key("O"):                        new_drugbox += "| O="                           + current_parameters['O']                       + " "
        if current_parameters.has_key("P"):                        new_drugbox += "| P="                           + current_parameters['P']                       + " "
        if current_parameters.has_key("Pt"):                       new_drugbox += "| Pt="                          + current_parameters['Pt']                      + " "
        if current_parameters.has_key("S"):                        new_drugbox += "| S="                           + current_parameters['S']                       + " "
        if current_parameters.has_key("Sb"):                       new_drugbox += "| C="                           + current_parameters['Sb']                      + " "
        if current_parameters.has_key("Se"):                       new_drugbox += "| Se="                          + current_parameters['Se']                      + " "
        if current_parameters.has_key("Sr"):                       new_drugbox += "| Sr="                          + current_parameters['Sr']                      + " "
        if current_parameters.has_key("Tc"):                       new_drugbox += "| Tc="                          + current_parameters['Tc']                      + " "
        if current_parameters.has_key("charge"):                   new_drugbox += "| charge = "                    + current_parameters['charge']                  + " "
        new_drugbox += "\n"
        
    if current_parameters.has_key("molecular_weight"):         new_drugbox += "| molecular_weight = "          + current_parameters['molecular_weight']        + "\n"
    if current_parameters.has_key("smiles"):                   new_drugbox += "| smiles = "                    + current_parameters['smiles']                  + "\n"
    if current_parameters.has_key("InChI_Ref"):                new_drugbox += "| InChI_Ref = "                 + current_parameters['InChI_Ref']               + "\n"
    if current_parameters.has_key("InChI"):                    new_drugbox += "| InChI = "                     + current_parameters['InChI']                   + "\n"
    if current_parameters.has_key("InChIKey"):                 new_drugbox += "| InChIKey = "                  + current_parameters['InChIKey']                + "\n"
    if current_parameters.has_key("StdInChI_Ref"):             new_drugbox += "| StdInChI_Ref = "              + current_parameters['StdInChI_Ref']            + "\n"
    if current_parameters.has_key("StdInChI"):                 new_drugbox += "| StdInChI = "                  + current_parameters['StdInChI']                + "\n"
    if current_parameters.has_key("StdInChI_comment"):         new_drugbox += "| StdInChI_comment = "          + current_parameters['StdInChI_comment']        + "\n"
    if current_parameters.has_key("StdInChIKey_Ref"):          new_drugbox += "| StdInChIKey_Ref = "           + current_parameters['StdInChIKey_Ref']         + "\n"
    if current_parameters.has_key("StdInChIKey"):              new_drugbox += "| StdInChIKey = "               + current_parameters['StdInChIKey']             + "\n"
    if current_parameters.has_key("synonyms"):                 new_drugbox += "| synonyms = "                  + current_parameters['synonyms']                + "\n"
    if current_parameters.has_key("density"):                  new_drugbox += "| density = "                   + current_parameters['density']                 + "\n"
    if current_parameters.has_key("melting_point"):            new_drugbox += "| melting_point = "             + current_parameters['melting_point']           + "\n"
    if current_parameters.has_key("melting_high"):             new_drugbox += "| melting_high = "              + current_parameters['melting_high']            + "\n"
    if current_parameters.has_key("melting_notes"):            new_drugbox += "| melting_notes = "             + current_parameters['melting_notes']           + "\n"
    if current_parameters.has_key("boiling_point"):            new_drugbox += "| boiling_point = "             + current_parameters['boiling_point']           + "\n"
    if current_parameters.has_key("boiling_notes"):            new_drugbox += "| boiling_notes = "             + current_parameters['boiling_notes']           + "\n"
    if current_parameters.has_key("solubility"):               new_drugbox += "| solubility = "                + current_parameters['solubility']              + "\n"
    if current_parameters.has_key("specific_rotation"):        new_drugbox += "| specific_rotation = "         + current_parameters['specific_rotation']       + "\n"
    if current_parameters.has_key("sec_combustion"):           new_drugbox += "| sec_combustion = "            + current_parameters['sec_combustion']          + "\n"

    new_drugbox += "}}"

#    print new_drugbox

    return new_drugbox
    
def merged_tradenames(merck_tradename, current_tradename):
# merge tradenames 
    new_tradenames = []

    if merck_tradename:
        merck_tradenames = sorted(set(merck_tradename.split(";")))[1:]
        for index, object in enumerate(merck_tradenames):
            merck_tradenames[index]   = string.capitalize(string.strip(object.encode("utf-8")))
    else:
        merck_tradenames = []

    if current_tradename: 
        current_tradenames = sorted(set(current_tradename.split(", ")))
        for index, object in enumerate(current_tradenames):
            current_tradenames[index] = string.capitalize(string.strip(object.encode("utf-8")))
    else:
        current_tradenames = []

    merged_tradenames = []
    if merck_tradenames:   merged_tradenames = merck_tradenames
    if current_tradenames:
        for name in current_tradenames:
            merged_tradenames.append(name)
    if merged_tradenames:
        new_tradenames = sorted(set(merged_tradenames))
        new_tradename  = ", ".join(new_tradenames)
        return new_tradename
    else:
        new_tradename = ""
        return new_tradename
        
#       print "merck tradenames: ", merck_tradenames
#       print "current tradenames: ", current_tradenames


def test_MedlinePlus(accession_number):

#   add MedlinePlus parameter
# | MedlinePlus =  <!-- MedlinePlus drug accession number, e.g.,  "a692051" that links to "http://www.nlm.nih.gov/medlineplus/druginfo/meds/a692051.html" --> 

# test web page, returns "200" if OK:
# if urllib.urlopen(link).getcode() == 200:
# 200: ('OK', 'Request fulfilled, document follows')
# 404: (page not found)

    if accession_number:
        link = "http://www.nlm.nih.gov/medlineplus/druginfo/meds/" + accession_number + ".html"
        if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
            return True
        else:
            link = ""
            return False

def test_Drugs_com(INN, tradename, drugbank_drugs_com):

#   add Drugs.com parameter
# | Drugs.com   =  <!--  link to Drugs.com monograph, e.g., "lisinopril" that links to "http://www.drugs.com/monograph/lisinopril.html" -->

#     create alternate candidate drugs.com links
#     alternateive roots include the monograph, CDI, CONS, MTM, and "parent" sectios of drgus.com web site
#     alternative stems inlclude the INN, trade names, and the name extracted from the drugbank link

    tradenames = tradename.split(", ")

# drugs.com root links:
    roots = [("monograph","http://www.drugs.com/monograph/"), ("CDI","http://www.drugs.com/cdi/"), ("CONS","http://www.drugs.com/cons/"), ("MTM","http://www.drugs.com/mtm/"), ("pro","http://www.drugs.com/pro/"), ("international","http://www.drugs.com/international/"), ("parent","http://www.drugs.com/")]

    stems = []
    drugnames = []
    drugnames.append(INN)
    link = False
    
    if tradenames:
        for tradename in tradenames:
            drugnames.append(tradename)
    for drugname in drugnames:
        drugname = string.lower(drugname)
#        print "drugnames: ", drugnames      
        if (drugname != " " and string.find(drugname, " ") > -1):
            stems.append(string.replace(drugname, " ", "_"))
            stems.append(string.replace(drugname, " ", "-"))
        elif (drugname != " "):
            stems.append(drugname)
# also try common salts
            stems.append(drugname + "-hydrochloride")
            stems.append(drugname + "-sulfate")
#            stems.append(drugname + "-chloride")
#            stems.append(drugname + "-sodium")
#            stems.append(drugname + "-bromide")
#            stems.append(drugname + "-maleate")
#            stems.append(drugname + "-citrate")

    if drugbank_drugs_com:
        if (string.find(drugbank_drugs_com, "http://www.drugs.com/") > -1):
            temp = string.replace(drugbank_drugs_com, "http://www.drugs.com/", "")
            temp = string.replace(temp,    ".html", "")
            drugnames.append(temp)

    try:
        for root in roots:
            for stem in stems:
                if stem:
                    link = iriToUri(root[1] + stem + ".html")
#                    print "attempted Drugs.com link: ", link
                    if urllib.urlopen(link).getcode() == 200: # test link status to make sure it is good before assigning parameter
#                        print "passed link: ", link
                        if root[0] == "monograph":
                            link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
                            raise StopIteration()
                        else:
                            opener = urllib.FancyURLopener({})
                            f = opener.open(link)
                            text = f.read()
                            result = regexp_monograph_url.search(text)
                            if result:
                                stem = result.group('STEM')
                                link = "{{drugs.com|" + roots[0][0] + "|" + stem + "}}"
                                raise StopIteration()
                            else:
                                link = "{{drugs.com|" + root[0] + "|" + stem + "}}"
#                                print "link: ", link
                                raise StopIteration()
                    else:
                        link = ""
    except StopIteration:
        pass

    return link

def unbalanced(text):

# test for unmatched square or curly brackets

    n_open_square_bracket         = len(regexp_open_square_bracket.findall(text))
    n_close_square_bracket        = len(regexp_close_square_bracket.findall(text))
    n_open_curly_bracket          = len(regexp_open_curly_bracket.findall(text))
    n_close_curly_bracket         = len(regexp_close_curly_bracket.findall(text))

    n_double_open_square_bracket  = len(regexp_double_open_square_bracket.findall(text))
    n_double_close_square_bracket = len(regexp_double_close_square_bracket.findall(text))
    n_double_open_curly_bracket   = len(regexp_double_open_curly_bracket.findall(text))
    n_double_close_curly_bracket  = len(regexp_double_close_curly_bracket.findall(text))

    if (n_open_square_bracket != n_close_square_bracket or
        n_open_curly_bracket  != n_close_curly_bracket or
        n_double_open_square_bracket != n_double_close_square_bracket or
        n_double_open_curly_bracket != n_double_close_curly_bracket):
        return True
    else:
        return False

def savepage(page, text, summary = '', minor = False, log_string = ""):
        """Save text to a page and log exceptions."""
        if summary != '':
                wikipedia.setAction(summary)
        try:
                page.put(text, minorEdit = minor)
                wikipedia.output('%s  \03{green}saving %s' % (log_string, page.title()) )
                return ''
        except wikipedia.LockedPage:
                wikipedia.output('%s    \03{red}cannot save %s because it is locked\03{default}' % (log_string, page.title()) )
                return '# %s: page was locked\n' % page.aslink()
        except wikipedia.EditConflict:
                wikipedia.output('%s    \03{red}cannot save %s because of edit conflict\03{default}' % (log_string, page.title()) )
                return '# %s: edit conflict occurred\n' % page.aslink()
        except wikipedia.SpamfilterError, error:
                wikipedia.output('%s    \03{red}cannot save %s because of spam blacklist entry %s\03{default}' % ((log_string, page.title(), error.url)) )
                return '# %s: spam blacklist entry\n' % page.aslink()
        except:
                wikipedia.output('%s    \03{red}unknown error on saving %s\03{default}' % (log_string, page.title()) )
                return '# %s: unknown error occurred\n' % page.aslink()

def run():

    drugbank_data = drugbank()

    DrugBank_ID_INN = {}
    
    for k, v in drugbank_data.iteritems():
        DrugBank_ID_INN[v[8]]= k

#   list of articles to work on is generated by: "python pagegenerators.py -namespace:0 -transcludes:Drugbox > drugbox_titles.txt"
    articles = []
    articles = codecs.open('/Users/BogBot/progs/pywikipedia/drugbox_titles.txt', mode = 'r', encoding='utf-8')

#    articles = ['Progesterone']

    for article in articles:

        article = article.rstrip('\n')

        encoding = 'utf-8'
        if isinstance(article, basestring):
            if not isinstance(article, unicode):
                article = unicode(article, encoding)

        new_drugbox = ""

        log_string = "* [[" + article + "]], " 

        site = wikipedia.getSite()
        page = wikipedia.Page(site, article)
        text = page.get(get_redirect = True)

        if not Allowbots(text):
            continue
        
        begin, end, begin2, end2 = find_drugbox_from_text(text)
        if end:
            parameters = text[begin:end]
        else:
            log_string = log_string + "drugbox not found!"
            print log_string
            continue

# make sure that there are no unmatched square or curly brackets
# if found, abbort, since these may indicate a error in the wiki markup
# and may trigger an infinite loop else where in this script

        if unbalanced(parameters):
            log_string = log_string + "unmatched brackets found, article skipped!"
            print log_string
            continue

#        print text[begin:end]

        current_parameters = {}
    
# first extract and assign nested templates commonly used in drugbox templates
        parameters = assign_nested_templates(parameters, current_parameters)

# remove any embedded carriage returns from remaining templates:
        parameters = remove_embedded_carriage_returns(parameters)

# next, parse each line for parameters
        lines = parameters.splitlines()
        for line in lines:
             parse_line(line, current_parameters)

        INN = article
#       INN = "Acetylsalicylic acid"

        if INN in drugbank_data:
            db_data = drugbank_data[INN]
        elif "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
            log_string = str(log_string + "INN reset from " + INN,)
            INN = DrugBank_ID_INN[current_parameters['DrugBank']]
            log_string = log_string +  "to " + INN + ", "
            db_data = drugbank_data[INN]
            if not "drug_name" in current_parameters:
                current_parameters['drug_name'] = INN
        else:
            db_data = []

        if "DrugBank" in current_parameters and current_parameters['DrugBank'] in DrugBank_ID_INN:
            if DrugBank_ID_INN[current_parameters['DrugBank']] == INN:
                log_string = log_string +  "DrugBankID/INN OK!, "
            else:
                log_string = log_string +  "DrugBankID/INN NOT OK!, "
        else:
            if db_data:
                if db_data[8]:
                    if not "DrugBank" in current_parameters:
                        current_parameters['DrugBank'] = db_data[8]
            

# while we are at it, populate KEGG_Drug_ID, ChemSpider_ID, and PubChem_Compound_ID fields if missing

        if db_data:

            if db_data[4] and not "KEGG" in current_parameters:
                current_parameters['KEGG'] = db_data[4]

            if db_data[6] and not "ChemSpiderID" in current_parameters:
                current_parameters['ChemSpiderID'] = db_data[6]

            if db_data[7] and not "PubChem" in current_parameters:
                current_parameters['PubChem'] = db_data[7]

# augument current tradename list with the ones supplied by the Merck Manual

        if db_data:
            if db_data[0]:
                merck_tradename = db_data[0]
            else:
                merck_tradename = ""
        else:
            merck_tradename = ""
        if 'tradename' in current_parameters:
            current_tradename = current_parameters['tradename']
        else:
            current_tradename = ""
        
        new_tradename = merged_tradenames(merck_tradename, current_tradename)
        if new_tradename: current_parameters['tradename'] = new_tradename
        
# add MedlinePlus parameter
        if db_data:
            if db_data[2]:
                if test_MedlinePlus(db_data[2]):
                    current_parameters['MedlinePlus'] = db_data[2]
                    
# add Drugs.com link
        if 'tradename' in current_parameters:
            tradename = current_parameters['tradename']
        else:
            tradename = ""
        if db_data:
            if db_data[3]:
                drugbank_drugs_com = db_data[3]
            else:
                drugbank_drugs_com = ""
        else:
            drugbank_drugs_com = ""
        
        result = test_Drugs_com(INN, tradename, drugbank_drugs_com)
        if result: current_parameters['Drugs.com'] = result

# add MedlinePlus parameter
        if db_data:
            if db_data[2]:
                if test_MedlinePlus(db_data[2]):
                    current_parameters['MedlinePlus'] = db_data[2]


        if not 'MedlinePlus' in current_parameters:
            opener = urllib.FancyURLopener({})
            stem = string.replace(article, " ", "+")
            link = "http://vsearch.nlm.nih.gov/vivisimo/cgi-bin/query-meta?&v:project=medlineplus&query=" + stem

#            print "MedlinePlus link:", link
            f = opener.open(link)
            text2 = f.read()
            result = regexp_medlineplus_url.search(text2)
            if result:
                current_parameters['MedlinePlus'] = result.group('ACNO')

        new_text = text[:begin2] + build_new_drugbox(current_parameters) + text[end2:]
        
#        print build_new_drugbox(current_parameters)

#        print new_text

#        print current_parameters
#        print new_text
        
        if current_parameters:
            comment='populated new fields in {{drugbox}} and reordered per [[Wikipedia:Bots/Requests_for_approval/BogBot_2|bot approval]]. Report errors and suggestions to [[User_talk:BogBot]]'
            status = savepage(page, new_text, comment, False, log_string)
        else:
            print ", page not updated"

        wikipedia.stopme()
        
run()