Jump to content

User:GreenC bot/Job 18/source

From Wikipedia, the free encyclopedia
#!/usr/local/bin/gawk -bE     

#
# vebug - https://en.wikipedia.org/wiki/User:GreenC_bot/Job_18
#

# The MIT License (MIT)
#    
# Copyright (c) August 2019 User:GreenC (en.wikipedia.org)
#   
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR                   
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

BEGIN {
  BotName = "vebug"
}

@include "botwiki.awk"
@include "library.awk"
@include "json.awk"

BEGIN {

  Mode = "bot"   # set to "find" and it will search only and exit with a 1 (found something) or 0 (found nothing)
                 #  in "find" mode, run via 'project -s' to search local cache for articles containing actionable matches
                 # set to anything else and it will process the article.

  IGNORECASE = 1

  ReSpace = "[\n\r\t]*[ ]*[\n\r\t]*[ ]*[\n\r\t]*"
  ReSups = "[<]sup[^>]*[>][^<]+[<][/]sup[>]"

  delete citeTable

  Optind = Opterr = 1
  while ((C = getopt(ARGC, ARGV, "hs:l:n:")) != -1) {
      opts++
      if(C == "s")                 #  -s <file>      article.txt source to process.
        articlename = verifyval(Optarg)
      if(C == "l")                 #  -l <dir/>      Directory where logging is sent.. end with "/"
        logdir = verifyval(Optarg)
      if(C == "n")                 #  -n <name>      Wikipedia name of article
        wikiname = verifyval(Optarg)
      if(C == "h") {
        usage()
        exit
      }
  }

  if( ! opts || articlename == "" ) {
    stdErr("Error in vebug.awk (1)")
    print "0"
    exit
  }

  if(wikiname == "" || logdir == "")
    Logfile = "/dev/null"
  else {
    if(substr(logdir, length(logdir), 1) != "/")
      logdir = logdir "/"
    Logfile = logdir "logvebug"
  }

  # Path to data directory ends in "/"
  DataDir = articlename
  gsub(regesc3(basename(articlename)) "$", "", DataDir) 

  # Number of changes made to article
  Count = 0

  main()

}

#
# Run the program, save the page to disk
#
function main(  article,articlenew,articlenewname,editsummaryname,bn,plural) {

  checkexists(articlename, "vebug.awk main()", "exit")
  article = readfile(articlename)
  if(length(article) < 10) {
    print "0"
    exit
  }

  article = deflate(article)

  articlenew = vebug(article)

  if(article != articlenew && length(articlenew) > 10 && Count > 0) {

    articlenew = inflate(articlenew)

    articlenewname = DataDir "article.vebug.txt"
    printf("%s", articlenew) > articlenewname 
    close(articlenewname)

    editsummaryname = DataDir "editsummary.vebug.txt"
    if(Count > 1)
      plural = "s"
    printf("Restore %s cite" plural " deleted by a bug in VisualEditor ([[User:GreenC_bot/Job_18|vebug bot]])", Count) > editsummaryname  # Customize the edit summary to be more specific
    close(editsummaryname)

    print Count
    exit

  }
  print "0"
  exit

}

#
# vebug - parse page, load data into citeTable[][], modify page, return it to main()
#
function vebug(article,  c,field,sep,i,field2,sep2,j,d,k,dest,dest2,fieldi,field2txt,setdone,key,vertemp,re,foundit) {

  # Special case
  # Regularize <sup>[[User:Claudia Diaz2/sandbox#cite%20note-5|[6]]<nowiki>]</nowiki></sup>
  c = patsplit(article, field, /[|][[][^]]*[]]{2}[<]nowiki[>][]][<][/]nowiki[>]/, sep)  
  for(k = 1; k <= c; k++) {
    if(match(reverse(sep[k-1]), /[^[]+[[]{2}/, dest) ) {
      origcite = field[k]
      gsub(/<\/?nowiki>/, "", field[k])
      field[k] = reverse(dest[0]) field[k] 
      origTable[gsubi("^[<]nowiki[^>]*[>]|[<][/]nowiki[>]", "", field[k])] = reverse(dest[0]) origcite
      sep[k-1] = gsubs(reverse(dest[0]), "", sep[k-1])
    }
  }
  article = unpatsplit(field, sep)

  # Special case
  # Regularize [[Politics of Venezuela#cite%20note-19|<sup>[1</sup>]]
  c = patsplit(article, field, /[|][<]sup[>][[][0-9]*[<][/]sup[>][]]{2}/, sep)  
  for(k = 1; k <= c; k++) {
    if(match(reverse(sep[k-1]), /[^[]+[[]{2}/, dest) ) {
      origcite = field[k]
      sub(/[<]sup[>][[][^<]*[<][/]sup[>]/, "[]", field[k])
      field[k] = reverse(dest[0]) field[k] 
      origTable[gsubi("^[<]nowiki[^>]*[>]|[<][/]nowiki[>]", "", field[k])] = reverse(dest[0]) origcite
      sep[k-1] = gsubs(reverse(dest[0]), "", sep[k-1])
    }
  }
  article = unpatsplit(field, sep)

  # Special case
  # Regularize <sup>[[User:Claudia Diaz2/sandbox#cite%20note-5|<nowiki>6]</nowiki>]]</sup>
  c = patsplit(article, field, /[|][<]nowiki[>][^]]*[]]{1}[<][/]nowiki[>][]]{2}/, sep)  
  for(k = 1; k <= c; k++) {
    if(match(reverse(sep[k-1]), /[^[]+[[]{2}/, dest) ) {
      origcite = field[k]
      sub(/<nowiki>[^]]*[^]]/, "[", field[k])
      sub(/<\/nowiki>/, "", field[k])
      field[k] = reverse(dest[0]) field[k] 
      origTable[gsubi("^[<]nowiki[^>]*[>]|[<][/]nowiki[>]", "", field[k])] = reverse(dest[0]) origcite
      sep[k-1] = gsubs(reverse(dest[0]), "", sep[k-1])
    }
  }
  article = unpatsplit(field, sep)

  # Special case
  # Regularize [[2017 Women's March#cite%20note-FT%20100%2C000-13|<span>[13]</span>]]
  # works for <sup> or <span>
  split("sup span", tag, " ")
  for(k = 1; k <= 2; k++) {
    re = "[<]" tag[k] "[^>]*[>][[][0-9]+[]][<][/]" tag[k] "[>][]]{2}"
    c = patsplit(article, field, re, sep)
    for(i = 1; i <= c; i++) {
      if(match(reverse(sep[i-1]), /[|][^[]+[[]{2}/, dest) ) {    
        origcite = field[i]        
        re = "^[<]" tag[k] "[^>]*[>]|[<][/]" tag[k] "[>]"                              
        gsub(re, "", field[i])     
        field[i] = "<sup>" reverse(dest[0]) field[i] "</sup>"    
        origTable[gsubi("^[<]" tag[k] "[^>]*[>]|[<][/]" tag[k] "[>]","",field[i])] = reverse(dest[0]) origcite  
        sep[i-1] = gsubs(reverse(dest[0]), "", sep[i-1])         
      }
    }
    article = unpatsplit(field,sep)
  }

  # Special case
  # Convert cases not surrounded by <sup> or <span> 
  d = patsplit(article, field2, ReSups, sep2)           # Everything already surrounded by sup
  c = patsplit(reverse(article), field, /[]]{3}[0-9]{0,3}[[][|][0-9]{1,3}[-][^[]+[[]{2}/, sep)
  for(i = 1; i <= c; i++) {
    foundit = 0
    for(j = 1; j <= d; j++) {
      if(field2[j] ~ regesc3(reverse(field[i]))) {
        foundit = 1
      }
    }
    if( ! foundit) {
      origTable["<sup>" reverse(field[i]) "</sup>"] = reverse(field[i])
      field[i] = ">pus/<" field[i] ">pus<"
    }
  }
  article = reverse(unpatsplit(field, sep))

  # Standard: convert cites surrounded by <sup>..</sup>

  c = patsplit(article, field, ReSups, sep)
  for(i = 1; i <= c; i++) {

    if(field[i] !~ /[{][{][^{]*[{][{]/ && field[i] ~ /cite[%]20/) {  # skip embeded templates not found by deflate()

      sendlog(logdir "logsups", wikiname, field[i])

      # Encode embedded [0-9] so it can be parsed
      # <sup>[[Group of Eight#cite%20note-19|[19]]][[Group of Eight#cite%20note-20|[20]]]</sup> -->
      #   <sup>[[Group of Eight#cite%20note-19|VEBUGO19VEBUGC]][[Group of Eight#cite%20note-20|VEBUGO20VEBUGC]]</sup>
      fieldi = field[i]
      while(match(fieldi, "[[][0-9]+[]]", dest)) {
        if(match(dest[0], /[0-9]+/, dest2)) 
          field[i] = gsubs(dest[0], "VEBUGO" dest2[0] "VEBUGC", field[i])
        sub("[[][0-9]+[]]", "", fieldi)
      }

      # Populate citeTable[][]
      delete citeTable
      d = patsplit(field[i], field2, /[[]{2}[^]]+[]]{1,3}/, sep2)
      for(j = 1; j <= d; j++) {

        # Decoded
        field2txt = field2[j]
        field2txt = gsubs("VEBUGO", "[", field2txt)
        field2txt = gsubs("VEBUGC", "]", field2txt)
        citeTable[field2txt]["decoded"] = field2txt

        key = field2txt


        # Encoded
        citeTable[key]["encoded"] = field2[j]

        if( empty(origTable[key]))
          origTable[key] = key

        # Cite number
        getCiteNumbers(key)
        if(abort(key, "citenumber")) continue

        # Primary article
        getTitle(key, "artprimary")
        if(abort(key, "artprimary")) continue

        # Secondary article title eg. Group of Eight 
        getTitle(key, "artsecondary")
        if(abort(key, "artsecondary")) continue

        # Time/revision it was last added to primary article
        getPrimaryRevTime(key)
        if(abort(key, "artprimaryrevid")) continue
        if(abort(key, "artprimarytimestamp")) continue

        # Time/revision it existed in secondary article
        getSecondaryRevTime(key)
        if(abort(key, "artsecondaryrevid")) continue
        if(abort(key, "artsecondarytimestamp")) continue
        if(abort(key, "artsecondarywikitext")) continue

        setdone = 0  # if 1, ref is established early in process due to missing data
        settype = 0

        if(! empty(citeTable[key]["citenumbermismatch"])) {
          split(citeTable[key]["citenumbermismatch"], cites, /[|]/)
          vertemp = "{{verify source |date=" getDateToday() " |reason=This ref was deleted Special:Diff/" citeTable[key]["artprimaryrevid"] " by a bug in VisualEditor and later identified by a bot. The original cite can be found at Special:Permalink/" citeTable[key]["artsecondaryrevid"] " (or in a rev close to it) in either cite #" cites[1] " or cite #" cites[2] " - find and verify the cite and replace this template with it (1). [[User:GreenC_bot/Job_18]]}}"
          field2[j] = "<ref>Citation error. See inline comment how to fix. " vertemp "</ref>"
          settype = 1
          setdone = 1
        }

        # Get cite from secondary by its number
        if( ! setdone) {
          getSecondaryCiteByNumer(key)
          if( empty(citeTable[key]["citesecondaryplaintext"]) ) {
            vertemp = "{{verify source |date=" getDateToday() " |reason=This ref was deleted Special:Diff/" citeTable[key]["artprimaryrevid"] " by a bug in VisualEditor and later identified by a bot. The original cite can be found at Special:Permalink/" citeTable[key]["artsecondaryrevid"] " (or in a rev close to it) as cite #" citeTable[key]["citenumber"] " - find and verify the cite and replace this template with it (2). [[User:GreenC_bot/Job_18]]}}"
            field2[j] = "<ref>Citation error. See inline comment how to fix. " vertemp "</ref>"
            settype = 2
            setdone = 1

          }
        }

        # Upload page to wikipedia
        if( ! setdone) {
          if( generateCitationsPage(key) == 0) {
            vertemp = "{{verify source |date=" getDateToday() " |reason=This ref was deleted Special:Diff/" citeTable[key]["artprimaryrevid"] " by a bug in VisualEditor and later restored by a bot in plain-text form. The original cite can be found at Special:Permalink/" citeTable[key]["artsecondaryrevid"] " (or in a rev close to it) as cite #" citeTable[key]["citenumber"] " - find and verify the cite and replace this template with it (3). [[User:GreenC_bot/Job_18]]}}"
            field2[j] = "<ref>" citeTable[key]["citesecondaryplaintext"] " " vertemp "</ref>"
            settype = 3
            setdone = 1
          }
        }

        # Parse page
        if( ! setdone) {
          parseCitationsPage(key)
          if( empty(citeTable[key]["citesecondarywikicite"]) ) { 
            vertemp = "{{verify source |date=" getDateToday() " |reason=This ref was deleted Special:Diff/" citeTable[key]["artprimaryrevid"] " by a bug in VisualEditor and later restored by a bot in plain-text form. The original cite can be found at Special:Permalink/" citeTable[key]["artsecondaryrevid"] " (or in a rev close it it) as cite #" citeTable[key]["citenumber"] " - find and verify the cite and replace this template with it (4). [[User:GreenC_bot/Job_18]]}}"
            field2[j] = "<ref>" citeTable[key]["citesecondaryplaintext"] " " vertemp "</ref>"
            settype = 4
            setdone = 1
          }
        }

        if( ! setdone) {
          vertemp = "{{verify source |date=" getDateToday() " |reason=This ref was deleted Special:Diff/" citeTable[key]["artprimaryrevid"] " by a bug in VisualEditor and later restored by a bot from the original cite located at Special:Permalink/" citeTable[key]["artsecondaryrevid"] " cite #" citeTable[key]["citenumber"] " - verify the cite is accurate and delete this template. [[User:GreenC_bot/Job_18]]}}"
          field2[j] = "<ref>" citeTable[key]["citesecondarywikicite"] " " vertemp "</ref>"
          sendlog(logdir "logconvert", wikiname, key " ---- " citeTable[key]["citesecondarywikicite"] )
        }
        else {
          sendlog(logdir "logconvert", wikiname, key " ---- " settype )
        }

        Count++
        
      }

      field[i] = unpatsplit(field2,sep2)
      gsub(/^[<]sup[^>]*[>]|[<][/]sup[>]$/, "", field[i])

      # Save debuggung info to table.out in data directory
      saveTable()

    }
  } 

  article = unpatsplit(field, sep)

  # Check for missed sups 
  if(article ~ /[#]cite[%]20/ || article ~ /VEBUG[OC]/ ) {
    sendlog(logdir "logmissed", wikiname, "Contains cite%20 or VEBUGO . aborting")
    print article > DataDir "article.abort.txt"
    Count = 0         # Abort changes to article
  }

  return article

}


#
# Get citations numbers
#
#   Populates:
#    citeTable[key]["citenumber"]  (a single cite number best guess, usually the second number)
#    citeTable[key]["citenumbermismatch"]  (two cites numbers sep by "|" if different, otherwise blank)
#
function getCiteNumbers(key,  buf1,buf2,dest) {

        citeTable[key]["citenumber"] = ""
        citeTable[key]["citenumbermismatch"] = ""

        # -55|[note 1]]$  (get "note 1")
        if(match(reverse(citeTable[key]["decoded"]), /^[]]{2}[^[]*[[][|]/, dest)) {
          gsub(/^[]]{2,3}|[[][|]$/, "", dest[0])
          buf1 = reverse(dest[0])
          if(! empty(strip(buf1)))
            citeTable[key]["citenumber"] = buf1
        }

        # -55|[note 1]]$  (get "55")
        if(match(reverse(citeTable[key]["decoded"]), /[|][^-]*[^-]/, dest)) {
          gsub(/^[|]/, "", dest[0])
          if(! empty(strip(dest[0])))
            buf2 = reverse(dest[0])
        }

        if(! empty(citeTable[key]["citenumber"]) && ! empty(buf2)) {
          if(buf2 != citeTable[key]["citenumber"])
            citeTable[key]["citenumbermismatch"] = buf2 "|" citeTable[key]["citenumber"] 
        }
        if( ! isanumber(citeTable[key]["citenumber"]) && isanumber(buf2))
          citeTable[key]["citenumber"] = buf2

}
#
# Parse citations page User:GreenC/testcases/iabdebug
#
#   Populate citeTable[key]["citesecondarywikicite"]
#
function parseCitationsPage(key, begin,i,a,b,fp,np,c,d,e) {

  citeTable[key]["citesecondarywikicite"] = ""

  # Convert page to plain-text
  command = "w3m -dump -cols 10000 'https://en.wikipedia.org/wiki/User:GreenC/testcases/iabdebug'"
  fp = sys2var(command)

  # Extract core surrounded by "%%%%" and "^^^^"
  begin = 0
  for(i = 1; i <= splitn(fp, a, i); i++) {
    if(a[i] ~ /^[%]{4}$/) { 
      begin = 1
      continue
    }
    if(a[i] ~ /^[\\^]{4}$/) begin = 0
    if(begin) 
      np = np "\n" a[i]
  }

  # Extract records surrounded by "@@@@"
  c = split(np, b, /[@]{4}\n/)
  for(i = 1; i <= c; i++) {
    if(! empty(strip(b[i]))) {

      d = split(strip(b[i]), e, /[+]{4}\n/)

      e[1] = gsubs("^[dead link]", "", e[1])

      # print strip(e[1]) " = " citeTable[key]["citesecondaryplaintext"]

      if(strip(e[1]) == citeTable[key]["citesecondaryplaintext"]) {
        citeTable[key]["citesecondarywikicite"] = strip(e[2])
        break
      }
    }
  }
  

}

#
# Generate a list of citations in parsable format and upload to User:GreenC/testcases/iabdebug
#
function generateCitationsPage(key,  c,b,i,k,bp,np,status) {

  bp = "\n<p>\n"

  np = setdatetype(citeTable[key]["artsecondarywikitext"]) bp  # need to set date type so CS1|2 date display is consistent

  np = np "%%%%" bp
  c = split(citeTable[key]["artsecondarywikitext"], b, "<ref[^>]*>")
  for(i = 1; i <= c; i++) {
    k = strip(substr(b[i], 1, match(b[i], "</ref>") - 1))
    if( ! empty(k)) {
      np = np "@@@@" bp
      np = np k bp
      np = np "++++" bp
      np = np "<nowiki>" k "</nowiki>" bp
    }
  }  
  np = np "^^^^" bp  

  # Upload page
  for(i = 1; i <= 3; i++) {
    status = sys2varPipe(np, "wikiget -E " shquote("User:GreenC/testcases/iabdebug") " -S " shquote(citeTable[key]["artprimary"] " -/- " citeTable[key]["artsecondary"]) " -P STDIN") 
    if(status ~ "Success" || status ~ "No change") {
      sleep(5)
      return 1
    }
    else
      sleep(5)
  }

  sendlog(logdir "syslog", wikiname, "Error: Unable to upload to User page, wikiget returns: " status)

  return 0
 
}

#
# Get plain-text version of given cite number in secondary article
#  populates citeTable[key]["citesecondaryplaintext"]
#  on error set to blank
#
function getSecondaryCiteByNumer(key,  command,fp,citenum,a,i) {

  citeTable[key]["citesecondaryplaintext"] = ""

  # Plain text of secondary article
  command = "w3m -dump -cols 10000 " shquote("https://en.wikipedia.org/w/index.php?title=" urlencodeawk(citeTable[key]["artsecondary"]) "&oldid=" citeTable[key]["artsecondaryrevid"] )
  fp = sys2var(command)

  # Get the cite # in plain-text
  if( int(citeTable[key]["citenumber"]) < 10)
    citenum = "^[ ]" citeTable[key]["citenumber"] "[.][ ][\\^]"
  else
    citenum = "^" citeTable[key]["citenumber"] "[.][ ][\\^]"

  for(i = 1; i <= splitn(fp, a, i); i++) {
    if(a[i] ~ citenum) {
      sub(citenum, "", a[i])
      gsub(/[\\^][a-z]{1,3}[ ]/, "", a[i])

      a[i] = gsubs("^[dead link]", "", a[i])

      citeTable[key]["citesecondaryplaintext"] = strip(a[i])
      # break # keep going to get past any duplicates in 'notes' section .. this is imperfect though
    }
  }

}


#
# What time and revision was the bug added to the primary article? 
#  populates:
#    citeTable[key]["artsecondaryrevid"] 
#    citeTable[key]["artsecondarytimestamp"]
#    citeTable[key]["artsecondarywikitext"]
#  on error they are blank 
# 
function getSecondaryRevTime(key,  jsona,i,cont,unixprimary,command,j,arrevid,artimestamp,arcontinue,a,maxrevs,prevcontinue) {

  citeTable[key]["artsecondaryrevid"] = ""
  citeTable[key]["artsecondarytimestamp"] = ""
  citeTable[key]["artsecondarywikitext"] = ""

  if(! empty(citeTable[key]["artprimarytimestamp"]))
    unixPrimary = unixTime(citeTable[key]["artprimarytimestamp"])
  else
    return

  maxrevs = 20
  i       = 0
  cont    = 1

  while(cont) {
    i++
    if(i > maxrevs) { 
      sendlog(logdir "syslog", wikiname, "Error: Exceeded " maxrevs " API requests in getSecondaryRevTime()")
      break  # sanity break
    }

    if(empty(arcontinue["continue"]))
      command = "wget -q -O- " shquote("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=" urlencodeawk(citeTable[key]["artsecondary"]) "&rvlimit=50&rvslots=main&rvprop=timestamp%7Cuser%7Cids&rvdir=older&format=json") 
    else
      command = "wget -q -O- " shquote("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=" urlencodeawk(citeTable[key]["artsecondary"]) "&rvlimit=50&rvslots=main&rvprop=timestamp%7Cuser%7Cids&rvdir=older&format=json&rvcontinue=" urlencodeawk(arcontinue["continue"]) ) 

    if(query_json(sys2var(command), jsona) >= 0) {

      #debug
      #awkenough_dump(jsona, "jsona")

      # jsona["query","pages","1196634","revisions","1","revid"]=7741763
      splitja(jsona, arrevid, 5, "revid")
      # jsona["query","pages","1196634","revisions","1","timestamp"]=2004-11-22T05:21:18Z
      splitja(jsona, artimestamp, 5, "timestamp")
      # jsona["continue","rvcontinue"]=20041122055516|7769047
      splitja(jsona, arcontinue, 1, "rvcontinue")

      for(j = 1; j <= length(arrevid); j++) {
        if(unixTime(artimestamp[j]) <= unixPrimary) {
          if(unixTime(artimestamp[j]) == unixPrimary && citeTable[key]["artprimary"] == citeTable[key]["artsecondary"]) { # same article same diff get prev
            if(j == 50) break # size of block requested in API
            j++
          }
          delete a
          tup(getwikisource(citeTable[key]["artsecondary"], "dontfollow", "wikipedia.org", "en", arrevid[j]), a)
          if(a[1] != "REDIRECT" && ! empty(a[1])) {
            citeTable[key]["artsecondaryrevid"] = arrevid[j]
            citeTable[key]["artsecondarytimestamp"] = artimestamp[j]
            citeTable[key]["artsecondarywikitext"] = a[1]
            cont = 0
            break
          }
          else {
            if( empty(a[1]) && empty(a[2]) ) {  # revision is empty
              sendlog(logdir "syslog", wikiname, "Error: empty secondary revision (" citeTable[key]["artsecondary"] "): " artimestamp[j])
            }
            cont = 0
            break
          }
        }
      }
      prevcontinue = arcontinue["continue"]
    }
  }

}

#
# Get article title (follow redirects)
#
function getTitle(key1,key2,   dest,a) {

        citeTable[key1][key2] = ""

        if(key2 == "artsecondary") {
          if(match(key1, /^[[][^#]+[^#]/, dest) > 0) {
            sub(/^[[]{2}/, "", dest[0])
            tup(getwikisource(dest[0], "dontfollow", "wikipedia.org", "en"), a)
            if(a[1] == "REDIRECT") {
              gsub(/^#REDIRECT[ ]*[[]{2}|[]]$/, "", a[2])
              citeTable[key1][key2] = a[2]
            }
            else
              citeTable[key1][key2] = dest[0]
          }
        }
        else if(key2 = "artprimary") {
          tup(getwikisource(wikiname, "dontfollow", "wikipedia.org", "en"), a)
          if(a[1] == "REDIRECT") {
            gsub(/^#REDIRECT [[]{2}|[]]$/, "", a[2])
            citeTable[key1][key2] = a[2]
          }
          else
            citeTable[key1][key2] = wikiname
        }
}

#
# What time and revision was the bug added to the primary article? 
#  populates:
#    citeTable[key]["artprimaryrevid"] 
#    citeTable[key]["artprimarytimestamp"]
#  on error they are blank 
# 
function getPrimaryRevTime(key,  jsona,i,cont,command,arrevid,atimestamp,arcontinue,j,a,prevrevid,prevtimestamp,maxrevs,prevcontinue,jsonin) {

  citeTable[key]["artprimaryrevid"] = ""
  citeTable[key]["artprimarytimestamp"] = ""

  maxrevs = 20
  i       = 0
  cont    = 1

  while(cont) {
    i++
    if(i > maxrevs) { 
      sendlog(logdir "syslog", wikiname, "Error: Exceeded " maxrevs " API requests in getPrimaryRevTime()")
      break  # sanity break
    }

    if(empty(arcontinue["continue"]))
      command = "wget -q -O- " shquote("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=" urlencodeawk(citeTable[key]["artprimary"]) "&rvlimit=50&rvslots=main&rvprop=timestamp%7Cuser%7Cids&rvdir=older&format=json") 
    else
      command = "wget -q -O- " shquote("https://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=" urlencodeawk(citeTable[key]["artprimary"]) "&rvlimit=50&rvslots=main&rvprop=timestamp%7Cuser%7Cids&rvdir=older&format=json&rvcontinue=" urlencodeawk(arcontinue["continue"]) ) 

    jsonin = sys2var(command)

    if(query_json(jsonin, jsona) >= 0) {

      #debug
      #awkenough_dump(jsona, "jsona")


      # jsona["query","pages","1196634","revisions","1","revid"]=7741763
      splitja(jsona, arrevid, 5, "revid")
      # jsona["query","pages","1196634","revisions","1","timestamp"]=2004-11-22T05:21:18Z
      splitja(jsona, artimestamp, 5, "timestamp")
      # jsona["continue","rvcontinue"]=20041122055516|7769047
      splitja(jsona, arcontinue, 1, "rvcontinue")


      if(arcontinue["continue"] == prevcontinue) {  # reached last revision
        j = length(arrevid) 
        delete a
        tup(getwikisource(citeTable[key]["artprimary"], "dontfollow", "wikipedia.org", "en", arrevid[j]), a)
        if(a[1] != "REDIRECT" && ! empty(a[1])) {
          citeTable[key]["artprimaryrevid"] = arrevid[j]
          citeTable[key]["artprimarytimestamp"] = artimestamp[j]
          citeTable[key]["artprimarywikitext"] = a[1]
          cont = 0
          break
        }
        else {
          sendlog(logdir "syslog", wikiname, "Error: empty primary revision: " artimestamp[j])
          cont = 0
          break
        }
      }

      for(j = 1; j <= length(arrevid); j++) {    # step through each revision for this batch

        tup(getwikisource(citeTable[key]["artprimary"], "dontfollow", "wikipedia.org", "en", arrevid[j]), a)

        if(a[1] != "REDIRECT" && ! empty(a[1])) {
          # if(countsubstring(a[1], citeTable[key]["decoded"]) == 0) {
          if(countsubstring(a[1], origTable[key]) == 0) {
            if(! empty(prevrevid)) {
              citeTable[key]["artprimaryrevid"] = prevrevid
              citeTable[key]["artprimarytimestamp"] = prevtimestamp
            }
            else {
              citeTable[key]["artprimaryrevid"] = arrevid[j]
              citeTable[key]["artprimarytimestamp"] = artimestamp[j]
            }
            cont = 0
            break
          }
          else {
            prevrevid = arrevid[j]
            prevtimestamp = artimestamp[j]
          }
        }
        else {
          if( empty(a[1]) && empty(a[2]) ) {  # revision is empty
            if( citeTable[key]["artprimary"] != citeTable[key]["secondary"] ) {
              citeTable[key]["artprimaryrevid"] = arrevid[j]
              citeTable[key]["artprimarytimestamp"] = artimestamp[j]
            }
            else 
              sendlog(logdir "syslog", wikiname, "Error: empty primary revision: " artimestamp[j])
            cont = 0
            break
          }
        }
      }
      prevcontinue = arcontinue["continue"]
    }
  }
}

#
# Given a Wikipedia datestring, return unix-time string (seconds since 1970)
#
function unixTime(s) {
  return sys2var("date --date=" shquote(s) " +%s")
}

#
# Return todays date as "August 2019"
#
function getDateToday() {
  return sys2var("date +\"%B %Y\"")
}

#
# Determine article date type {{set dmy dates}} etc
#  imported from medilibrary.nim
#
function setdatetype(art,   reDmy,reMdy,i,a) {

  reDmy = "[{]{2}" ReSpace "use" ReSpace "dmy" ReSpace "d?a?t?e?s?|[{]{2}" ReSpace "dmy" ReSpace "[|]|[{]{2}" ReSpace "dmy" ReSpace "[}]|[{]{2}" "use[ -]?dmy"
  reMdy = "[{]{2}" ReSpace "use" ReSpace "mdy" ReSpace "d?a?t?e?s?|[{]{2}" ReSpace "mdy" ReSpace "[|]|[{]{2}" ReSpace "mdy" ReSpace "[}]|[{]{2}" "use[ -]?mdy"
  for(i = 1; i <= splitn(art, a, i); i++) {
    if(a[i] ~ reDmy)
      return "{{dmy}}"
    if(a[i] ~ reMdy)
      return "{{mdy}}"
  }
  return ""
}

#
# Abort check
#
function abort(key1, key2) {

        if( empty( strip(citeTable[key1][key2]) ) ) {

          if(key2 == "citesecondaryplaintext") {
            sendlog(logdir "logabort", wikiname, key1 " ---- " citeTable[key1]["citesecondaryplaintext"] " ---- " key2 " missing")
          }
          else {
            sendlog(logdir "logabort", wikiname, key1 " ---- " key2 " missing")
          }
          return 1
        }
        return 0
} 

#
# Reverse string
#
function reverse(s,   i,len,a,r) {
   len = split(s, a, "")
   for(i = 1; i <= len; i++) 
      r = a[i] r
   return r
}


function saveTable() {
      # Save debuggung info to table.out in data directory
      printtable = 1
      if(printtable) {
        for(kk in citeTable) {
          print "\n -------------- \n" >> DataDir "table.out"
          print "origTable[" kk "] = " origTable[kk] >> DataDir "table.out"
          for(ll in citeTable[kk]) {
            if(ll != "artsecondarywikitext")
              print "citeTable[" kk "][" ll "] = " citeTable[kk][ll] >> DataDir "table.out"
            else {
              print "citeTable[" kk "][" ll "] = " length(citeTable[kk][ll]) >> DataDir "table.out"
            }
          }
        }
      }

}