Jump to content

Wikipedia:Bots/Requests for approval/BareRefBot/Code

From Wikipedia, the free encyclopedia
// 2.0 - 2022 February 17
function checkentry(url) { 
    const inquery2 = "SELECT * FROM web WHERE url=$1"
    var insertarray = [url]
    try {
        var res =  sql.query(inquery2, insertarray)
        if (res.rowCount != 1) { // duplicate
            return null
        }
        var retstruct = {
            url: res.rows[0].url,
            title: res.rows[0].title,
            isdead: res.rows[0].isdead,
            work: res.rows[0].work,
            metatitle:res.rows[0].metatitle,
            shouldnotplace: res.rows[0].shouldnotplace

        }
        if (retstruct.shouldnotplace) {
            // dup titles, etc... detected by grabber script
            return null 
        }
        return retstruct
    } catch(e) {
        return null
    }
}

function checktitle(str) {
    var badones = [
        "error",
        "not found",
        "sorry",
        "cookies",
        "404",
        "410",
        "just a moment",
        "unavailable",
        "not available",
        "untitled",
        "web server is down",
        "wayback machine",
        "archive.",
        "attention required",
        "paywall",
        "503",
        "too many requests",
        "under construction",
        "hugedomains",
        "godaddy",
        "are you a robot",
        "loading...",
        "account suspended",
        "domain for sale",
        "access denied",
        "browser settings",
        "suspended",
        "unsupported",
        "down for maintainence",
        "captcha"
    ]
    for (unsuitables of badones) {
        if (str.toLowerCase().indexOf(unsuitables) >= 0) {
            return true 
        }

    }
    return false
}
function regexp_quote(str) { 
    return str.replace(/([.?*+^$[\]\\(){}-])/g, "\\$1");
}
function traverse(refitem, datefmt = "") {
    var traversedcount = -1
    var removebaretemp = false // tracking category for multiple bare refs
    for (refobj of refitem) { // iterate over parser "objects" in the <ref></ref> in question
        traversedcount = traversedcount + 1 // count of objects traversed.
        if (typeof refobj == "string") { 
            // This is a recursive function, so sometimes it calls a function on a string
            // A string can not be iterated and if the object passed in is a string it has gone too deep in, so step out.
            return
        }
       if (refobj.type == "url" && refobj.is_bare == true  ) {
           usethisurl = refobj[0].toString()
        if (usethisurl.indexOf("archive.") >= 0  || // everything else (note the . at the end)
        usethisurl.indexOf("webcit") >= 0 || // webcite
        usethisurl.indexOf("youtube.com") >= 0 ||
         usethisurl.indexOf("twitter.com") >= 0 || 
         usethisurl.indexOf("facebook.com") >= 0 || 
         usethisurl.indexOf("instagram.com") >= 0) {
             // Skip these, because these shoud either be in archive-url (out of scope) or I haven't integrated the fixes for these yet
            continue

        }
        var shoulddo = true  
        for (refobj2 of refitem) { //  iterate through the whole thing again to check for undeseriables
            if (typeof refobj2 == "string" && refobj2.trim() != "") {
                shoulddo = false // lets not fix middle ones. For exaple <ref>https://website.website is an amazing site</ref> is not something that should be filled
                break 
            }

            if (refobj2.type == "transclusion" && refobj2.name.toLowerCase() != "bare url inline") {
                // If there is some sort of transcluion in the <ref></ref> that is not recognized, skip as it might be out of scope. 
                shoulddo = false
                break
            }

        }
        if (!shoulddo) {
            continue
        }
         var parsethis = "{{cite web"
         usethisurl = usethisurl.replaceAll("|", "%7C") // escape for  CS1
         parsethis = parsethis + " |url=" + usethisurl
         if (usethisurl.indexOf(".pdf") >=0) {
             continue
         }
         var cached = false 
         var retstruct =  checkentry(usethisurl)
         var usethistitle = ""
         var usethiswebsite = "" 
         var placeDead = false
         if  (retstruct && retstruct.title) {
             cached = true
             usethistitle = retstruct.title
             if (retstruct.isdead) {
                 placeDead = true
             }
         } else {
             // no match
             continue

         }
         if (retstruct.metatitle && retstruct.work && !placeDead ) {
             // This will handle some of the splicing
             // Sometimes the "metatitle" will have the name of the work, in this case it will remove it. 
             // Note that the "work" field is confirmed to be the website name, so it can remove this off the splice if its there
             // Contrast this to just removing any and all splices where we don't know what comes after the splice.
            retstruct.metatitle = retstruct.metatitle.replaceAll("|", "{{!}}")
            retstruct.work = retstruct.work.replaceAll("|", "{{!}}")
            var metatitle_lcase =  retstruct.metatitle.toLowerCase()
            var work_lcase = retstruct.work.toLowerCase()
            if (retstruct 
                && (retstruct.metatitle && retstruct.metatitle.trim() != "")
                &&  (retstruct.work && retstruct.work.trim() != "")
                && metatitle_lcase != work_lcase
                && work_lcase != usethistitle.toLowerCase()
                && (metatitle_lcase != usethistitle.toLowerCase() || metatitle_lcase.indexOf(work_lcase) > 0)
                &&  work_lcase.indexOf(metatitle_lcase) < 0 ) { 
                   // Once website name is determined, strip it out of title and place it in website field per request
                   if (metatitle_lcase.indexOf(work_lcase) > 0) {  
                       // if website name is in title, strip it out (equiv: IF articleTitle INCLUDES foundWebsiteName)
                       var regstr = "[»|–—-]+\\s+" + regexp_quote(retstruct.work) + "$"
                       var regobj = new RegExp(regstr)
                       retstruct.metatitle = retstruct.metatitle.replace(regobj, "")
                       if (retstruct.metatitle.toLowerCase() != metatitle_lcase && retstruct.metatitle.trim() != "") {
                           // set website, otherwise move on
                           //equiv: "trimmedArticleTitle IS NOT BLANK OR CRAP"
                           usethistitle = retstruct.metatitle
                           usethiswebsite = retstruct.work
                       }
                   } else {
                      usethistitle = retstruct.metatitle
                      usethiswebsite = retstruct.work
   
                   }
                
            } else {
                 // We couldn't find the website name, or couldn't extract and remove it. The website name may be in the title however, the bot may not be able to get to it. Per request, always put domain name, redundancy is always better.
                usethiswebsite = usethisurl.parse().hostname

           }


         if (usethistitle.length > 75 || usethiswebsite.length > 35) {
             // // Some malformed websites have absurdly long titles. Don't fill these in.
             continue
         }
         
         if (checktitle(usethistitle)) {
             // if bad title, continue and don't fill. We don't know if dead, just that something is wrong, leave it alone
            continue
        }
         if (usethisurl.indexOf(usethistitle.toLowerCase()) >= 0) {
             // If the title is in the URL (Example: title is Wikipedia.Com and website is wikipedia.com/fsdfdfsdf), then don't fill 
            continue
         }
        if (usethistitle && usethistitle != "" && !placeDead ) {
            usethistitle = usethistitle.replaceAll("|", "{{!}}")
            // This escapes pipe parameters in titles
            unicoderemove = usethistitle.replaceAll(/[\u00a0\u00ad\ufffd\u200a\u200b\u200d\u0009\u0010\u0013\u007f\u0000-\u001f\u0080-\u0094]/g, " ")
            // prevent CS1 errors with certain blacklisted unicode characters.
            if (usethistitle != unicoderemove) {
                continue // if replaced, then lets avoid
            }
            unicoderemove_web = usethiswebsite.replaceAll(/[\u00a0\u00ad\ufffd\u200a\u200b\u200d\u0009\u0010\u0013\u007f\u0000-\u001f\u0080-\u0094]/g, " ")
            if (usethiswebsite != unicoderemove_web) {
                continue // if replaced, then lets avoid
            }
            if (usethiswebsite && usethiswebsite != "") {
                parsethis = parsethis + " |title=" + usethistitle.trim() + " |website=" + usethiswebsite.trim()
                // If the retreived "website=" parameter is valid, then fill it in
            } else {
                parsethis = parsethis + " |title=" + usethistitle.trim()
            }
            
        }  else (placeDead) {
                parsethis2 = " {{Dead link|bot=BareRefBot|date=February 2022}}" // note space
                var parsethis2t = CeL.net.wiki.parser(parsethis2).parse()
                obj.push(parsethis2t)
                continue
        
        }
       
        parsethis = parsethis + "}}"
        var parsedt = CeL.net.wiki.parser(parsethis).parse()
        obj[traversedcount] = parsedt
        console.log("done with " + usethistitle)
        removebaretemp = true
		

       }
       if (refobj.type == "external_link") {
           continue
       }
       if (refobj.type == "transclusion" && refobj.name.toLowerCase() == "bare url inline" && removebaretemp) {
           delete obj[traversedcount]
           removebaretemp = false
       }
       if (obj.type == "tag_inner") {
            traverse(obj[traversedcount]) // Deal with nested refs, and other parser strangeness.
       }
    }

} 
function main(filename) {
	var wikitxt = fs.readFileSync(filename).toString()
	var page_data = CeL.net.wiki.parser(wikitxt)
    parsed_data = page_data.parse()
    parsed_data.each("tag_inner", function refprocess(token, index, parent) { 
      if (!parent || parent.tag != "ref") {
          // we dont want to convert non ref bares (e.g.: URLS out of nowhere and external link sections)
          return
      }
      gst = traverse(token, datetype)
  })
  console.log("done")
  var writeto = parsed_data.toString()
  if (!writeto || writeto.trim().length == 0 || writeto.length < wikitxt.length) {
      // Should never be less, sanity check to prevent blanking, etc....
      // Reset to orig and reutrn
      writeto = wikitxt
      return
  }
  writeFile(filename, writeto)
}