Wikipedia:Bots/Requests for approval/BHGbot 9/Step5 checker
Appearance
// AWB custom module to remove {{tl|Cleanup bare URLs}} when there are no remaining [[WP:Bare URLs|Bare URLs]] // v0.07 18 October 2021 // -- BHG // NOTE this version is hacked for testing purposes. // It skips all pages except those which get to Step 5, then fail there. public string botNV () { string botName = "[[WP:BHGbot 9]]"; string botVersion = "0.07 checker"; string botTrial = " Trial"; // string botTrial = ""; return botName + "v" + botVersion + botTrial; } public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip) { Skip = false; Summary = botNV() + ": "; // String DECLARATIONS bool debugging = false; string debuggingEditSummary = "This is a test to debug " + botNV() + ". This edit should not have been saved, so please revert it"; string successEditSummary = "Removed {{[[Template:Cleanup bare URLs|Cleanup bare URLs]]}}. This page currently has no bare URLs"; // article text variables string nuArticleText = ""; // The text that we will return if the tag is removed. string testArticleText = ArticleText; // A copy of the article which will be used for testing purposes // tallies int CleanupBareURLsTagCount = 0; int bareURLinlineTagCount = 0; int bareURLrefCount = 0; int URLsremainingAfterRemovingNonBareURlsCount = 0; // DECLARE some regexes needed later on string CleanupBareURLsTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Cc]leanup[_ ]+bare[_ ]+URLs|[Bb]are[_ ]+|[Bb]are|[Bb]are[_ ]+link|[Bb]are[_ ]+linkname|[Bb]are[_ ]+links|[Bb]are[_ ]+references|[Bb]are[_ ]+refs|[Bb]are[_ ]+URL|[Bb]are[_ ]+URLs|[Bb]are-URLs|[Bb]arelinks|[Bb]areURL|[Bb]areURLs|[Cc]leanup[_ ]+bare-URLs|[Cc]leanup[_ ]+link[_ ]+rot|[Cc]leanup[_ ]+link-rot|[Cc]leanup-Bare[_ ]+URLs|[Cc]leanup-barelinks|[Cc]leanup-link[_ ]+rot|[Cc]leanup-link-rot|[Cc]leanup-linkrot|[Cc]UBURL|[Ll]ink[_ ]+rot|[Ll]INKROT|[Ll]R) *(\|[^\}]*)?\}\}"; string bareURLinlineTagMatcher = @"\s*\{\{ *([tT]emplate *: *)?([Bb]are[_ ]+URL[\- ]inline|[Ll]inkrot-inline|[Bb]are-inline|[Bb]are[_ ]+inline|[Bb]are[_ ]+url[_ ]+inline|[Bb]are-url[_ ]+inline|[Bb]are[_ ]+link[_ ]+inline|[Bb]are-link-inline|[Bb]are-url-inline|[Bb]are[_ ]+url) *(\|[^\}]*)?\}\}"; string bareURLinlineRefMatcher = @"<ref[^>]*?>\s*\[?\s*https?:[^>< \|\[\]]+\s*\]?\s*<\s*/\s*ref\s*>"; string completeRefTagMatcher = @"<ref[^>]*?>[^<>]*<\s*/\s*ref\s*>"; string citeTemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Cc](ite|itation))[^\]\{]*\}\}"; // Yes, this is crude, and will miss some cases // such as cites using {{sfnref}}, but it will do // for a start string URLtemplateMatcher = @"{\{ *([tT]emplate *: *)?(URL|Websites|URLWww|URLUrlw|URLUrl|URLUR|URLSite|URLWebsite|URLپیوند وب)\s*(\|[^\}]*)?\}\}"; string OfficialWebsiteOrOfficialURLtemplateMatcher = @"\{\{ *([tT]emplate *: *)?([Oo]fficial[_ ]+URL|[Oo]fficial[_ ]+website|[Cc]onditionalURL|[Cc]onditional[_ ]+URL|[Gg]et[_ ]+URL[_ ]+from[_ ]+WikiData|[Oo]fficialURL|[Oo]fficial[_ ]+url|[Oo]fficialSite|[Oo]fficial|[Cc]ompany[_ ]+Website|[Oo]fficial[_ ]+site|[Oo]fficial[_ ]+Website|[Oo]ffficial[_ ]+website|[Oo]fficial[_ ]+web[_ ]+site|[Oo]fficial[_ ]+homepage|[Hh]omepage|[Hh]ome[_ ]+page|[Oo]fficialwebsite|[Mm]ain[_ ]+website|[Oo]fficialsite|[Oo]fficial[_ ]+webpage|[Oo]fficial[_ ]+Site|[Oo]web)\s*(\|[^\}]*)?\}\}"; string URLparameterMatcher = @"\|\s*(website|url)\s*=\s*https?:[^\|\}]*"; string nonBareURLMatcher = @"\[\s*https?://[^>< \|\[\]]+\s+[^\]]+\]"; // a bit crude string BareURLMatcher = @"((?!<\[ *)https?://[^>< \|\[\]]+|\[ *https?:[^>< \|\[\]]+\s*\])"; // currently unused string anyURLMatcher = @"(?!<\w)https?://\w"; // is this enough? // STEP 1.check that the page contains the banner template {{Cleanup bare URLs}}, or one of its many aliases. If not, skip the page MatchCollection CleanupBareURLsTagmatches = Regex.Matches(ArticleText, CleanupBareURLsTagMatcher, RegexOptions.Singleline); CleanupBareURLsTagCount = CleanupBareURLsTagmatches.Count; if (CleanupBareURLsTagCount == 0) { // No {{CleanupBareURLsTagMatcher}} tags, so skip this page if (debugging) { Skip = false; Summary = debuggingEditSummary; return MakeDebugMsg(1, false, "Page contains no {{tl|CleanupBareURLsTagMatcher}} tag.", false, ArticleText); } Skip = true; return ArticleText; } // So we have a {{Cleanup bare URLs}} tag // Now create a copy of the page without the tag. This is what we will save if there are no remaining Bare URLs nuArticleText = Regex.Replace(ArticleText, CleanupBareURLsTagMatcher, "", RegexOptions.Singleline); // STEP 2. count the number of {{Bare URL inline}} tags in the page, including aliases MatchCollection bareURLinlineTagmatches = Regex.Matches(ArticleText, bareURLinlineTagMatcher, RegexOptions.Singleline); bareURLinlineTagCount = bareURLinlineTagmatches.Count; // STEP 3. count the number of {{Bare URL inline}} tags in the page, including aliases MatchCollection bareURLrefmatches = Regex.Matches(ArticleText, bareURLinlineRefMatcher, RegexOptions.Singleline); bareURLrefCount = bareURLrefmatches.Count; // STEP 4. if the total matches of step 2 + step 3 is greater than zero, then skip the page if ((bareURLinlineTagCount + bareURLrefCount) > 0) { // This page still has some bare URL refs, so skip this page // No {{CleanupBareURLsTagMatcher}} tags, so skip this page if (debugging) { Skip = false; Summary = debuggingEditSummary; return MakeDebugMsg(4, false, "Page still has some bare URL refs.\n* bareURLinlineTagCount=" + bareURLinlineTagCount + "\n* bareURLrefCount=" + bareURLrefCount, false, ArticleText); } Skip = true; return ArticleText; } // STEP 5. check for bare URLs not in ref tags // // In this step we proceed by working on a copy of the article from which we remove URls which are known to be OK // Then we check whether any bare URLs remain // STEP 5.A: remove all ref tags. // We have already checked for any bare URLs inside ref tags, so we can just remove all ref tags and their contents. testArticleText = Regex.Replace(testArticleText, completeRefTagMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 5.B: remove all {{cite}} templates. // Anything inside a {{cite}} template is good, so just remove the whole template testArticleText = Regex.Replace(testArticleText, citeTemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 5.C: remove any {{URL}} templates. // Anything inside a {{URL}} template is good, so just remove the whole template testArticleText = Regex.Replace(testArticleText, URLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 5.D: remove any {{Official URL}} or {{Official website}} templates. // Anything inside an {{Official URL}} or {{Official website}} template is good, so just remove the whole template testArticleText = Regex.Replace(testArticleText, OfficialWebsiteOrOfficialURLtemplateMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 5.E: remove any URL which is a value of a template parameter "url=" or "website=" // e.g. "|website=https://example.com" or "|url=https://example.com" testArticleText = Regex.Replace(testArticleText, URLparameterMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 5.F: remove any non-bare URLs // e.g. "[https://example.com foo]" testArticleText = Regex.Replace(testArticleText, nonBareURLMatcher, "", RegexOptions.Singleline | RegexOptions.IgnoreCase); // STEP 6: does the page still contain any URLs? MatchCollection RemainingURLsMatches = Regex.Matches(testArticleText, anyURLMatcher, RegexOptions.Singleline | RegexOptions.IgnoreCase); URLsremainingAfterRemovingNonBareURlsCount = RemainingURLsMatches.Count; if (URLsremainingAfterRemovingNonBareURlsCount == 0) { // SUCCESS! No bare URLs, so we can remove the tag if (debugging) { Skip = false; Summary = debuggingEditSummary; return MakeDebugMsg(6, true, "Page contains no [[WP:Bare URLs]].", true, nuArticleText); } Skip = true; Summary = botNV() + ": " + successEditSummary; return ArticleText; } // FAILURE // If we get here, then the page still contains bare URLs Skip = false; return "STEP 5 FAIL \n\n\n" +ArticleText; } public string MakeDebugMsg(int stepNum, bool testsOK, string debugMessage, bool textChanged, string pageText) { string retval = "DEBUGGING " + botNV() + ". --- This edit should NOT have been saved. Please revert.\n"; retval = retval + "\nSTEP: " + stepNum; retval = retval + "\nSTATUS: "; if (testsOK) { retval = retval + "Success"; } else { retval = retval + "Fail"; } retval = retval + "\nNOTES: " + debugMessage; retval = retval + "\n\nArticle text follows below the line. "; if (textChanged) { retval = retval + "This text has been modified"; } else { retval = retval + "This is the original text, unmodified\n"; } retval = retval + "\n\n____________________________________"; return retval; }