User:AnomieBOT/source/tasks/TaskRedirectChecker.pm

From Wikipedia, the free encyclopedia
package tasks::TaskRedirectChecker;

=pod

=begin metadata

Bot:     AnomieBOT
Task:    TaskRedirectChecker
BRFA:    N/A
Status:  Begun 2010-06-16
Created: 2010-06-16

Check the permanent redirects under [[Special:PrefixIndex/User:AnomieBOT/req/|User:AnomieBOT/req/]] to validate the anchor still exists in the target page. If the anchor can be found in an archive subpage, the redirect will be updated. Otherwise, the bot will ask for help on its talk page.

Note this doesn't handle {{tl|anchor}} or the like, just TOC headers.

=end metadata

=cut

use utf8;
use strict;

use AnomieBOT::Task;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

use Data::Dumper;

sub new {
    my $class=shift;
    my $self=$class->SUPER::new;
    bless $self, $class;
    return $self;
}

=pod

=for info
Per [[WP:BOT#Approval]], any bot or automated editing process that only
affects only the operators' user and talk pages (or subpages thereof),
and which are not otherwise disruptive, may be run without prior
approval.

=cut

sub approved {
    return 999;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('TaskRedirectChecker', 0, 10, qw(d::Timestamp d::Redirects d::Talk));

    my $starttime=time();
    my $lastrun=$api->store->{'lastrun'} // 0;
    my $t=$lastrun+86400-$starttime;
    return $t if $t>0;

    my $re=$api->redirect_regex();
    my $base=$api->user.'/req/';
    my $iter=$api->iterator(generator=>'allpages',gapprefix=>$base,gapnamespace=>2,gapfilterredir=>'redirects',prop=>'info|revisions',rvprop=>'content',rvslots=>'main');
    my @whine=();
    while(my $page=$iter->next){
        if(!$page->{'_ok_'}){
            $api->warn("Could not retrieve page from iterator: ".$page->{'error'}."\n");
            return 60;
        }

        my $txt=$page->{'revisions'}[0]{'slots'}{'main'}{'*'};
        next unless $txt=~/$re\[\[([^]#]+)#([^]]+)\]\]/;
        my ($title,$anchor)=($1,$2);

        # Ask MediaWiki to canonicalize the title for us, because the actual
        # normalization can depend on various factors.
        $res=$api->query(titles=>$title);
        if($res->{'code'} ne 'success'){
            $api->warn("Failed to get canonical name for $title: ".$res->{'error'}."\n");
            return 60;
        }
        $title=$res->{'query'}{'normalized'}[0]{'to'} // $title;

        # Add a "dummy" section for the anchor we're actually looking for,
        # because the encoded anchors returned in "sections" varies based on
        # server settings. Note this doesn't support {{anchor}} or the like.
        $anchor =~ s/\{/{/g;
        $res=$api->query(action=>'parse',title=>$title,text=>"__TOC__\n== XXX $anchor ==\n\n{{:$title}}",prop=>'sections');
        if($res->{'code'} ne 'success'){
            $api->warn("Failed to retrieve section list for $title: ".$res->{'error'});
            return 60;
        }
        my @s=map $_->{'anchor'}, @{$res->{'parse'}{'sections'}};
        my $anchorenc=shift @s; $anchorenc=~s/^XXX_//; # Pull out the dummy
        next if grep($_ eq $anchorenc, @s);

        # No anchor found, let's try looking for archives linked from edit
        # summaries since the last run.
        my %q=(titles=>$title,prop=>'revisions',rvprop=>'comment',rvlimit=>'100',rvend=>$api->timestamp2ISO($lastrun-86400));
        my %did_archives=();
        my $newtitle=undef;
        do {
            $res=$api->query(%q);
            if($res->{'code'} ne 'success'){
                $api->warn("Failed to retrieve history for $title: ".$res->{'error'});
                return 60;
            }
            if(exists($res->{'query-continue'})){
                $q{'rvcontinue'}=$res->{'query-continue'}{'revisions'}{'rvcontinue'};
            } else {
                delete $q{'rvcontinue'};
            }

            # For each revision...
            for my $r (@{(values %{$res->{'query'}{'pages'}})[0]{'revisions'}}) {
                # Catch items with no comment
                next unless ref($r) eq 'HASH';

                # For each wikilink in the edit summary...
                for my $a ($r->{'comment'}=~/\[\[([^]|#]+)/g){
                    next unless $a=~m!^\Q$title/\E!i; # Only consider subpages of the target
                    next if exists($did_archives{$a});
                    $did_archives{$a}=1;

                    # Get section list for the linked page
                    my $res2=$api->query(action=>'parse',text=>"__TOC__\n{{:$a}}",prop=>'sections');
                    if($res2->{'code'} ne 'success'){
                        $api->warn("Failed to retrieve section list for $a: ".$res->{'error'});
                        return 60;
                    }
                    my @s=map $_->{'anchor'}, @{$res2->{'parse'}{'sections'}};
                    next unless grep($_ eq $anchorenc, @s);
                    # Found one! Save the name and exit the loop
                    $newtitle=$a;
                    last;
                }
                last if defined($newtitle);
            }
        } while(!defined($newtitle) && exists($q{'rvcontinue'}));

        if(defined($newtitle)){
            $txt=~s/($re)\[\[[^]#]+/$1\[[$newtitle/;
            my $tok=$api->edittoken($page->{'title'}, EditRedir=>1);
            if($tok->{'code'} eq 'shutoff'){
                $api->warn("Task disabled: ".$tok->{'content'}."\n");
                return 300;
            }
            if($tok->{'code'} ne 'success'){
                $api->warn("Failed to retrieve edit token for $page: ".$tok->{'error'});
                return 60;
            }
            if(exists($tok->{'missing'})){
                $api->warn("Page $page->{title} does not exist, WTF?");
                return 60;
            }
            if($tok->{'lastrevid'} ne $page->{'lastrevid'}){
                $api->warn("$page->{title} edited since loaded");
                return 60;
            }
            $res=$api->edit($tok, $txt, "Updating redirect to archived discussion",0,1);
            if($res->{'code'} ne 'success'){
                $api->warn("Write for $page->{title} failed: ".$res->{'error'});
                return 60;
            }
            $api->log("Updated redirect for $page->{title} from $title to $newtitle");
        } else {
            push @whine, $page->{'title'};
        }
    }

    if(@whine){
        $api->log("Bot needs help on pages: ".join(' ', @whine));
        my $res=$api->whine("Broken task redirects", "When linking to discussions from an edit summary, we use redirects under [[Special:PrefixIndex/User:$base|User:$base]] so the links don't die when the discussion is archived. The following redirect pages seem to have become invalid, and I can't find an archive subpage to automatically update them with. Please fix them manually. Thanks.\n* [[:".join("]]\n* [[:", @whine)."]]\n", NoSmallPrint=>1);
        if($res->{'code'} ne 'success'){
            $api->warn("Failed to post to my talkpage: ".$res->{'error'});
            return 60;
        }
    }

    # We processed all pages, calculate the number of seconds until the next
    # time we're needed.
    $api->store->{'lastrun'}=$starttime;
    return $starttime+86400-time();
}

1;