User:AnomieBOT/source/tasks/ReplaceExternalLinks.pm

From Wikipedia, the free encyclopedia
package tasks::ReplaceExternalLinks;

=pod

=for warning
Due to breaking changes in AnomieBOT::API, this task will probably not run
anymore. If you really must run it, try getting a version from before
2009-03-23.

=begin metadata

Bot:     AnomieBOT
Task:    ReplaceExternalLinks
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 9
Status:  Completed 2008-11-12
Created: 2008-11-08

Replace links to the domains w*.allmusic.com with just "allmusic.com", as those other domains no longer function.

=end metadata

=cut

use utf8;
use strict;

use AnomieBOT::Task;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2008-11-11, completed 2008-11-12<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 9]]

=cut

sub approved {
    return -1;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('ReplaceExternalLinks');
    $api->read_throttle(0);
    $api->edit_throttle(10);

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    # Replacements
    my @euqueries=(
        'wc)(.allmusic.com',
        'wc01.allmusic.com',
        'wc02.allmusic.com',
        'wc03.allmusic.com',
        'wc04.allmusic.com',
        'wc05.allmusic.com',
        'wc06.allmusic.com',
        'wc07.allmusic.com',
        'wc08.allmusic.com',
        'wc09.allmusic.com',
        'wc10.allmusic.com',
        'wm01.allmusic.com',
        'wm02.allmusic.com',
        'wm03.allmusic.com',
        'wm04.allmusic.com',
        'wm05.allmusic.com',
        'wm06.allmusic.com',
        'wm07.allmusic.com',
        'wm08.allmusic.com',
        'wm09.allmusic.com',
        'wm10.allmusic.com',
        'wm11.allmusic.com',
    );
    my @repl=(
        [ qr{\bhttp://(?:w[cm][0-9][0-9]|wc\)\()\.allmusic\.com(?=[][/<>"\x00-\x20\x7F]|$)}i, 'http://allmusic.com', 'updating broken allmusic.com links' ],
        [ qr{\bhttp://www\.allmusic\.com(?=[][/<>"\x00-\x20\x7F]|$)}i, 'http://allmusic.com', 'changing www.allmusic.com to allmusic.com' ],
    );
    my $req=" per [[WP:BOTREQ#Allmusic links|request]]";

    my %q=(
        generator => 'exturlusage',
        geulimit  => 'max',
        prop      => 'info',
    );
    foreach my $q (@euqueries){
        $q{'geuquery'}=$q;
        delete $q{'geuoffset'};

        # Get the list of pages to check
        do {
            $res=$api->query(%q);
            if($res->{'code'} ne 'success'){
                $self->warn("Failed to retrieve usage list for $q: ".$res->{'error'}."\n");
                return 60;
            }
            if(exists($res->{'query-continue'})){
                $q{'geuoffset'}=$res->{'query-continue'}{'exturlusage'}{'geuoffset'};
            } else {
                delete $q{'geuoffset'};
            }

            foreach (values %{$res->{'query'}{'pages'}}){
                my $pageid=$_->{'pageid'};
                my $revid=$_->{'lastrevid'};

                my $checked=$api->fetch($pageid);
                next if(defined($checked) && $$checked>=$revid);

                my $title=$_->{'title'};

                $self->warn("Checking external links in $title\n");

                # Ok, check the page
                my $tok=$api->edittoken($title, EditRedir => 1);
                if($tok->{'code'} eq 'shutoff'){
                    $self->warn("Task disabled: ".$tok->{'content'}."\n");
                    return 300;
                }
                if($tok->{'code'} ne 'success'){
                    $self->warn("Failed to get edit token for $title: ".$tok->{'error'}."\n");
                    next;
                }
                next if exists($tok->{'missing'});
                $revid=$tok->{'lastrevid'};

                # Get page text
                my $intxt=$tok->{'revisions'}[0]{'*'};

                # Perform the replacements
                my ($outtxt,$nowiki)=$self->strip_nowiki($intxt);
                my @s=();
                foreach (@repl){
                    my ($re, $repl, $s)=@$_;
                    my $old=$outtxt;
                    $outtxt=~s/$re/$repl/g;
                    push @s, $s if $outtxt ne $old;
                }
                $outtxt=$self->replace_nowiki($outtxt, $nowiki);

                # Need to edit?
                if($outtxt ne $intxt){
                    if(!@s){
                        $self->warn("No summary for $title even though changes were made, WTF?\n");
                        next;
                    }
                    $s[-1]='and '.$s[-1] if @s>1;
                    my $summary=ucfirst(join((@s>2)?', ':' ', @s)).$req;
                    $self->warn("$summary in $title\n");
                    my $r=$api->edit($tok, $outtxt, $summary, 1, 1);
                    if($r->{'code'} ne 'success'){
                        $self->warn("Write failed on $title: ".$r->{'error'}."\n");
                        next;
                    }
                    $revid=$r->{'edit'}{'newrevid'};
                } else {
                    $self->warn("Nothing to do in $title\n");
                }

                # Save checked revision
                $api->store($pageid, \$revid);

                # If we've been at it long enough, let another task have a
                # go.
                return 0 if time()>=$endtime;
            }
        } while(exists($q{'geuoffset'}));
    }

    # No more pages to check, try again in 10 minutes or so in case of errors.
    return 600;
}

1;