User:AnomieBOT/source/tasks/ReplaceExternalLinks2.pm

From Wikipedia, the free encyclopedia
package tasks::ReplaceExternalLinks2;

=pod

=for warning
Due to breaking changes in AnomieBOT::API, this task will probably not run
anymore. If you really must run it, try getting a version from before
2018-08-12.

=begin metadata

Bot:     AnomieBOT
Task:    ReplaceExternalLinks2
BRFA:    Wikipedia:Bots/Requests for approval/AnomieBOT 44
Status:  Completed 2012-04-04
Created: 2010-09-20

Process pages with geocities.com or oocities.com links to revert oocities.com
spam, add archiveurl for geocities cites on archive.org or webcitation.org,
change archived geocities links to archive.org or webcitation.org, and tag
unarchived geocities links with {{tl|dead link}}.

=end metadata

=cut

use utf8;
use strict;

use Data::Dumper;
use POSIX;
use Date::Parse;
use LWP::UserAgent;
use XML::LibXML;
use HTML::Entities ();
use URI;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

# Marker to indicate where {{dead links}} should be removed
my $rmdl="\x02*\x03";

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    $self->{'iter'}=undef;
    $self->{'ua'}=LWP::UserAgent->new(
        agent=>"AnomieBOT link checker for en.wikipedia.org (https://en.wikipedia.org/wiki/Wikipedia:Bots/Requests_for_approval/AnomieBOT_44)",
        keep_alive=>300,
    );
    # Unfortunately, webcite seems to like quoting back the url without
    # encoding ampersands in certain error messages.
    $self->{'xml'}=XML::LibXML->new(recover=>1);
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2010-10-07.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 44]]

=cut

sub approved {
    return -1;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('ReplaceExternalLinks2', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);

    my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks2]]';

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    # Get list of citation templates
    my %templates=$api->redirects_to_resolved(
        'Template:Citation',
        'Template:Citation metadata',
        'Template:Cite api',
        'Template:Cite book',
        'Template:Cite conference',
        'Template:Cite IETF',
        'Template:Cite interview',
        'Template:Cite journal',
        'Template:Cite mailing list',
        'Template:Cite news',
        'Template:Cite press release',
        'Template:Cite video',
        'Template:Cite web',
        'Template:Unicite',
        'Template:Vancite conference',
        'Template:Vancite journal',
        'Template:Vancite news',
        'Template:Vancite web',
        'Template:Vcite conference',
        'Template:Vcite journal',
        'Template:Vcite news',
        'Template:Vcite web',
    );
    if(exists($templates{''})){
        $api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");
        return 60;
    }

    # Get regex for finding {{dead link}}
    my %dl=$api->redirects_to_resolved(
        'Template:Dead link',
    );
    if(exists($dl{''})){
        $api->warn("Failed to get dead link template redirects: ".$dl{''}{'error'}."\n");
        return 60;
    }
    my $dlre='{{(?i:\s*Template\s*:)?\s*(?:'.join('|',map { $_="\Q$_\E"; s/^Template\\:(.)/(?i:$1)/; s/\\ /[ _]/g; $_; } keys %dl).')(?>\s*(?s:\|.*?)?}})';
    $dlre=qr/$dlre/;

    if(!defined($self->{'iter'})){
        $self->{'iter'}=$api->iterator(
            list        => 'exturlusage',
            euprop      => 'title',
            euquery     => ['*.oocities.com','*.geocities.com'],
            eunamespace => '0',
            eulimit     => '1000', # exturlusage has issues with big lists
        );
    }
    while(my $pg=$self->{'iter'}->next){
        if(!$pg->{'_ok_'}){
            $api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
            return 60;
        }

        return 0 if $api->halting;
        my $page=$pg->{'title'};
        my $tok=$api->edittoken($page, EditRedir => 1);
        if($tok->{'code'} eq 'shutoff'){
            $api->warn("Task disabled: ".$tok->{'content'}."\n");
            return 300;
        }
        if($tok->{'code'} ne 'success'){
            $api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
            next;
        }
        if(exists($tok->{'missing'})){
            $api->warn("WTF? $page does not exist?\n");
            next;
        }

        # Setup flags
        $self->{'flags'}={oo=>0,cite=>0,link=>0,404=>0,fail=>0};

        my $intxt=$tok->{'revisions'}[0]{'*'};
        my $outtxt=$intxt;

        # Despam
        $outtxt=~s{(?<=[./])oocities.com}{geocities.com}g;
        $self->{'flags'}{'oo'}=1 if $intxt ne $outtxt;

        # Replace the links. First, do citation templates.
        my $nowiki;
        $outtxt=$api->process_templates($outtxt, sub {
            return undef if $self->{'flags'}{'fail'};
            my $name=shift;
            my $params=shift;
            my $wikitext=shift;
            my $data=shift;
            my $oname=shift;

            return undef unless exists($templates{"Template:$name"});

            my $ret="{{$oname";
            my $archived=0;
            my $url='';
            my ($accessdate,$date,$year,$month);
            $year=$month='XXX';
            foreach ($api->process_paramlist(@$params)){
                $_->{'name'}=~s/^\s+|\s+$//g;
                $_->{'value'}=~s/^\s+|\s+$//g;
                if($_->{'name'} eq 'url'){
                    $url=$_->{'value'};
                } elsif($_->{'name'} eq 'accessdate'){
                    $accessdate=str2time($_->{'value'});
                } elsif($_->{'name'} eq 'date'){
                    $date=str2time($_->{'value'});
                } elsif($_->{'name'} eq 'year' && $_->{'value'}=~/^\d+$/){
                    $year=$_->{'value'};
                } elsif($_->{'name'} eq 'month'){
                    $month=$_->{'value'};
                } elsif($_->{'name'} eq 'archiveurl'){
                    $archived=1;
                }
                $ret.='|'.$_->{'text'};
            }
            my $r404='';
            if(!$archived && $url=~m!^http://(?:[\w\d-]+\.)*geocities\.com!){
                my ($u,$dt);
                $dt=$accessdate // $date // str2time("1 $month $year") // str2time("1 June $year") // time();
                ($u,$dt,$r404)=chkExtLink($self,$api,0,$url, $dt);
                return undef if($self->{'flags'}{'fail'});
                $ret.="|archiveurl=$u|archivedate=$dt" unless $r404;
                if(!$r404){
                    $ret=~s/$rmdl//g;
                    $r404=$rmdl;
                }
            }
            $ret.="}}".$r404;
            return $ret;
        });
        return 60 if($self->{'flags'}{'fail'});

        # Next, strip for raw link processing
        # Regular expressions are adapted from those MediaWiki uses to
        # recognize external links.
        ($outtxt,$nowiki)=$api->strip_nowiki($outtxt);
        ($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {
            my $name=shift;
            return exists($templates{"Template:$name"});
        }, {}, $nowiki);

        # Strip out ref tags, then replace any links with a guess at access
        # date.
        ($outtxt,$nowiki)=$api->strip_regex(qr!<ref[ >].*?</ref>!, $outtxt, $nowiki);
        my @arc=qw/[aA]rchive webcitation\.org [wW]ayback/;
        my $arc='(?:'.join('|',@arc).')';
        while(my ($k,$v)=each %$nowiki){
            next unless $v=~/^<ref/;
            next if $v=~/$arc/;
            my ($dt,$nw);

            # We have to re-strip here, because the saved values here are
            # automatically unstripped.
            ($v,$nw)=$api->strip_nowiki($v);
            ($v,$nw)=$api->strip_templates($v, sub {
                my $name=shift;
                return exists($templates{"Template:$name"});
            }, {}, $nw);

            $dt=str2time($1) if $v=~/(?:accessed|retrieved)(?: +on)? +(\d{4}-\d{2}-\d{2}|\d+ \w+,? \d{4}|\w+ \d+,? \d{4})/i;

            $v=~s{\[(http://(?:[\w\d-]+\.)*geocities\.com(?:[/:][^][<>\x22\x00-\x20\x7F]+)?)( *[^\]\x00-\x08\x0a-\x1F]*?)\]}{ chkExtLink($self,$api,1,$1,$dt // time(),$2) }ge;
            return 60 if($self->{'flags'}{'fail'});
            ($v,$nw)=$api->strip_regex(qr{\[http://[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $v, $nw);
            $v=~s{\b(http://[^][<>\x22\x00-\x20\x7F]+)}{ chkExtLink($self,$api,2,$1,$dt // time()) }ge;
            return 60 if($self->{'flags'}{'fail'});
            $v=$api->replace_stripped($v,$nw);
            $nowiki->{$k}=$v;
        }

        # Fix any bracketed external link that doesn't have "Archive" or the
        # like in the line after it.
        $outtxt=~s{\[(http://(?:[\w\d-]+\.)*geocities\.com(?:[/:][^][<>\x22\x00-\x20\x7F]+)?)( *[^\]\x00-\x08\x0a-\x1F]*?)\](?!.*$arc)}{ chkExtLink($self,$api,1,$1,time(),$2) }ge;
        return 60 if($self->{'flags'}{'fail'});

        # Hide all bracketed external links. We have to keep track of the
        # replacement token for the ones that have "Archive" etc in their
        # display text.
        ($outtxt,$nowiki)=$api->strip_regex(qr{\[http://[^][<>\x22\x00-\x20\x7F]+ *[^\]\x00-\x08\x0a-\x1F]*?\]}, $outtxt, $nowiki);
        while(my ($k,$v)=each %$nowiki){
            push @arc, $k if $v=~m!^\[http://[^][<>\x22\x00-\x20\x7F]+ *.*$arc!;
        }
        $arc='(?:'.join('|',@arc).')';

        # Fix any bare external link that doesn't have "Archive" or the like in
        # the line after it.
        $outtxt=~s{\b(http://[^][<>\x22\x00-\x20\x7F]+)(?!.*$arc)}{ chkExtLink($self,$api,2,$1,time()) }ge;
        return 60 if($self->{'flags'}{'fail'});

        # Unstrip
        $outtxt=$api->replace_stripped($outtxt,$nowiki);

        # rm marked {{dead link}} templates (and $rmdl markers)
        $outtxt=~s/\Q$rmdl\E(?:\s*$dlre)*//g;

        # rm duplicate {{dead link}} templates too
        $outtxt=~s/$dlre+($dlre)/$1/g;

        if($outtxt ne $intxt){
            my @summary=();
            push @summary, 'reverting oocities.com spam' if $self->{'flags'}{'oo'};
            push @summary, 'adding archiveurl for archived geocities cites' if $self->{'flags'}{'cite'};
            push @summary, 'changing archived geocities links' if $self->{'flags'}{'link'};
            push @summary, 'tagging dead geocities links' if $self->{'flags'}{'404'};
            unless(@summary){
                $api->warn("Changes made with no summary for $page, not editing");
                next;
            }
            $summary[$#summary]='and '.$summary[$#summary] if @summary>1;
            my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
            $api->log("$summary in $page");
            my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
            if($r->{'code'} ne 'success'){
                $api->warn("Write failed on $page: ".$r->{'error'}."\n");
                next;
            }
        }

        # If we've been at it long enough, let another task have a go.
        return 0 if time()>=$endtime;
    }

    $api->log("May be DONE!");
    $self->{'iter'}=undef;
    return 600;
}

sub chkExtLink {
    my $self=shift;
    if($self->{'flags'}{'fail'}){
        return wantarray?('fail','fail','fail'):'fail';
    }

    my $api=shift;
    my $fmt=shift;
    my $url=shift;
    my $date=shift;
    my $txt='';

    if($fmt==2){
        # Duplicate Mediawiki post-processing of bare external links
        $txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
        my $sep=',;\.:!?';
        $sep.=')' unless $url=~/\(/;
        $txt=$1.$txt if $url=~s/([$sep]+$)//;

        # There shouldn't be a template inside the url
        $txt=$1.$txt if $url=~s/(\{\{.*$)//;

        return $url.$txt unless $url=~m!^http://(?:[\w\d-]+\.)*geocities\.com[/:]!;
    }

    # Get archive link and date
    my @archives;
    my ($u, $dt);
    if(exists($api->store->{$url})){
        @archives=@{$api->store->{$url}};
    } else {
        ($u="http://web.archive.org/web/*/$url")=~s!/http://!/!;
        $api->log("... Checking $u");

        # Screen-scrape archive.org
        my $r=$self->{'ua'}->get($u);
        if($r->is_success){
            foreach $_ ($r->decoded_content=~m!href="(http://web.archive.org/web/\d+/[^\x22]*)"!g) {
                $_ = HTML::Entities::decode($_);
                $api->log("... Got $_");

                if(m!^http://web.archive.org/web/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})!){
                    $dt=timegm($6,$5,$4,$3,$2-1,$1-1900);
                } else {
                    $dt=time();
                }
                push @archives, [$dt, $_];
            }
        } elsif($r->code eq '404'){
            $api->log("... Failed with ".$r->code);
        } elsif($r->code eq '403' && $r->decoded_content=~m!<p class="mainTitle">Blocked Site Error.<br><br>\s*</p>\s*<p class="mainBigBody"><i>\Q$url\E</i> is not available in the Wayback Machine!){
            $api->log("... Failed with 403 'not available in the Wayback Machine'");
        } else {
            $api->log("... Failed with ".$r->code.", will retry later");
            $self->{'flags'}{'fail'}=1;
            return chkExtLink($self);
        }

        # check webcite too
        $u=URI->new('http://www.webcitation.org/query');
        $u->query_form(url=>$url,returnxml=>1);
        $u=$u->as_string;
        $api->log("... Checking $u");
        $r=$self->{'ua'}->get($u);
        if($r->is_success){
            my $xml=$self->{'xml'}->load_xml(string=>$r->decoded_content);
            if($xml){
                foreach $_ (@{$xml->find('//result[@status=\'success\']')}){
                    $dt=$_->find('./timestamp');
                    my $uu=URI->new('http://www.webcitation.org/query');
                    $uu->query_form(url=>$url,date=>$dt);
                    $uu=$uu->as_string;
                    # Not exactly RFC-compliant, but it works fine
                    $uu=~s/%3A/:/g; $uu=~s/%2F/\//g;
                    $api->log("... Got $uu");
                    push @archives, [str2time($dt) // time(), $uu];
                }
            } else {
                $api->log("... Invalid XMl data");
                $self->{'flags'}{'fail'}=1;
                return chkExtLink($self);
            }
        } elsif($r->code eq '404'){
            $api->log("... Failed with ".$r->code);
        } else {
            $api->log("... Failed with ".$r->code.", will retry later");
            $self->{'flags'}{'fail'}=1;
            return chkExtLink($self);
        }

        $api->store->{$url}=\@archives;
    }

    # Then pull the closest archive to the accessdate or whatever.
    my ($diff,$r404)=(1e100,'{{dead link|date='.strftime('%B %Y', gmtime).'|bot='.$api->user.'}}');
    $u=undef;
    foreach $_ (@archives){
        if(abs($_->[0] - $date) < $diff){
            $diff=abs($_->[0] - $date);
            ($dt,$u)=@$_;
            $r404='';
        }
    }

    if($r404){
        $self->{'flags'}{'404'}=1;
    } elsif($fmt==0){
        $self->{'flags'}{'cite'}=1;
    } else {
        $self->{'flags'}{'link'}=1;
    }

    if($fmt==0){ # cite template
        return ($u,strftime('%Y-%m-%d',gmtime($dt // 0)),$r404);
    } elsif($fmt==1){ # Bracketed external link
        my $txt=shift;
        return $r404?"[$url$txt]$r404":"[$u$txt]$rmdl";
    } elsif($fmt==2){ # Bare external link
        return ($r404?"[$url $url]$r404":"$u$rmdl").$txt.$rmdl;
    } else {
        return undef;
    }
}

1;