User:AnomieBOT/source/tasks/ReplaceExternalLinks5.pm

From Wikipedia, the free encyclopedia
package tasks::ReplaceExternalLinks5;

=pod

=begin metadata

Bot:      AnomieBOT
Task:     ReplaceExternalLinks5
BRFA:     Wikipedia:Bots/Requests for approval/AnomieBOT 60
Status:   Approved 2011-12-04
Created:  2011-11-30
OnDemand: true

Add archiveurl for dead or dying links, when an archive can be found at
archive.org or webcitation.org, and optionally tag unarchived links with
{{tl|dead link}} or a similar template.

=end metadata

=cut

use utf8;
use strict;

use Data::Dumper;
use POSIX;
use Date::Parse;
use LWP::UserAgent;
use XML::LibXML;
use HTML::Entities ();
use URI;
use AnomieBOT::Task qw/:time/;
use vars qw/@ISA/;
@ISA=qw/AnomieBOT::Task/;

my $req='[[User:AnomieBOT/req/Gamepro links|request]]';

# Useful character sets
my $chars='[^][<>"\x00-\x20\x7F\p{Zs}]';
my $dchars='[^][<>"\x00-\x20\x7F\p{Zs}/.]';
my $portre=qr!(?::\d+)?!;

# Template for marking dead links. Set undef for no tagging.
my $deadlink=undef; #'dead link';

# euquery values to search for
my @euquery=('*.gamepro.com');

# Regular expression matching links to replace. No protocol.
my $linkre=qr!(?:$dchars+\.)*(?i:gamepro\.com)$portre\/!;

# Description of links
my $desc='Gamepro';

###########################
# Marker to indicate where {{dead links}} should be removed
my $rmdl="\x02*\x03";

# Placeholder for when 
my $nodl="\x02x\x03";

# The text part of a bracketed link
my $btext=qr/ *[^\]\x00-\x08\x0a-\x1F]*?/;

# Protocol re
my $proto1=qr!(?:https?:)?//!;
my $proto2=qr!https?://!;

sub new {
    my $class=shift;
    my $self=$class->SUPER::new();
    $self->{'iter'}=undef;
    $self->{'ua'}=LWP::UserAgent->new(
        agent=>"AnomieBOT link checker for en.wikipedia.org (https://en.wikipedia.org/wiki/Wikipedia:Bots/Requests_for_approval/AnomieBOT_60)",
        keep_alive=>300,
    );
    # Unfortunately, webcite seems to like quoting back the url without
    # encoding ampersands in certain error messages.
    $self->{'xml'}=XML::LibXML->new(recover=>1);
    $self->{'protocols'}=[];
    bless $self, $class;
    return $self;
}

=pod

=for info
Approved 2011-12-04.<br />[[Wikipedia:Bots/Requests for approval/AnomieBOT 60]]

=cut

sub approved {
    return -1;
}

sub run {
    my ($self, $api)=@_;
    my $res;

    $api->task('ReplaceExternalLinks5', 0, 10, qw/d::Redirects d::Templates d::Nowiki/);

    my $screwup='Errors? [[User:'.$api->user.'/shutoff/ReplaceExternalLinks5]]';

    # Spend a max of 5 minutes on this task before restarting
    my $endtime=time()+300;

    # Get list of citation templates
    my %templates=$api->redirects_to_resolved(
        'Template:Citation',
        'Template:Citation metadata',
        'Template:Cite api',
        'Template:Cite book',
        'Template:Cite conference',
        'Template:Cite IETF',
        'Template:Cite interview',
        'Template:Cite journal',
        'Template:Cite mailing list',
        'Template:Cite news',
        'Template:Cite press release',
        'Template:Cite video',
        'Template:Cite web',
        'Template:Unicite',
        'Template:Vancite conference',
        'Template:Vancite journal',
        'Template:Vancite news',
        'Template:Vancite web',
        'Template:Vcite conference',
        'Template:Vcite journal',
        'Template:Vcite news',
        'Template:Vcite web',
    );
    if(exists($templates{''})){
        $api->warn("Failed to get citation template redirects: ".$templates{''}{'error'}."\n");
        return 60;
    }

    # Get regex for finding {{dead link}}
    my (%dl,$dlre);
    if(defined($deadlink)){
        %dl=$api->redirects_to_resolved($deadlink);
        if(exists($dl{''})){
            $api->warn("Failed to get dead link template redirects: ".$dl{''}{'error'}."\n");
            return 60;
        }
        $dlre='{{(?i:\s*Template\s*:)?\s*(?:'.join('|',map { $_="\Q$_\E"; s/^Template\\:(.)/(?i:$1)/; s/\\ /[ _]/g; $_; } keys %dl).')(?>\s*(?s:\|.*?)?}})';
        $dlre=qr/$dlre/;
    } else {
        %dl=();
        $dlre=qr/(*F)x/;
    }

    $self->{'protocols'}=[qw/http https/] unless @{$self->{'protocols'}};
    while(@{$self->{'protocols'}}){
        if(!defined($self->{'iter'})){
            $self->{'iter'}=$api->iterator(
                list        => 'exturlusage',
                euprop      => 'title',
                euquery     => [@euquery],
                euprotocol  => shift @{$self->{'protocols'}},
                eunamespace => '0',
                eulimit     => '1000', # exturlusage has issues with big lists
            );
        }
        while(my $pg=$self->{'iter'}->next){
            if(!$pg->{'_ok_'}){
                $api->warn("Failed to retrieve page list for ".$self->{'iter'}->iterval.": ".$pg->{'error'}."\n");
                return 60;
            }

            return 0 if $api->halting;
            my $page=$pg->{'title'};
            my $tok=$api->edittoken($page, EditRedir => 1);
            if($tok->{'code'} eq 'shutoff'){
                $api->warn("Task disabled: ".$tok->{'content'}."\n");
                return 300;
            }
            if($tok->{'code'} ne 'success'){
                $api->warn("Failed to get edit token for $page: ".$tok->{'error'}."\n");
                next;
            }
            if(exists($tok->{'missing'})){
                $api->warn("WTF? $page does not exist?\n");
                next;
            }

            # Setup flags
            $self->{'flags'}={cite=>0,link=>0,404=>0,fail=>0};

            my $intxt=$tok->{'revisions'}[0]{'slots'}{'main'}{'*'};
            my $outtxt=$intxt;

            # Replace the links. First, do citation templates.
            my $nowiki;
            $outtxt=$api->process_templates($outtxt, sub {
                return undef if $self->{'flags'}{'fail'};
                my $name=shift;
                my $params=shift;
                my $wikitext=shift;
                my $data=shift;
                my $oname=shift;

                return undef unless exists($templates{"Template:$name"});

                my $ret="{{$oname";
                my $archived=0;
                my $url='';
                my ($accessdate,$date,$year,$month);
                $year=$month='XXX';
                foreach ($api->process_paramlist(@$params)){
                    $_->{'name'}=~s/^\s+|\s+$//g;
                    $_->{'value'}=~s/^\s+|\s+$//g;
                    if($_->{'name'} eq 'url'){
                        $url=$_->{'value'};
                    } elsif($_->{'name'} eq 'accessdate'){
                        $accessdate=str2time($_->{'value'});
                    } elsif($_->{'name'} eq 'date'){
                        $date=str2time($_->{'value'});
                    } elsif($_->{'name'} eq 'year' && $_->{'value'}=~/^\d+$/){
                        $year=$_->{'value'};
                    } elsif($_->{'name'} eq 'month'){
                        $month=$_->{'value'};
                    } elsif($_->{'name'} eq 'archiveurl'){
                        $archived=1;
                    }
                    $ret.='|'.$_->{'text'};
                }
                my $r404='';
                if(!$archived && $url=~m!^$proto1$linkre!){
                    my ($u,$dt);
                    $dt=$accessdate // $date // str2time("1 $month $year") // str2time("1 June $year") // time();
                    ($u,$dt,$r404)=chkExtLink($self,$api,0,$url, $dt);
                    return undef if($self->{'flags'}{'fail'});
                    $ret.="|archiveurl=$u|archivedate=$dt" unless $r404;
                    if(!$r404){
                        $ret=~s/$rmdl//g;
                        $r404=$rmdl;
                    }
                }
                $ret.="}}".$r404;
                return $ret;
            });
            return 60 if($self->{'flags'}{'fail'});

            # Next, strip for raw link processing
            # Regular expressions are adapted from those MediaWiki uses to
            # recognize external links.
            ($outtxt,$nowiki)=$api->strip_nowiki($outtxt);
            ($outtxt,$nowiki)=$api->strip_templates($outtxt, sub {
                my $name=shift;
                return exists($templates{"Template:$name"});
            }, {}, $nowiki);

            # Strip out ref tags, then replace any links with a guess at access
            # date.
            ($outtxt,$nowiki)=$api->strip_regex(qr!<ref[ >].*?</ref>!, $outtxt, $nowiki);
            my @arc=qw/[aA]rchive webcitation\.org [wW]ayback/;
            my $arc='(?:'.join('|',@arc).')';
            while(my ($k,$v)=each %$nowiki){
                next unless $v=~/^<ref/;
                next if $v=~/$arc/;
                my ($dt,$nw);

                # We have to re-strip here, because the saved values here are
                # automatically unstripped.
                ($v,$nw)=$api->strip_nowiki($v);
                ($v,$nw)=$api->strip_templates($v, sub {
                    my $name=shift;
                    return exists($templates{"Template:$name"});
                }, {}, $nw);

                $dt=str2time($1) if $v=~/(?:accessed|retrieved)(?: +on)? +(\d{4}-\d{2}-\d{2}|\d+ \w+,? \d{4}|\w+ \d+,? \d{4})/i;

                $v=~s{\[($proto1$linkre$chars*)($btext)\]}{ chkExtLink($self,$api,1,$1,$dt // time(),$2) }ge;
                return 60 if($self->{'flags'}{'fail'});
                ($v,$nw)=$api->strip_regex(qr{\[$proto1$chars+$btext\]}, $v, $nw);
                $v=~s{\b($proto2$linkre$chars*)}{ chkExtLink($self,$api,2,$1,$dt // time()) }ge;
                return 60 if($self->{'flags'}{'fail'});
                $v=$api->replace_stripped($v,$nw);
                $nowiki->{$k}=$v;
            }

            # Fix any bracketed external link that doesn't have "Archive" or the
            # like in the line after it.
            $outtxt=~s{\[($proto1$linkre$chars*)($btext)\](?!.*$arc)}{ chkExtLink($self,$api,1,$1,time(),$2) }ge;
            return 60 if($self->{'flags'}{'fail'});

            # Hide all bracketed external links. We have to keep track of the
            # replacement token for the ones that have "Archive" etc in their
            # display text.
            ($outtxt,$nowiki)=$api->strip_regex(qr{\[$proto1$chars+$btext\]}, $outtxt, $nowiki);
            while(my ($k,$v)=each %$nowiki){
                push @arc, $k if $v=~m!^\[$proto1$chars+ *.*$arc!;
            }
            $arc='(?:'.join('|',@arc).')';

            # Fix any bare external link that doesn't have "Archive" or the like in
            # the line after it.
            $outtxt=~s{\b($proto2$linkre$chars+)(?!.*$arc)}{ chkExtLink($self,$api,2,$1,time()) }ge;
            return 60 if($self->{'flags'}{'fail'});

            # Unstrip
            $outtxt=$api->replace_stripped($outtxt,$nowiki);

            # Remove "no-dead-link" markers
            $outtxt=~s/\Q$nodl\E//g;

            # rm marked {{dead link}} templates (and $rmdl markers)
            $outtxt=~s/\Q$rmdl\E(?:\s*$dlre)*//g;

            # rm duplicate {{dead link}} templates too
            $outtxt=~s/$dlre+($dlre)/$1/g;

            if($outtxt ne $intxt){
                my @summary=();
                push @summary, "adding archiveurl for archived $desc cites" if $self->{'flags'}{'cite'};
                push @summary, "changing archived $desc links" if $self->{'flags'}{'link'};
                push @summary, "tagging dead $desc links" if($self->{'flags'}{'404'} && defined($deadlink));
                unless(@summary){
                    $api->warn("Changes made with no summary for $page, not editing");
                    next;
                }
                $summary[$#summary]='and '.$summary[$#summary] if @summary>1;
                my $summary=ucfirst(join((@summary>2)?', ':' ', @summary));
                $summary.=" per $req";
                $api->log("$summary in $page");
                my $r=$api->edit($tok, $outtxt, "$summary. $screwup", 1, 1);
                if($r->{'code'} ne 'success'){
                    $api->warn("Write failed on $page: ".$r->{'error'}."\n");
                    next;
                }
            }

            # If we've been at it long enough, let another task have a go.
            return 0 if time()>=$endtime;
        }
        $self->{'iter'}=undef;
    }

    $api->log("May be DONE!");
    return 600;
}

sub chkExtLink {
    my $self=shift;
    if($self->{'flags'}{'fail'}){
        return wantarray?('fail','fail','fail'):'fail';
    }

    my $api=shift;
    my $fmt=shift;
    my $url=shift;
    my $date=shift;
    my $txt='';

    if($fmt==2){
        # Duplicate Mediawiki post-processing of bare external links
        $txt=$1.$txt if $url=~s/((?:[<>]|&[lg]t;).*$)//;
        my $sep=',;\.:!?';
        $sep.=')' unless $url=~/\(/;
        $txt=$1.$txt if $url=~s/([$sep]+$)//;

        # There shouldn't be a template inside the url
        $txt=$1.$txt if $url=~s/(\{\{.*$)//;

        return $url.$txt unless $url=~m!^$proto2$linkre!;
    }

    # Get archive link and date
    my @archives;
    my ($u, $dt);
    if(exists($api->store->{$url})){
        @archives=@{$api->store->{$url}};
    } else {
        ($u="http://web.archive.org/web/*/$url")=~s!/$proto1!/!;
        $api->log("... Checking $u");

        # Screen-scrape archive.org
        my $r=$self->{'ua'}->get($u);
        if($r->is_success){
            foreach $_ ($r->decoded_content=~m!href="(http://web.archive.org/web/\d+/[^\x22]*)"!g) {
                $_ = HTML::Entities::decode($_);
                $api->log("... Got $_");

                if(m!^http://web.archive.org/web/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})!){
                    $dt=timegm($6,$5,$4,$3,$2-1,$1-1900);
                } else {
                    $dt=time();
                }
                push @archives, [$dt, $_];
            }
        } elsif($r->code eq '404'){
            $api->log("... Failed with ".$r->code);
        } elsif($r->code eq '403' && $r->decoded_content=~m!<p class="mainTitle">Blocked Site Error.<br><br>\s*</p>\s*<p class="mainBigBody"><i>\Q$url\E</i> is not available in the Wayback Machine!){
            $api->log("... Failed with 403 'not available in the Wayback Machine'");
        } else {
            $api->log("... Failed with ".$r->code.", will retry later");
            $self->{'flags'}{'fail'}=1;
            return chkExtLink($self);
        }

        # check webcite too
        $u=URI->new('http://www.webcitation.org/query');
        $u->query_form(url=>$url,returnxml=>1);
        $u=$u->as_string;
        $api->log("... Checking $u");
        $r=$self->{'ua'}->get($u);
        if($r->is_success){
            my $xml=$self->{'xml'}->load_xml(string=>$r->decoded_content);
            if($xml){
                foreach $_ (@{$xml->find('//result[@status=\'success\']')}){
                    $dt=$_->find('./timestamp');
                    my $uu=URI->new('http://www.webcitation.org/query');
                    $uu->query_form(url=>$url,date=>$dt);
                    $uu=$uu->as_string;
                    # Not exactly RFC-compliant, but it works fine
                    $uu=~s/%3A/:/g; $uu=~s/%2F/\//g;
                    $api->log("... Got $uu");
                    push @archives, [str2time($dt) // time(), $uu];
                }
            } else {
                $api->log("... Invalid XML data");
                $self->{'flags'}{'fail'}=1;
                return chkExtLink($self);
            }
        } elsif($r->code eq '404'){
            $api->log("... Failed with ".$r->code);
        } else {
            $api->log("... Failed with ".$r->code.", will retry later");
            $self->{'flags'}{'fail'}=1;
            return chkExtLink($self);
        }

        $api->store->{$url}=\@archives;
    }

    # Then pull the closest archive to the accessdate or whatever.
    my ($diff,$r404)=(1e100,defined($deadlink)?"{{$deadlink|date=".strftime('%B %Y', gmtime)."|bot=".$api->user."}}":$nodl);
    $u=undef;
    foreach $_ (@archives){
        if(abs($_->[0] - $date) < $diff){
            $diff=abs($_->[0] - $date);
            ($dt,$u)=@$_;
            $r404='';
        }
    }

    if($r404){
        $self->{'flags'}{'404'}=1;
    } elsif($fmt==0){
        $self->{'flags'}{'cite'}=1;
    } else {
        $self->{'flags'}{'link'}=1;
    }

    if($fmt==0){ # cite template
        return ($u,strftime('%Y-%m-%d',gmtime($dt // 0)),$r404);
    } elsif($fmt==1){ # Bracketed external link
        my $txt=shift;
        return $r404?"[$url$txt]$r404":"[$u$txt]$rmdl";
    } elsif($fmt==2){ # Bare external link
        return ($r404?"[$url $url]$r404":"$u$rmdl").$txt.$rmdl;
    } else {
        return undef;
    }
}

1;