User:Coren/csb2.js

From Wikipedia, the free encyclopedia
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
//<syntaxhighlight language=perl>
#! /usr/bin/perl

use LWPx::ParanoidAgent;
use HTTP::Cookies;
use URI::Escape;
use Text::Align::WagnerFischer;


$ua = LWPx::ParanoidAgent->new(timeout => 20);
$ua->agent("CorenSearchBot/1.0 ");
$cookie_jar = HTTP::Cookies->new(file => "$ENV{'HOME'}/lwp_cookies.dat", autosave => 1,);
$ua->cookie_jar($cookie_jar);

sub Doing($) {
    my($msg) = @_;
    print "\t$msg\n";
}

sub significant($) {
    my @in = split "\n", $_[0];
    my @out;
    foreach my $l (@in) {
        next if $l =~ m/ Categor(y|ies) /;
        next if $l =~ m/align/;
        my $words = 0;
        if($l =~ m/\b[a-z]{5,}\b/) {
            $words++ while $l =~ m//g;
        }
        if($l =~ m/\b\*\b/) {
            $words-=2 while $l =~ m//g;
        }
        next if $words < 3;
#$l .= " [$words]";
        push @out, $l;
    }
    return @out;
}

sub complete($) {
    my @in = split "\n", $_[0];
    my @out;
    foreach my $l (@in) {
        next if $l =~ m/ Categor(y|ies) /;
        push @out, $l;
    }
    return @out;
}

sub tokenize(@) {
    my @t;
    foreach my $l (@_) {
        foreach my $t (split / /, $l) {
            push @t, $t if length($t) > 3;
        }
    }
    return @t;
}

sub statementize($) {
    ($_, undef) = @_;
    s/---*/ /g;
    tr/!-?/ /;
#s/  */ /g;
    s/^ *//g;
    s/ *$//g;
    s/\*([^ .])/\1/g;
    s/\.  */.\n/g;
#while(s/([^. \n]) *([A-Z][a-zA-Z0-9_]*)/\1 */gs) { }
#while(s/\*  *\*/* /gs) { }
    s/\.([A-Z])/\n\1/sg;
    s/  *\././g;
    s/\n\n*/\n/gs;
    s/\.\n/\n/gs;
    return $_;
}

sub normalizewikitext($) {
    ($_, undef) = @_;
    tr/*#/::/;
    s/&lt;ref&gt;.*?&lt;\/ref&gt;/ /igs;
    s/&lt;.*?&gt;/ /igs;
    s/&[^;]*;/ /gs;
    while(s/('''*)(.*?)\1/ \2 /gs) { }
    s/\[\[([^|\]]*)]]/ \1 /gs;
    s/\[\[.*?\|(.*?)]]/ \1 /gs;
    s/\[[^ ]* (.*?)]/ \1 /gs;
    s/\[.*?]/ /gs;
    s/^(===*)(.*?)\1/\2. /g;
    s/{{.*?}}/ /gs;
    return statementize $_;
}

sub normalizewebtext($) {
    ($_, undef) = @_;
    s/<.*?>/ /igs;
    s/\&.*?;/ /gs;
    return statementize $_;
}

sub WPRequest(@) {
    my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/api.php');
    $req->content_type('application/x-www-form-urlencoded');
    $req->content(join '&', @_);
    my $res = $ua->request($req);
    return $res->is_success? $res->content: undef;
}

sub WPLogin($$) {
    my ($uname, $pwd) = @_;
    $pwd = uri_escape($pwd);
    my $req = HTTP::Request->new(POST => 'http://en.wikipedia.org/w/index.php?title=Special:Userlogin&action=submitlogin&type=login');
    $req->content_type('application/x-www-form-urlencoded');
    $req->content("wpName=$uname&wpPassword=$pwd&wpRemember=1&wpLoginattempt=Log+in");
    my $res = $ua->request($req);
    $cookie_jar->extract_cookies($req);
    return "Ok";
}

sub WPStartEdit($) {
    my ($title) = @_;
    $title = uri_escape($title);
    my $req = HTTP::Request->new(GET => "http://en.wikipedia.org/w/index.php?title=$title&action=edit");
    my $res = $ua->request($req);
    my $txt;
    $txt = $1 if $res->content =~ m/<textarea[^>]*>(.*)<\/textarea>/s;
    $txt =~ s/&lt;/</gs;
    $txt =~ s/&gt;/>/gs;
    $txt =~ s/&quot;/"/gs;
    $txt =~ s/&amp;/\&/gs;
    my $et;
    $et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
    my $more;
    $more .= '&wpStarttime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpStarttime" \/>/s;
    $more .= '&wpEdittime='.uri_escape($1) if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEdittime" \/>/s;
    if($res->is_success) {
        return ($title, $et, $more, $txt);
    }
    return undef;
}

sub WPTryEdit($$$$$) {
    my($title, $et, $more, $txt, $es) = @_;
    my $req = HTTP::Request->new(POST => "http://en.wikipedia.org/w/index.php?title=$title&action=submit");
    $req->content_type('application/x-www-form-urlencoded');
    $req->content(
          'wpSection='
        . '&wpSummary='.uri_escape($es)
        . '&wpSave=wpSave'
        . '&wpEditToken='.uri_escape($et)
        . '&wpTextbox1='.uri_escape($txt)
        . $more
    );
    my $res = $ua->request($req);
    $et = $1 if $res->content =~ m/<input type='hidden' value="([^"]*?)" name="wpEditToken" \/>/s;
    return undef  if $res->content =~ m/<textarea/;
    return 1;
}

sub WPArticle($) {
    my($title) = @_;
    my $art = WPRequest('action=query',
                        'prop=revisions',
                        'titles='.uri_escape($title),
                        'rvprop=content',
                        'rvlimit=1',
                        'format=xml');
    $art = $1 if $art =~ m/<rev>(.*?)<\/rev>/s;
    return $art;
}

sub WPNewPages() {
    my $list = WPRequest('action=query',
                         'list=recentchanges',
                         'rclimit=500',
                         'rcnamespace=0',
                         'format=xml');
    my @news;
    my $maxrid = 0;
    if($list =~ m/<rc type="1" .*? title="([^"]*)" .*? revid="([0-9]+)"/g) {
        while(1) {
            last if $2 <= $last_revid;
            $maxrid = $2 if $2>$maxrid;
            push @news, $1;
            last if not $list =~ m//g;
        }
    }
    $last_revid = $maxrid  if $maxrid>$last_revid;
    return @news;
}

sub WPCreator($) {
    my($title) = @_;
    my $art = WPRequest('action=query',
                        'prop=revisions',
                        'titles='.uri_escape($title),
                        'rvprop=user',
                        'rvlimit=1',
                        'rvdir=newer',
                        'format=xml');
    return $1 if $art =~ m/<rev user="([^"]*?)" \/>/s;
    return undef;
}


sub YahooFind($) {
    my $req = HTTP::Request->new(GET => 'http://search.yahooapis.com/WebSearchService/V1/webSearch?appid=SANITIZED&query='.uri_escape(join(' ',@_)).'&results=5&language=en');
    my $res = $ua->request($req);
    my @uri;
    my $r = $res->content;
    $r =~ s/<Cache>.*?<\/Cache>//sg;
    my @re = $r =~ m/<Url>([^<]*?)\/?<\/Url>/gs;
    Doing "Search \"".join(' ',@_)."\" found $#re+1 results";
    return @re;
}

sub top3($) {
    my($q) = @_;
    my @uri, YahooFind($q);
    $#uri=2 if $#uri>2;
    SITE:
    foreach my $uri (@uri) {
        next if $uri =~ m/\.[pP][Dd][Ff]/;
        foreach my $q (@web) {
            next SITE if $q eq $uri;
        }
        my $site;
        $site = $1 if $uri =~ m{^[^:]*://([^/]*)/};
        if($site eq 'en.wikipedia.org' and $uri=~m{/wiki/}) {
            $uri =~ s{.*/wiki/(.*)}{\1};
            $uri = uri_unescape($uri);
            $uri =~ tr/_/ /;
            foreach my $q (@enwiki) {
                next SITE if $q eq $uri;
            }
            push @enwiki, $uri;
            next SITE;
        }
        foreach my $re (@exclude) {
            next SITE  if $site =~ $re;
        }
        push @web, $uri;
        return if $#web > 5;
    }
}
                                                                                                                                                                        sub findmatches($) {
    my $article = WPArticle($_[0]);
    my @atokens = tokenize complete normalizewikitext $article;
#print "article <", join(' ', @atokens), ">\n";
    my @paras = significant normalizewikitext $article;

    my $why = undef;
    my $score = $config{MinScore};
    my $what = undef;
    my $what_ok;
    my $score_ok = 50000;

    local @web;
    local @enwiki;

    return undef if $#atokens < 5;
    $#atokens = 200 if $#atokens > 200;

    my @uri;
    my $ln = 0;

    my $title = $_[0];
    $title =~ s/\(.*?\) *//;
    foreach my $l (@paras) {
        if($ln==1 or $ln==7 or $ln==($#paras-1)) {
            if($l =~ m/ (.*)\.?/) {
                my @tq = split ' ', $1;
                my @q;
                my $num = 0;
                foreach my $w (@tq) {
                    push @q, $w if $w =~ m/[a-zA-Z0-9*]/;
                    $num++ if not $w eq '*';
                    last if $num > 9;
                }
                my $q = join ' ', @q;
                top3 "\"$title\" $q";
            }
        }
        $ln++;
    }
    return undef if $#paras < 0;                                                                                                                                            top3 "\"$title\"";

    foreach my $uri (@web) {
        Doing "checking $uri";
        my @src = eval {
            local $SIG{ALRM} = sub { die "alarm\n" };
            alarm 25;
            my $req = HTTP::Request->new(GET => $uri);
            alarm 0;
            my $res = $ua->request($req);
            if($res->is_success) {
                my @src = tokenize complete normalizewebtext $res->content;
#print "webpage <", join(' ', @src), ">\n";
                return @src if $#src > 9;
            }
            return undef;
        };
        next if $#src < 10;
        next if $@ eq "alarm\n";

        $#src = 100000/$#atokens  if $#src*$#atokens > 100000;

        my $alignment = Text::Align::WagnerFischer->new(
                                                    left => \@src,
                                                    right => \@atokens,
                                                    weights => [0,1,2]
                                                   );

        my $maybe = 'pageincluded';
        my $dif = abs ($#src-$#atokens);
        $sina = ($alignment->cost()-$dif)*1000/$#src;
        $ains = ($alignment->cost()-$dif)*1000/$#atokens;
        Doing "$#src/$#atokens $dif gives cost ".($alignment->cost()-$dif)." for $sina/$ains";
        if($ains > $sina) {
            $maybe = 'pageincludes';
            $sina = $ains;
        }
        my $need = $config{MinScore};
        $need = ($need*$#atokens)/30 if $#atokens<30;
        if($sina < $need and $sina < $score) {
            $why = $maybe;                                                                                                                                                          $score = $sina;
            $what = $uri;
        }
        if($sina < $score_ok) {
            $score_ok = $sina;
            $what_ok = $uri;
        }
    }

    foreach $uri (@enwiki) {
        next if $uri eq $_[0];
        my $test = WPArticle($uri);
        my @src = tokenize complete normalizewikitext $test;
        next if $#src < 10;
        my $alignment = Text::Align::WagnerFischer->new(
                                                        left => \@src,
                                                        right => \@atokens,
                                                        weights => [-1,1,2]
                                                       );
        $sina = $alignment->cost()*1000/$#src;
        $ains = $alignment->cost()*1000/$#atokens;
        $sina = $ains if $ains < $sina;
        if($sina<-400 and $sina < $score) {
            $why = 'wikipage';
            $what = $uri;
            $score = $sina;
        }
        if($sina < $score_ok) {
            $score_ok = $sina;
            $what_ok = $uri;
        }
    }

    return ($why, $what, ($score)/10) if $score < $config{MinScore};
    Doing "Best match was $what_ok with $score_ok";
    return ('', '', 1000);
}

sub TagPage($$$) {
    my($title, $type, $what) = @_;
    my $tag = "{{csb-$type|1=$what}}";                                                                                                                                  
    my $user = WPCreator($title);
    foreach my $ally (@allies) {
        return "creator trusted" if $user eq $ally;
    }
    $user = "User talk:$user" if defined $user;

    while(1) {
        my($ttl, $token, $more, $text) = WPStartEdit($title);
        return "article is (now) a redirect"    if $text =~ m/^#REDIRECT/;
        return "attributed"                     if $text =~ m/{{DANFS}}/i;
        return "attributed"                     if $text =~ m/{{[cC]atholic}}/i;
        return "speedied"                       if $text =~ m/{{db/;
        return "marked copyvio"                 if $text =~ m/{{copyvio/;
        return "already tagged"                 if $text =~ m/{{csb-/;
        return "page gone"                      if length($text)<20;

        $text = "$tag\n\n" . $text;
        if(WPTryEdit($ttl, $token, $more, $text, "Tagging for copyvio of $what"))
          {
            while(defined $user) {
                ($ttl, $token, $more, $text) = WPStartEdit($user);
                $text .= "\n{{subst:csb-notice-$type|$title|url=$what}} &mdash;&nbsp;[[User:Coren|Coren]]&nbsp;<sup>[[User Talk:Coren|(talk)]]</sup> 22:41, 18 August 2007 (UTC)\n";
                last if WPTryEdit($ttl, $token, $more, $text, "Notifying user of copyvio on $title");
            }
            while(1) {
                ($ttl, $token, $more, $text) = WPStartEdit($config{ReportTo});
                my $re = qr/\[\[$title]]/s;
                last if $text =~ $re;
                if($type eq 'wikipage') {
                    $text .= "* [[$title]] &mdash; [[$what]]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
                } else {
                    $text .= "* [[$title]] &mdash; [$what $what]. Reported by [[User:CorenSearchBot|CSBot]] at 22:41, 18 August 2007 (UTC)\n";
                }
                last if WPTryEdit($ttl, $token, $more, $text, "Adding violation on $title");
            }
            return undef
          }
    }
}
                                                                                                                                                                        sub configstatus() {
    undef %config;
    undef @exclude;
    undef @allies;
    foreach $l (split "\n", WPArticle("User:CorenSearchBot/config")) {
        $config{$1} = $2  if $l =~ m/ *([A-Za-z]+)=(.*)/;
    }
    foreach $l (split "\n", WPArticle("User:CorenSearchBot/exclude")) {
        push @exclude, qr/$1$/i  if $l =~ m/ *([^=]*\.[a-z]{2,4})$/;
    }
    foreach $l (split "\n", WPArticle("User:CorenSearchBot/allies")) {
        push @allies, $1  if $l =~ m/  *([^=]*)$/;
    }
}

my @npq;

my $ok = WPLogin('CorenSearchBot', SANITIZED);

configstatus;
print "Configuration read.\n";
print "(", $#exclude+1, " exclusions)\n";
print "(", $#allies+1, " allies)\n";
print "Report to '$config{ReportTo}'\n";
print "Is a copy below $config{MinScore}\n";
print "\n";

push @npq, @ARGV;
my @manuals;

while(1) {
    if($#npq < 1) {
        print "Fetching new pages\n";
        push @npq, WPNewPages if $#npq < 1;
        print $#npq+1, " page(s) to check. (last revid $last_revid)\n";
        if($#npq<0) {
            if($#manuals<0) {
                foreach $l (split "\n", WPArticle("User:CorenSearchBot/manual")) {
                    push @manuals, $1  if $l =~ m/\[\[([^]]*)]]$/;
                }
                while($#manuals >= 0) {                                                                                                                                                     my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/manual");
                    $text =~ s/==Unprocessed requests==.*==Recent Results==/==Unprocessed requests==\n\n==Recent Results==/s;
                    last if WPTryEdit($ttl, $token, $more, $text, "Removing pending requests");
                }
            }
            if($#manuals>=0) {
                my $page = pop @manuals;
                my $result = "{{User:CorenSearchBot/result-no|$page|22:41, 18 August 2007 (UTC)}}\n";
                print "Manually checking [[$page]]\n";
                my($why, $what, $score) = findmatches($page);
                $score = int(100-$score);
                $result = "{{User:CorenSearchBot/result-unknown|$page|22:41, 18 August 2007 (UTC)}}\n" if $score>-10;
                if(defined $why and not $why eq '') {
                    print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
                    $result = "{{User:CorenSearchBot/result-yes|$page|$score|22:41, 18 August 2007 (UTC)|url=$what}}\n";
                }
                while(1) {
                    my ($ttl, $token, $more, $text) = WPStartEdit("User:CorenSearchBot/results");
                    $text .= $result;
                    last if WPTryEdit($ttl, $token, $more, $text, "Posting result of manual check");
                }
            } else {
                print "Sleeping.\n";
                sleep 20;
                configstatus;
            }
        }
    }
    if($#npq >= 0) {
        my $page = $npq[0];
        shift @npq;
        print "Checking [[$page]]\n";
        my($why, $what, $score) = findmatches($page);
        if(defined $why and not $why eq '') {
            $score = int(100-$score);
            print "\t\033[31;1m[[$page]] is $why of [$what] with confidence $score\033[0m\n";
            my $res = TagPage($page, $why, $what);
            if(defined $res) {
                print "\tTagging: $res\n";
            } else {
                print "\tTags placed\n";
            }
        }
    }
}
//</syntaxhighlight>