Jump to content

User:Lar/ClassificationTableGen

From Wikipedia, the free encyclopedia

Old version at: User:Lar/ClassificationTableGen/Backlev

Perl code: This code generated User:Lar/Sandbox2 (version 8).. There is a lot of work to do on it yet but if you stumble across this, feedback welcome. Not ready for public release yet (if ever).

Signature shows update time/date ++Lar: t/c 03:34, 27 April 2006 (UTC)

note: requires perl 5.8.... only tested on Wintel platform.

Helper module, based on Pearle Wisebot code

[edit]

... this module needs some better commenting and trimming out of code not needed for what I need. It ALSO needs the user/cookie/token stuff made generic. It is hard coded to use my userid so may not work for anyone else (no you can't have my cookie file. Read the User:Pearle pages on how to set up your own and look for Lar in there and change it to your userid.

Filename WP_util_pearlebot.pm

package WP_util_pearlebot;  # assumes WP_util_pearlebot.pm

# based on boilerplate module declaration found here:
#   http://perldoc.perl.org/perlmod.html#Perl-Modules-module
#
# based on code that is part of the "Pearle Wisebot"
#   http://en.wikipedia.org/wiki/User:Pearle
#   http://en.wikipedia.org/wiki/User:Pearle/pearle-documentation.txt
#   http://en.wikipedia.org/wiki/User:Pearle/pearle.pl
# which was created by [[User:Beland]]: 
#  http://en.wikipedia.org/wiki/User:Beland

# Mods by Larry Pieniazek ( [[user:Lar]] )

use strict;
use warnings;

use Data::Dumper;
use Getopt::Std;

use Time::HiRes;


use utf8;
#use encoding 'utf8';

# Initialization
use LWP;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;



    BEGIN {
        use Exporter   ();
        our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);

        # set the version for version checking
        $VERSION     = 1.00;
        # if using RCS/CVS, this may be preferred
        # $VERSION = sprintf "%d.%03d", q$Revision: 1.1 $ =~ /(\d+)/g;

        @ISA         = qw(Exporter);
        @EXPORT      = qw(&func1 &func2 &func4);
        %EXPORT_TAGS = ( );     # eg: TAG => [ qw!name1 name2! ],

        # your exported package globals go here,
        # as well as any optionally exported functions
        # @EXPORT_OK   = qw($Var1 %Hashit &func3);
        # we do not have any externals
    }
    our @EXPORT_OK;

    # exported package globals go here
    # our $Var1;
    # our %Hashit;

    # non-exported package globals go here


    # initialize package globals, first exported ones


    # then the others (which are still accessible as $WP_util_pearlebot::stuff)


    # all file-scoped lexicals must be created before
    # the functions below that use them.

    # file-private lexicals go here


    # here's a file-private function as a closure,
    # callable as &$priv_func;  it cannot be prototyped.
    # my $priv_func = sub {
        # stuff goes here.
    # };

    # make all your functions, whether exported or not;
    # remember to put something interesting in the {} stubs
    sub myLog;
    sub getPage;
    sub postPage;
    sub retry;
    sub printWikitext;
    sub test;


    END { }       # module clean-up code here (global destructor)

    ## YOUR CODE GOES HERE


# LWP:UserAgent is a library which allows us to create a "user agent"
# object that handles the low-level details of making HTTP requests.

$::ua = LWP::UserAgent->new(timeout => 300);
$::ua->agent("LarUtil/0.1");
$::ua->cookie_jar(HTTP::Cookies->new(file => "cookies.lar.txt",
				     autosave => 1));
# $::ua->cookie_jar->load();

# $::ua->

#
#  $ua = LWP::UserAgent->new;
#  $req = HTTP::Request->new(GET => 'http://www.linpro.no/secret/');
#  $req->authorization_basic('aas', 'mypassword');
#  print $ua->request($req)->as_string;

# Hot pipes
$| = 1;

#set default speedlimit

 $::speedLimit = 10;

##---

## test();



sub test
{
    my ($target, $text, $editTime, $startTime, $token);

    #$target = "Special:Userlogin";
    #($text, $editTime, $startTime, $token) = getPage($target);

    # temporary
    $::nullOK = "yes";



    $target = "Wikipedia:Sandbox";
    ($text, $editTime, $startTime, $token) = getPage($target);
    print $text;

#    die ("nopost Test complete.");

    $text .= "\nEat my electrons! -- testing Pearle clone ([[User:Lar]]) \n";
    print "---\n";
    postPage ($target, $editTime, $startTime, $token, $text, "Test 028", "najor");  # (no it is not minor)
    die ("Test complete.");
}

##---


sub getPage
{

    my ($target, $request, $response, $reply, $text, $text2,
	$editTime, $startTime, $attemptStartTime, $attemptFinishTime,
	$token);

    $target = $_[0];

    if ($target =~ m/^\s*$/)
    {
	myLog("getPage: Null target.");
	die("getPage: Null target.");
    }

    # urlSafe ($target);

    # Monitor wiki server responsiveness
    $attemptStartTime = Time::HiRes::time();

    # Create a request-object
    print "GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n";
    myLog("GET http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n");
    
    $request = HTTP::Request->new(GET => "http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit");

    $response = $::ua->request($request);

    if ($response->is_success)
    {
	$reply = $response->content;

	# Monitor wiki server responsiveness
	$attemptFinishTime = Time::HiRes::time();
	retry ("success", "getPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));
	
	# This detects whether or not we're logged in.
	unless ($reply =~ m%<a href="/wiki/User_talk:Lar">My talk</a>%)
	{
	    # We've lost our identity.
	    myLog ("Wiki server is not recognizing me (1).\n---\n${reply}\n---\n");
	    die ("Wiki server is not recognizing me (1).\n");
	}      
        
        my $saveReply=$reply;
        myLog ("Dump reply prior to regex processing in getPage... \n---\n${saveReply}\n---\n");
        
	$reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
	$text = $1;

        
#	$reply =~ m%<textarea\s+tabindex='1'\s+accesskey=","\s+name="wpTextbox1"\s+id="wpTextbox1"\s+rows='25'\s+cols='80'\s+>(.*?)</textarea>%s;
#	$reply =~ m%<textarea\s*tabindex='1'\s*accesskey=","\s*name="wpTextbox1"\s*rows='25'\s*cols='80'\s*>(.*?)</textarea>%s;
#	$text = $1;

#        print "debug: 1: ".$1."\n";

	$reply =~ m/value="(\d+)" name="wpEdittime"/;
	$editTime = $1;

	# Added 22 Aug 2005 to correctly handle articles that have
	# been undeleted
	$reply =~ m/value="(\d+)" name="wpStarttime"/;
	$startTime = $1;

	# Added 9 Mar 2005 after recent software change.
	$reply =~ m/value="(\w+)" name="wpEditToken"/;
	$token = $1;
	###

	if (($text =~ m/^\s*$/)
	    and ($::nullOK ne "yes"))
	{
	    myLog ("getPage($target): Null text!\n");
	    myLog "\n---\n$reply\n---\n";
	    die ("getPage($target): Null text!\n");
	}

	if (($editTime =~ m/^\s*$/)
	    and ($::nullOK ne "yes"))
	{
	    myLog ("getPage($target): Null time!\n");
	    myLog "\n---\n$reply\n---\n";
	    die ("getPage($target): Null time!\n");
	}

	if (($text =~ m/>/) or
	    ($text =~ m/</))
	{
	    print $text;
	    myLog "\n---\n$text\n---\n";
	    myLog ("getPage($target): Bad text suck!\n");
	    die ("getPage($target): Bad text suck!\n");
	}
	
	# Change ( " -> " ) etc
	# This function is from HTML::Entities.
	decode_entities($text);

	# This may or may not actually work
	$::ua->cookie_jar->save();

	return ($text, $editTime, $startTime, $token);
    } 
    else 
    {
	myLog ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n");
	print ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n".$response->content."\n");
	# 50X HTTP errors mean there is a problem connecting to the wiki server
	if (($response->status_line =~ m/^500/)
	    or ($response->status_line =~ m/^502/)
	    or ($response->status_line =~ m/^503/))
	{
	    return(retry("getPage", @_));
	}
	else
	{
	    # Unhandled HTTP response
	    die ("getPage($target): HTTP ERR (".$response->status_line.") http://en.wikipedia.org/w/wiki.phtml?title=${target}&action=edit\n");
	}
    }
}

sub postPage
{
    my ($request, $response, $pageName, $textToPost, $summaryEntry,
	$editTime, $startTime, $actual, $expected, $attemptStartTime,
	$attemptFinishTime, $date, $editToken, $minor);

    $pageName = $_[0];
    $editTime = $_[1];
    $startTime = $_[2];
    $editToken = $_[3];
    $textToPost = $_[4];
    $summaryEntry = $_[5]; # Max 200 chars!
    $minor = $_[6];

    $summaryEntry = substr($summaryEntry, 0, 200);

    if ($pageName eq "")
    {
	myLog ("postPage(): Empty pageName.\n"); 
	die ("postPage(): Empty pageName.\n"); 
    }

    if ($summaryEntry eq "")
    {
	$summaryEntry = "Automated editing.";
    }
    
    # Monitor server responsiveness
    $attemptStartTime = Time::HiRes::time();

    if ($minor eq "yes")
    {
	$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit",
	[wpTextbox1 => $textToPost,
	 wpSummary => $summaryEntry,
	 wpSave => "Save page",
	 wpMinoredit => "on",
	 wpEditToken => $editToken,
	 wpStarttime => $startTime,
	 wpEdittime => $editTime];
	# Optional: wpWatchthis
    }
    else
    {
	$request = POST "http://en.wikipedia.org/w/wiki.phtml?title=${pageName}&action=submit",
	[wpTextbox1 => $textToPost,
	 wpSummary => $summaryEntry,
	 wpSave => "Save page",
	 wpEditToken => $editToken,
	 wpStarttime => $startTime,
	 wpEdittime => $editTime];
	# Optional: wpWatchthis, wpMinoredit
    }

    # ---
    ## If posts are failing, you can uncomment the below to see what
    ## HTTP request is being made.
    # myLog($request->as_string());
    # print $request->as_string();	$::speedLimit = 60 * 10;
    # print $::ua->request($request)->as_string;
    # ---

    myLog("POSTing...");
    print "POSTing...";
    # Pass request to the user agent and get a response back
    $response = $::ua->request($request);
    myLog("POSTed.\n");
    print "POSTed.\n";


    if ($response->content =~ m/Please confirm that really want to recreate this article./)
    {
	myLog ($response->content."\n");
	die ("Deleted article conflict! See log!");
    }


    # Check the outcome of the response
    if (($response->is_success) or ($response->is_redirect))
    {
	# Monitor server responsiveness
	$attemptFinishTime = Time::HiRes::time();
	retry ("success", "postPage", sprintf("%.3f", $attemptFinishTime-$attemptStartTime));


	$expected = "302 Moved Temporarily";
	$actual = $response->status_line;
	if (($expected ne $actual)
	    and ($actual ne "200 OK"))
	{
	    myLog ("postPage(${pageName}, $editTime)#1 - expected =! actual\n");
	    myLog ($request->as_string());
	    myLog ("EXPECTED: '${expected}'\n");
	    myLog ("  ACTUAL: '${actual}'\n");

	    die ("postPage(${pageName}, $editTime)#1 - expected =! actual - see log\n");
	}

	$expected = "http://en.wikipedia.org/wiki/${pageName}";
	$expected =~ s/\'/%27/g;
	$expected =~ s/\*/%2A/g;
	# $expected = urlEncode($expected);

	$actual = $response->headers->header("Location");


	if (($expected ne $actual)
 	    and !(($actual eq "") and ($response->status_line eq "200 OK")))
 	{
 	    myLog ("postPage(${pageName}, $editTime)#2 - expected =! actual\n");
	    myLog ("EXPECTED: '${expected}'\n");
	    myLog ("  ACTUAL: '${actual}'\n");
	    die ("postPage(${pageName}, $editTime)#2 - expected =! actual - see log\n");
	}


	if ($response->content =~ m/<h1 class="firstHeading">Edit conflict/)
	{
	    myLog ("Edit conflict on '$pageName' at '$editTime'!\n");
	    die ("Edit conflict on '$pageName' at '$editTime'!\n");
	}

	$::ua->cookie_jar->save();
	return ($response->content);
    }
    else
    {
	$date = `date`;
	$date =~ s/\n//g;
	myLog ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n");

	# 50X HTTP errors mean there is a problem connecting to the wiki server
	if (($response->status_line =~ m/^500/)
	    or ($response->status_line =~ m/^502/)
	    or ($response->status_line =~ m/^503/))
	{
	    print "Bad response to POST to $pageName at $date.\n".$response->status_line."\n".$response->content."\n";
	    return(retry("postPage", @_));
	}
	else
	{
	    # Unhandled HTTP response
	    die ("Bad response to POST to $pageName at $date.\n".$response->status_line."\n");
	}
    }
}

 
 
sub myLog
{
    open (LOG, ">>pearle-wisebot.ersatz.log.txt") 
	|| die "Could not append to log!";
    print LOG $_[0];
    close (LOG);
}



# A call to this recursive function handles any retries necessary to
# wait out network or server problems.  It's a bit of a hack.
sub retry
{

    my ($callType, @args, $i, $normalDelay, $firstRetry,
	$secondRetry, $thirdRetry);

    ($callType, @args) = @_;

    ### ATTENTION ###
    # Increasing the speed of the bot to faster than 1 edit every 10
    # seconds violates English Wikipedia rules as of April, 2005, and
    # will cause your bot to be banned.  So don't change $normalDelay
    # unless you know what you are doing.  Other sites may have
    # similar policies, and you are advised to check before using your
    # bot at the default speed.
    #################

    # HTTP failures are usually an indication of high server load.
    # The retry settings here are designed to give human editors
    # priority use of the server, by allowing it ample recovering time
    # when load is high.

    # Time to wait before retry on failure, in seconds
    $normalDelay = 10;       # Normal interval between edits is 10 seconds
    $firstRetry = 60;        # First delay on fail is 1 minute
    $secondRetry = 60 * 10;  # Second delay on fail is 10 minutes
    $thirdRetry = 60 * 60;   # Third delay on fail is 1 hour
    
    # SUCCESS CASE
    # e.g. retry ("success", "getPage", "0.23");
    if ($callType eq "success")
    {
	myLog("Response time for ".$args[0]." (sec): ".$args[1]."\n");
	$::retryDelay = $normalDelay;

	if ($args[0] eq "postPage")
	{
	    # If the response time is greater than 20 seconds...
	    if ($args[1] > 20)
	    {
		print "Wikipedia is very slow.  Increasing minimum wait to 10 min...\n";
		myLog("Wikipedia is very slow.  Increasing minimum wait to 10 min...\n");
		
		$::speedLimit = 60 * 10;
	    }

	    # If the response time is between 10 and 20 seconds...
	    elsif ($args[1] > 10)
	    {
		print "Wikipedia is somewhat slow.  Setting minimum wait to 60 sec...\n";
		myLog("Wikipedia is somewhat slow.  Setting minimum wait to 60 sec...\n");
		
		$::speedLimit = 60;
	    }

	    # If the response time is less than 10 seconds...
	    else
	    {
		if ($::speedLimit > 10)
		{
		    print "Returning to normal minimum wait time.\n";
		    myLog("Returning to normal minimum wait time.\n");
		    $::speedLimit = 10;
		}
	    }
	}
	return();
    }

    # e.g. retry ("getPage", "George_Washington")
    # FAILURE CASES
    elsif (($::retryDelay == $normalDelay)
	   or ($::retryDelay == 0))
    {
	print "First retry for ".$args[0]."\n";
	myLog("First retry for ".$args[0]."\n");
	$::retryDelay = $firstRetry;
	$::speedLimit = 60 * 10;
    }
    elsif ($::retryDelay == $firstRetry)
    {
	print "Second retry for ".$args[0]."\n";
	myLog("Second retry for ".$args[0]."\n");
	$::retryDelay = $secondRetry;
	$::speedLimit = 60 * 10;
    }
    elsif ($::retryDelay == $secondRetry)
    {
	print "Third retry for ".$args[0]."\n";
	myLog("Third retry for ".$args[0]."\n");
	$::retryDelay = $thirdRetry;
	$::speedLimit = 60 * 10;
    }
    elsif ($::retryDelay == $thirdRetry)
    {
	print "Nth retry for ".$args[0]."\n";
	myLog("Nth retry for ".$args[0]."\n");
	$::retryDelay = $thirdRetry;
	$::speedLimit = 60 * 10;
    }
    else
    {
	die ("retry(): Internal error - unknown delay factor '".$::retryDelay."'\n");
    }

    # DEFAULT TO FAILURE CASE HANDLING
    
    $i = $::retryDelay;
    while ($i >= 0)
    {
	sleep (1);
	print STDERR "Waiting $i seconds for retry...\r";
	$i--;
    }
    print "                                     \r";

    # DO THE ACTUAL RETRY
    if ($callType eq "getPage")
    {
	return(getPage(@args));
    }
    elsif ($callType eq "postPage")
    {
	return(postPage(@args));
    }
    elsif ($callType eq "getCategoryArticles")
    {
	return(getCategoryArticles(@args));
    }
    elsif ($callType eq "getSubcategories")
    {
	return(getSubcategories(@args));
    }
    elsif ($callType eq "getURL")
    {
	return(getURL(@args));
    }
    else
    {
	myLog ("retry(): Unknown callType: $callType\n");
	die ("retry(): Unknown callType: $callType\n");
    }
}



# perl pearle.pl PRINT_WIKITEXT Article_you_want_to_get
## Warning: Saves to a file in the current directory with the same name
## as the article, plus another file with the .html extention.
sub printWikitext
{
    my ($editTime, $startTime, $text, $target, $token);

    $target = $_[0];

    $target =~ s/^\[\[://;
    $target =~ s/\]\]$//;

    ($text, $editTime, $startTime, $token) = getPage($target);

    # Save the wikicode version to a file.
    open (WIKITEXT, ">./${target}");
    print WIKITEXT $text;
    close (WIKITEXT);

    # Save the HTML version to a file.
    print `wget http://en.wikipedia.org/wiki/${target} -O ./${target}.html`;
}


    1;  # don't forget to return a true value from the file

Main code

[edit]

Invoke as (for example)

perl genClassTable.pl -d 2 -C Wikipedia:WikiProject_The_Beatles/Categories -a leaveOrdered.txt -q C:\shortprg\AWB\enwiki-20060303-categorylinks.sql -o bigone2c.txt >runlog2c.txt

Fillename: genClassTable.pl


#!/usr/bin/perl -w
#---------------------------------------------------------------------------#
# process files and generate a category table                       
# Author: Larry Pieniazek (IBM/Ascential Software) as hobby project
# Adapted from stuff I cribbed from all over.  
# (c)Larry Pieniazek 2006. This library is free software; you can redistribute it 
# and/or modify it under the same terms as Perl itself.  
# additionally, can be redistributed and modified under GFDL or CC-SA as you choose        
# 
# Abstract:
#	This perlscript is designed to parse category SQL dumps from wikipedia
#	which are found here: http://download.wikimedia.org/enwiki/
#   For example the 23 March dump is called 
#	  http://download.wikimedia.org/enwiki/20060323/enwiki-20060323-categorylinks.sql.gz
#   
#	The parsing is to generate article classification tables such as those found at
#	  http://en.wikipedia.org/wiki/Wikipedia_talk:WikiProject_The_Beatles/Article_Classification
#
#   In addition to the dump (currently must have been converted to linefeed delimited tuples)
#	the other input is a list of categories of interest, one per line.
#
#---------------------------------------------------------------------------#
use strict;

use WP_util_pearlebot;
use WP_util_ClassTable;

use Getopt::Std;
    

#---------------------------------------------------------------------------#
# Main routine -
#	process options
#	read in categories desired
#	build hash of articles by parsing SQL file
#	write out table file using hash           
#---------------------------------------------------------------------------#
# main
    my $rc=0;
    # print "prior to getopts\n";
	
    getopts('hvd:q:a:C:c:o:', \%WP_util_ClassTable::options) or WP_util_ClassTable::Usage(); # debug also d

    # print "post getopts, pre process\n";
    WP_util_ClassTable::ProcessOptions();
    		    		
    if ($WP_util_ClassTable::debug>1) { print "post process, pre read cat\n";	}
    
    if (defined $WP_util_ClassTable::options{'a'}) { # using page with art special key list
         WP_util_ClassTable::ReadArtKeyFile(); 
    } else { # no list to read so make it empty
        %WP_util_ClassTable::artSpecialKeyHash = ();
    }
    
    if (defined $WP_util_ClassTable::options{'C'}) { # using page with cat lists
        # $rc=FetchCatPage("Wikipedia:WikiProject_The Beatles/Categories");
        $rc=WP_util_ClassTable::FetchCatPage($WP_util_ClassTable::catArtPage);
        if ($rc) { die "error fetching category list from Wikipedia\n"; }
    } else {
        $rc=WP_util_ClassTable::ReadCatFile();
        if ($rc) { die "error reading category list\n"; }
    }

    $rc=WP_util_ClassTable::ParseSQL();
    if ($rc) { die "error reading SQL or building structure\n"; }
    
#    $rc=WP_util_ClassTable::WriteHash();
#    if ($rc) { die "error writing hash\n"; }    

    # exit 0;

    $rc=WP_util_ClassTable::WriteTable();
    if ($rc) { die "error building table\n"; }

exit 0;