User:OrphanBot/libBot.pl

From Wikipedia, the free encyclopedia

#!/usr/bin/perl

# libBot: A library of useful routines for running a bot

use strict;
use warnings;

require "libPearle2.pl";

my $test_only = 0;
my $username = "";

sub config
{
	my %params = @_;
	
	$test_only = $params{test_only} if(defined($params{test_only}));
	$username = $params{username} if(defined($params{username}));
}

# Log a warning on the talk page of the bot
sub userwarnlog
{
	my ($text, $editTime, $startTime, $token, $user, $summary, $session);
	$user = $_[1];
	$user = $username if(!defined($user));
	$summary = $_[2];
	$summary = "Logging warning message" if(!defined($summary));
	$session = $_[3];
	
	if(defined($session))
	{
		# We've been handed an editing session
		($text, $editTime, $startTime, $token) = @{$session};
		Pearle::myLog("Warning with existing edit session\n");
	}
	else
	{
		($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$user");
	}
	
	if($test_only)
	{
		print STDERR $_[0];
		return;
	}
	
	if($text =~ /^#redirect/i)
	{
		userwarnlog("*User talk page [[User talk:$user]] is a redirect\n");
		return;
	}
	$text .= $_[0];
	Pearle::postPage("User talk:$user", $editTime, $startTime, $token, $text, $summary, "no");
	print STDERR $_[0];
}

# Log a notification message to the console
sub notelog
{
	print STDERR @_;
}

# Fix all wikilinks in a string so that they shows as a link, not inline, if it's for a category or image
sub FixupLinks
{
	my $link = shift;
	$link =~ s/\[\[(Category|Image)/[[:$1/g;
	return $link;
}

# Make a string into a Wikipedia-compatible regex
sub MakeWikiRegex
{
	my $string = shift;
	# Escape metacharacters
	$string =~ s/\\/\\\\/g;
	$string =~ s/\./\\\./g;
	$string =~ s/\(/\\\(/g;
	$string =~ s/\)/\\\)/g;
	$string =~ s/\[/\\\[/g;
	$string =~ s/\]/\\\]/g;
	$string =~ s/\+/\\\+/g;
	$string =~ s/\*/\\\*/g;
	$string =~ s/\?/\\\?/g;
	$string =~ s/\^/\\\^/g;
	$string =~ s/\$/\\\$/g;
	# Process the string to match both with spaces and with underscores
	$string =~ s/[ _]/[ _]+/g;

	# Process the string to match both upcase and lowercase first characters
	if($string =~ /^[A-Za-z]/)
	{       
		$string =~ s/^(.)/"[$1".lc($1)."]"/e;
	}
	return $string;
}

# Check for new talk page messages
sub DoIHaveMessages
{
	my $text = shift;
	if($text =~ /<div class="usermessage">You have/)
	{
		return 1;
	}
	else
	{
		return 0;
	}
}


sub GetPageList
{
	my $image = shift;
	my $image_text = shift;
	my @pages = ();
	# Extract the page links
	# <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
	# <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
	# </ul>
	while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
	{
		my $title;
		$title = $2;
		# Unescape any HTML entities in the title
		$title =~ s/</</g;
		$title =~ s/>/>/g;
		$title =~ s/"/"/g;
		$title =~ s/&/&/g;

		notelog("Matched article $title\n");

		# Filter out bad namespaces
		if($title =~ /^(User:|Talk:|User talk:|Template talk:|Image:|Image talk:|Category talk:|Wikipedia:|Wikipedia talk:|Portal talk:)/)	# Leave these alone
		{
			notelog("Ignoring [[$title]] due to namespace\n");
		}
		elsif($title =~ /^Special:/)
		{
			# Ignore Special: pages completely
		}
		elsif($title =~ /^(MediaWiki:|MediaWiki talk:|Template:|Help:|Help talk:)/)		# Log a warning about these, but otherwise leave them alone
		{
			userwarnlog("*Found image [[:$image]] in [[$title]]\n");
		}
		else	# Good namespaces: article, Category:, Portal:
		{
			push @pages, $title;
		}
	}
	return @pages;
}

# Get all pages.  Don't filter for bad namespaces.
sub GetFullPageList
{
	my $image = shift;
	my $image_text = shift;
	my @pages = ();
	# Extract the page links
	# <ul><li><a href="/wiki/Lee_Hyori" title="Lee Hyori">Lee Hyori</a></li>
	# <li><a href="/wiki/Daesung_Entertainment" title="Daesung Entertainment">Daesung Entertainment</a></li>
	# </ul>
	while($image_text =~ /<li><a href="(\/wiki\/[^"]+)" title="([^"]+)">/g)
	{
		my $title;
		$title = $2;
		# Unescape any HTML entities in the title
		$title =~ s/</</g;
		$title =~ s/>/>/g;
		$title =~ s/"/"/g;
		$title =~ s/&/&/g;

		notelog("Matched article $title\n");

		push @pages, $title;
	}
	return @pages;
}

sub SaveImage
{
	my $image = shift;
	my $image_text = shift;
	my $image_path = shift;
	
	my $image_url;
	
	($image_url) = $image_text =~ /<a href="(http:\/\/upload\.wikimedia\.org\/wikipedia\/en\/[^"]+)"/;
	if(defined($image_url))
	{
		my $filename;
		my $image_data;
		notelog("Fetching image $image_url\n");
		($filename) = $image_url =~ /(\/[^\/]+)$/;
		$filename = $image_path . $filename;
		if(! -e $filename)
		{
			if($test_only)
			{
				notelog("Would save to $filename...");
			}
			else
			{
				$image_url = Pearle::urlDecode($image_url);
				$image_data = Pearle::getURL($image_url);
				notelog("Saving to $filename...");
				if(defined($filename) and $filename)
				{
					open OUTFILE, ">", $filename;
					print OUTFILE $image_data;
					close OUTFILE;
					notelog("Image saved\n");
					Pearle::myLog("Image $image saved as $filename\n");
				}
				else
				{
					notelog("Failed\n");
				}
			}
		}
		else
		{
			notelog("File already exists\n");
		}
	}			
}

sub RemoveImageFromPage
{
	my $image = shift;
	my $page = shift;
	my $image_regex = shift;
	my $removal_prefix = shift;
	my $removal_comment = shift;

	my ($text, $editTime, $startTime, $token);
	my ($match1, $match2);
	my $old_length;
	my $new_length;
	my $change_len;
	my $match_len;

	# Fetch an article page
	($text, $editTime, $startTime, $token) = Pearle::getPage($page);
	
	if(!defined($text))
	{
		Pearle::myLog("Error: Bad edit page [[$page]]\n");
		userwarnlog(FixupLinks("*Error: Bad edit page [[$page]]\n"));
		sleep(300);
		return 0;
	}
	
	if($text =~ /^\s*$/)
	{
		# Might be protected instead of empty
		Pearle::myLog("Error: Empty page [[$page]]\n");
		userwarnlog(FixupLinks("*Error: Empty page [[$page]]\n"));
		sleep(300);
		return 0;
	}
	
	if($text =~ /^#redirect/i)
	{
		Pearle::myLog("Redirect found for page [[$page]] (image [[:$image]])\n");
		userwarnlog(FixupLinks("*Redirect found for page [[$page]] (image [[:$image]])\n"));
		return 0;
	}

	# Remove the image
	my $regex3 = "(\\[\\[${image_regex}.*?(\\[\\[.*?\\]\\].*?|)+\\]\\][ \\t]*)";	# Regex to match images
	my $regex3ex = "\\w[ \\t]*${regex3}[ \\t]*\\w";									# Regex to try to spot inline images
	my $regex3c = "<!--.*${regex3}.*-->";											# Regex to spot images in comments
	my $regex3g = "(${image_regex}.*)";												# Regex to match gallery images
	my $regex3gc = "<!--.*${regex3g}-->";											# Regex to spot gallery images in comments
	my ($raw_image) = $image =~ /Image:(.*)/;	
	my $regex4a = "([Cc]over\\s*=\\s*)" . MakeWikiRegex($raw_image);
	my $regex4b = "(image_skyline\\s*=\\s*)" . MakeWikiRegex($raw_image);
	my $regex4i = "(image\\s*=\\s*)" . MakeWikiRegex($raw_image);						# Regex to match "image = " sections in infoboxes
	my $regex4p = "(picture\\s*=\\s*)" . MakeWikiRegex($raw_image);					# Regex to match "picture = " sections in infoboxes

	my $regex4m = "\\[\\[[ _]*[Mm]edia[ _]*:[ _]*" . MakeWikiRegex($raw_image) . "[ _]*\\|([^]]*)\\]\\]";	# Regex to match inline Media: links
	my $regex4g =  "(img\\s*=\\s*)" . MakeWikiRegex($raw_image);	# Regex to match "img = " sections in infoboxes
	Pearle::myLog("Regex 3: $regex3\n");
	notelog("Regex 3: $regex3\n");
	notelog("Regex 3 extended: $regex3ex\n");
	notelog("Regex 3 gallery: $regex3g\n");
	Pearle::myLog("Raw regex: $raw_image\n");
	notelog("Regex 4 Album: $regex4a\n");
	notelog("Regex 4 City: $regex4b\n");
	notelog("Regex 4 Image: $regex4i\n");
	notelog("Regex 4 Media: $regex4m\n");
	notelog("Regex 4 Picture: $regex4p\n");
	notelog("Regex 4 Img: $regex4g\n");
	
	if($text =~ /$regex3ex/)
	{
		Pearle::myLog("Possible inline image in [[$page]]\n");
		userwarnlog(FixupLinks("*Possible inline image [[:$image]] in [[$page]]\n"));
		return 0;	# Can't do gallery matching because that also matches regular images, and odds are, we don't have an infobox
	}
	
	if($text =~ /$regex3c/ or $text =~ /$regex3gc/)
	{
		Pearle::myLog("Image in comment in [[$page]]\n");
#		userwarnlog(FixupLinks("*Image in comment in [[$page]]\n"));
		return 0;	# Can't do gallery matching because that also matches regular images
	}
	
	$text =~ /$regex3/;
	$match_len = length($1);
	$match2 = $text =~ s/$regex3/<!-- $removal_prefix $1 -->/g;

	$new_length = length($text);
	print "Num: $match2 Len: $match_len\n";
	if($match2)
	{
		# If a whole lot of text was removed, log a warning
		if($match_len > (500 + length($image)))
		{
			userwarnlog(FixupLinks("*Long caption of $match_len bytes replaced in [[$page]]\n"));
			if($match_len > (1000 + length($image)))
			{
				notelog("Unusually long caption found.  Exiting.\n");
				Pearle::myLog("Unusually long caption of $match_len found in [[$page]] ($match2 matches).\n");
				exit;
			}
		}
		if($match_len < (4 + length($image)))
		{
			notelog("*Short replacement of $match_len bytes in [[$page]]\n");
			Pearle::myLog("Short replacement of $match_len bytes (min " . (length($image) + 4) . ") in [[$page]] ($match2 matches).  Exiting.\n");
			Pearle::myLog("Text:\n$text\n");
			exit;
		}
		# If many matches, log a warning
		if($match2 > 2)
		{
			Pearle::myLog("More than one match ($match2) in page [[$page]]\n");
#			userwarnlog(FixupLinks("*More than one match ($match2) in page [[$page]]\n"));
		}
		if($match2 > 100)
		{
			Pearle::myLog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
			userwarnlog("Too many matches ($match2) in page [[$page]].  Skipping.\n");
			return 0;
		}
		# If there might be a reference, log a warning
#		if($text =~ /(?:see (?:image|picture|graph|diagram|right|left)|\(left\)|\(right\)|\(below\)|\(above\))/)
#		{
#			Pearle::myLog("Possible image reference in page [[$page]]\n");
#			userwarnlog("*Possible image reference in page [[$page]]\n");
#		}
		if($text =~ /-->\]/)
		{
			Pearle::myLog("Possible bracket mixup in page [[$page]]\n");
			userwarnlog(FixupLinks("*Possible bracket mixup in page [[$page]]\n"));
		}
#		if($text =~ /\[\[(?: |)<!--/)
#		{
#			Pearle::myLog("Possible multiline image in page [[$page]]\n");
#			userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
#		}
	}
	elsif($text =~ /<gallery/)
	{
		Pearle::myLog("*Possible image gallery in page [[$page]]\n");
		if($text =~ s/$regex3g/<!-- $removal_prefix $1 -->/)
		{
			$match2 += 1;
		}
	}

	if($match2 > 0)
	{
		if($text =~ /\[\[(?: |)<!--/)
		{
			Pearle::myLog("Possible multiline image in page [[$page]]\n");
			userwarnlog(FixupLinks("*Possible multiline image in page [[$page]]\n"));
		}
	}

	# Infobox removal
	if($text =~ /{{Album[ _]infobox|{{Infobox[ _]Album/i)
	{
		if($text =~ s/$regex4a/$1/)
		{
			Pearle::myLog("*Album infobox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]City/i)
	{
		if($text =~ s/$regex4b/$1/)
		{
			Pearle::myLog("*City infobox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Taxobox/i)
	{
		if($text =~ s/$regex4i/$1/)
		{
			Pearle::myLog("*Taxobox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{NFL[ _]player/i)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*NFL Playerbox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]President/i)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Presidentbox in page [[$page]]\n");
#			userwarnlog("*Presidentbox in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]Cricketer/i)
	{
		if($text =~ s/$regex4p/picture = cricket no pic.png/i)
		{
			Pearle::myLog("*Cricketer in page [[$page]]\n");
#			userwarnlog("*Cricketer in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]Celebrity/)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Celebrity in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox[ _]Wrestler/)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Wrestler in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox musical artist 2/)
	{
		if($text =~ s/$regex4g/$1/i)
		{
			Pearle::myLog("*InfoMusArt2 in page [[$page]]\n");
			$match2 += 1;
		}
	}
	if($text =~ /{{Infobox Model/)
	{
		if($text =~ s/$regex4i/$1/i)
		{
			Pearle::myLog("*Model in page [[$page]]\n");
			$match2 += 1;
		}
	}

	if($match2)	# No need to null-edit articles anymore
	{
		if($test_only)
		{
			notelog("Test removal from page succeeded\n");
		}
		else
		{
			# Submit the changes
			Pearle::postPage($page, $editTime, $startTime, $token, $text, $removal_comment, "no");
		}
	}
	
	return ($match2)
}

# Returns 1 if the user has been notified, or a reference to the userpage edit session if they haven't
sub isNotified
{
	my $image_text = shift;
	my $uploader = shift;
	my $image_regex = shift;
	my $image_name = shift;
	my $notes_ref = shift;
	my $donts_ref = shift;

	# Check notification list
	if($notes_ref->{"$uploader,$image_name"})
	{
		notelog("Already notified for this image\n");
		return 1;
	}

	if($donts_ref->{$uploader})
	{
		notelog("On exception list\n");
		Pearle::myLog("On exception list: $uploader\n");
		return 1;
	}
	
	# Check uploader's talkpage
	my ($text, $editTime, $startTime, $token) = Pearle::getPage("User talk:$uploader");
	if($text =~ /$image_regex/)
	{
		notelog("Already notified by someone else\n");
		$donts_ref->{"$uploader,$image_name"} = 1;
		return 1;
	}
	else
	{
		print "Not already notified\n";
		return [$text, $editTime, $startTime, $token];
	}
}

sub isDated
{
	my $image_text = shift;
	if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)	# Dated template
	{
		print "Dated tag $1 $2 $3\n";
		return 1;
	}
	# as of 6 October 2006">
	elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
	{
		print "Template borked; category $1 $2 $3\n";
		return 1;
	}
	elsif($image_text =~ /{{{day}}} {{{month}}} \d\d\d\d/ or $image_text =~ /\( 2006\)/)	# Generic template
	{
		print "Generic tag\n";
		return 0;
	}
	else
	{
		print "No tag match\n";
		return 0;
	}
}

# Return the tag date if there is one, the upload date if not
# Returns in (day, month, year) format
sub getDate
{
	my $image_text = shift;
	if($image_text =~ /\((\d\d?) (\w*) (\d\d\d\d)\)/)
	{
		print "Template date $1-$2-$3\n";
		return ($1, $2, $3);
	}
	elsif($image_text =~ /as of (\d\d?) (\w*) (\d\d\d\d)/) # Template borked, working off category
	{
		print "Category date $1-$2-$3\n";
		return ($1, $2, $3);
	}
	elsif($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</)
	{
		print "Upload date $1-$2-$3\n";
#		return ($1, $2, $3);
		# For now, be conservative:
		my ($year, $month, $day) = Today();
		return ($day, Month_to_Text($month), $year);
	}
	else
	{
		print "No date\n";
		return (1, "January", 2006);
	}
}

# Return a list of upload dates
sub getUploadDates
{
	my @dates;
	my $image_text = shift;
	while($image_text =~ />\d\d?:\d\d, (\d\d?) (\w*) (\d\d\d\d)</g)
	{
		push @dates, [$1, $2, $3];
	}
	return @dates;
}

sub getLastEditDate
{
	my ($day, $month, $year);
	my $image = shift;
	
	my @history = Pearle::parseHistory($image);
	(undef, $day, $month, $year) = @{$history[0]};
	
	return ($day, $month, $year);
}

# Find the most recent non-vandal, non-revert uploader
sub getUploader
{
	my $image_text = shift;
	my ($uploader, $dims, $bytes, $comment);
	my @uploaders;
	my $uploader_data;
	my $i = 0;
	
	# title="User:Jamie100">Jamie100</a> (<a href="/wiki/User_talk:Jamie100" title="User talk:Jamie100">Talk</a>) . . 424x216 (25800 bytes) <span class='comment'>(Reverted to earlier revision)</span></li>
	
#	while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a>\) \. \. (\d+x\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
	while($image_text =~ />([^<]+?)<\/a> \(<a href="[^"]+?" (?:class="new" |)title="[^"]+?">Talk<\/a> \| <a href="[^"]*" title="[^"]*">contribs<\/a>\) \. \. (\d+.+?\d+) \(([0-9,]+) bytes\)(?: <span class="comment">([^<]*)|)</g)
	{
		($uploader, $dims, $bytes, $comment) = ($1, $2, $3, $4);
		$bytes =~ s/,//g;						# Remove commas to turn into a real number
		$comment = "" if(!defined($comment));	# Reduce warnings
		push @uploaders, [$uploader, $dims, $bytes, $comment];
		notelog("Uploader found: $uploader, $dims, $bytes, $comment\n");
		$i++;
		die "Too many uploaders: $i\n" if($i > 100);
	}
	my $max = scalar(@uploaders);
	print $max, "\n";
	for($i = 0; $i < $max; $i++)
	{
		$uploader = $uploaders[$i][0];
		if($uploaders[$i][3] =~ /Reverted/)
		{
			$dims = $uploaders[$i][1];
			$bytes = $uploaders[$i][2];
			notelog("Revert found: $uploader, $dims, $bytes\n");
			$i++;
			while(($dims ne $uploaders[$i][1] or $bytes ne $uploaders[$i][2]) and $i < $max)
			{
				notelog("Reversion data: $uploaders[$i][1], $uploaders[$i][2], $i\n");
				$uploader = $uploaders[$i][0];
				$i++;
			}
		}
		elsif($uploaders[$i][3] =~ /optimi(z|s)|adjust|tweak|scale|crop|change|resize/i)
		{
			notelog("Optimize found.  Skipping.\n");
		}
		else
		{
			notelog("Uploader: $uploader ($i)\n");
			last;
		}
	}
	$uploader = undef if($i >= $max);
	
	print "Uploader: $uploader\n";
	return $uploader;
}

# See if the specified category exists, and if not, create it
sub checkImageCategory
{
	my $cat;
	my ($text, $editTime, $startTime, $token);
	$cat = "Category:Images with unknown source as of $_[0] $_[1] $_[2]";
	
	($text, $editTime, $startTime, $token) = Pearle::getPage($cat);
	if($text !~ /\[\[[Cc]ategory:[Ii]mages with unknown source/)
	{
		$text .= "\n[[Category:Images with unknown source| ]]\n";
		if($test_only)
		{
			notelog("Would create category [[:$cat]]\n");
		}
		else
		{
			Pearle::postPage($cat, $editTime, $startTime, $token, $text, "Created category", "no");
			userwarnlog("*Created category [[:$cat]]\n");
		}
	}
}


sub loadNotificationList
{
	my $file = shift;
	my %notelist;
	my $i = 0;
	notelog("File: $file\n");
	open INFILE, "<", $file;
	while(<INFILE>)
	{
		$_ =~ s/\s*#.*$//g;
		chomp;
		$notelist{$_} = 1;
		$i++;
	}
	close INFILE;
	notelog("$i notifications loaded\n");
	return %notelist;
}

sub saveNotificationList
{
	return if($test_only);
	
	my $file = shift;
	my %notelist = @_;
	my $key;
	
	open OUTFILE, ">", $file;
	foreach $key (keys(%notelist))
	{
		print OUTFILE "$key\n";
	}
	close OUTFILE;
}

1;