User:OrphanBot/tagbot.pl

From Wikipedia, the free encyclopedia

Source code for OrphanBot's upload-tagging task. Requires libBot.pm and Pearle.pm.

#!/usr/bin/perl

# Tagbot
#
# A bot to identify and tag recently-uploaded images that have no image description page, source information, or copyright tag.


use strict;
use warnings;

use Date::Calc qw(Month_to_Text Today);
use Array::Utils;
use utf8;

use Data::Dumper;

use libBot;

binmode STDOUT, ":utf8";

my $permit_interruptions = 0;	# Allow talkpage messages to stop the bot?

my ($cur_y, $cur_m, $cur_d);

my %users_notified;				# List of users notifed.  0, undef = no; 1 = notified once; 2 = notified and second notice
my %notifications;				# List of user,image pairs, used to ensure that no user is ever notified about an image twice.
my %dont_notify = ();				# List of users to never notify
my %banned_users = ();				# List of users banned from uploading
my %exempt_users = ();				# List of users exempt from inspection

my %unknown_tags;				# List of tags found that are not in either the "good" or "bad" list
my @sourcereq_tags;				# List of tags that require a separate source
my $sourcereq_tags;
my @nosource_tags;				# List of self-sourcing tags
my $nosource_tags;
my @deletion_tags;				# Tags that will eventually lead to the deletion of the image
my $deletion_tags;
my @forbidden_tags;				# List of tags that should never be seen
my $forbidden_tags;
my @deprecated_tags;				# List of tags that shouldn't be used any more
my $deprecated_tags;	
my @nontags;					# List of tags that aren't copyright tags
my $nontags;
my @source_tags;					# List of tags that provide source but not copyright status
my $source_tags;

sub loadTagList
{
	my $filename = shift;
	my @list = ();
	open INFILE, "<", $filename;
	while(<INFILE>)
	{
		$_ =~ s/#.*//;		# Remove comments
		$_ =~ s/^\s*//;		# Remove leading whitespace
		$_ =~ s/\s*$//;		# Remove trailing whitespace
		push @list, $_ if($_ !~ /^\s*$/);
	}
	close INFILE;
	return @list;
}

sub processTagList
{
	my $tags = join "|", @_;
	$tags =~ s/\(/\\\(/g;
	$tags =~ s/\)/\\\)/g;
	$tags =~ s/\./\\\./g;
	$tags =~ s/\*/.*?/g;
	return "($tags)";
}


@sourcereq_tags = loadTagList("sourcereq.tags");
$sourcereq_tags = processTagList(@sourcereq_tags);
print "Sourcereq: Loaded\n\n";

@nosource_tags = loadTagList("nosource.tags");
$nosource_tags = processTagList(@nosource_tags);
print "Nosource: Loaded\n\n";

@forbidden_tags = loadTagList("forbidden.tags");
$forbidden_tags = processTagList(@forbidden_tags);
print "Forbid: Loaded\n\n";

@deletion_tags = loadTagList("deletion.tags");
$deletion_tags = processTagList(@deletion_tags);
print "Deletion: Loaded\n\n";

@deprecated_tags = loadTagList("deprecated.tags");
$deprecated_tags = processTagList(@deprecated_tags);
print "Deprecated: Loaded\n\n";

@nontags = loadTagList("nontags.tags");
$nontags = processTagList(@nontags);
print "Nontags: Loaded\n\n";

@source_tags = loadTagList("source.tags");
$source_tags = processTagList(@source_tags);
print "Sourcetags: Loaded\n\n";


sub tokenSubst
{
	my $string = shift;
	my $image = shift;

	$string =~ s/<IMAGE>/$image/g if(defined($image));
	$string =~ s/<DAY>/$cur_d/g;
	$string =~ s/<MONTH>/$cur_m/g;
	$string =~ s/<YEAR>/$cur_y/g;
	
	return $string;
}

sub loadUserList
{
	my $file = shift;
	my %notelist;
	my $i = 0;
	Pearle::myLog(4, "File: $file\n");
	open INFILE, "<", $file;
	while(<INFILE>)
	{
		my ($user, $reason);
		$_ =~ s/\s*#.*$//g;
		chomp;
		($user, $reason) = $_ =~ /([^\t]*)\t+(.*)/;
		next if(!defined($user) or !defined($reason));
		$notelist{$user} = $reason;
		$i++;
	}
	close INFILE;
	Pearle::myLog(3, "$i notifications loaded\n");
	return %notelist;

}

# Initialize
($cur_y, $cur_m, $cur_d) = Today(1);	# Today in GMT
$cur_m = Month_to_Text($cur_m);

Pearle::init("username", "password", "tagbot.log","cookies.tagbot.txt");
Pearle::config(nullOK => 1, sanityCheck => 1, loglevel => 3, printlevel => 4, testmode => 0);
config(username => 'username');

Pearle::myLog(2, "Beginning execution\n");

%dont_notify = loadNotificationList("orphanbot.whitelist");
%banned_users = loadUserList("banneduser.list");
%exempt_users = loadUserList("exemptuser.list");

if(!Pearle::login())
{
	exit;
}

# Get the day's uploads
my @articles;
@articles = Pearle::getLogArticles(log => 'upload', limit => 150);
# Chop off the 20 most recent log entries
splice @articles, 0, 20;
Pearle::myLog(3, scalar(@articles) . " images found\n");


foreach my $log_entry (@articles)
{
	my $image = $log_entry->[0];
	my $uploader = $log_entry->[1];
	my $summary = $log_entry->[2] || "";
	
	print "$image\n";
	print "$uploader\n";
	
	Pearle::myLog(2, "Processing image $image\n");
	
	# Basic checks that can be done from the log alone
	
	# Non-terminating check: Was the image uploaded by a blacklisted user?
	if($banned_users{$uploader})
	{
		botwarnlog("*Image [[:$image]] uploaded by blacklisted user [[User:$uploader]]\n");
		Pearle::myLog(3, "Upload by banned user $uploader\n");
	}
	
	# Terminating check: Is the user on the whitelist?
	if($exempt_users{$uploader})
	{
		Pearle::myLog(2, "Upload by exempt user $uploader found.\n");
		next;
	}
	
	# Terminating check: Is the upload a modification?
	if($summary =~ /optimi(z|s)ed using (optipng|PNGCrusher)/i)
	{
		Pearle::myLog(2, "Optimize upload found for image $image\n");
		next;
	}
	if($summary =~ /tweak|crop|scale|adjust|change|resize|corrected|correcting/i)
	{
		Pearle::myLog(2, "Tweak found for image $image\n");
		next;
	}	
	
	# Terminating check: Is the upload a revert?
	if($summary =~ /Reverted to earlier revision|Reverted to version/)
	{
		Pearle::myLog(2, "Revert upload found for image $image\n");
		next;
	}
	
	# Get page data
	my $image_data = Pearle::APIQuery(titles => $image, prop => ['templates', 'revisions'],
	                                  tllimit => 500,						# All the templates
	                                  rvprop => ['content'],					# Article body
	                                  meta => 'userinfo', uiprop => ['hasmsg'], 			# Check for talkpage messages
	                                  redirects => 1,						# Resolve redirects
	                                  );
	
	if(!defined($image_data))
	{
		Pearle::myLog(1, "Server did not return an appropriate response.\n");
		next;
	}
	my $parsed_xml = Pearle::getXMLParser()->XMLin($image_data, ForceArray => ['tl'] );
	Pearle::myLog(4, Dumper($parsed_xml));
	
	my $page_text = GetPageText($parsed_xml);
	my @templates = GetPageTemplates($parsed_xml);
	
	# Remove non-tags from template list
	# TODO: Remove redlinks
	@templates = grep {$_ !~ /:$nontags$/i} @templates;
	
	my $stripped_page_text = $page_text || "";
	$stripped_page_text =~ s/^==.*?==//gm;		# Remove section headers
	$stripped_page_text =~ s/\n//g;			# Remove newlines
	$stripped_page_text =~ s/{{{[^}]+}}}//g;	# Remove template parameters
	$stripped_page_text =~ s/{{[^}]+}}//gi;		# Remove templates
	
	Pearle::myLog(4, "Templates: " . join(", ", @templates) . "\n");
	Pearle::myLog(4, "Stripped text: $stripped_page_text\n");
	print "=============================================================================\n";

	# Check for interruptions
	if($permit_interruptions and DoIHaveMessages($image_data))
	{
		Pearle::myLog(0, "Talkpage message found; exiting on image $image.\n");
		last;
	}
	
	# Sanity check: Does the image still exist?
	if(defined($parsed_xml->{query}->{pages}->{page}->{missing}))
	{
		Pearle::myLog(2, "Image $image has already been deleted\n");
		next;
	}
		
	# Sanity check: Is the image marked for deletion?
	if(grep {$_ =~ /:$deletion_tags$/i} @templates)
	{
		# We don't do anything with images already marked for deletion.  There are just too many corner cases and wasted-effort conditions.
		Pearle::myLog(2, "Deletion tag found\n");
		next;
	}

	# Check for red flag: "Google Image" (matches 'image', 'images', 'imagesearch')
	if(defined($page_text) and $page_text =~ /google image/i)
	{
		Pearle::myLog(2, "Image $image has red-flag keyword 'google image'\n");
		botwarnlog("* Image [[:$image]] has red-flag keyword 'google image'\n");
	}

	# Check for red flag: those goddamn navboxes
	if(grep {$_ =~ /navbox/i} @templates)
	{
		botwarnlog("* Navbox found on [[:$image]]\n");
	}
	
	######### Check for source, license, and tag ###################
	
	# Meanings: "undef" = we don't know, "0" = definitely no, "1" = probably yes
	my $has_source = undef;
	my $has_license = undef;
	my $has_tag = undef;
	
	# Does the image lack a description page?
	if(!defined($page_text) or $page_text =~ /^\s*$/)
	{
		Pearle::myLog(3, "Empty IDP\n");
		$has_source = 0;
		$has_license = 0;
		$has_tag = 0;
	}
	
	# Does the image have a source-providing template?
	if(my @tags = grep {$_ =~ /:$source_tags$/i} @templates)
	{
		Pearle::myLog(3, "Source-providing template @tags found\n");
		$has_source = 1;
		
		# Remove the tags from the candidate set: they can't keep an image from being "untagged" or having unknown tags
		@templates = Array::Utils::array_diff(@templates, @tags);
	}

	# Does the image have a self-sourcing tag?
	if(my @tags = grep {$_ =~ /:$nosource_tags$/i} @templates)
	{
		Pearle::myLog(3, "Self-sourcing tag @tags found\n");
		$has_source = 1;
		$has_license = 1;
		$has_tag = 1;
	}
	
	# Does the image have a sourcereq tag?
	if(my @tags = grep {$_ =~ /:$sourcereq_tags$/i} @templates)
	{
		Pearle::myLog(3, "Sourcereq tag @tags found\n");
		$has_license = 1;
		$has_tag = 1;
	}
	
	# Handle those damned "Information" and "Non-free use rationale" tags
	if(grep {$_ =~ /:Information$/} @templates)
	{
		Pearle::myLog(3, "Has an Information template\n");
		# Remove the template from the list
		@templates = grep {$_ !~ /:Information$/} @templates;
		# Attempt to parse an "information" template
		if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "source" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Information:source}}\n");
			$has_source = 1;
		}
		
		if($page_text =~/\|\s*author\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "author" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Information:author}}\n");
			$has_source = 1;
		}
		
		if($page_text =~/\|\s*permission\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "permission" parameter, assume a license (but not a tag)
			Pearle::myLog(3, "Assuming license in {{Information:permission}}\n");
			$has_license = 1;
		}

		if($page_text =~/\|\s*flickr_url\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "flickr_url" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Flickr:flickr_url}}\n");
			$has_source = 1;
		}
	}

	if((grep {$_ =~ /:Non-free media rationale$/} @templates) or 
	   (grep {$_ =~ /:Non-free use rationale$/} @templates))
	{
		Pearle::myLog(3, "Has a non-free use rationale template\n");
		# Remove the template from the list
		@templates = grep {$_ !~ /:Non-free media rationale$/} @templates;
		@templates = grep {$_ !~ /:Non-free use rationale$/} @templates;
		# Attempt to parse a "non-free use rationale" template or derivative
		if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "source" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Non-free * rationale:source}}\n");
			$has_source = 1;
		}
		if($page_text =~ /\|\s*publisher\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "publisher" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Non-free * rationale:publisher}}\n");
			$has_source = 1;
		}
		if($page_text =~ /\|\s*owner\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "owner" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Non-free * rationale:owner}}\n");
			$has_source = 1;
		}
		if($page_text =~ /\|\s*website\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "website" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Non-free * rationale:website}}\n");
			$has_source = 1;
		}
		if($page_text =~ /\|\s*distributor\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "distributor" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Non-free * rationale:distributor}}\n");
			$has_source = 1;
		}
		
		$has_license = 1;	# Assume that it's licensed as "fair use"
	}
	
	if(grep {$_ =~ /:Non-free image data$/} @templates)
	{
		Pearle::myLog(3, "Has a non-free image data template\n");
		# Remove the template from the list
		@templates = grep {$_ !~ /:Non-free image data$/} @templates;
		# Attempt to parse a "non-free use rationale" template
		if($page_text =~ /\|\s*source\s*=\s*[^|}]{4,}/i)
		{
			# If there's a filled-in "source" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Non-free image data:source}}\n");
			$has_source = 1;
		}
		
		$has_license = 1;	# Assume that it's licensed as "fair use"
	}
	if(grep {$_ =~ /:spoken article entry$/i} @templates)
	{
		Pearle::myLog(3, "Has a Spoken Article template\n");
		# Remove the template from the list
		@templates = grep {$_ !~ /:Spoken article entry$/i} @templates;
		# Attempt to parse
		if($page_text =~ /\|\s*user_name\s*=[ \t]*\S+/i)
		{
			# If there's a filled-in "user_name" parameter, assume a source
			Pearle::myLog(3, "Assuming source in {{Spoken article entry:user_name}}\n");
			$has_source = 1;
		}
		$has_license = 1;	# Assume that it's GFDL
	}
	
	# Is the image description page lacking in tags?
	# This is checked here because we may have removed "information" or "non-free use rationale" templates from the list earlier
	# We want those in the list before here because it makes detecting them for parsing easier, but we don't want them in the list
	# here so we can say for sure that the page is untagged.
	if(scalar(@templates) == 0)
	{
		Pearle::myLog(3, "No templates found\n");
		$has_tag = 0;
	}
	

	# Does it have source information outside of the templates?
	if(length($stripped_page_text) >= 7) # Page text with headers, newlines and templates stripped is at least seven bytes ("my work")
	{
		# TODO: Better source checking
		if(!defined($has_source) or $has_source == 0)
		{
			Pearle::myLog(3, "Assuming page has source\n");
			$has_source = 1;
		}
		if(!defined($has_license) or $has_license == 0)
		{
			Pearle::myLog(3, "Assuming page has license\n");
			$has_license = 1;
		}
	}
	else
	{
		if(!defined($has_source))
		{
			# If we still don't know if it has a source, it's safe to assume it doesn't.
			Pearle::myLog(3, "Assuming page doesn't have source\n");
			$has_source = 0;
		}
		if(!defined($has_license))
		{
			# If we still don't know if it has a license, we'll assume it doesn't
			Pearle::myLog(3, "Assuming page doesn't have license information\n");
			$has_license = 0;
		}
	}
	########## Check for exceptional conditions ##########
	
	# Terminating check: Is the image using a deprecated tag, and doesn't have any other license tag?
	if(($has_tag != 1) and (grep {$_ =~ /:$deprecated_tags$/i} @templates))
	{
		Pearle::myLog(2, "Image has deprecated tag\n");
		# Mark as no-license
		wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Obsolete or deprecated tag");
		if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
		{
			Pearle::myLog(2, "Warning user $uploader\n");
			wikilog("User talk:$uploader", "\n{{subst:User:OrphanBot/deprecated|$image}} --~~~~", "Image with obsolete or deprecated license");
		}
		Pearle::limit();
		next;
	}
	
	# Terminating check: Does the image have a forbidden tag?
	if(my @tags = grep {$_ =~ /:$forbidden_tags$/i} @templates)
	{
		# Doesn't matter what else is on the page, the image requires human handling
		Pearle::myLog(2, "Forbidden tag $tags[0] found on image [[:$image]]\n");
		botwarnlog("*Forbidden tag {{tl|$tags[0]}} found on image [[:$image]]\n");
		Pearle::limit();
		next;
	}
	
	# TODO: Terminating check: Malformed fair-use rationale
	
	########## Process #####################################

	# We've found an image with only unknown templates
	if(!defined($has_tag))
	{
		# We don't know if it has any tags or not, and so we cannot deduce the license or source status
		# We know an image doesn't have tags if:
		# * It has no templates
		# * or all templates are on the "nontags" list
		# We know an image has tags if:
		# * We found a tag we know about
		print "Has unknown tags\n";
	
		my @new_unknown_tags = grep {!defined($unknown_tags{$_})} @templates;
		if(scalar(@new_unknown_tags) > 0)
		{
			foreach my $unknown_tag (@new_unknown_tags)
			{
				Pearle::myLog(2, "Unknown tag {{$unknown_tag}} found\n");
				botwarnlog("* Unknown tag [[$unknown_tag]] found\n");
				$unknown_tags{$unknown_tag} = 1;
			}
			Pearle::limit();
		}
	}
	elsif($has_tag == 0)
	{
		if($has_source == 0)
		{
			if($has_license == 0)
			{
				# Tag as "no source" and "no license"
				wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license information");
				if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
				{
					Pearle::myLog(2, "Warning user $uploader\n");
					wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license information");
				}
				Pearle::myLog(2, "No source, no license\n");
				Pearle::limit();
			}
			else
			{
				# No license tag, and it either has a license or we don't know if it has a license
				# Tag as "no source" and "untagged"
				wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source or license tag");
				if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
				{
					Pearle::myLog(2, "Warning user $uploader\n");
					wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource untagged|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source or license tag");
				}
				Pearle::myLog(2, "No source, untagged\n");
				Pearle::limit();
			}
		}
		else
		{
			if($has_license == 0)
			{
				# Tag as "no license"
				wikilog($image, tokenSubst("\n{{no copyright information|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license information");
				if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
				{
					Pearle::myLog(2, "Warning user $uploader\n");
					wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nolicense|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license information");
				}
				Pearle::myLog(2, "No license\n");
				Pearle::limit();
			}
			else
			{
				# Tag as "untagged"
				wikilog($image, tokenSubst("\n{{untagged|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no license tag");
				if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
				{
					Pearle::myLog(2, "Warning user $uploader\n");
					wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/untagged-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no license tag");
				}
				Pearle::myLog(2, "Untagged\n");
				Pearle::limit();
			}
		}
	}
	else
	{
		# If it has a tag, it has a license
		if($has_source == 0)
		{
			# Tag as "no source"
			wikilog($image, tokenSubst("\n{{no copyright holder|month=<MONTH>|day=<DAY>|year=<YEAR>}}"), "Image has no source information");
			if(!IsNotified($uploader, undef, $image, undef, \%dont_notify))
			{
				Pearle::myLog(2, "Warning user $uploader\n");
				wikilog("User talk:$uploader", tokenSubst("{{subst:User:OrphanBot/nosource-new|<IMAGE>}} --~~~~\n", $image), "You've uploaded an image with no source information");
			}
			Pearle::myLog(2, "No source\n");
			Pearle::limit();
		}
		else
		{
			# Everything's fine
			Pearle::myLog(2, "Image has no problems\n");
		}
	}
	
	sleep(2);
}

Pearle::myLog(2, "Finished with upload set\n");