User:FairuseBot/Pearle/WikiPage.pm

From Wikipedia, the free encyclopedia
### IMPORTANT ###

# This code is released into the public domain.

### RECENT CHANGES ###
#  6 Aug 2007: Created
# 21 Aug 2007: Added comment folding/unfolding
# 21 Oct 2007: Fixed and tested comment folding
# 25 Oct 2007: Added link canonicalization

# Notes on editable markup:
#  * Multi-character symbols are replaced with single-character placeholders 
#    from the Unicode "control symbols" set (U+0001 to U+001F).
#  * Comments are replaced with single-character placeholders from the 
#    Unicode fifteenth-plane private-use area (U+F0000 to U+FFFFF).

package Pearle::WikiPage;

use strict;
use warnings;

use URI::Escape;
use Encode;

########## Constructor ###############################################
sub new
{
	my $class = shift;
	my %params = @_;
	
	my $self = {
		text => '',				# Page text
		title => '',			# Page title
		
		# Internal variables
		editTime => undef,		# editTime parameter used when editing a page
		startTime => undef,		# startTime parameter used when editing a page
		editToken => undef,		# editToken parameter used when editing a page
		
		# Comment-folding
		_comments_folded => 0,		# Are comments presently folded?
		_comment_fold_lookup => {},	# Lookup table of proxy,comment pairs
		_comment_fold_proxy => 0xF0000,	# Next proxy character to use
		
		# Single-character markup representations
		_linkstart => "\x01",
		_linkend => "\x02",
		_transclusionstart => "\x03",
		_transclusionend => "\x04",
		
	};
	
	foreach my $key (keys(%params))
	{
		if($key eq 'text')
		{
			$self->{text} = $params{text};
		}
		elsif($key eq 'title')
		{
			$self->{title} = $params{title};
		}
		elsif($key eq 'editTime')
		{
			$self->{editTime} = $params{editTime};
		}
		elsif($key eq 'startTime')
		{
			$self->{startTime} = $params{startTime};
		}
		elsif($key eq 'editToken')
		{
			$self->{editToken} = $params{editToken};
		}
	}

	bless($self, $class);
	return $self;
}
########## Accessor functions ########################################

# Return the text with modifications to make it easier to operate on
#
# NOTE: Don't try to print this.  In order to make editing easier,
#  various multi-character markup sequences have been replaced with
#  very non-printable characters.
sub getEditableText
{
	my $self = shift;
	$self->foldComments();
	return $self->makeEditableMarkup($self->{text});
}

sub setEditableText
{
	my $self = shift;
	$self->{text} = shift;
}

# Return the text in WikiMarkup format
sub getWikiText
{
	my $self = shift;
	$self->unfoldComments();
	return $self->makeWikiMarkup($self->{text});
}

sub getTitle
{
	my $self = shift;
	return $self->{title};
}

sub setTitle
{
	die "Setting the title of a WikiPage is not supported.\n";
}

sub getEditToken
{
	my $self = shift;
	return $self->{editToken};
}

sub getStartTime
{
	my $self = shift;
	return $self->{startTime};
}

sub getEditTime
{
	my $self = shift;
	return $self->{editTime};
}

########## Verbs #####################################################

# Convert to editable representation
sub makeEditableMarkup
{
	my $self = shift;
	my $text = shift;

#	$text =~ s/\[\[\[/\x01[/g;			# Triple opening brackets: not valid wikimarkup
	$text =~ s/\[\[/\x01/g;				# Double opening brackets: the start of an internal link or inline image
	$text =~ s/\]\]\]\]/\x02\x02/g;	# Quadruple closing brackets: The end of an image caption containing an internal link
	$text =~ s/\]\]\]/]\x02/g;			# Triple closing brackets: an image caption containing an external link
	$text =~ s/\]\]/\x02/g;				# Double closing brackets: the end of an internal link or image
	$text =~ s/\{\{/\x03/g;				# Double opening braces: the start of a transclusion
	$text =~ s/\}\}/\x04/g;				# Double closing braces: the end of a transclusion
	
	return $text;
}

# Convert to WikiMarkup representation
sub makeWikiMarkup
{
	my $self = shift;
	my $text = shift;

	$text =~ s/\x01/[[/g;
	$text =~ s/\x02/]]/g;
	$text =~ s/\x03/{{/g;
	$text =~ s/\x04/}}/g;
	
	return $text;
}

# Replace comments with single-character proxies.
sub foldComments
{
	my $self = shift;
	my $text = $self->{text};
	
	while($text =~ /(<!--.*?-->)/s)
	{
		my $proxy_char = chr $self->{_comment_fold_proxy};
		$self->{_comment_fold_lookup}->{$proxy_char} = $1;
		my $comment = escapeRegex($1);
		$text =~ s/$comment/$proxy_char/;

		$self->{_comment_fold_proxy} += 1;
		die "Too many comments in page" if $self->{_comment_fold_proxy} > 0xFFFFF;	# More than 65535 comments in the page
	}
	$self->{text} = $text;
	return $text;
}

# Replace proxies with the original comments
sub unfoldComments
{
	my $self = shift;
	my $text = $self->{text};

	while (my ($proxy_char,$link) = each(%{$self->{_comment_fold_lookup}}))
	{
		$text =~ s/$proxy_char/$link/g;
	}
	$self->{text} = $text;
	return $text;
}

sub canonicalizeLinks
{
	my $self = shift;
	my %link_lookup;
	# NOTE: Order of the following two lines is important, since getEditableText modifies $self->{_comment_fold_proxy}
	my $text = $self->getEditableText();
	my $link_proxy = $self->{_comment_fold_proxy};
	
	# Extract the links beginnings into a lookup table
	while($text =~ /(\x01.*?[|\x02])/)
	{
		my $proxy_char = chr $link_proxy;
		$link_lookup{$proxy_char} = $1;
		my $link = escapeRegex($1);
		$text =~ s/$link/$proxy_char/;
#		print "$link_proxy $link_lookup{$proxy_char}\n";

		$link_proxy += 1;
		die "Too many links in page" if($link_proxy > 0xFFFFF);
	}

	# Canonicalize link beginnings
	while (my ($proxy_char,$link) = each %link_lookup)
	{
		next if $link =~ /http:/;		# Skip if it's a badly-formatted external link
		$link = unescapeUTF8URL($link);												# Convert URL-encoded UTF8 to Perl chars
		$link =~ s/_/ /g;															# Underscores to spaces
		$link =~ s/  / /g;															# Collapse multiple spaces
		$link =~ s/[\x{200E}\x{200F}\x{202A}\x{202B}\x{202C}\x{202D}\x{202E}]//g;	# Kill Unicode BiDi markers
		# TODO: Decode HTML entities (&#x45; &#69; &Aacute
		# Trim spaces
		$link =~ s/^\x01 /\x01/;
		$link =~ s/ \|$/|/;
		$link =~ s/ \x02$/\x02/;
		# TODO: Trim internal spaces for namespaced links
#		print URI::Escape::uri_escape_utf8($link), "\n";
		$link_lookup{$proxy_char} = $link;
	}
	
	# Put link beginnings back in the text

	while (my ($proxy_char,$link) = each %link_lookup)
	{
		$text =~ s/$proxy_char/$link/g;
	}

	$self->setEditableText($text);
#	exit;
}

########## Utilities #################################################

# Escape a string so that it's a literal match in a regex
sub escapeRegex
{
	my $string = shift;
	$string =~ s/\\/\\\\/g;
	$string =~ s/\./\\\./g;
	$string =~ s/\(/\\\(/g;
	$string =~ s/\)/\\\)/g;
	$string =~ s/\[/\\\[/g;
	$string =~ s/\{/\\\{/g;
	$string =~ s/\+/\\\+/g;
	$string =~ s/\*/\\\*/g;
	$string =~ s/\?/\\\?/g;
	$string =~ s/\^/\\\^/g;
	$string =~ s/\$/\\\$/g;
	$string =~ s/\|/\\\|/g;
	return $string;
}

sub unescapeUTF8URL
{
	# Since nobody seems to have a module to unescape a UTF8-encoded URL-escaped string...
	my $string = shift;
	my @chars = split //, $string;
	my $result_string = '';
	
	for(my $i = 0; $i < scalar(@chars); $i++)
	{
		my $partial_string = '';
		if($chars[$i] eq '%')
		{
			while(1)
			{
				# If the next two chars are hex values, stuff them in $partial_string
				if($chars[$i+1] =~ /[0-9a-f]/i and $chars[$i+2] =~ /[0-9a-f]/i)
				{
					$partial_string .= $chars[$i] . $chars[$i+1] . $chars[$i+2];
					$i += 3;
				}
				else
				{
					# Literal percent
					$result_string .= $chars[$i];
					$i++;
					last;
				}
				if($chars[$i] ne '%')
				{
					last;
				}
			}
			if($partial_string ne '')
			{
				$result_string .= decode("utf8", URI::Escape::uri_unescape($partial_string));
			}
			$i--;
		}
		else
		{
			# Literal char, already in unicode
			$result_string .= $chars[$i];
		}
	}
	return $result_string;
}

1;