Jump to content

User:Bo Lindbergh/dabalyze

From Wikipedia, the free encyclopedia

This is a Perl script for finding links to disambiguation pages in Wikipedia by analyzing database dumps in the new XML format. It may not work properly in a non-Unix environment. Save it as "dabalyze" in a convenient directory. Instructions follow below.

#! /usr/bin/perl

use strict;

my %interesting=
    ('' => {
	name		=> 'article',
	filename	=> 'articles.txt',
	cutoff		=> 100},
     'Template'	=> {
	 name		=> 'template',
	 filename	=> 'templates.txt',
	 cutoff		=> 0,
	 list		=> 1});

my $exp_re=qr/\(disambiguation\)$/;

my @templates=split(/\n/,<<__EOT__);
2LA
2LCdisambig
4LA
5LA
Albumdis
Dab
Dambig
Disam
Disamb
Disambig
Disambig-cleanup
Disambiguate
Disambiguation
Exp-dab
Geodis
Hndis
Hurricane disambig
Hurricanedis
Interstatedis
Listdis
LND
Miscdis
Nocatdab
Numberdis
Phrasedis
Rdab
Roadis
Songdis
Substadis
Tla-dab
TLAdisambig
TLAdisambiguation
Townshipdis
__EOT__

foreach my $template (@templates) {
    $template =~ s/^([[:alpha:]])/[$1\L$1]/;
}

my $tmpl_re=join('|',sort({$b cmp $a} @templates));

my $dab_re=qr/{{(?i:msg:)?\s*(?i:template\s*:\s*)?($tmpl_re)\s*}}/;

my($ns_re,%ns_canon);

my $want_progress=@ARGV>0 && $ARGV[0] eq '-p';
my $last_progress=-1;

sub pageloop (&)
{
    my($handler)=@_;
    my($size);
    local $/="</page>\x0A";

    $size=-s PAGES;
    while (defined(my $page=<PAGES>)) {
	my($nstitle,$ns,$title);

	$page =~ /^\s*<page>/ or last;
	($nstitle)=($page =~ m{<title>([^<]+)</title>})
	    or die "Can't find page title";
	if ($nstitle =~ /^($ns_re):(.+)$/) {
	    $ns=$1;
	    $title=$2;
	} else {
	    $ns='';
	    $title=$nstitle;
	}
	$page =~ m{</text>} or next;
	substr($page,$-[0])='';
	$page =~ /<text xml:space="preserve">/
	    or die "Can't find start of text for page $nstitle";
	substr($page,0,$+[0])='';
	$handler->($nstitle,$ns,$title,$page);
	if ($want_progress) {
	    my $progress=int(tell(PAGES)/$size*1000);
	    if ($progress!=$last_progress) {
		$last_progress=$progress;
		printf STDERR "\r0.%.3u",$progress;
	    }
	}
    }
    if ($want_progress) {
	print STDERR "\r";
    }
}

sub mungtarget ($$$ )
{
    my(undef,$source,$sub)=@_;

    for my $target ($_[0]) {
	$target =~ tr/\t\n\r/   /;
	$target =~ s/^ +//;
	$target =~ s/ +$//;
	$target =~ s/ {2,}/ /g;
	if ($sub && $target =~ m{^/}) {
	    $target=$source.$target;
	} elsif ($target =~ /^:*($ns_re) *: *(.+)$/i) {
	    $target=$2;
	    utf8::decode($target);
	    $target=ucfirst($target);
	    utf8::encode($target);
	    $target=$ns_canon{lc($1)}.":".$target;
	} elsif ($target =~ /^:*(.+)$/i) {
	    $target=$1;
	    utf8::decode($target);
	    $target=ucfirst($target);
	    utf8::encode($target);
	} else {
	    # a malformed link, usually empty brackets
	}
    }
}

my(%dab,%redir,@circular);

sub pass1 ()
{
    print STDERR "Analysis pass 1\n";
    {
	my($siteinfo,@namespaces);
	local $/="</siteinfo>\x0A";

	$siteinfo=<PAGES>;
	@namespaces=
	    $siteinfo =~ m{<namespace key="-?\d+">([^<]+)</namespace>}g;
	$ns_re=join('|',map(quotemeta($_),sort({$b cmp $a} @namespaces)));
	foreach my $ns (@namespaces) {
	    $ns_canon{lc($ns)}=$ns;
	}
    }
    pageloop {
	my($nstitle,$ns,$title)=splice(@_,0,3);

	for my $text ($_[0]) {
	    my $sub=$interesting{$ns}->{subpages};

	    if ($ns eq '' && $text =~ $dab_re) {
		$dab{$nstitle}=1;
	    }
	    if ($text =~ /^#redirect.*\[\[([^\]\|]+)/i) {
		my($target,$back);

		$target=$1;
		mungtarget($target,$nstitle,$sub);
		while ($target ne $nstitle) {
		    my($newtarget);

		    $newtarget=$redir{$target};
		    last unless defined($newtarget);
		    $target=$newtarget;
		}
		if ($target eq $nstitle) {
		    push(@circular,$nstitle);
		} else {
		    $redir{$nstitle}=$target;
		}
	    }
	}
    };
    foreach my $target (keys(%redir)) {
	my(@chain);

	for (;;) {
	    my $newtarget=$redir{$target};
	    last unless defined($newtarget);
	    push(@chain,$target);
	    $target=$newtarget;
	}
	pop(@chain);
	foreach my $source (@chain) {
	    $redir{$source}=$target;
	}
    }

    print STDERR "    ".keys(%dab)." total disambiguation pages\n";
    print STDERR "\n";
}

my %stats=map {
    ($_,{});
} keys(%interesting);

my %lists=map {
    ($_,{});
} grep {
    $interesting{$_}->{list};
} keys(%interesting);

sub pass2 ()
{
    my(%linked);

    print STDERR "Analysis pass 2\n";
    {
	local $/="</siteinfo>\x0A";

	<PAGES>;
    }
    pageloop {
	my($nstitle,$ns,$title)=splice(@_,0,3);

	for my $text ($_[0]) {
	    my($stats,$lists,$sub);

	    $stats=$stats{$ns};
	    $lists=$lists{$ns};
	    $sub=$interesting{$ns}->{subpages};
	    if ($stats) {
		my(%seen);

		while ($text =~ /\[\[([^\]\|]+)/g) {
		    my($target,$final);

		    $target=$1;
		    mungtarget($target,$nstitle,$sub);
		    next if $target =~ $exp_re;
		    $final=$redir{$target};
		    $final=$target unless defined($final);
		    if ($dab{$final} && !$seen{$final}++) {
			$linked{$final}=1;
			$stats->{$final}++;
			if ($lists) {
			    push(@{$lists->{$final}},$nstitle);
			}
		    }
		}
	    }
	}
    };
    print STDERR "    ".keys(%linked)." linked disambiguation pages\n";
    foreach my $ns (sort(keys(%stats))) {
	print STDERR ("    ".keys(%{$stats{$ns}})." in the ".
		      $interesting{$ns}->{name}." namespace\n");
    }
    print STDERR "\n";
}

sub wikilink ($ )
{
    my($target)=@_;

    if (exists($redir{$target})) {
	"[{{SERVER}}{{localurl:$target|redirect=no}} $target]";
    } elsif ($target =~ m{/\.{1,2}(?:$|/)}) {
	"[{{SERVER}}{{localurl:$target}} $target]";
    } elsif ($target =~ m{^/}) {
	"[[:$target]]";
    } else {
	"[[$target]]";
    }
}

sub report ()
{
    print STDERR "Report generation\n";

    foreach my $target (@circular) {
	$redir{$target}=$target;
    }

    while (my($ns,$stats)=each(%stats)) {
	my($filename,$cutoff)=@{$interesting{$ns}}{qw(filename cutoff)};
	my $lists=$lists{$ns};
	my @nstitles=sort {
	    $stats->{$b}<=>$stats->{$a} || $a cmp $b;
	} grep {
	    $stats->{$_}>=$cutoff;
	} keys(%{$stats});
	my $total=0;

	open(REPORT,'>',$filename)
	    or die "Can't create $filename: $!";
	binmode(REPORT);
	print REPORT "\xEF\xBB\xBF";
	foreach my $nstitle (@nstitles) {
	    $total+=$stats->{$nstitle};
	}
	print REPORT "Total link count: $total\n";
	foreach my $nstitle (@nstitles) {
	    print REPORT ("# ",wikilink($nstitle),": ",$stats->{$nstitle},
			  " [[Special:Whatlinkshere/",$nstitle,"|links]]\n");
	    if ($lists) {
		foreach my $source (sort(@{$lists->{$nstitle}})) {
		    print REPORT "#* ",wikilink($source),"\n";
		}
	    }
	}
	close(REPORT);
	print STDERR "    ".@nstitles." entries written to $filename\n";
    }

    if (@circular) {
	@circular=sort(@circular);
	open(REPORT,'>','circular.txt')
	    or die "Can't create circular.txt: $!";
	binmode(REPORT);
	print REPORT "\xEF\xBB\xBF";
	foreach my $target (@circular) {
	    print REPORT "* ",wikilink($target),"\n";
	}
	close(REPORT);
	print STDERR "    ".@circular." entries written to circular.txt\n";
    } else {
	unlink('circular.txt');
    }
}

open(PAGES,'<','pages_current.xml')
    or die "Can't open pages_current.xml: $!";
binmode(PAGES);
pass1();
seek(PAGES,0,0);
pass2();
close(PAGES);
report();
input
The script expects to find the file "pages_current.xml" in the current directory. You can get this by downloading and uncompressing http://download.wikimedia.org/wikipedia/en/pages_current.xml.bz2
output
The script generates two text files named "articles.txt" and "templates.txt" in the current directory. The first one contains a list of disambiguation pages linked to by articles, suitable for inclusion in Wikipedia:Disambiguation pages with links. The second one contains a list of disambiguation pages linked to by templates; this is intended for a hypothetical sub-project concentrating on the template namespace. Note that the files use UTF-8 encoding; any text editor you use for copying and pasting into Wikipedia must be able to handle that.
Since the script has to handle circular redirects anyway, it generates a list of them in the file "circular.txt".
diagnostics
A successful run generates diagnostic output similar to the following:
Analysis pass 1
    41868 total disambiguation pages

Analysis pass 2
    30385 linked disambiguation pages
    30369 in the article namespace
    880 in the template namespace

Report generation
    514 entries written to articles.txt
    880 entries written to templates.txt
    100 entries written to circular.txt

The total running time is about 32 minutes on an 867 MHz PowerPC G4 (based on the database dump dated 2005-10-20).