#!/usr/bin/perl
# A perl script to convert Wikipedia {{ref}}-style references to use the
# <ref> feature.
#
# Usage:
# wp-refconvert article.txt
# creates a new file called article-new.txt, containing the new version
# of the article.
use utf8;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
use strict;
my $prog = "wp-refconvert";
##############################################################################
# Global Data
##############################################################################
# Number of distinct references and notes found.
my $numRefs = 0;
my $numNotes = 0;
my $fixedRefs = 0;
# Table of references.
my @references;
##############################################################################
# Article Parsing
##############################################################################
sub readArticle {
my ( $file ) = @_;
open(my $in, "<:utf8", $file) || die("$prog: can't open $file\n");
local($_);
while (<$in>) {
# Check for references in the line; and check for notes. These
# should be mutually exclusive.
my @refs = m/\{\{ref[^}]+\}\}/g;
foreach my $r (@refs) {
addRef($r);
}
my @notes = m/ *(\{\{note[^}]+\}\})/gc;
if (scalar(@notes) > 0) {
my ( $text ) = m/\G *(.*)$/;
addNote($notes[0], $text);
}
}
close($in);
}
sub addRef {
my ( $ref ) = @_;
my ( $n, $k, $l );
if (($n) = ( $ref =~ /^\{\{ref\|([^}|]+)\}\}$/)) {
$k = ++$numRefs;
$l = "";
# printf "Simple: %s, %d, %s\n", $n, $k, $l;
} elsif (($n, $k) = ( $ref =~ /^\{\{ref_num\|([^}|]+)\|([^}|]+)\}\}$/)) {
$l = "";
# printf "Num: %s, %d, %s\n", $n, $k, $l;
} elsif (($n, $k, $l) = ( $ref =~ /^\{\{ref_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) {
# printf "Label: %s, %d, %s\n", $n, $k, $l;
} else {
die("$prog: unknown reference style \"$ref\"\n");
}
my $record = $references[$k - 1];
if (!defined($record)) {
$record = { 'name' => $n, 'count' => 1, 'usecount' => 0 };
$references[$k - 1] = $record;
} else {
if ($record->{'name'} ne $n) {
die(sprintf "$prog: note mismatch: ref %d=%s; note=%s\n", $k, $record->{'name'}, $n);
}
++$record->{'count'};
}
}
sub addNote {
my ( $note, $text ) = @_;
my ( $n, $k, $l );
if (($n) = ( $note =~ /^\{\{note\|([^}|]+)\}\}$/)) {
$k = ++$numNotes;
$l = "";
# printf "Simple: %s, %d, %s\n", $n, $k, $l;
} elsif (($n, $k, $l) = ( $note =~ /^\{\{note_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) {
if ($k != ++$numNotes) {
die("$prog: note number mismatch: seq = $numNotes, explicit = $k\n");
}
# printf "Label: %s, %d, %s\n", $n, $k, $l;
} else {
die("$prog: unknown note style \"$note\"\n");
}
my $record = $references[$k - 1];
if (!defined($record)) {
die("$prog: note mismatch: no ref $k ($n)\n");
}
if ($record->{'name'} ne $n) {
die(sprintf "$prog: note mismatch: ref %d=%s; note=%s\n", $k, $record->{'name'}, $n);
}
$record->{'text'} = $text;
}
##############################################################################
# Article Editing
##############################################################################
sub editArticle {
my ( $file, $output ) = @_;
my $doneRefs = 0;
open(my $in, "<:utf8", $file) || die("$prog: can't open $file\n");
open(my $out, ">:utf8", $output) || die("$prog: can't create $output\n");
local($_);
while (<$in>) {
# Check for references in the line; and check for notes. These
# should be mutually exclusive.
if (/^\# \{\{note/) {
if (!$doneRefs) {
printf $out "<references/>\n";
++$doneRefs;
}
} else {
s/(\{\{ref[^}]+\}\})/fixRef($1)/ge;
print $out $_;
}
}
close($in);
close($out);
}
sub fixRef {
my ( $ref ) = @_;
my ( $n, $k, $l );
if (($n) = ( $ref =~ /^\{\{ref\|([^}|]+)\}\}$/)) {
$k = ++$fixedRefs;
$l = "";
} elsif (($n, $k) = ( $ref =~ /^\{\{ref_num\|([^}|]+)\|([^}|]+)\}\}$/)) {
$l = "";
# printf "Num: %s, %d, %s\n", $n, $k, $l;
} elsif (($n, $k, $l) = ( $ref =~ /^\{\{ref_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) {
;
} else {
die("$prog: unknown reference style \"$ref\"\n");
}
my $record = $references[$k - 1];
if (!defined($record)) {
die("$prog: ref mismatch: no ref $k ($n)\n");
}
if ($record->{'count'} > 1) {
if ($record->{'usecount'}++ == 0) {
return sprintf "<ref name=\"%s\">%s</ref>", $record->{'name'}, $record->{'text'};
} else {
return sprintf "<ref name=\"%s\"/>", $record->{'name'};
}
} else {
return sprintf "<ref>%s</ref>", $record->{'text'};
}
}
##############################################################################
# Diagnostics
##############################################################################
sub dumpRefs {
if ($numNotes != $numRefs) {
printf STDERR "## %d refs; %d notes\n", $numRefs, $numNotes;
}
foreach my $i (1 .. $numRefs) {
my $record = $references[$i - 1];
printf STDERR "[%2d] %-12s (%2d) %s\n", $i, $record->{'name'},
$record->{'count'}, $record->{'text'};
}
}
##############################################################################
# Main
##############################################################################
sub main {
my ( @args ) = @_;
my $article = $args[0];
# First, parse the article.
readArticle($article);
# dumpRefs();
my $newvers = $article;
($newvers =~ s:\.([^/]+)$:-new.\1:) || ($newvers .= ".new");
printf STDERR "## edit %s -> %s\n", $article, $newvers;
editArticle($article, $newvers);
0;
}
exit(main(@ARGV));