User:Johantheghost/wp-refconvert.pl

From Wikipedia, the free encyclopedia
#!/usr/bin/perl

# A perl script to convert Wikipedia {{ref}}-style references to use the
# <ref> feature.
#
# Usage:
#     wp-refconvert article.txt
# creates a new file called article-new.txt, containing the new version
# of the article.

use utf8;
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
use strict;


my $prog = "wp-refconvert";


##############################################################################
# Global Data
##############################################################################

# Number of distinct references and notes found.
my $numRefs = 0;
my $numNotes = 0;
my $fixedRefs = 0;

# Table of references.
my @references;


##############################################################################
# Article Parsing
##############################################################################

sub readArticle {
  my ( $file ) = @_;

  open(my $in, "<:utf8", $file) || die("$prog: can't open $file\n");

  local($_);
  while (<$in>) {
    # Check for references in the line; and check for notes.  These
    # should be mutually exclusive.

    my @refs = m/\{\{ref[^}]+\}\}/g;
    foreach my $r (@refs) {
      addRef($r);
    }

    my @notes = m/ *(\{\{note[^}]+\}\})/gc;
    if (scalar(@notes) > 0) {
      my ( $text ) = m/\G *(.*)$/;
      addNote($notes[0], $text);
    }
  }

  close($in);
}


sub addRef {
  my ( $ref ) = @_;

  my ( $n, $k, $l );
  if (($n) = ( $ref =~ /^\{\{ref\|([^}|]+)\}\}$/)) {
    $k = ++$numRefs;
    $l = "";
    # printf "Simple: %s, %d, %s\n", $n, $k, $l;
  } elsif (($n, $k) = ( $ref =~ /^\{\{ref_num\|([^}|]+)\|([^}|]+)\}\}$/)) {
    $l = "";
    # printf "Num:    %s, %d, %s\n", $n, $k, $l;
  } elsif (($n, $k, $l) = ( $ref =~ /^\{\{ref_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) {
    # printf "Label:  %s, %d, %s\n", $n, $k, $l;
  } else {
    die("$prog: unknown reference style \"$ref\"\n");
  }

  my $record = $references[$k - 1];
  if (!defined($record)) {
    $record = { 'name' => $n, 'count' => 1, 'usecount' => 0 };
    $references[$k - 1] = $record;
  } else {
    if ($record->{'name'} ne $n) {
      die(sprintf "$prog: note mismatch: ref %d=%s; note=%s\n", $k, $record->{'name'}, $n);
    }
    ++$record->{'count'};
  }
}


sub addNote {
  my ( $note, $text ) = @_;

  my ( $n, $k, $l );
  if (($n) = ( $note =~ /^\{\{note\|([^}|]+)\}\}$/)) {
    $k = ++$numNotes;
    $l = "";
    # printf "Simple: %s, %d, %s\n", $n, $k, $l;
  } elsif (($n, $k, $l) = ( $note =~ /^\{\{note_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) {
    if ($k != ++$numNotes) {
      die("$prog: note number mismatch: seq = $numNotes, explicit = $k\n");
    }
    # printf "Label:  %s, %d, %s\n", $n, $k, $l;
  } else {
    die("$prog: unknown note style \"$note\"\n");
  }

  my $record = $references[$k - 1];
  if (!defined($record)) {
    die("$prog: note mismatch: no ref $k ($n)\n");
  }
  if ($record->{'name'} ne $n) {
    die(sprintf "$prog: note mismatch: ref %d=%s; note=%s\n", $k, $record->{'name'}, $n);
  }

  $record->{'text'} = $text;
}


##############################################################################
# Article Editing
##############################################################################

sub editArticle {
  my ( $file, $output ) = @_;

  my $doneRefs = 0;

  open(my $in, "<:utf8", $file) || die("$prog: can't open $file\n");
  open(my $out, ">:utf8", $output) || die("$prog: can't create $output\n");

  local($_);
  while (<$in>) {
    # Check for references in the line; and check for notes.  These
    # should be mutually exclusive.

    if (/^\# \{\{note/) {
      if (!$doneRefs) {
	printf $out "<references/>\n";
	++$doneRefs;
      }
    } else {
      s/(\{\{ref[^}]+\}\})/fixRef($1)/ge;
      print $out $_;
    }
  }

  close($in);
  close($out);
}


sub fixRef {
  my ( $ref ) = @_;

  my ( $n, $k, $l );
  if (($n) = ( $ref =~ /^\{\{ref\|([^}|]+)\}\}$/)) {
    $k = ++$fixedRefs;
    $l = "";
  } elsif (($n, $k) = ( $ref =~ /^\{\{ref_num\|([^}|]+)\|([^}|]+)\}\}$/)) {
    $l = "";
    # printf "Num:    %s, %d, %s\n", $n, $k, $l;
  } elsif (($n, $k, $l) = ( $ref =~ /^\{\{ref_label\|([^}|]+)\|([^}|]+)\|([^}|]+)\}\}$/)) {
    ;
  } else {
    die("$prog: unknown reference style \"$ref\"\n");
  }

  my $record = $references[$k - 1];
  if (!defined($record)) {
    die("$prog: ref mismatch: no ref $k ($n)\n");
  }

  if ($record->{'count'} > 1) {
    if ($record->{'usecount'}++ == 0) {
      return sprintf "<ref name=\"%s\">%s</ref>", $record->{'name'}, $record->{'text'};
    } else {
      return sprintf "<ref name=\"%s\"/>", $record->{'name'};
    }
  } else {
    return sprintf "<ref>%s</ref>", $record->{'text'};
  }
}


##############################################################################
# Diagnostics
##############################################################################

sub dumpRefs {
  if ($numNotes != $numRefs) {
    printf STDERR "## %d refs; %d notes\n", $numRefs, $numNotes;
  }

  foreach my $i (1 .. $numRefs) {
    my $record = $references[$i - 1];
    printf STDERR "[%2d] %-12s (%2d) %s\n", $i, $record->{'name'},
				$record->{'count'}, $record->{'text'};
  }
}


##############################################################################
# Main
##############################################################################

sub main {
  my ( @args ) = @_;

  my $article = $args[0];

  # First, parse the article.
  readArticle($article);

  # dumpRefs();

  my $newvers = $article;
  ($newvers =~ s:\.([^/]+)$:-new.\1:) || ($newvers .= ".new");
  printf STDERR "## edit %s -> %s\n", $article, $newvers;
  editArticle($article, $newvers);

  0;
}


exit(main(@ARGV));