Jump to content

Wikipedia:WikiProject Astronomical objects/Stub processing/tools/stub triage perl script

From Wikipedia, the free encyclopedia

Originally written by User:Christopher Thomas. By posting it here, I recognize and acknowledge its release under appropriate Wikipedia licenses. --Christopher Thomas (talk) 19:27, 22 January 2012 (UTC)[reply]

#!/usr/bin/perl
#
# Stub Triage Script - Triage Processing
# Written by christopher Thomas per WT:ASTRO thread discussion.
#
# Usage:  ProcessStubs.pl <page list> <output filename>
#
# This script examines a series of wikipedia pages and produces a 
# wiki-markup table listing the entries and indicating which are stubs.
# Auxiliary information (presence of references, infoboxes) is also 
# included.
#
# This is intended to be made more flexible in the future. Right now, all 
# examination is hard-coded.
#
# The page list must be in the format produced by GetStubList.pl.
#
# Long lists are split into many smaller tables, all collapsible.
#
# This script worked in January 2012. Wiki changes may break it later!
#


#
# Includes
#

use strict;



#
# Constants
#


# Various magic values.

# Max number of entries per table.
my ($tsize);
$tsize = 100;

# Character count threshold for being "short".
# FIXME - This is a very mushy boundary!
my ($shortsize);
$shortsize = 400;



#
# Functions
#


# Displays a help screen.
# No arguments.
# No return value.

sub PrintHelp
{
  print << "Endofblock"

Stub Triage Script - Triage Processing
Written by christopher Thomas per WT:ASTRO thread discussion.

Usage:  ProcessStubs.pl <page list> <output filename>

This script examines a series of wikipedia pages and produces a 
wiki-markup table listing the entries and indicating which are stubs.
Auxiliary information (presence of references, infoboxes) is also 
included.

This is intended to be made more flexible in the future. Right now, all 
examination is hard-coded.

The page list must be in the format produced by GetStubList.pl.

Long lists are split into many smaller tables, all collapsible.

This script worked in January 2012. Wiki changes may break it later!

Endofblock
}



# Constructs a table header.
# FIXME - Lots of magic in here.
# Arg 0 is the index of the first entry in the table.
# Returns the string to be emitted.

sub MakeTableHeader
{
  my ($nidx);
  my ($result);

  # Process args.

  $nidx = $_[0];

  if (!(defined $nidx))
  {
    print "### [MakeTableHeader]  Bad arguments.\n";

    # Pick something to emit.
    $nidx = '-bogus-';
  }


  $result = "\n".'{| class="wikitable collapsible collapsed"'."\n"
    . "|-\n"
    . "! colspan=7 | Stubs starting from item $nidx\n"
    . "|-\n"
    . "! Reviewed\n"
    . "! Article\n"
    . "! Length\n"
    . "! Refs\n"
    . "! ExLinks\n"
    . "! Infobox\n"
    . "! JPL\n"
    ;

  # Done.
  return $result;
}



# Constructs a table footer.
# No arguments.
# Returns the string to be emitted.

sub MakeTableFooter
{
  my ($result);

  $result = "|}\n\n";

  return $result;
}



# Builds a statistics line for a given article URL.
# FIXME - Lots of magic in here.
# Arg 0 is the URL/label pair string from the page list.
# Returns the string to be emitted for this table row.

sub MakeStatLineForURL
{
  my ($nstring, $result);
  my ($url, $name);
  my ($pstats_p);


  $nstring = $_[0];
  $result = "";


  if (!(defined $nstring))
  {
    print "### [MakeStatLineForURL]  Bad arguments.\n";
  }
  elsif (!($nstring =~ m/^(\S*)\s+(.*\S)/))
  {
    print "### [MakeStatLineForURL]  Unable to parse URL/label string.\n";
  }
  else
  {
    #
    # Process name and URL.


    $url = $1;
    $name = $2;

    # Complete the URL. It starts with "/wiki" now.
    $url = 'http://en.wikipedia.org' . $url;


    # FIXME - Non-English characters will be mangled, so extract the
    # true name from the URL if possible.
    if ($url =~ m/wikipedia\.org\/wiki\/(.*\S)/)
    {
      $name = $1;
      $name =~ s/_/\ /g;
    }

# FIXME - Diagnostics
#print "Name \"$name\", URL \"$url\".\n";


    #
    # Get information hash for this page.

    $pstats_p = {};

    ComputePageStats($url, $pstats_p);


    #
    # Emit table row.

    $result = "|-\n"
      . '| <!-- Add "tick", "cross", or other template here. -->'."\n"
      . '| {{article|' . $name . '}}' . "\n"
      . '| ' . $$pstats_p{length} . "\n"
      . '| ' . $$pstats_p{refcount} . "\n"
      . '| ' . $$pstats_p{excount} . "\n"
      . '| ' . $$pstats_p{hasinfo} . "\n"
      . '| ' . $$pstats_p{jpl} . "\n"
  }


  return $result;
}



# Fetches Wikipedia markup source for a given page URL.
# Arg 0 is the URL used to view the page.
# Arg 1 points to an array to store source in.
# No return value.

sub FetchWikiSource
{
  my ($url, $src_p);
  my (@rawdata, $ridx, $sidx, $thisline);
  my ($insource, $done);

  $url = $_[0];
  $src_p = $_[1];

  if (!( (defined $url) && (defined $src_p) ))
  {
    print "### [FetchWikiSource]  Bad arguments!\n";
  }
  else
  {
    # No matter what, delay so that we don't hammer the wiki.
    sleep(1);

    # Initialize.
    @rawdata = ();

    # Turn this into an "edit page" URL, and fetch it.

    if ($url =~ m/wiki\/(\S+)/)
    {
      $url = 'http://en.wikipedia.org/w/index.php?title='
        . $1 . '&action=edit';

      # FIXME - Doing this the messy but easy way.
      @rawdata = `lynx --source \"$url\"`;
    }


    # We now have either a blank array (on failure) or a raw html array.
    # Scan for useful information.

    $insource = 0;
    $done = 0;
    $sidx = 0;
    @$src_p = ();

    for ($ridx = 0;
      (!$done) && (defined ($thisline = $rawdata[$ridx]));
      $ridx++)
    {
      if ($insource)
      {
        # Looking for the end of the wiki markup textarea.
        # Saving everything in the meantime.

        if ($thisline =~ m/^(.*)\<\/textarea/i)
        {
          $thisline = $1;
          $insource = 0;
          $done = 1;

          if ($thisline =~ m/\S/)
          {
            $$src_p[$sidx] = $thisline;
            $sidx++;
          }
        }
        # FIXME - Force sanity.
        elsif ($thisline =~ m/\<\/textarea/i)
        {
          $insource = 0;
          $done = 1;

          print "### Un-caught end of text area (shouldn't happen).\n";
        }
        else
        {
          $$src_p[$sidx] = $thisline;
          $sidx++;
        }
      }
      else
      {
        # Looking for the wiki markup textarea.

        if ($thisline =~ m/\<textarea .* name=\"wpTextbox1\"\>(.*)/i)
        {
          $thisline = $1;
          $insource = 1;

          if ($thisline =~ m/\S/)
          {
            $$src_p[$sidx] = $thisline;
            $sidx++;
          }
        }
      }
    }
  }

  # Done.
}



# Fetches a wikipedia page and computes its stub-related statistics.
# FIXME - Lots of magic in here.
# Arg 0 is the URL to fetch (complete).
# Arg 1 points to a hash to store statistics in.
# No return value.

sub ComputePageStats
{
  my ($url, $stats_p);
  my (@pagedata, $thisline, $lidx);
  my ($state);
  my ($charcount, $refcount, $excount, $hasinfo, $jplurl);

  $url = $_[0];
  $stats_p = $_[1];

  if (!( (defined $url) && (defined $stats_p) ))
  {
    print "### [ComputePageStats]  Bad arguments.\n";
  }
  else
  {
# FIXME - Diagnostics.
print "Fetching \"$url\".\n";

    # Fetch wikipedia markup source for this page.
    @pagedata = ();
    FetchWikiSource($url, \@pagedata);

    # Initialize stats.
    $charcount = 0;
    $refcount = 0;
    $excount = 0;
    $hasinfo = 0;
    $jplurl = undef;


    #
    # Crawl through the page, updating statistics.

    # FIXME - This is really fragile!
    # Among other things, it'll choke on nested infoboxes and
    # templates or links that are split across lines.

    # Fortunately, the mass-created articles tend to be well-formed.

    $state = 'top';

    for ($lidx = 0;
      ($state ne 'done') && (defined ($thisline = $pagedata[$lidx]));
      $lidx++)
    {
      # No matter what state we're in, flag JPL URLs.
      # We have to do this before eating templates, as they're often
      # within {{cite}} templates.

      if ($thisline =~ m/(http:\/\/ssd\.jpl\.nasa\.gov\S+)/i)
      {
        # FIXME - Overwrite any previous JPL URLs.
        $jplurl = $1;


        # Clip pipes or end braces.

        if ($jplurl =~ m/(.*?)\|/)
        {
          $jplurl = $1;
        }

        if ($jplurl =~ m/(.*?)\]/)
        {
          $jplurl = $1;
        }


        # URL should be trimmed now.
      }


      # FIXME - Eat any single-line template.
      # There are way too many of these, and they break infobox
      # recognition.

      while ($thisline =~ m/(.*)(\{\{[^{]+\}\})(.*)/)
      {
        $thisline = $1 . $3;

# FIXME - Diagnostics.
#print "Pruning \"$2\".\n";
      }


      # Take action depending on state.

      if ('top' eq $state)
      {
        # At the top level.
        # We're either seeing content, or the start of a different type
        # of section.

        if ($thisline =~ m/\{\{infobox/i)
        {
          $hasinfo = 1;

          $state = 'infobox';
        }
        elsif ($thisline =~ m/==\s*references/i)
        {
          $state = 'refs';
        }
        elsif ($thisline =~ m/==\s*see also/i)
        {
          $state = 'also';
        }
        elsif ($thisline =~ m/==\s*external links/i)
        {
          $state = 'links';
        }
        elsif ($thisline =~ m/\[\[Category\:/i)
        {
          $state = 'done';
        }
        else
        {
          # This seems to be content.

# FIXME - Emit content, for debugging.
#print "-- $thisline"; # Already has a newline.

          # Count characters.
          if ($thisline =~ m/(\S.*\S)/)
          {
            $charcount += length($1);
          }

          # Make note of references.
          # Count close-ref tags to get a more accurate count.
          # FIXME - HTML source seems to turn < into "&lt;".
          if ($thisline =~ m/\/ref\>/i)
          {
            $refcount++;
          }
        }
      }
      elsif ('infobox' eq $state)
      {
        # We don't care what's in the infobox; just when it ends.

        if ($thisline =~ m/\}\}/)
        {
          $state = 'top';
        }
      }
      elsif ('refs' eq $state)
      {
        # We don't care what's in the references section.
        # It should just be a "{{reflist}}" template.

        if ($thisline =~ m/==\s*external links/i)
        {
          $state = 'links';
        }
        elsif ($thisline =~ m/==\s*see also/i)
        {
          $state = 'also';
        }
        elsif ($thisline =~ m/\[\[Category\:/i)
        {
          $state = 'done';
        }
      }
      elsif ('also' eq $state)
      {
        # We don't care what's in the "see also" section.
        # In theory it's content, in practice it bloats the stats.

        if ($thisline =~ m/==\s*external links/i)
        {
          $state = 'links';
        }
        elsif ($thisline =~ m/==\s*references/i)
        {
          $state = 'refs';
        }
        elsif ($thisline =~ m/\[\[Category\:/i)
        {
          $state = 'done';
        }
      }
      elsif ('links' eq $state)
      {
        # Look for URLs in this section.

        if ($thisline =~ m/==\s*references/i)
        {
          $state = 'refs';
        }
        elsif ($thisline =~ m/==\s*see also/i)
        {
          $state = 'also';
        }
        elsif ($thisline =~ m/\[\[Category\:/i)
        {
          $state = 'done';
        }
        elsif ($thisline =~ m/\[http/i)
        {
          $excount++;
        }
      }
      else
      {
        print "### [ComputePageStats]  Bogus state \"$state\".\n";
        $state = 'done';
      }
    }



    #
    # Save statistics.

    $$stats_p{length} = '{{tick}}' . $charcount;
    if ($charcount <= $shortsize)
    {
       $$stats_p{length} = '{{warnsign|' . $charcount . '}}';
    }

    $$stats_p{refcount} = $refcount;
    $$stats_p{excount} = $excount;

    $$stats_p{hasinfo} = 'N';
    if ($hasinfo)
    {
      $$stats_p{hasinfo} = 'Y';
    }

    $$stats_p{jpl} = '{{cross}}';
    if (defined $jplurl)
    {
      $$stats_p{jpl} = '['. $jplurl . ']';
    }


    # Done.
  }
}



#
# Main Program
#

my ($lname, $oname);
my ($thisname, $ncount, $nidx, $intable);


$lname = $ARGV[0];
$oname = $ARGV[1];

if ( (!(defined $lname)) || (!(defined $oname)) || (defined $ARGV[2]) )
{
  PrintHelp();
}
elsif (!open(NFILE, "<$lname"))
{
  print "### Unable to read from \"$lname\".\n";
}
else
{
  if (!open(OFILE, ">$oname"))
  {
    print "### Unable to write to \"$oname\".\n";
  }
  else
  {
    # Walk through the names file, processing pages.

    $ncount = 0;
    $intable = 0;

    while (defined ($thisname = <NFILE>))
    {
      $ncount++;


      # Emit this line.
      # Start a new table if necessary.

      if (!$intable)
      {
        # Diagnostics.
        print "-- Starting table at entry $ncount.\n";

        print OFILE MakeTableHeader($ncount);

        $intable = 1;
      }

      print OFILE MakeStatLineForURL($thisname);


      # End the table if it's reached the size limit.

      $nidx = $ncount % $tsize;

      if (0 == $nidx)
      {
        # Sanity.
        if (!$intable)
        {
          print "### Ending a table we didn't start? (count = $ncount)\n";
        }

        print OFILE MakeTableFooter();

        $intable = 0;
      }
    }


    # We've finished processing the names list.
    # Print a footer if we have to.

    if ($intable)
    {
      print OFILE MakeTableFooter();

      $intable = 0;
    }


    # Diagnostics.
    print "-- Done.\n";


    # Close the output file no matter what.
    close(OFILE);
  }

  # Close the names file no matter what.
  close(NFILE);
}



#
# This is the end of the file.
#