User:Wizzy/badwords

#! /usr/bin/perl

use strict;
use File::Find ();
use English;

if ($#ARGV != 1) {
    print "Usage: $0 <wikipedia directory> <badwords file>\n";
    exit 1;
}


my @dirs = ($ARGV[0]);
my $badwordsfile = $ARGV[1];

open(BADWORDS, $ARGV[0]) || die("can't open badwords file $ARGV[0]: $!");
my @badwords = <BADWORDS>;
chomp(@badwords);
my %count;


sub wanted {
    if (/^.*html\z/s) {
        my $title;
	my $endhead = 0;
	open(FILE, $_) || die ("Can't open file $_: $!");
	while (my $line = <FILE>) {
	    if ($line =~ m%<title>(.*)- Wikipedia, the free encyclopedia</title>%) {
	        $title = $1;
	    }
	    if ($line =~ m%</head>%) {
	        $endhead = 1;
	    }
	    next if ! $endhead;
	    study ($line);
       	    for my $badword (@badwords) {
	        if ($badword =~ m:^/(.*)/$:) {
		    my $pattern = $1;
                    next if ($title =~ /\b$pattern\b/);	# skip this if it matches the title
	    	    if ($line =~ /\b$pattern\b/) {
		        my $prematch = substr($PREMATCH, -15);
			my $postmatch = substr($POSTMATCH, 0, 15);
		        print "<$prematch:$MATCH:$postmatch>\t$title\n";
			$count{$badword}++;
		    }
		} else {
		    next if ($title =~ /\b\Q$badword\b/);
		    if ($line =~ /\b\Q$badword\b/) {
		        my $prematch = substr($PREMATCH, -15);
			my $postmatch = substr($POSTMATCH, 0, 15);
		    	print "<$prematch:$badword:$postmatch>\t$title\n";
			$count{$badword}++;
		    }
		}
	    }
	}
    }
}

# for the convenience of &wanted calls, including -eval statements:
use vars qw/*name *dir *prune/;
*name   = *File::Find::name;
*dir    = *File::Find::dir;
*prune  = *File::Find::prune;

# Traverse desired filesystems
File::Find::find({wanted => \&wanted}, @dirs);

print "===================================\n";

foreach my $key (sort { $count{$a} <=> $count{$b} } keys %count) {
    print "$count{$key}\t$key\n";
}

exit;