#! /usr/bin/perl
use strict;
use File::Find ();
use English;
if ($#ARGV != 1) {
print "Usage: $0 <wikipedia directory> <badwords file>\n";
exit 1;
}
my @dirs = ($ARGV[0]);
my $badwordsfile = $ARGV[1];
open(BADWORDS, $ARGV[0]) || die("can't open badwords file $ARGV[0]: $!");
my @badwords = <BADWORDS>;
chomp(@badwords);
my %count;
sub wanted {
if (/^.*html\z/s) {
my $title;
my $endhead = 0;
open(FILE, $_) || die ("Can't open file $_: $!");
while (my $line = <FILE>) {
if ($line =~ m%<title>(.*)- Wikipedia, the free encyclopedia</title>%) {
$title = $1;
}
if ($line =~ m%</head>%) {
$endhead = 1;
}
next if ! $endhead;
study ($line);
for my $badword (@badwords) {
if ($badword =~ m:^/(.*)/$:) {
my $pattern = $1;
next if ($title =~ /\b$pattern\b/); # skip this if it matches the title
if ($line =~ /\b$pattern\b/) {
my $prematch = substr($PREMATCH, -15);
my $postmatch = substr($POSTMATCH, 0, 15);
print "<$prematch:$MATCH:$postmatch>\t$title\n";
$count{$badword}++;
}
} else {
next if ($title =~ /\b\Q$badword\b/);
if ($line =~ /\b\Q$badword\b/) {
my $prematch = substr($PREMATCH, -15);
my $postmatch = substr($POSTMATCH, 0, 15);
print "<$prematch:$badword:$postmatch>\t$title\n";
$count{$badword}++;
}
}
}
}
}
}
# for the convenience of &wanted calls, including -eval statements:
use vars qw/*name *dir *prune/;
*name = *File::Find::name;
*dir = *File::Find::dir;
*prune = *File::Find::prune;
# Traverse desired filesystems
File::Find::find({wanted => \&wanted}, @dirs);
print "===================================\n";
foreach my $key (sort { $count{$a} <=> $count{$b} } keys %count) {
print "$count{$key}\t$key\n";
}
exit;