7. improved the regex matching of badwords (more efficient, better coverage)
[spider.git] / perl / BadWords.pm
index 2336bb0468e22b1b95e3b517f5750c6397dd9fc2..36db8ffba44384d76a3bc8acbc9789d0447cbbef 100644 (file)
@@ -12,12 +12,22 @@ use strict;
 
 use DXUtil;
 use DXVars;
+use DXHash;
+use DXDebug;
+
 use IO::File;
 
-use vars qw(%badwords $fn);
+use vars qw($badword @regex);
+
+my $oldfn = "$main::data/badwords";
+my $regex = "$main::data/badw_regex";
+my $bwfn = "$main::data/badword";
 
-$fn = "$main::data/badwords";
-%badwords = ();
+# copy issue ones across
+filecopy("$regex.issue", $regex) unless -e $regex;
+filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
+
+$badword = new DXHash "badword";
 
 use vars qw($VERSION $BRANCH);
 $VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );
@@ -29,32 +39,78 @@ $main::branch += $BRANCH;
 sub load
 {
        my @out;
-       return unless -e $fn;
-       my $fh = new IO::File $fn;
+       my $fh = new IO::File $oldfn;
        
        if ($fh) {
-               %badwords = ();
                while (<$fh>) {
                        chomp;
                        next if /^\s*\#/;
                        my @list = split " ";
                        for (@list) {
-                               $badwords{lc $_}++;
+                               $badword->add($_);
+                       }
+               }
+               $fh->close;
+               $badword->put;
+               unlink $oldfn;
+       }
+       push @out, create_regex(); 
+       return @out;
+}
+
+sub create_regex
+{
+       my @out;
+       @regex = ();
+       
+       my $fh = new IO::File $regex;
+       
+       if ($fh) {
+               while (<$fh>) {
+                       chomp;
+                       next if /^\s*\#/;
+                       my @list = split " ";
+                       for (@list) {
+                               # create a closure for each word so that it matches stuff with spaces/punctuation
+                               # and repeated characters in it
+                               my $w = uc $_;
+                               my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w;
+                               my $e = join '+[\s\W]+', @l;
+                               my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
+                               push @regex, $s unless $@;
+                               dbg("create_regex: $@") if $@;
                        }
                }
                $fh->close;
        } else {
-               my $l = "can't open $fn $!";
-               dbg('err', $l);
+               my $l = "can't open $regex $!";
+               dbg($l);
                push @out, $l;
        }
+       
        return @out;
 }
 
 # check the text against the badwords list
 sub check
 {
-       return grep { $badwords{$_} } split(/\b/, lc shift);
+       my $s = uc shift;
+       my @out;
+       
+       for (@regex) {
+               push @out, &$_($s);
+       }
+       
+       return @out if @out;
+       
+       for (split(/\s+/, $s)) {
+               s/[^\w]//g;
+               push @out, $_ if $badword->in($_);
+               s/\'?S$//;
+               push @out, $_ if $badword->in($_);
+       }
+
+       return @out;
 }
 
 1;