7. improved the regex matching of badwords (more efficient, better coverage)

[spider.git] / perl / BadWords.pm
diff --git a/perl/BadWords.pm b/perl/BadWords.pm

index 9814e3fadd5ef712d25315887d1582e2c509d338..36db8ffba44384d76a3bc8acbc9789d0447cbbef 100644 (file)
--- a/perl/BadWords.pm
+++ b/perl/BadWords.pm
@@ -13,11 +13,20 @@ use strict;
  use DXUtil;
  use DXVars;
  use DXHash;
+use DXDebug;
+
  use IO::File;
  
-use vars qw($badword);
+use vars qw($badword @regex);
  
  my $oldfn = "$main::data/badwords";
+my $regex = "$main::data/badw_regex";
+my $bwfn = "$main::data/badword";
+
+# copy issue ones across
+filecopy("$regex.issue", $regex) unless -e $regex;
+filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
+
  $badword = new DXHash "badword";
  
  use vars qw($VERSION $BRANCH);
@@ -30,7 +39,6 @@ $main::branch += $BRANCH;
  sub load
  {
         my @out;
-       return unless -e $oldfn;
         my $fh = new IO::File $oldfn;
         
         if ($fh) {
@@ -45,11 +53,41 @@ sub load
                 $fh->close;
                 $badword->put;
                 unlink $oldfn;
+       }
+       push @out, create_regex(); 
+       return @out;
+}
+
+sub create_regex
+{
+       my @out;
+       @regex = ();
+       
+       my $fh = new IO::File $regex;
+       
+       if ($fh) {
+               while (<$fh>) {
+                       chomp;
+                       next if /^\s*\#/;
+                       my @list = split " ";
+                       for (@list) {
+                               # create a closure for each word so that it matches stuff with spaces/punctuation
+                               # and repeated characters in it
+                               my $w = uc $_;
+                               my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w;
+                               my $e = join '+[\s\W]+', @l;
+                               my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
+                               push @regex, $s unless $@;
+                               dbg("create_regex: $@") if $@;
+                       }
+               }
+               $fh->close;
         } else {
-               my $l = "can't open $oldfn $!";
+               my $l = "can't open $regex $!";
                 dbg($l);
                 push @out, $l;
         }
+       
         return @out;
  }
  
@@ -57,32 +95,22 @@ sub load
  sub check
  {
         my $s = uc shift;
+       my @out;
+       
+       for (@regex) {
+               push @out, &$_($s);
+       }
+       
+       return @out if @out;
         
         for (split(/\s+/, $s)) {
                 s/[^\w]//g;
-               return $_ if $badword->in($_);
+               push @out, $_ if $badword->in($_);
                 s/\'?S$//;
-               return $_ if $badword->in($_);
-       }
-       
-       # look for a few of the common ones with spaces and stuff
-       if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) {
-               return "FUCK";
-       } elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) {
-               return "CUNT";
-       } elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) {
-               return "WANK";
-       } elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) {
-               return "COCK";
-       } elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) {
-               return "SHIT";
-       } elsif ($s =~ /P[\s\W]*[I1][\s\W]*S[\s\W]*S/) {
-               return "PISS";
-       } elsif ($s =~ /B[\s\W]*[O0][\s\W]*L[\s\W]*L[\s\W]*[O0][\s\W]*[CK]/) {
-               return "BOLLOCKS";
+               push @out, $_ if $badword->in($_);
         }
-       
-       return ();
+
+       return @out;
  }
  
  1;