7. improved the regex matching of badwords (more efficient, better coverage)

author minima <minima>

Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)

committer minima <minima>

Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
author minima <minima>
Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
committer minima <minima>
Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
diff --git a/Changes b/Changes

index 0890d5ed22a7680d78effc70b399fb91e16cf073..57218b5ad0d0b6c73051e652081741fd67a51340 100644 (file)
--- a/Changes
+++ b/Changes
@@ -6,6 +6,9 @@ last port of call).
  4. store deleted status across restarts!
  5. make callsign checking more rigorous
  6. dup check PC49 (kill full)
+7. improved the regex matching of badwords (more efficient, better coverage)
+8. added default badword and badw_regex tables (as .issue files) which will
+activate unless there is one there already.
  03Oct01=======================================================================
  1. don't allow @WWW to become a 'TO' field...
  2. handle @gb7tlh.#35.eu type addresses as well
diff --git a/perl/BadWords.pm b/perl/BadWords.pm

index 9814e3fadd5ef712d25315887d1582e2c509d338..36db8ffba44384d76a3bc8acbc9789d0447cbbef 100644 (file)
--- a/perl/BadWords.pm
+++ b/perl/BadWords.pm
@@ -13,11 +13,20 @@ use strict;
  use DXUtil;
  use DXVars;
  use DXHash;
+use DXDebug;
+
  use IO::File;
  
-use vars qw($badword);
+use vars qw($badword @regex);
  
  my $oldfn = "$main::data/badwords";
+my $regex = "$main::data/badw_regex";
+my $bwfn = "$main::data/badword";
+
+# copy issue ones across
+filecopy("$regex.issue", $regex) unless -e $regex;
+filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
+
  $badword = new DXHash "badword";
  
  use vars qw($VERSION $BRANCH);
@@ -30,7 +39,6 @@ $main::branch += $BRANCH;
  sub load
  {
         my @out;
-       return unless -e $oldfn;
         my $fh = new IO::File $oldfn;
         
         if ($fh) {
@@ -45,11 +53,41 @@ sub load
                 $fh->close;
                 $badword->put;
                 unlink $oldfn;
+       }
+       push @out, create_regex(); 
+       return @out;
+}
+
+sub create_regex
+{
+       my @out;
+       @regex = ();
+       
+       my $fh = new IO::File $regex;
+       
+       if ($fh) {
+               while (<$fh>) {
+                       chomp;
+                       next if /^\s*\#/;
+                       my @list = split " ";
+                       for (@list) {
+                               # create a closure for each word so that it matches stuff with spaces/punctuation
+                               # and repeated characters in it
+                               my $w = uc $_;
+                               my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w;
+                               my $e = join '+[\s\W]+', @l;
+                               my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
+                               push @regex, $s unless $@;
+                               dbg("create_regex: $@") if $@;
+                       }
+               }
+               $fh->close;
         } else {
-               my $l = "can't open $oldfn $!";
+               my $l = "can't open $regex $!";
                 dbg($l);
                 push @out, $l;
         }
+       
         return @out;
  }
  
@@ -57,32 +95,22 @@ sub load
  sub check
  {
         my $s = uc shift;
+       my @out;
+       
+       for (@regex) {
+               push @out, &$_($s);
+       }
+       
+       return @out if @out;
         
         for (split(/\s+/, $s)) {
                 s/[^\w]//g;
-               return $_ if $badword->in($_);
+               push @out, $_ if $badword->in($_);
                 s/\'?S$//;
-               return $_ if $badword->in($_);
-       }
-       
-       # look for a few of the common ones with spaces and stuff
-       if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) {
-               return "FUCK";
-       } elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) {
-               return "CUNT";
-       } elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) {
-               return "WANK";
-       } elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) {
-               return "COCK";
-       } elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) {
-               return "SHIT";
-       } elsif ($s =~ /P[\s\W]*[I1][\s\W]*S[\s\W]*S/) {
-               return "PISS";
-       } elsif ($s =~ /B[\s\W]*[O0][\s\W]*L[\s\W]*L[\s\W]*[O0][\s\W]*[CK]/) {
-               return "BOLLOCKS";
+               push @out, $_ if $badword->in($_);
         }
-       
-       return ();
+
+       return @out;
  }
  
  1;
diff --git a/perl/DXUtil.pm b/perl/DXUtil.pm

index 018a404a6249baa43fe2987ff24520c25cf33d59..4246754832f4eb80fe99f7e5a77944e31b32844d 100644 (file)
--- a/perl/DXUtil.pm
+++ b/perl/DXUtil.pm
@@ -10,6 +10,7 @@ package DXUtil;
  
  use Date::Parse;
  use IO::File;
+use File::Copy;
  use Data::Dumper;
  
  use strict;
@@ -26,6 +27,7 @@ require Exporter;
  @ISA = qw(Exporter);
  @EXPORT = qw(atime ztime cldate cldatetime slat slong yesno promptf 
                          parray parraypairs phex shellregex readfilestr writefilestr
+                        filecopy
               print_all_fields cltounix unpad is_callsign is_latlong
                          is_qra is_freq is_digits is_pctext is_pcflag insertitem deleteitem
              );
@@ -321,6 +323,11 @@ sub writefilestr
         }
  }
  
+sub filecopy
+{
+       copy(@_) or return $!;
+}
+
  # remove leading and trailing spaces from an input string
  sub unpad
  {
author	minima <minima>
	Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
committer	minima <minima>
	Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
Changes		patch \| blob \| history
perl/BadWords.pm		patch \| blob \| history
perl/DXUtil.pm		patch \| blob \| history