7. improved the regex matching of badwords (more efficient, better coverage)

author minima <minima>

Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)

committer minima <minima>

Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
author minima <minima>
Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
committer minima <minima>
Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
diff --git a/Changes b/Changes

index 0890d5ed22a7680d78effc70b399fb91e16cf073..57218b5ad0d0b6c73051e652081741fd67a51340 100644 (file)
--- a/Changes
+++ b/Changes
@@ -6,6 +6,9 @@ last port of call).
  4. store deleted status across restarts!
  5. make callsign checking more rigorous
  6. dup check PC49 (kill full)
  4. store deleted status across restarts!
  5. make callsign checking more rigorous
  6. dup check PC49 (kill full)
+7. improved the regex matching of badwords (more efficient, better coverage)
+8. added default badword and badw_regex tables (as .issue files) which will
+activate unless there is one there already.
  03Oct01=======================================================================
  1. don't allow @WWW to become a 'TO' field...
  2. handle @gb7tlh.#35.eu type addresses as well
  03Oct01=======================================================================
  1. don't allow @WWW to become a 'TO' field...
  2. handle @gb7tlh.#35.eu type addresses as well
diff --git a/perl/BadWords.pm b/perl/BadWords.pm

index 9814e3fadd5ef712d25315887d1582e2c509d338..36db8ffba44384d76a3bc8acbc9789d0447cbbef 100644 (file)
--- a/perl/BadWords.pm
+++ b/perl/BadWords.pm
@@ -13,11 +13,20 @@ use strict;
  use DXUtil;
  use DXVars;
  use DXHash;
  use DXUtil;
  use DXVars;
  use DXHash;
+use DXDebug;
+
  use IO::File;
  
  use IO::File;
  
-use vars qw($badword);
+use vars qw($badword @regex);
  
  my $oldfn = "$main::data/badwords";
  
  my $oldfn = "$main::data/badwords";
+my $regex = "$main::data/badw_regex";
+my $bwfn = "$main::data/badword";
+
+# copy issue ones across
+filecopy("$regex.issue", $regex) unless -e $regex;
+filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
+
  $badword = new DXHash "badword";
  
  use vars qw($VERSION $BRANCH);
  $badword = new DXHash "badword";
  
  use vars qw($VERSION $BRANCH);
@@ -30,7 +39,6 @@ $main::branch += $BRANCH;
  sub load
  {
         my @out;
  sub load
  {
         my @out;
-       return unless -e $oldfn;
         my $fh = new IO::File $oldfn;
         
         if ($fh) {
         my $fh = new IO::File $oldfn;
         
         if ($fh) {
@@ -45,11 +53,41 @@ sub load
                 $fh->close;
                 $badword->put;
                 unlink $oldfn;
                 $fh->close;
                 $badword->put;
                 unlink $oldfn;
+       }
+       push @out, create_regex(); 
+       return @out;
+}
+
+sub create_regex
+{
+       my @out;
+       @regex = ();
+       
+       my $fh = new IO::File $regex;
+       
+       if ($fh) {
+               while (<$fh>) {
+                       chomp;
+                       next if /^\s*\#/;
+                       my @list = split " ";
+                       for (@list) {
+                               # create a closure for each word so that it matches stuff with spaces/punctuation
+                               # and repeated characters in it
+                               my $w = uc $_;
+                               my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w;
+                               my $e = join '+[\s\W]+', @l;
+                               my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
+                               push @regex, $s unless $@;
+                               dbg("create_regex: $@") if $@;
+                       }
+               }
+               $fh->close;
         } else {
         } else {
-               my $l = "can't open $oldfn $!";
+               my $l = "can't open $regex $!";
                 dbg($l);
                 push @out, $l;
         }
                 dbg($l);
                 push @out, $l;
         }
+       
         return @out;
  }
  
         return @out;
  }
  
@@ -57,32 +95,22 @@ sub load
  sub check
  {
         my $s = uc shift;
  sub check
  {
         my $s = uc shift;
+       my @out;
+       
+       for (@regex) {
+               push @out, &$_($s);
+       }
+       
+       return @out if @out;
         
         for (split(/\s+/, $s)) {
                 s/[^\w]//g;
         
         for (split(/\s+/, $s)) {
                 s/[^\w]//g;
-               return $_ if $badword->in($_);
+               push @out, $_ if $badword->in($_);
                 s/\'?S$//;
                 s/\'?S$//;
-               return $_ if $badword->in($_);
-       }
-       
-       # look for a few of the common ones with spaces and stuff
-       if ($s =~ /F[\s\W]*U[\s\W]*C[\s\W]*K/) {
-               return "FUCK";
-       } elsif ($s =~ /C[\s\W]*U[\s\W]*N[\s\W]*T/) {
-               return "CUNT";
-       } elsif ($s =~ /W[\s\W]*A[\s\W]*N[\s\W]*K/) {
-               return "WANK";
-       } elsif ($s =~ /C[\s\W]*[0O][\s\W]*C[\s\W]*K/) {
-               return "COCK";
-       } elsif ($s =~ /S[\s\W]*H[\s\W]*[I1][\s\W]*T/) {
-               return "SHIT";
-       } elsif ($s =~ /P[\s\W]*[I1][\s\W]*S[\s\W]*S/) {
-               return "PISS";
-       } elsif ($s =~ /B[\s\W]*[O0][\s\W]*L[\s\W]*L[\s\W]*[O0][\s\W]*[CK]/) {
-               return "BOLLOCKS";
+               push @out, $_ if $badword->in($_);
         }
         }
-       
-       return ();
+
+       return @out;
  }
  
  1;
  }
  
  1;
diff --git a/perl/DXUtil.pm b/perl/DXUtil.pm

index 018a404a6249baa43fe2987ff24520c25cf33d59..4246754832f4eb80fe99f7e5a77944e31b32844d 100644 (file)
--- a/perl/DXUtil.pm
+++ b/perl/DXUtil.pm
@@ -10,6 +10,7 @@ package DXUtil;
  
  use Date::Parse;
  use IO::File;
  
  use Date::Parse;
  use IO::File;
+use File::Copy;
  use Data::Dumper;
  
  use strict;
  use Data::Dumper;
  
  use strict;
@@ -26,6 +27,7 @@ require Exporter;
  @ISA = qw(Exporter);
  @EXPORT = qw(atime ztime cldate cldatetime slat slong yesno promptf 
                          parray parraypairs phex shellregex readfilestr writefilestr
  @ISA = qw(Exporter);
  @EXPORT = qw(atime ztime cldate cldatetime slat slong yesno promptf 
                          parray parraypairs phex shellregex readfilestr writefilestr
+                        filecopy
               print_all_fields cltounix unpad is_callsign is_latlong
                          is_qra is_freq is_digits is_pctext is_pcflag insertitem deleteitem
              );
               print_all_fields cltounix unpad is_callsign is_latlong
                          is_qra is_freq is_digits is_pctext is_pcflag insertitem deleteitem
              );
@@ -321,6 +323,11 @@ sub writefilestr
         }
  }
  
         }
  }
  
+sub filecopy
+{
+       copy(@_) or return $!;
+}
+
  # remove leading and trailing spaces from an input string
  sub unpad
  {
  # remove leading and trailing spaces from an input string
  sub unpad
  {
author	minima <minima>
	Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
committer	minima <minima>
	Thu, 4 Oct 2001 13:53:47 +0000 (13:53 +0000)
Changes		patch \| blob \| history
perl/BadWords.pm		patch \| blob \| history
perl/DXUtil.pm		patch \| blob \| history