perl/BadWords.pm

   1 #
   2 # Search for bad words in strings
   3 #
   4 # Copyright (c) 2000 Dirk Koopman
   5 #
   6 # $Id$
   7 #
   8
   9 package BadWords;
  10
  11 use strict;
  12
  13 use DXUtil;
  14 use DXVars;
  15 use DXHash;
  16 use DXDebug;
  17
  18 use IO::File;
  19
  20 use vars qw($badword @regex);
  21
  22 my $oldfn = "$main::data/badwords";
  23 my $regex = "$main::data/badw_regex";
  24 my $bwfn = "$main::data/badword";
  25
  26 # copy issue ones across
  27 filecopy("$regex.issue", $regex) unless -e $regex;
  28 filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
  29
  30 $badword = new DXHash "badword";
  31
  32 use vars qw($VERSION $BRANCH);
  33 $VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );
  34 $BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0;
  35 $main::build += $VERSION;
  36 $main::branch += $BRANCH;
  37
  38 # load the badwords file
  39 sub load
  40 {
  41         my @out;
  42         my $fh = new IO::File $oldfn;
  43
  44         if ($fh) {
  45                 while (<$fh>) {
  46                         chomp;
  47                         next if /^\s*\#/;
  48                         my @list = split " ";
  49                         for (@list) {
  50                                 $badword->add($_);
  51                         }
  52                 }
  53                 $fh->close;
  54                 $badword->put;
  55                 unlink $oldfn;
  56         }
  57         push @out, create_regex();
  58         return @out;
  59 }
  60
  61 sub create_regex
  62 {
  63         my @out;
  64         @regex = ();
  65
  66         my $fh = new IO::File $regex;
  67
  68         if ($fh) {
  69                 while (<$fh>) {
  70                         chomp;
  71                         next if /^\s*\#/;
  72                         my @list = split " ";
  73                         for (@list) {
  74                                 # create a closure for each word so that it matches stuff with spaces/punctuation
  75                                 # and repeated characters in it
  76                                 my $w = uc $_;
  77                                 my @l = map { $_ eq 'I' ? '[I1]' : ($_ eq 'O' ? '[O0]' : $_) }split //, $w;
  78                                 my $e = join '+[\s\W]+', @l;
  79                                 my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } };
  80                                 push @regex, $s unless $@;
  81                                 dbg("create_regex: $@") if $@;
  82                         }
  83                 }
  84                 $fh->close;
  85         } else {
  86                 my $l = "can't open $regex $!";
  87                 dbg($l);
  88                 push @out, $l;
  89         }
  90
  91         return @out;
  92 }
  93
  94 # check the text against the badwords list
  95 sub check
  96 {
  97         my $s = uc shift;
  98         my @out;
  99
 100         for (@regex) {
 101                 push @out, &$_($s);
 102         }
 103
 104         return @out if @out;
 105
 106         for (split(/\s+/, $s)) {
 107                 s/[^\w]//g;
 108                 push @out, $_ if $badword->in($_);
 109                 s/\'?S$//;
 110                 push @out, $_ if $badword->in($_);
 111         }
 112
 113         return @out;
 114 }
 115
 116 1;