X-Git-Url: http://dxcluster.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=b598c385d4e940f45716a4f2f35c303ad9b46e1e;hb=c77ea09e91a5f7c3052e3e30dfb48fcaad1e0dcd;hp=ff9dd04c8b4e3b70a634828862420fcac1ccaa8a;hpb=8e0eef80216fbb2bca3606daf5797e39b2889d7a;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index ff9dd04c..b598c385 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -12,43 +12,107 @@ use strict; use DXUtil; use DXVars; +use DXHash; +use DXDebug; + use IO::File; -use vars qw(%badwords $fn); +use vars qw($badword $regexcode); + +my $oldfn = "$main::data/badwords"; +my $regex = "$main::data/badw_regex"; +my $bwfn = "$main::data/badword"; + +# copy issue ones across +filecopy("$regex.issue", $regex) unless -e $regex; +filecopy("$bwfn.issue", $bwfn) unless -e $bwfn; + +$badword = new DXHash "badword"; -$fn = "$main::data/badwords"; -%badwords = (); +use vars qw($VERSION $BRANCH); +$VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ ); +$BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/ ) || 0; +$main::build += $VERSION; +$main::branch += $BRANCH; # load the badwords file sub load { my @out; - return unless -e $fn; - my $fh = new IO::File $fn; + my $fh = new IO::File $oldfn; if ($fh) { - %badwords = (); while (<$fh>) { chomp; next if /^\s*\#/; my @list = split " "; for (@list) { - $badwords{lc $_}++; + $badword->add($_); } } $fh->close; + $badword->put; + unlink $oldfn; + } + push @out, create_regex(); + return @out; +} + +sub create_regex +{ + my @out; + my $fh = new IO::File $regex; + + if ($fh) { + my $s = "sub { my \$str = shift; my \@out; \n"; + while (<$fh>) { + chomp; + next if /^\s*\#/; + my @list = split " "; + for (@list) { + # create a closure for each word so that it matches stuff with spaces/punctuation + # and repeated characters in it + my $w = uc $_; + my @l = split //, $w; + my $e = join '+[\s\W]*', @l; + $s .= "push \@out, \$1 if \$str =~ /($e)/;\n"; + } + } + $s .= "return \@out;\n}"; + $regexcode = eval $s; + dbg($s) if isdbg('badword'); + if ($@) { + @out = ($@); + dbg($@); + return @out; + } + $fh->close; } else { - my $l = "can't open $fn $!"; - dbg('err', $l); + my $l = "can't open $regex $!"; + dbg($l); push @out, $l; } + return @out; } # check the text against the badwords list sub check { - return grep { $badwords{$_} } split(/\b/, lc shift); + my $s = uc shift; + my @out; + + dbg($s) if isdbg('badword'); + push @out, &$regexcode($s) if $regexcode; + + return @out if @out; + + for (split(/\s+/, $s)) { + s/\'?S$//; + push @out, $_ if $badword->in($_); + } + + return @out; } 1;