X-Git-Url: http://dxcluster.org/gitweb/gitweb.cgi?a=blobdiff_plain;f=perl%2FBadWords.pm;h=db33d7a1c4ebaeb34127c79bb1f5c1463dc35463;hb=985363e6626057fc4efb8b75a94a18aeeae73e54;hp=e7d1169e3a390abd5845f5922bb299eaa79cefdc;hpb=17f0b57add792391822d38116e89b33c1df4e2dd;p=spider.git diff --git a/perl/BadWords.pm b/perl/BadWords.pm index e7d1169e..db33d7a1 100644 --- a/perl/BadWords.pm +++ b/perl/BadWords.pm @@ -13,11 +13,20 @@ use strict; use DXUtil; use DXVars; use DXHash; +use DXDebug; + use IO::File; -use vars qw($badword); +use vars qw($badword @regex); my $oldfn = "$main::data/badwords"; +my $regex = "$main::data/badw_regex"; +my $bwfn = "$main::data/badword"; + +# copy issue ones across +filecopy("$regex.issue", $regex) unless -e $regex; +filecopy("$bwfn.issue", $bwfn) unless -e $bwfn; + $badword = new DXHash "badword"; use vars qw($VERSION $BRANCH); @@ -30,7 +39,6 @@ $main::branch += $BRANCH; sub load { my @out; - return unless -e $oldfn; my $fh = new IO::File $oldfn; if ($fh) { @@ -45,18 +53,62 @@ sub load $fh->close; $badword->put; unlink $oldfn; + } + push @out, create_regex(); + return @out; +} + +sub create_regex +{ + my @out; + @regex = (); + + my $fh = new IO::File $regex; + + if ($fh) { + while (<$fh>) { + chomp; + next if /^\s*\#/; + my @list = split " "; + for (@list) { + # create a closure for each word so that it matches stuff with spaces/punctuation + # and repeated characters in it + my $w = uc $_; + my @l = split //, $w; + my $e = join '+[\s\W]+', @l; + my $s = eval qq{sub { return \$_[0] =~ /$e+/ ? '$w' : () } }; + push @regex, $s unless $@; + dbg("create_regex: $@") if $@; + } + } + $fh->close; } else { - my $l = "can't open $oldfn $!"; + my $l = "can't open $regex $!"; dbg($l); push @out, $l; } + return @out; } # check the text against the badwords list sub check { - return grep { $badword->in($_) } split(/\b/, lc shift); + my $s = uc shift; + my @out; + + for (@regex) { + push @out, &$_($s); + } + + return @out if @out; + + for (split(/\s+/, $s)) { + s/\'?S$//; + push @out, $_ if $badword->in($_); + } + + return @out; } 1;