]> dxcluster.org Git - spider.git/blob - perl/BadWords.pm
add cmd and text xml handlers
[spider.git] / perl / BadWords.pm
1 #
2 # Search for bad words in strings
3 #
4 # Copyright (c) 2000 Dirk Koopman
5 #
6 # $Id$
7 #
8
9 package BadWords;
10
11 use strict;
12
13 use DXUtil;
14 use DXVars;
15 use DXHash;
16 use DXDebug;
17
18 use IO::File;
19
20 use vars qw($badword $regexcode);
21
22 my $oldfn = "$main::data/badwords";
23 my $regex = "$main::data/badw_regex";
24 my $bwfn = "$main::data/badword";
25
26 # copy issue ones across
27 filecopy("$regex.gb.issue", $regex) unless -e $regex;
28 filecopy("$bwfn.issue", $bwfn) unless -e $bwfn;
29
30 $badword = new DXHash "badword";
31
32 use vars qw($VERSION $BRANCH);
33 $VERSION = sprintf( "%d.%03d", q$Revision$ =~ /(\d+)\.(\d+)/ );
34 $BRANCH = sprintf( "%d.%03d", q$Revision$ =~ /\d+\.\d+\.(\d+)\.(\d+)/  || (0,0));
35 $main::build += $VERSION;
36 $main::branch += $BRANCH;
37
38 # load the badwords file
39 sub load
40 {
41         my @out;
42         my $fh = new IO::File $oldfn;
43         
44         if ($fh) {
45                 while (<$fh>) {
46                         chomp;
47                         next if /^\s*\#/;
48                         my @list = split " ";
49                         for (@list) {
50                                 $badword->add($_);
51                         }
52                 }
53                 $fh->close;
54                 $badword->put;
55                 unlink $oldfn;
56         }
57         push @out, create_regex(); 
58         return @out;
59 }
60
61 sub create_regex
62 {
63         my @out;
64         my $fh = new IO::File $regex;
65         
66         if ($fh) {
67                 my $s = "sub { my \$str = shift; my \@out; \n";
68                 while (<$fh>) {
69                         chomp;
70                         next if /^\s*\#/;
71                         my @list = split " ";
72                         for (@list) {
73                                 # create a closure for each word so that it matches stuff with spaces/punctuation
74                                 # and repeated characters in it
75                                 my $w = uc $_;
76                                 my @l = split //, $w;
77                                 my $e = join '+[\s\W]*', @l;
78                                 $s .= "push \@out, \$1 if \$str =~ /\\b($e)/;\n";
79                         }
80                 }
81                 $s .= "return \@out;\n}";
82                 $regexcode = eval $s;
83                 dbg($s) if isdbg('badword');
84                 if ($@) {
85                         @out = ($@);
86                         dbg($@);
87                         return @out;
88                 }
89                 $fh->close;
90         } else {
91                 my $l = "can't open $regex $!";
92                 dbg($l);
93                 push @out, $l;
94         }
95         
96         return @out;
97 }
98
99 # check the text against the badwords list
100 sub check
101 {
102         my $s = uc shift;
103         my @out;
104
105         push @out, &$regexcode($s) if $regexcode;
106         
107         return @out if @out;
108         
109         for (split(/\b/, $s)) {
110                 push @out, $_ if $badword->in($_);
111         }
112
113         return @out;
114 }
115
116 1;