35
36:- module(content_filter,
37 [ eval_content/3 38 ]). 39:- use_module(library(porter_stem)). 40:- use_module(library(apply)). 41:- use_module(library(debug)).
49:- dynamic
50 black/1,
51 white/1,
52 wl_loaded/0.
59eval_content(Text, WordCount, Score) :-
60 read_word_lists,
61 tokenize_atom(Text, Tokens),
62 wordlist(Tokens, Words),
63 length(Words, WordCount),
64 foldl(acc_score, Words, 0-0, Score0-_Acc),
65 Score is max(-100, min(100, Score0)).
66
67acc_score(Word, V0-A0, V-A) :-
68 downcase_atom(Word, Lower),
69 score(Lower, WScore),
70 A is min(0, A0//2 + WScore),
71 V is V0+A+WScore,
72 debug(spam, '~w: ~w, ~w -> ~w', [Word, WScore, V0-A0, V-A]).
73
74score(Word, 20) :-
75 current_predicate(Word, system:_),
76 !.
77score(Word, Score) :-
78 black(Word),
79 !,
80 Score = -50.
81score(Word, Score) :-
82 white(Word),
83 !,
84 Score = 10.
85score(Word, Score) :-
86 glued_identifier(Word),
87 !,
88 Score = 15.
89score(_, -5).
90
91glued_identifier(Word) :-
92 id_part(Word),
93 !.
94glued_identifier(Word) :-
95 atom_length(Word, Len),
96 Len > 25,
97 !,
98 fail.
99glued_identifier(Word) :-
100 ( sub_atom(Word, _, _, _, '_'),
101 atomic_list_concat(Parts, '_', Word),
102 Parts = [_,_|_]
103 -> !
104 ),
105 maplist(id_part, Parts).
106glued_identifier(Word) :-
107 atom_concat(Pre, Rest, Word),
108 atom_length(Pre, L),
109 L > 2,
110 id_part(Pre),
111 glued_identifier(Rest),
112 !.
113glued_identifier(Word) :-
114 ( atom_concat(Pre, Rest, Word),
115 atom_number(Rest, _)
116 -> id_part(Pre)
117 ).
118
119id_part(Part) :-
120 atom_length(Part, 1),
121 !.
122id_part(Part) :-
123 atom_number(Part, _).
124id_part(Part) :-
125 downcase_atom(Part, Word),
126 white(Word),
127 !,
128 \+ black(Word).
136wordlist([], []).
137wordlist([H|T0], Words) :-
138 single_char(H),
139 !,
140 single_chars(T0, Chars, T),
141 ( make_word([H|Chars], Word)
142 -> Words = [Word|TW]
143 ; TW = Words
144 ),
145 wordlist(T, TW).
146wordlist([H|T], Words) :-
147 number(H),
148 !,
149 wordlist(T, Words).
150wordlist([H|T0], [H|T]) :-
151 wordlist(T0, T).
152
153single_chars([H|T0], [H|T], Rest) :-
154 single_char(H),
155 !,
156 single_chars(T0, T, Rest).
157single_chars(List, [], List).
158
159single_char(H) :-
160 atom(H),
161 !,
162 atom_length(H, 1).
163single_char(H) :-
164 integer(H),
165 between(0, 9, H).
166
167make_word(List, Word) :-
168 separated(List, _Sep, Chars),
169 wordy(Chars),
170 atomic_list_concat(Chars, Word).
171make_word(List, Word) :-
172 wordy(List),
173 !,
174 atomic_list_concat(List, Word).
175
176separated([H,Sep|T0], Sep, [H|T]) :-
177 char_type(Sep, punct),
178 separated_([Sep|T0], Sep, T).
179
180separated_([], _, []).
181separated_([Sep,H|T0], Sep, [H|T]) :-
182 separated_(T0, Sep, T).
183
184wordy(Chars) :-
185 wordy(Chars, 0, V),
186 V >= 3.
187
188wordy([H|T], V0, V) :-
189 char_type(H, alnum),
190 !,
191 V1 is V0 + 1,
192 wordy(T, V1, V).
193wordy([_|T], V0, V) :-
194 V1 is V0 - 1,
195 wordy(T, V1, V).
196
197
198 201
202read_word_lists :-
203 wl_loaded,
204 !.
205read_word_lists :-
206 with_mutex(content_filter, read_word_lists_sync).
207
208read_word_lists_sync :-
209 wl_loaded,
210 !.
211read_word_lists_sync :-
212 read_word_list(wordlist('words'), white),
213 read_word_list(wordlist('whitelist.txt'), white),
214 read_word_list(wordlist('bad-words.txt'), black),
215 read_word_list(wordlist('bad-words-google.txt'), black),
216 assertz(wl_loaded).
217
218:- multifile user:file_search_path/2. 219user:file_search_path(wordlist, '/usr/share/dict').
220user:file_search_path(wordlist, Dir) :-
221 source_file(read_word_lists, SrcFile),
222 file_directory_name(SrcFile, Dir).
228read_word_list(File, List) :-
229 absolute_file_name(File, Path, [access(read), file_errors(fail)]),
230 !,
231 setup_call_cleanup(
232 open(Path, read, In, [encoding(utf8)]),
233 ( lazy_list(lazy_read_lines(In, [as(atom)]), Words),
234 forall(member(Word, Words),
235 assert_word(List, Word))
236 ),
237 close(In)).
238read_word_list(_, _).
239
240assert_word(black, Word0) :- downcase_atom(Word0, Word), assertz(black(Word)).
241assert_word(white, Word0) :- downcase_atom(Word0, Word), assertz(white(Word))
Ban list content filter