View source with raw comments or as raw
    1/*  Part of SWISH
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@cs.vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (C): 2017, VU University Amsterdam
    7			 CWI Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(content_filter,
   37          [ eval_content/3              % +Text, -Wordcount, -Score
   38          ]).   39:- use_module(library(porter_stem)).   40:- use_module(library(apply)).   41:- use_module(library(debug)).

Ban list content filter

See also
- https://www.cs.cmu.edu/~biglou/resources/bad-words.txt
- https://www.freewebheaders.com/full-list-of-bad-words-banned-by-google/ */
   49:- dynamic
   50    black/1,
   51    white/1,
   52    wl_loaded/0.
 eval_content(+Text, -WordCount, -Score) is det
Evaluate the content of Text. WordCount is the number of non-trivial words and Score is the evaluation.
   59eval_content(Text, WordCount, Score) :-
   60    read_word_lists,
   61    tokenize_atom(Text, Tokens),
   62    wordlist(Tokens, Words),
   63    length(Words, WordCount),
   64    foldl(acc_score, Words, 0-0, Score0-_Acc),
   65    Score is max(-100, min(100, Score0)).
   66
   67acc_score(Word, V0-A0, V-A) :-
   68    downcase_atom(Word, Lower),
   69    score(Lower, WScore),
   70    A is min(0, A0//2 + WScore),
   71    V is V0+A+WScore,
   72    debug(spam, '~w: ~w, ~w -> ~w', [Word, WScore, V0-A0, V-A]).
   73
   74score(Word, 20) :-
   75    current_predicate(Word, system:_),
   76    !.
   77score(Word, Score) :-
   78    black(Word),
   79    !,
   80    Score = -50.
   81score(Word, Score) :-
   82    white(Word),
   83    !,
   84    Score = 10.
   85score(Word, Score) :-
   86    glued_identifier(Word),
   87    !,
   88    Score = 15.
   89score(_, -5).
   90
   91glued_identifier(Word) :-
   92    id_part(Word),
   93    !.
   94glued_identifier(Word) :-
   95    atom_length(Word, Len),
   96    Len > 25,
   97    !,
   98    fail.
   99glued_identifier(Word) :-
  100    (   sub_atom(Word, _, _, _, '_'),
  101        atomic_list_concat(Parts, '_', Word),
  102        Parts = [_,_|_]
  103    ->  !
  104    ),
  105    maplist(id_part, Parts).
  106glued_identifier(Word) :-
  107    atom_concat(Pre, Rest, Word),
  108    atom_length(Pre, L),
  109    L > 2,
  110    id_part(Pre),
  111    glued_identifier(Rest),
  112    !.
  113glued_identifier(Word) :-
  114    (   atom_concat(Pre, Rest, Word),
  115        atom_number(Rest, _)
  116    ->  id_part(Pre)
  117    ).
  118
  119id_part(Part) :-
  120    atom_length(Part, 1),
  121    !.
  122id_part(Part) :-
  123    atom_number(Part, _).
  124id_part(Part) :-
  125    downcase_atom(Part, Word),
  126    white(Word),
  127    !,
  128    \+ black(Word).
 wordlist(+Tokens, -WordList) is det
Filter the token list. Removes numbers and joins typical escape patterns such as 't h i s' or 't.h.i.s'.
  136wordlist([], []).
  137wordlist([H|T0], Words) :-
  138    single_char(H),
  139    !,
  140    single_chars(T0, Chars, T),
  141    (   make_word([H|Chars], Word)
  142    ->  Words = [Word|TW]
  143    ;   TW = Words
  144    ),
  145    wordlist(T, TW).
  146wordlist([H|T], Words) :-
  147    number(H),
  148    !,
  149    wordlist(T, Words).
  150wordlist([H|T0], [H|T]) :-
  151    wordlist(T0, T).
  152
  153single_chars([H|T0], [H|T], Rest) :-
  154    single_char(H),
  155    !,
  156    single_chars(T0, T, Rest).
  157single_chars(List, [], List).
  158
  159single_char(H) :-
  160    atom(H),
  161    !,
  162    atom_length(H, 1).
  163single_char(H) :-
  164    integer(H),
  165    between(0, 9, H).
  166
  167make_word(List, Word) :-
  168    separated(List, _Sep, Chars),
  169    wordy(Chars),
  170    atomic_list_concat(Chars, Word).
  171make_word(List, Word) :-
  172    wordy(List),
  173    !,
  174    atomic_list_concat(List, Word).
  175
  176separated([H,Sep|T0], Sep, [H|T]) :-
  177    char_type(Sep, punct),
  178    separated_([Sep|T0], Sep, T).
  179
  180separated_([], _, []).
  181separated_([Sep,H|T0], Sep, [H|T]) :-
  182    separated_(T0, Sep, T).
  183
  184wordy(Chars) :-
  185    wordy(Chars, 0, V),
  186    V >= 3.
  187
  188wordy([H|T], V0, V) :-
  189    char_type(H, alnum),
  190    !,
  191    V1 is V0 + 1,
  192    wordy(T, V1, V).
  193wordy([_|T], V0, V) :-
  194    V1 is V0 - 1,
  195    wordy(T, V1, V).
  196
  197
  198		 /*******************************
  199		 *           WORD LISTS		*
  200		 *******************************/
  201
  202read_word_lists :-
  203    wl_loaded,
  204    !.
  205read_word_lists :-
  206    with_mutex(content_filter, read_word_lists_sync).
  207
  208read_word_lists_sync :-
  209    wl_loaded,
  210    !.
  211read_word_lists_sync :-
  212    read_word_list(wordlist('words'), white),
  213    read_word_list(wordlist('whitelist.txt'), white),
  214    read_word_list(wordlist('bad-words.txt'), black),
  215    read_word_list(wordlist('bad-words-google.txt'), black),
  216    assertz(wl_loaded).
  217
  218:- multifile user:file_search_path/2.  219user:file_search_path(wordlist, '/usr/share/dict').
  220user:file_search_path(wordlist, Dir) :-
  221    source_file(read_word_lists, SrcFile),
  222    file_directory_name(SrcFile, Dir).
 read_word_list(+FileSpec, +List) is det
Read a list of words into a fact.
  228read_word_list(File, List) :-
  229    absolute_file_name(File, Path, [access(read), file_errors(fail)]),
  230    !,
  231    setup_call_cleanup(
  232        open(Path, read, In, [encoding(utf8)]),
  233        (   lazy_list(lazy_read_lines(In, [as(atom)]), Words),
  234            forall(member(Word, Words),
  235                   assert_word(List, Word))
  236        ),
  237        close(In)).
  238read_word_list(_, _).
  239
  240assert_word(black, Word0) :- downcase_atom(Word0, Word), assertz(black(Word)).
  241assert_word(white, Word0) :- downcase_atom(Word0, Word), assertz(white(Word))