virgil/commit
ADD drug spell checking, based on aspell
author | Michiel Hildebrand |
---|---|
Tue Mar 26 13:46:48 2013 +0100 | |
committer | Michiel Hildebrand |
Tue Mar 26 13:48:06 2013 +0100 | |
commit | 31c1d8bc578957bc4534fdc3e835e65ce9625c55 |
tree | 087d5709f99923598a618125fd11fab7b1c89123 |
parent | d3d76a42ee42a650fca15080533b21c201956ada |
Diff style: patch stat
diff --git a/lib/aspell.pl b/lib/aspell.pl new file mode 100644 index 0000000..a577fb0 --- /dev/null +++ b/lib/aspell.pl @@ -0,0 +1,139 @@ +:- module(aspell, + [create_aspell_process/2, + cleanup_aspell_process/1, + cleanup_aspell/0, + current_aspell_process/2, + aspell/3 + ]). + +:- use_module(library(process)). + +:- dynamic + aspell_process/3. + +%% create_aspell_process(+Options, -PID) +% +% Creates a new aspell process with id PID. +% A new process is created for each unique option-list. +% +% If a process already exists for the option-list it simply +% returns the PID of that process. + +create_aspell_process(Options, PID) :- + current_aspell_process(Options, PID), + !. +create_aspell_process(Options, PID) :- + process_create(path(aspell), [pipe|Options], + [ stdin(pipe(In)), + stdout(pipe(Out)), + process(PID) + ]), + assert(aspell_process(PID, in, In)), + assert(aspell_process(PID, out, Out)), + assert(aspell_process(PID, options, Options)). + +%% cleanup_aspell_process(+PID) +% +% Kills the process and removes the administration. + +cleanup_aspell_process(PID) :- + is_process(PID), + aspell_process(PID, in, In), + aspell_process(PID, out, Out), + !, + retractall(aspell_process(PID,_,_)), + catch(close(In), _, true), + catch(close(Out), _, true), + process_kill(PID). +cleanup_aspell_process(_PID). + +cleanup_aspell :- + forall(aspell_process(PID, in, _), + cleanup_aspell_process(PID)). + + +%% current_aspell_process(+Options, ?PID) +% +% True if a process exists with Options and PID. + +current_aspell_process(Options, PID) :- + aspell_process(PID, options, Options). + + +%% aspell(+ProcessID, +Word, -SuggestionList) +% +% SuggestionList contains all candidate corrections provided by +% aspell. +% +% Options is an option-list for aspell. + +aspell(PID, Word, Suggestions) :- + is_process(PID), + aspell_process(PID, in, In), + aspell_process(PID, out, Out), + catch(format(In, '~w~n', [Word]), _, + (cleanup_aspell_process(PID), + fail)), + catch(flush_output(In), _, + (cleanup_aspell_process(PID), + fail + )), + !, + read_suggestions(Out, Suggestions). +aspell(PID, _, _) :- + existence_error('aspell process', PID). + +read_suggestions(Out, Suggestions) :- + read_line_to_codes(Out, Line), + read_suggestions(Line, Out, Suggestions). + +read_suggestions([], _, []) :- !. +read_suggestions(Line, Out, Suggestions) :- + phrase(suggestion_line(Suggestions), Line), + !, + read_line_to_codes(Out,Next), + read_to_end(Next,Out). +read_suggestions(_Line,Out, Suggestions) :- + read_line_to_codes(Out, NextLine), + read_suggestions(NextLine, Out, Suggestions). + +suggestion_line(Suggestions) --> + "& ", + !, + codes_to_suggestions(Suggestions). + +codes_to_suggestions(Suggestions) --> + ": ", + !, + codes_suggest_list(Suggestions). +codes_to_suggestions(Suggestions) --> + [_], + !, + codes_to_suggestions(Suggestions). +codes_to_suggestions([]) --> + "". + +codes_suggest_list([H|T]) --> + suggestion_codes(C), + { C \== [], !, + atom_codes(H, C) + }, + codes_suggest_list(T). +codes_suggest_list([]) --> + "". + + +suggestion_codes([]) --> + ", ", + !. +suggestion_codes([H|T]) --> + [H], + !, + suggestion_codes(T). +suggestion_codes([]) --> + []. + +read_to_end([], _) :- !. +read_to_end(_, S) :- + read_line_to_codes(S, L), + read_to_end(L, S). diff --git a/lib/drug_spell_check.pl b/lib/drug_spell_check.pl new file mode 100644 index 0000000..65226a9 --- /dev/null +++ b/lib/drug_spell_check.pl @@ -0,0 +1,76 @@ +:- module(drug_spell_check, + [correct_drug_names/0 + ]). + + +:- use_module(library(aspell)). +:- use_module(library(semweb/rdf_label)). +:- use_module(library(semweb/rdf_litindex)). + +user:file_search_path(dict, '../dict'). + +correct_drug_names :- + absolute_file_name(dict(.), DictDir), + atom_concat('--dict-dir=',DictDir,DictOpt), + create_aspell_process([DictOpt, + '--master=drugbank'], + PID), + findall(Lit, rdf(_,aers:drugname,Lit), Drugs0), + sort(Drugs0, Drugs), + cleanup_lit(Drugs, Drugs1), + length(Drugs, UniqueCount), + length(Drugs1, CleanupCount), + debug(drugcorrect, '~w unique drug names', [UniqueCount]), + debug(drugcorrect, '~w after cleanup', [CleanupCount]), + spell_check(Drugs1, PID, Suggestions), + length(Suggestions, SuggestCount), + debug(drugcorrect, '~w corrected', [SuggestCount]), + maplist(assert_suggestion, Suggestions). + +cleanup_lit([], []). +cleanup_lit([Lit|T], [A-Lit|Rest]) :- + literal_text(Lit, H), + atom_length(H, Length), + Length > 2, + tokenize_atom(H, [A0]), + downcase_atom(A0, A), + !, + cleanup_lit(T, Rest). +cleanup_lit([_|T], Rest) :- + cleanup_lit(T, Rest). + +spell_check([], _, []). +spell_check([A-Lit|T], PID, [Lit-Suggestion|Rest]) :- + aspell(PID, A, Suggestions), + Suggestions = [Suggestion|_], % we only keep the first suggestion + !, + %debug(drugcorrect, '~w -> ~w', [A,Suggestion]), + spell_check(T, PID, Rest). +spell_check([_|T], PID, Rest) :- + spell_check(T, PID, Rest). + +assert_suggestion(Lit-Suggestion) :- + forall(rdf(R,aers:drugname,Lit), + rdf_assert(R,aers:drugname_corrected,literal(Suggestion))). + + + +in_drugbank(Q) :- + tokenize_atom(Q,DL), + member(Word,DL), + find_drug_by_name(Word, _). + +find_drug_by_name(Q, Drug) :- + rdf_find_literals(case(Q), Literals), + member(Lit, Literals), + drug_name(Lit, Drug). + +drug_name(L, Drug) :- + rdf(Drug,rdfs:label,literal(L)), + rdf(Drug,rdf:type,drugbank:'drugbank/drugs'), + !. +drug_name(L, Drug) :- + rdf(Drug,drugbank:'drugbank/synonym',literal(L)), + !. +drug_name(L, Drug) :- + rdf(Drug,drugbank:'drugbank/brandName',literal(L)).