virgil/commit

ADD drug spell checking, based on aspell

authorMichiel Hildebrand
Tue Mar 26 13:46:48 2013 +0100
committerMichiel Hildebrand
Tue Mar 26 13:48:06 2013 +0100
commit31c1d8bc578957bc4534fdc3e835e65ce9625c55
tree087d5709f99923598a618125fd11fab7b1c89123
parentd3d76a42ee42a650fca15080533b21c201956ada
Diff style: patch stat
diff --git a/lib/aspell.pl b/lib/aspell.pl
new file mode 100644
index 0000000..a577fb0
--- /dev/null
+++ b/lib/aspell.pl
@@ -0,0 +1,139 @@
+:- module(aspell,
+	  [create_aspell_process/2,
+	   cleanup_aspell_process/1,
+	   cleanup_aspell/0,
+	   current_aspell_process/2,
+	   aspell/3
+	  ]).
+
+:- use_module(library(process)).
+
+:- dynamic
+	aspell_process/3.
+
+%%	create_aspell_process(+Options, -PID)
+%
+%	Creates a new aspell process with id PID.
+%	A new process is created for each unique option-list.
+%
+%	If a process already exists for the option-list it simply
+%	returns the PID of that process.
+
+create_aspell_process(Options, PID) :-
+	current_aspell_process(Options, PID),
+	!.
+create_aspell_process(Options, PID) :-
+	process_create(path(aspell), [pipe|Options],
+		  [ stdin(pipe(In)),
+		    stdout(pipe(Out)),
+		    process(PID)
+		  ]),
+	assert(aspell_process(PID, in, In)),
+	assert(aspell_process(PID, out, Out)),
+	assert(aspell_process(PID, options, Options)).
+
+%%	cleanup_aspell_process(+PID)
+%
+%	Kills the process and removes the administration.
+
+cleanup_aspell_process(PID) :-
+	is_process(PID),
+	aspell_process(PID, in, In),
+	aspell_process(PID, out, Out),
+	!,
+	retractall(aspell_process(PID,_,_)),
+	catch(close(In), _, true),
+	catch(close(Out), _, true),
+	process_kill(PID).
+cleanup_aspell_process(_PID).
+
+cleanup_aspell :-
+	forall(aspell_process(PID, in, _),
+	       cleanup_aspell_process(PID)).
+
+
+%%	current_aspell_process(+Options, ?PID)
+%
+%	True if a process exists with Options and PID.
+
+current_aspell_process(Options, PID) :-
+	aspell_process(PID, options, Options).
+
+
+%%	aspell(+ProcessID, +Word, -SuggestionList)
+%
+%	SuggestionList contains all candidate corrections provided by
+%	aspell.
+%
+%       Options is an option-list for aspell.
+
+aspell(PID, Word, Suggestions) :-
+	is_process(PID),
+	aspell_process(PID, in, In),
+	aspell_process(PID, out, Out),
+	catch(format(In, '~w~n', [Word]), _,
+	      (cleanup_aspell_process(PID),
+	       fail)),
+	catch(flush_output(In), _,
+	      (cleanup_aspell_process(PID),
+	       fail
+	      )),
+	!,
+	read_suggestions(Out, Suggestions).
+aspell(PID, _, _) :-
+	existence_error('aspell process', PID).
+
+read_suggestions(Out, Suggestions) :-
+	read_line_to_codes(Out, Line),
+	read_suggestions(Line, Out, Suggestions).
+
+read_suggestions([], _, []) :- !.
+read_suggestions(Line, Out, Suggestions) :-
+	phrase(suggestion_line(Suggestions), Line),
+	!,
+	read_line_to_codes(Out,Next),
+        read_to_end(Next,Out).
+read_suggestions(_Line,Out, Suggestions) :-
+	read_line_to_codes(Out, NextLine),
+	read_suggestions(NextLine, Out, Suggestions).
+
+suggestion_line(Suggestions) -->
+	"& ",
+	!,
+	codes_to_suggestions(Suggestions).
+
+codes_to_suggestions(Suggestions) -->
+	": ",
+	!,
+	codes_suggest_list(Suggestions).
+codes_to_suggestions(Suggestions) -->
+	[_],
+	!,
+	codes_to_suggestions(Suggestions).
+codes_to_suggestions([]) -->
+	"".
+
+codes_suggest_list([H|T]) -->
+	suggestion_codes(C),
+	{ C \== [], !,
+	  atom_codes(H, C)
+	},
+	codes_suggest_list(T).
+codes_suggest_list([]) -->
+	"".
+
+
+suggestion_codes([]) -->
+	", ",
+	!.
+suggestion_codes([H|T]) -->
+	[H],
+	!,
+	suggestion_codes(T).
+suggestion_codes([]) -->
+	[].
+
+read_to_end([], _) :- !.
+read_to_end(_, S) :-
+	read_line_to_codes(S, L),
+	read_to_end(L, S).
diff --git a/lib/drug_spell_check.pl b/lib/drug_spell_check.pl
new file mode 100644
index 0000000..65226a9
--- /dev/null
+++ b/lib/drug_spell_check.pl
@@ -0,0 +1,76 @@
+:- module(drug_spell_check,
+	  [correct_drug_names/0
+	  ]).
+
+
+:- use_module(library(aspell)).
+:- use_module(library(semweb/rdf_label)).
+:- use_module(library(semweb/rdf_litindex)).
+
+user:file_search_path(dict, '../dict').
+
+correct_drug_names :-
+	absolute_file_name(dict(.), DictDir),
+	atom_concat('--dict-dir=',DictDir,DictOpt),
+	create_aspell_process([DictOpt,
+			       '--master=drugbank'],
+			      PID),
+	findall(Lit, rdf(_,aers:drugname,Lit), Drugs0),
+	sort(Drugs0, Drugs),
+	cleanup_lit(Drugs, Drugs1),
+	length(Drugs, UniqueCount),
+	length(Drugs1, CleanupCount),
+	debug(drugcorrect, '~w unique drug names', [UniqueCount]),
+	debug(drugcorrect, '~w after cleanup', [CleanupCount]),
+	spell_check(Drugs1, PID, Suggestions),
+	length(Suggestions, SuggestCount),
+	debug(drugcorrect, '~w corrected', [SuggestCount]),
+	maplist(assert_suggestion, Suggestions).
+
+cleanup_lit([], []).
+cleanup_lit([Lit|T], [A-Lit|Rest]) :-
+	literal_text(Lit, H),
+	atom_length(H, Length),
+	Length > 2,
+	tokenize_atom(H, [A0]),
+	downcase_atom(A0, A),
+	!,
+	cleanup_lit(T, Rest).
+cleanup_lit([_|T], Rest) :-
+	cleanup_lit(T, Rest).
+
+spell_check([], _, []).
+spell_check([A-Lit|T], PID, [Lit-Suggestion|Rest]) :-
+	aspell(PID, A, Suggestions),
+	Suggestions = [Suggestion|_], % we only keep the first suggestion
+	!,
+	%debug(drugcorrect, '~w -> ~w', [A,Suggestion]),
+	spell_check(T, PID, Rest).
+spell_check([_|T], PID, Rest) :-
+	spell_check(T, PID, Rest).
+
+assert_suggestion(Lit-Suggestion) :-
+	forall(rdf(R,aers:drugname,Lit),
+	       rdf_assert(R,aers:drugname_corrected,literal(Suggestion))).
+
+
+
+in_drugbank(Q) :-
+	tokenize_atom(Q,DL),
+	member(Word,DL),
+	find_drug_by_name(Word, _).
+
+find_drug_by_name(Q, Drug) :-
+	rdf_find_literals(case(Q), Literals),
+	member(Lit, Literals),
+	drug_name(Lit, Drug).
+
+drug_name(L, Drug) :-
+	rdf(Drug,rdfs:label,literal(L)),
+	rdf(Drug,rdf:type,drugbank:'drugbank/drugs'),
+	!.
+drug_name(L, Drug) :-
+	rdf(Drug,drugbank:'drugbank/synonym',literal(L)),
+	!.
+drug_name(L, Drug) :-
+	rdf(Drug,drugbank:'drugbank/brandName',literal(L)).