virgil/commit

Experiments in drug name normalisation

authorMichiel Hildebrand
Fri Jul 12 21:20:13 2013 +0200
committerMichiel Hildebrand
Fri Jul 12 21:20:13 2013 +0200
commitec55a0559e7c185a742a3d597a7078da7d18b963
treefe89de1a432408e2c7ee4715856b5095c20d4d11
parentcc3d014dcbfd846d10637f75839dd40355813a27
Diff style: patch stat
diff --git a/lib/drug_normalise.pl b/lib/drug_normalise.pl
index eb67c8e..0809aaa 100644
--- a/lib/drug_normalise.pl
+++ b/lib/drug_normalise.pl
@@ -2,6 +2,11 @@
 	  [drug_normalise/2
 	  ]).
 
+:- use_module(library(semweb/rdf_label)).
+:- use_module(library(semweb/rdf_litindex)).
+:- use_module(library(aspell)).
+
+
 drug_normalise(DrugName, Normalised) :-
 	tokenize_atom(DrugName, Tokens),
 	list_to_set(Tokens, TokenSet0),
@@ -29,3 +34,150 @@ punct('_').
 punct(')').
 punct('(').
 punct('*').
+
+
+
+
+
+		 /*******************************
+		 *               C		*
+		 *******************************/
+
+
+normalise_drug_names :-
+	debug(drug),
+	aers_drugnames(DrugNames),
+	length(DrugNames, DrugCount),
+	debug(drug, 'drug count ~w', [DrugCount]),
+
+	init_spell_check(Spellchecker),
+
+	% match on drug name
+	drug_link(DrugNames, exact(name), ExactPairs, Rest1),
+	length(ExactPairs, ExactCount),
+	ExactPerc is ExactCount/DrugCount*100,
+	debug(drug, 'exact matches on name ~w ~1f%', [ExactCount, ExactPerc]),
+
+	drug_link(Rest1, subword(name), SubwordPairs, Rest2),
+	length(SubwordPairs, SubwordCount),
+	SubwordPerc is SubwordCount/DrugCount*100,
+	debug(drug, 'subword matches on name ~w ~1f%', [SubwordCount, SubwordPerc]),
+
+	% match on brand name
+	drug_link(Rest2, exact(brand), ExactBrandPairs, Rest3),
+	length(ExactBrandPairs, ExactBrandCount),
+	ExactBrandPerc is ExactBrandCount/DrugCount*100,
+	debug(drug, 'exact matches on brandname ~w ~1f%', [ExactBrandCount, ExactBrandPerc]),
+
+	drug_link(Rest3, subword(brand), SubwordBrandPairs, Rest4),
+	length(SubwordBrandPairs, SubwordBrandCount),
+	SubwordBrandPerc is SubwordBrandCount/DrugCount*100,
+	debug(drug, 'subword matches on brandname ~w ~1f%', [SubwordBrandCount, SubwordBrandPerc]),
+
+	% spelling variations
+	drug_link(Rest4, exact(name, spelling(Spellchecker)), ExactSpellingPairs, Rest5),
+	length(ExactSpellingPairs, ExactSpellingCount),
+	ExactSpellingPerc is ExactSpellingCount/DrugCount*100,
+	debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [ExactSpellingCount, ExactSpellingPerc]),
+
+	drug_link(Rest5, subword(name, spelling(Spellchecker)), SubwordSpellingPairs, Rest6),
+	length(SubwordSpellingPairs, SubwordSpellingCount),
+	SubwordSpellingPerc is SubwordSpellingCount/DrugCount*100,
+	debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [SubwordSpellingCount, SubwordSpellingPerc]),
+
+	drug_link(Rest6, exact(brand, spelling(Spellchecker)), ExactBrandSpellingPairs, _Rest7),
+	length(ExactBrandSpellingPairs, ExactBrandSpellingCount),
+	ExactBrandSpellingPerc is ExactBrandSpellingCount/DrugCount*100,
+	debug(drug, 'exact matches with spelling correction on brand ~w ~1f%', [ExactBrandSpellingCount, ExactBrandSpellingPerc]),
+
+	MatchCount is ExactCount+SubwordCount+ExactBrandCount+SubwordBrandCount+ExactSpellingCount+ExactBrandSpellingCount+SubwordSpellingCount,
+	MatchPerc is MatchCount/DrugCount*100,
+	debug(drug, 'total matched ~w ~1f%', [MatchCount, MatchPerc]).
+
+
+aers_drugnames(DrugNames) :-
+	findall(D,drugname(D),Ds),
+	sort(Ds, DrugNames).
+
+drugname(Name) :-
+	rdf(_,aers:drugname,D),
+	literal_text(D,Name).
+
+
+drug_link([], _, [], []).
+drug_link([H|T], MatchType, [H-Drug|Drugs], Rest) :-
+	drug_match(MatchType, H, Drug),
+	!,
+	drug_link(T, MatchType, Drugs, Rest).
+drug_link([H|T], MatchType, Drugs, [H|Rest]) :-
+	drug_link(T, MatchType, Drugs, Rest).
+
+drug_match(exact(Type), Name, Drug) :-
+	drug_literal(Type, literal(exact(Name),_), Drug).
+drug_match(subword(Type), Name, Drug) :-
+	drug_tokenize(Name, Tokens),
+	token_match(Tokens, Type, Drug).
+drug_match(exact(Type, spelling(Checker)), Name, Drug) :-
+	aspell(Checker, Name, Suggestions),
+	member(Suggestion, Suggestions),
+	drug_match(exact(Type), Suggestion, Drug).
+drug_match(subword(Type, spelling(Checker)), Name, Drug) :-
+	drug_tokenize(Name, Tokens),
+	token_match_spelling(Tokens, Type, Checker, Drug).
+
+token_match([Name|_T], Type, Drug) :-
+	drug_literal(Type, literal(exact(Name),_), Drug).
+token_match([_|T], Type, Drug) :-
+	token_match(T, Type, Drug).
+
+token_match_spelling([Name|_T], Type, Checker, Drug) :-
+	aspell(Checker, Name, Suggestions),
+	member(Suggestion, Suggestions),
+	drug_literal(Type, literal(exact(Suggestion),_), Drug).
+token_match_spelling([_|T], Type, Checker, Drug) :-
+	token_match_spelling(T, Type, Checker, Drug).
+
+drug_literal(name, Literal, ob_ingredient(Ingredient)) :-
+	rdf(Ingredient, ob:ingredientName, Literal).
+%drug_literal(name, Literal, db_drug(Drug)) :-
+%	rdf(Drug, drugbank:'drugbank/genericName', Literal).
+drug_literal(brand, Literal, ob_product(Product)) :-
+	rdf(Product, ob:tradeName, Literal).
+%drug_literal(brand, Literal, db_brand(Drug)) :-
+%	rdf(Drug, drugbank:'drugbank/brandName', Literal).
+
+drug_tokenize(Drug, Tokens) :-
+	tokenize_atom(Drug, Tokens0),
+	cleanup_tokens(Tokens0, Tokens).
+
+cleanup_tokens([], []).
+cleanup_tokens([H0|T], Rest) :-
+	downcase_atom(H0,H),
+	bad_token(H),
+	!,
+	cleanup_tokens(T, Rest).
+cleanup_tokens([H|T], [H|Rest]) :-
+	cleanup_tokens(T, Rest).
+
+bad_token(mg).
+bad_token(tablet).
+bad_token(unit).
+bad_token(dose).
+bad_token(inc).
+bad_token(unknown).
+bad_token(m2).
+bad_token(solution).
+bad_token(form).
+bad_token(day).
+bad_token(days).
+bad_token(A) :- number(A), !.
+bad_token(A) :- atom(A), atom_number(A,_),!.
+
+
+
+init_spell_check(PID) :-
+	absolute_file_name(dict(.), DictDir),
+	atom_concat('--dict-dir=',DictDir,DictOpt),
+	create_aspell_process([DictOpt,
+			       '--master=drugbank'],
+			      PID).