virgil/commit
Experiments in drug name normalisation
author | Michiel Hildebrand |
---|---|
Fri Jul 12 21:20:13 2013 +0200 | |
committer | Michiel Hildebrand |
Fri Jul 12 21:20:13 2013 +0200 | |
commit | ec55a0559e7c185a742a3d597a7078da7d18b963 |
tree | fe89de1a432408e2c7ee4715856b5095c20d4d11 |
parent | cc3d014dcbfd846d10637f75839dd40355813a27 |
Diff style: patch stat
diff --git a/lib/drug_normalise.pl b/lib/drug_normalise.pl index eb67c8e..0809aaa 100644 --- a/lib/drug_normalise.pl +++ b/lib/drug_normalise.pl @@ -2,6 +2,11 @@ [drug_normalise/2 ]). +:- use_module(library(semweb/rdf_label)). +:- use_module(library(semweb/rdf_litindex)). +:- use_module(library(aspell)). + + drug_normalise(DrugName, Normalised) :- tokenize_atom(DrugName, Tokens), list_to_set(Tokens, TokenSet0), @@ -29,3 +34,150 @@ punct('_'). punct(')'). punct('('). punct('*'). + + + + + + /******************************* + * C * + *******************************/ + + +normalise_drug_names :- + debug(drug), + aers_drugnames(DrugNames), + length(DrugNames, DrugCount), + debug(drug, 'drug count ~w', [DrugCount]), + + init_spell_check(Spellchecker), + + % match on drug name + drug_link(DrugNames, exact(name), ExactPairs, Rest1), + length(ExactPairs, ExactCount), + ExactPerc is ExactCount/DrugCount*100, + debug(drug, 'exact matches on name ~w ~1f%', [ExactCount, ExactPerc]), + + drug_link(Rest1, subword(name), SubwordPairs, Rest2), + length(SubwordPairs, SubwordCount), + SubwordPerc is SubwordCount/DrugCount*100, + debug(drug, 'subword matches on name ~w ~1f%', [SubwordCount, SubwordPerc]), + + % match on brand name + drug_link(Rest2, exact(brand), ExactBrandPairs, Rest3), + length(ExactBrandPairs, ExactBrandCount), + ExactBrandPerc is ExactBrandCount/DrugCount*100, + debug(drug, 'exact matches on brandname ~w ~1f%', [ExactBrandCount, ExactBrandPerc]), + + drug_link(Rest3, subword(brand), SubwordBrandPairs, Rest4), + length(SubwordBrandPairs, SubwordBrandCount), + SubwordBrandPerc is SubwordBrandCount/DrugCount*100, + debug(drug, 'subword matches on brandname ~w ~1f%', [SubwordBrandCount, SubwordBrandPerc]), + + % spelling variations + drug_link(Rest4, exact(name, spelling(Spellchecker)), ExactSpellingPairs, Rest5), + length(ExactSpellingPairs, ExactSpellingCount), + ExactSpellingPerc is ExactSpellingCount/DrugCount*100, + debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [ExactSpellingCount, ExactSpellingPerc]), + + drug_link(Rest5, subword(name, spelling(Spellchecker)), SubwordSpellingPairs, Rest6), + length(SubwordSpellingPairs, SubwordSpellingCount), + SubwordSpellingPerc is SubwordSpellingCount/DrugCount*100, + debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [SubwordSpellingCount, SubwordSpellingPerc]), + + drug_link(Rest6, exact(brand, spelling(Spellchecker)), ExactBrandSpellingPairs, _Rest7), + length(ExactBrandSpellingPairs, ExactBrandSpellingCount), + ExactBrandSpellingPerc is ExactBrandSpellingCount/DrugCount*100, + debug(drug, 'exact matches with spelling correction on brand ~w ~1f%', [ExactBrandSpellingCount, ExactBrandSpellingPerc]), + + MatchCount is ExactCount+SubwordCount+ExactBrandCount+SubwordBrandCount+ExactSpellingCount+ExactBrandSpellingCount+SubwordSpellingCount, + MatchPerc is MatchCount/DrugCount*100, + debug(drug, 'total matched ~w ~1f%', [MatchCount, MatchPerc]). + + +aers_drugnames(DrugNames) :- + findall(D,drugname(D),Ds), + sort(Ds, DrugNames). + +drugname(Name) :- + rdf(_,aers:drugname,D), + literal_text(D,Name). + + +drug_link([], _, [], []). +drug_link([H|T], MatchType, [H-Drug|Drugs], Rest) :- + drug_match(MatchType, H, Drug), + !, + drug_link(T, MatchType, Drugs, Rest). +drug_link([H|T], MatchType, Drugs, [H|Rest]) :- + drug_link(T, MatchType, Drugs, Rest). + +drug_match(exact(Type), Name, Drug) :- + drug_literal(Type, literal(exact(Name),_), Drug). +drug_match(subword(Type), Name, Drug) :- + drug_tokenize(Name, Tokens), + token_match(Tokens, Type, Drug). +drug_match(exact(Type, spelling(Checker)), Name, Drug) :- + aspell(Checker, Name, Suggestions), + member(Suggestion, Suggestions), + drug_match(exact(Type), Suggestion, Drug). +drug_match(subword(Type, spelling(Checker)), Name, Drug) :- + drug_tokenize(Name, Tokens), + token_match_spelling(Tokens, Type, Checker, Drug). + +token_match([Name|_T], Type, Drug) :- + drug_literal(Type, literal(exact(Name),_), Drug). +token_match([_|T], Type, Drug) :- + token_match(T, Type, Drug). + +token_match_spelling([Name|_T], Type, Checker, Drug) :- + aspell(Checker, Name, Suggestions), + member(Suggestion, Suggestions), + drug_literal(Type, literal(exact(Suggestion),_), Drug). +token_match_spelling([_|T], Type, Checker, Drug) :- + token_match_spelling(T, Type, Checker, Drug). + +drug_literal(name, Literal, ob_ingredient(Ingredient)) :- + rdf(Ingredient, ob:ingredientName, Literal). +%drug_literal(name, Literal, db_drug(Drug)) :- +% rdf(Drug, drugbank:'drugbank/genericName', Literal). +drug_literal(brand, Literal, ob_product(Product)) :- + rdf(Product, ob:tradeName, Literal). +%drug_literal(brand, Literal, db_brand(Drug)) :- +% rdf(Drug, drugbank:'drugbank/brandName', Literal). + +drug_tokenize(Drug, Tokens) :- + tokenize_atom(Drug, Tokens0), + cleanup_tokens(Tokens0, Tokens). + +cleanup_tokens([], []). +cleanup_tokens([H0|T], Rest) :- + downcase_atom(H0,H), + bad_token(H), + !, + cleanup_tokens(T, Rest). +cleanup_tokens([H|T], [H|Rest]) :- + cleanup_tokens(T, Rest). + +bad_token(mg). +bad_token(tablet). +bad_token(unit). +bad_token(dose). +bad_token(inc). +bad_token(unknown). +bad_token(m2). +bad_token(solution). +bad_token(form). +bad_token(day). +bad_token(days). +bad_token(A) :- number(A), !. +bad_token(A) :- atom(A), atom_number(A,_),!. + + + +init_spell_check(PID) :- + absolute_file_name(dict(.), DictDir), + atom_concat('--dict-dir=',DictDir,DictOpt), + create_aspell_process([DictOpt, + '--master=drugbank'], + PID).