:- module(drug_normalise, [drug_normalise/2 ]). :- use_module(library(semweb/rdf_db)). :- use_module(library(semweb/rdf_label)). :- use_module(library(semweb/rdf_litindex)). :- use_module(library(aspell)). drug_normalise(DrugName, Normalised) :- tokenize_atom(DrugName, Tokens), list_to_set(Tokens, TokenSet0), convert(TokenSet0, TokenSet1), exclude(punct, TokenSet1, TokenSet), atomic_list_concat(TokenSet, ' ', Normalised). convert([], []). convert([N0|T], [N|Rest]) :- number(N0), !, N is abs(N0), convert(T, Rest). convert(['MG','/','M2'|T], ['MG'|Rest]) :- !, convert(T, Rest). convert([H|T], [H|Rest]) :- convert(T, Rest). punct('!'). punct('.'). punct(','). punct('-'). punct('_'). punct(')'). punct('('). punct('*'). /******************************* * C * *******************************/ :- use_module(library(semweb/rdf_label)). normalise_drug_names :- debug(drug), aers_drugnames(DrugNames), length(DrugNames, DrugCount), debug(drug, 'drug count ~w', [DrugCount]), init_spell_check(Spellchecker), % match on drug name drug_link(DrugNames, exact(name), ExactPairs, Rest1), length(ExactPairs, ExactCount), ExactPerc is ExactCount/DrugCount*100, debug(drug, 'exact matches on name ~w ~1f%', [ExactCount, ExactPerc]), drug_link(Rest1, subword(name), SubwordPairs, Rest2), length(SubwordPairs, SubwordCount), SubwordPerc is SubwordCount/DrugCount*100, debug(drug, 'subword matches on name ~w ~1f%', [SubwordCount, SubwordPerc]), % match on brand name drug_link(Rest2, exact(brand), ExactBrandPairs, Rest3), length(ExactBrandPairs, ExactBrandCount), ExactBrandPerc is ExactBrandCount/DrugCount*100, debug(drug, 'exact matches on brandname ~w ~1f%', [ExactBrandCount, ExactBrandPerc]), drug_link(Rest3, subword(brand), SubwordBrandPairs, Rest4), length(SubwordBrandPairs, SubwordBrandCount), SubwordBrandPerc is SubwordBrandCount/DrugCount*100, debug(drug, 'subword matches on brandname ~w ~1f%', [SubwordBrandCount, SubwordBrandPerc]), % spelling variations drug_link(Rest4, exact(name, spelling(Spellchecker)), ExactSpellingPairs, Rest5), length(ExactSpellingPairs, ExactSpellingCount), ExactSpellingPerc is ExactSpellingCount/DrugCount*100, debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [ExactSpellingCount, ExactSpellingPerc]), drug_link(Rest5, subword(name, spelling(Spellchecker)), SubwordSpellingPairs, Rest6), length(SubwordSpellingPairs, SubwordSpellingCount), SubwordSpellingPerc is SubwordSpellingCount/DrugCount*100, debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [SubwordSpellingCount, SubwordSpellingPerc]), drug_link(Rest6, exact(brand, spelling(Spellchecker)), ExactBrandSpellingPairs, _Rest7), length(ExactBrandSpellingPairs, ExactBrandSpellingCount), ExactBrandSpellingPerc is ExactBrandSpellingCount/DrugCount*100, debug(drug, 'exact matches with spelling correction on brand ~w ~1f%', [ExactBrandSpellingCount, ExactBrandSpellingPerc]), MatchCount is ExactCount+SubwordCount+ExactBrandCount+SubwordBrandCount+ExactSpellingCount+ExactBrandSpellingCount+SubwordSpellingCount, MatchPerc is MatchCount/DrugCount*100, debug(drug, 'total matched ~w ~1f%', [MatchCount, MatchPerc]). aers_drugnames(DrugNames) :- findall(D,drugname(D),Ds), sort(Ds, DrugNames). drugname(Name) :- rdf(_,aers:drugname,D), literal_text(D,Name). drug_link([], _, [], []). drug_link([H|T], MatchType, [H-Drug|Drugs], Rest) :- drug_match(MatchType, H, Drug), !, drug_link(T, MatchType, Drugs, Rest). drug_link([H|T], MatchType, Drugs, [H|Rest]) :- drug_link(T, MatchType, Drugs, Rest). drug_match(exact(Type), Name, Drug) :- drug_literal(Type, literal(exact(Name),_), Drug). drug_match(subword(Type), Name, Drug) :- drug_tokenize(Name, Tokens), token_match(Tokens, Type, Drug). drug_match(exact(Type, spelling(Checker)), Name, Drug) :- aspell(Checker, Name, Suggestions), member(Suggestion, Suggestions), drug_match(exact(Type), Suggestion, Drug). drug_match(subword(Type, spelling(Checker)), Name, Drug) :- drug_tokenize(Name, Tokens), token_match_spelling(Tokens, Type, Checker, Drug). token_match([Name|_T], Type, Drug) :- drug_literal(Type, literal(exact(Name),_), Drug). token_match([_|T], Type, Drug) :- token_match(T, Type, Drug). token_match_spelling([Name|_T], Type, Checker, Drug) :- aspell(Checker, Name, Suggestions), member(Suggestion, Suggestions), drug_literal(Type, literal(exact(Suggestion),_), Drug). token_match_spelling([_|T], Type, Checker, Drug) :- token_match_spelling(T, Type, Checker, Drug). drug_literal(name, Literal, ob_ingredient(Ingredient)) :- rdf(Ingredient, ob:ingredientName, Literal). %drug_literal(name, Literal, db_drug(Drug)) :- % rdf(Drug, drugbank:'drugbank/genericName', Literal). drug_literal(brand, Literal, ob_product(Product)) :- rdf(Product, ob:tradeName, Literal). %drug_literal(brand, Literal, db_brand(Drug)) :- % rdf(Drug, drugbank:'drugbank/brandName', Literal). drug_tokenize(Drug, Tokens) :- tokenize_atom(Drug, Tokens0), cleanup_tokens(Tokens0, Tokens). cleanup_tokens([], []). cleanup_tokens([H0|T], Rest) :- downcase_atom(H0,H), bad_token(H), !, cleanup_tokens(T, Rest). cleanup_tokens([H|T], [H|Rest]) :- cleanup_tokens(T, Rest). bad_token(mg). bad_token(tablet). bad_token(unit). bad_token(dose). bad_token(inc). bad_token(unknown). bad_token(m2). bad_token(solution). bad_token(form). bad_token(day). bad_token(days). bad_token(A) :- number(A), !. bad_token(A) :- atom(A), atom_number(A,_),!. init_spell_check(PID) :- absolute_file_name(dict(.), DictDir), atom_concat('--dict-dir=',DictDir,DictOpt), create_aspell_process([DictOpt, '--master=drugbank'], PID).