:- module(drug_normalise,
	  [drug_normalise/2
	  ]).

:- use_module(library(semweb/rdf_db)).
:- use_module(library(semweb/rdf_label)).
:- use_module(library(semweb/rdf_litindex)).
:- use_module(library(aspell)).


drug_normalise(DrugName, Normalised) :-
	tokenize_atom(DrugName, Tokens),
	list_to_set(Tokens, TokenSet0),
	convert(TokenSet0, TokenSet1),
	exclude(punct, TokenSet1, TokenSet),
	atomic_list_concat(TokenSet, ' ', Normalised).

convert([], []).
convert([N0|T], [N|Rest]) :-
	number(N0),
	!,
	N is abs(N0),
	convert(T, Rest).
convert(['MG','/','M2'|T], ['MG'|Rest]) :-
	!,
	convert(T, Rest).
convert([H|T], [H|Rest]) :-
	convert(T, Rest).

punct('!').
punct('.').
punct(',').
punct('-').
punct('_').
punct(')').
punct('(').
punct('*').


		 /*******************************
		 *               C		*
		 *******************************/
:- use_module(library(semweb/rdf_label)).


normalise_drug_names :-
	debug(drug),
	aers_drugnames(DrugNames),
	length(DrugNames, DrugCount),
	debug(drug, 'drug count ~w', [DrugCount]),

	init_spell_check(Spellchecker),

	% match on drug name
	drug_link(DrugNames, exact(name), ExactPairs, Rest1),
	length(ExactPairs, ExactCount),
	ExactPerc is ExactCount/DrugCount*100,
	debug(drug, 'exact matches on name ~w ~1f%', [ExactCount, ExactPerc]),

	drug_link(Rest1, subword(name), SubwordPairs, Rest2),
	length(SubwordPairs, SubwordCount),
	SubwordPerc is SubwordCount/DrugCount*100,
	debug(drug, 'subword matches on name ~w ~1f%', [SubwordCount, SubwordPerc]),

	% match on brand name
	drug_link(Rest2, exact(brand), ExactBrandPairs, Rest3),
	length(ExactBrandPairs, ExactBrandCount),
	ExactBrandPerc is ExactBrandCount/DrugCount*100,
	debug(drug, 'exact matches on brandname ~w ~1f%', [ExactBrandCount, ExactBrandPerc]),

	drug_link(Rest3, subword(brand), SubwordBrandPairs, Rest4),
	length(SubwordBrandPairs, SubwordBrandCount),
	SubwordBrandPerc is SubwordBrandCount/DrugCount*100,
	debug(drug, 'subword matches on brandname ~w ~1f%', [SubwordBrandCount, SubwordBrandPerc]),

	% spelling variations
	drug_link(Rest4, exact(name, spelling(Spellchecker)), ExactSpellingPairs, Rest5),
	length(ExactSpellingPairs, ExactSpellingCount),
	ExactSpellingPerc is ExactSpellingCount/DrugCount*100,
	debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [ExactSpellingCount, ExactSpellingPerc]),

	drug_link(Rest5, subword(name, spelling(Spellchecker)), SubwordSpellingPairs, Rest6),
	length(SubwordSpellingPairs, SubwordSpellingCount),
	SubwordSpellingPerc is SubwordSpellingCount/DrugCount*100,
	debug(drug, 'exact matches with spelling correction on name ~w ~1f%', [SubwordSpellingCount, SubwordSpellingPerc]),

	drug_link(Rest6, exact(brand, spelling(Spellchecker)), ExactBrandSpellingPairs, _Rest7),
	length(ExactBrandSpellingPairs, ExactBrandSpellingCount),
	ExactBrandSpellingPerc is ExactBrandSpellingCount/DrugCount*100,
	debug(drug, 'exact matches with spelling correction on brand ~w ~1f%', [ExactBrandSpellingCount, ExactBrandSpellingPerc]),

	MatchCount is ExactCount+SubwordCount+ExactBrandCount+SubwordBrandCount+ExactSpellingCount+ExactBrandSpellingCount+SubwordSpellingCount,
	MatchPerc is MatchCount/DrugCount*100,
	debug(drug, 'total matched ~w ~1f%', [MatchCount, MatchPerc]).


aers_drugnames(DrugNames) :-
	findall(D,drugname(D),Ds),
	sort(Ds, DrugNames).

drugname(Name) :-
	rdf(_,aers:drugname,D),
	literal_text(D,Name).


drug_link([], _, [], []).
drug_link([H|T], MatchType, [H-Drug|Drugs], Rest) :-
	drug_match(MatchType, H, Drug),
	!,
	drug_link(T, MatchType, Drugs, Rest).
drug_link([H|T], MatchType, Drugs, [H|Rest]) :-
	drug_link(T, MatchType, Drugs, Rest).

drug_match(exact(Type), Name, Drug) :-
	drug_literal(Type, literal(exact(Name),_), Drug).
drug_match(subword(Type), Name, Drug) :-
	drug_tokenize(Name, Tokens),
	token_match(Tokens, Type, Drug).
drug_match(exact(Type, spelling(Checker)), Name, Drug) :-
	aspell(Checker, Name, Suggestions),
	member(Suggestion, Suggestions),
	drug_match(exact(Type), Suggestion, Drug).
drug_match(subword(Type, spelling(Checker)), Name, Drug) :-
	drug_tokenize(Name, Tokens),
	token_match_spelling(Tokens, Type, Checker, Drug).

token_match([Name|_T], Type, Drug) :-
	drug_literal(Type, literal(exact(Name),_), Drug).
token_match([_|T], Type, Drug) :-
	token_match(T, Type, Drug).

token_match_spelling([Name|_T], Type, Checker, Drug) :-
	aspell(Checker, Name, Suggestions),
	member(Suggestion, Suggestions),
	drug_literal(Type, literal(exact(Suggestion),_), Drug).
token_match_spelling([_|T], Type, Checker, Drug) :-
	token_match_spelling(T, Type, Checker, Drug).

drug_literal(name, Literal, ob_ingredient(Ingredient)) :-
	rdf(Ingredient, ob:ingredientName, Literal).
%drug_literal(name, Literal, db_drug(Drug)) :-
%	rdf(Drug, drugbank:'drugbank/genericName', Literal).
drug_literal(brand, Literal, ob_product(Product)) :-
	rdf(Product, ob:tradeName, Literal).
%drug_literal(brand, Literal, db_brand(Drug)) :-
%	rdf(Drug, drugbank:'drugbank/brandName', Literal).

drug_tokenize(Drug, Tokens) :-
	tokenize_atom(Drug, Tokens0),
	cleanup_tokens(Tokens0, Tokens).

cleanup_tokens([], []).
cleanup_tokens([H0|T], Rest) :-
	downcase_atom(H0,H),
	bad_token(H),
	!,
	cleanup_tokens(T, Rest).
cleanup_tokens([H|T], [H|Rest]) :-
	cleanup_tokens(T, Rest).

bad_token(mg).
bad_token(tablet).
bad_token(unit).
bad_token(dose).
bad_token(inc).
bad_token(unknown).
bad_token(m2).
bad_token(solution).
bad_token(form).
bad_token(day).
bad_token(days).
bad_token(A) :- number(A), !.
bad_token(A) :- atom(A), atom_number(A,_),!.


init_spell_check(PID) :-
	absolute_file_name(dict(.), DictDir),
	atom_concat('--dict-dir=',DictDir,DictOpt),
	create_aspell_process([DictOpt,
			       '--master=drugbank'],
			      PID).