/*  This file is part of ClioPatria.

    Author:
    HTTP:	http://e-culture.multimedian.nl/
    GITWEB:	http://gollem.science.uva.nl/git/ClioPatria.git
    GIT:	git://gollem.science.uva.nl/home/git/ClioPatria.git
    GIT:	http://gollem.science.uva.nl/home/git/ClioPatria.git
    Copyright:  2007, E-Culture/MultimediaN

    ClioPatria is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    ClioPatria is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with ClioPatria.  If not, see <http://www.gnu.org/licenses/>.
*/

:- module(kwd_search,
	  [ match_uri/7,	% +Atom, +Range, +Threshold, +Map, -URI, -Score, -Path
	    match_string/7,	% +Atom, +Range, +Threshold, +Map, -URI, -Score, -Path
	    search_string/6,	% +Atom, +Range, +Threshold, +Map, -Score, -Path
	    search_graph/7,	% +From, +Range, +Score0, +Threshold, +Map, -Score, -Path
	    find_literals/3	% +Search, +Threshold, -Literals
	  ]).
:- use_module(library(lists)).
:- use_module(library(assoc)).
:- use_module(library(debug)).
:- use_module(library(semweb/rdf_db)).
:- use_module(library(semweb/rdfs)).
:- use_module(library(semweb/rdf_litindex)).
:- use_module(library(porter_stem)).
:- use_module(library(semweb/owl)).
:- use_module(rdf_search).
:- use_module(rdf_cluster).
:- use_module(graph_search).
:- use_module(fuzzy).


:- rdf_meta
	match_string(+, r, +, +, -),
	search_string(+, t, +, +, -, -).

%%	match_string(+Atom, -Matches, +Options)
%
%	Find matches for the literal string Atom controlled by Options.
%	Options supported are:
%
%		* filter(+URI, :Goal)
%		Calls Goal to filter target objects.
%
%		* score(+Path, -Score, :Goal)
%		Calls Goal to rate Path.  Score most be unified to a
%		float between 0.0 and 1.0.  If Goal fails this path is
%		discarded.
%
%		* score_literal(+Literal, -Score, :Goal)
%		Score the initial literal match.
%
%		* threshold(Float)
%		Float in the range 0.0..1.0, stopping the search if
%		the score of a path is below Float.
%
%		* threshold(Max, Min, MinHits)
%		Perform iterative lowering of threshold, starting at
%		Max and stopping at Min or after at least MinHits
%		hits have been found.
%
%	Example
%
%	==
%	match_string(gogh, Matches,
%		     [ filter(URI, rdfs_individual_of(URI, ulan:'Person'))
%		       score([rdfs:label|_], 1, true)
%		     ])
%	==
%
%	@param	Matches is a list of hit(URI, Score, Path) terms.


%match_string(Word, Matches, Options) :-
%	select(threshold(Max, Min, MinHits), Options, Options1), !,
%	iterative_threshold(4, Max, Min, Threshold),
%	match_string(Word, Matches, [threshold(Threshold)|Options1]),
%	length(Matches, HitCount),
%	HitCount >= MinHits.
%match_string(Word, Matches, Options) :-
%	find_literals(Word, ScoredLiterals, Options),
%	search_graph(ScoredLiterals, Matches, Options).


%%	iterative_threshold(+Max, +Min, +Steps, -Threshold) is nondet.
%
%	Generate thresholds, starting at Max and   going  in Steps equal
%	relative steps down to Min.

%iterative_threshold(Max, Min, Steps, Threshold) :-
%	Steps1 is Steps - 1,
%	Itv = log(Max) - log(Min),
%	between(0, Steps1, Step),
%	Threshold is exp(log(Max) - (Itv*Step/Steps1)).


%%	match_string(+Atom, +Range, +Threshold, +Map,
%%		     -URI, -Score, -Path) is nondet.
%
%	Search URIs related to the string Atom.
%
%	@param Atom	Search specification.  Will be tokenized.
%	@param Range	Target objects.  Passed to owl_satisfies/2.
%	@param Threshold Search cut-off threshold (0..1)
%	@param Map	List of Pred-Factor.
%	@param URI	Matching URI
%	@param Score	Score of the match (0..1)
%	@param Path	List of Pred-Obj with path from matching literals to URI.
%			Note that this does *not* include URI.
%	@see	search_string/7
%	@tbd	Complete description

match_string(Atom, Range, Threshold, Map, URI, Score, Path) :-
	findall(Score1-Path1,
		search_string(Atom, all_values_from(Range), Threshold, Map, Score1, Path1),
		Pairs0),
	join_probabilities(Pairs0, Pairs1),
	keysort(Pairs1, Pairs2),
	reverse(Pairs2, Pairs3),
	member(Score-URI, Pairs3),

	findall(Score2-Path2, member(Score2-[URI|Path2], Pairs0), Paths),
	keysort(Paths,Paths0),
	reverse(Paths0,[_Score-Path|_Tail]).	% TBD Should this _Score not be the result?


match_uri(Atom, Range, Threshold, Map, URI, Score, Path) :-
	assoc_map(Map, Assoc),
	findall(Score1-Path1,
		search_graph(Atom, all_values_from(Range), 1, Threshold, Assoc, Score1, Path1),
		Pairs0),
	join_probabilities(Pairs0, Pairs1),
	keysort(Pairs1, Pairs2),
	reverse(Pairs2, Pairs3),
	member(Score-URI, Pairs3),

	findall(Score2-Path2, member(Score2-[URI|Path2], Pairs0), Paths),
	keysort(Paths,Paths0),
	reverse(Paths0,[_Score-Path|_Tail]).

%%	search_string(+Atom, +Range, +Threshold, +Map, -Score, -Path)
%
%	We return all resources belonging to   Range  that either have a
%	literal attribute from Atom or  have   a  resource  with such an
%	attribute. Range is a  description   for  owl_satisfies/2.  Most
%	typical usage is =|all_values_from(Class)|=
%
%	Using one_of([Resource]) you can produce all  paths leading to a
%	specific resource.
%
%	@tbd	This is an old interface used by old basic search that
%		is now hacked to run on top of the new search
%		infrastructure.
%
%	@param Map List of Pred-Factor. Current strength is multiplied
%		   by Factor for Pred.  If strength becomes lower than
%		   Threshold the search is stopped.
%	@param Path List of [Target, R1-P1, R2-P2, ... literal(X)-PN]

search_string(Atom, Range, Threshold, _Map, Score, Path) :-
	graph_search(Atom, State,
		     [ filter([owl_satisfies(Range)]),
		       threshold(Threshold),
		       prune(true)
		     ]),
	rdf_search_property(State, graph(Graph)),
	rdf_search_property(State, targets(Targets)),
	member(Score-Target, Targets),
	rdf_cluster:search_path(Target, Graph, [_Target|IPath]),
	op_pairs(IPath, Path1),
	Path = [Target|Path1],
	debug(path, 'Path = ~p', [Path]).

%%	op_pairs(+List, -Pairs) is det.
%
%	Translate between new path descriptions  and   the  old ones. We
%	also used to delete owl:sameAs here, but  this is not the proper
%	place as we  want  to  highlight  the   use  of  sameAs  in  the
%	interface.

op_pairs([], []).
%op_pairs([P,_O|T0], T) :-
%	rdf_equal(P, owl:sameAs), !,
%	op_pairs(T0, T).
op_pairs([P,O|T0], [O-P|T]) :-
	op_pairs(T0, T).


		 /*******************************
		 *	 LITERAL HANDLING	*
		 *******************************/

%%	find_literals(+SearchFor, -ScoredLiterals:list(Score-Literal), +Options) is det.
%
%	Find all literals in the database  that are related to SearchFor
%	by at least one common stem. Order these literals by the quality
%	of the match.  Options:
%
%		* threshold(+Float)
%		If present, ignore literals that match with a score
%		lower than threshold.
%
%	@tbd	Allow for more than stem and "compound term" matches.
%	@bug	Tokenization only deals with ISO Latin-1 text.

find_literals(Search, Literals, Options) :-
	option(threshold(Threshold), Options, 0.0),
	(      rdf_tokenize_literal(Search, Tokens)
	->  true
	;   Tokens = [Search]		% HACK
	),
	all_literals(Tokens, Literals0),
	sort(Literals0, Literals1),
	length(Literals1, NL1),
	debug(find_literals, '~D matches', [NL1]),
	sort_matches(Literals1, Tokens, Threshold, Literals).

all_literals(Tokens, Literals) :-
	compound_search_tokens(Tokens, Compounds, AllTokens),
	list_to_and(AllTokens, LitCond),
	rdf_find_literals(LitCond, Ls0),
	debug(find_literals_detail, 'Matching literals ~p', [Ls0]),
	(   Compounds == []
	->  Literals = Ls0
	;   tokenize_hits(Ls0, Tokenized),
	    filter_compounds(Compounds, Tokenized, KeyedLiterals),
	    pairs_values(KeyedLiterals, Literals)
	).

%%	compound_search_tokens(+Tokens, -Compounds:list(list), -AllTokens)
%
%	Extract all tokens and the compounds that must be matched.

compound_search_tokens([], [], []).
compound_search_tokens(['"'|Rest], [Compound|RC], Words) :-
	append(Compound, ['"'|T], Rest),
	append(Compound, RW, Words), !,
	compound_search_tokens(T, RC, RW).
compound_search_tokens([H0|T], RC, [H|RW]) :-
	mkmatch(H0, H),
	compound_search_tokens(T, RC, RW).

list_to_and([], true).
list_to_and([One], One) :- !.
list_to_and([H|T], and(H, And)) :- !,
        list_to_and(T, And).

mkmatch(Number, Number) :-
	number(Number), !.
mkmatch(Atom, stem(Atom)).

%%	tokenize_hits(+Literals:list(atom), -Keyed:list(list-atom)) is det.

tokenize_hits([], []).
tokenize_hits([H|T0], [L-H|T]) :-
	(   tokenize_atom(H, L)
	->  true
	;   L = [H]			% HACK
	),
	tokenize_hits(T0, T).

filter_compounds([], Literals, Literals).
filter_compounds([C0|C], L0, L) :-
	filter_compound(L0, C0, L1),
	filter_compounds(C, L1, L).


filter_compound([], _, []).
filter_compound([H|T0], Compound, [H|T]) :-
	H = TL-_,
	append(_, Rest, TL),
	same_tokens(Compound, Rest), !,
	filter_compound(T0, Compound, T).
filter_compound([_|T0], Compound, T) :-
	filter_compound(T0, Compound, T).


same_tokens([], _).
same_tokens([H|T0], [H|T]) :- !,
	same_tokens(T0, T).
same_tokens([-|T0], T) :- !,
	same_tokens(T0, T).
same_tokens(T0, [-|T]) :- !,
	same_tokens(T0, T).
same_tokens([H0|T0], [H1|T]) :-
	porter_stem(H0, Stem),
	porter_stem(H1, Stem),
	same_tokens(T0, T).


%%	sort_matches(+Matches, +Search, +Threshold, -Set)
%
%	Sort matches by score, best  first.   Scoring  is done using the
%	search target in Search. All scores  below Threshold are removed
%	from the result-set.
%
%	@tbd	If Fuzzy is not =off=, score is always 1 and Set is
%		the same as Matches.  Should be cleaned.

sort_matches(Set0, Search, Threshold, Set) :-
    debug(find_literals, 'Step1D: Score literals and remove the ones below threshold',[]),
	tag_match_score(Set0, Search, Threshold, Tagged),
	debug(find_literals, 'Step1E: Sort list of literals by score',[]),
    keysort(Tagged, Set1),
    reverse(Set1, Set),
	debug(find_literals_detail, 'Sorted literals ~p', [Set]).


%%	tag_match_score(+Set:list(atom), +Search, +Threshold, -Result:list(Score-Atom))
%
%	Match literal scores on how well they match Search. Matches
%	below Threshold are deleted from the result.

tag_match_score([], _, _, []).
tag_match_score([H|T0], Search, Threshold, L) :-
	match_score(Search, H, S),
	%debug(search, 'Score ~w~n', [S]),
	(   S >= Threshold
	->  L = [S-H|T]
	;   L = T
	),
	tag_match_score(T0, Search, Threshold, T).


%%	match_score(+Search, +Literal, -Score) is det.
%
%	Determine quality of the score. There  are two cases. Literal is
%	the same or about the  same  as   Search  and  Literal is a long
%	literal and Search  only  provides  a   few  stems  or  words in
%	Literal. In the first case we   use the `minimal edit' algorithm
%	from fuzzy.pl. In the latter case our score depends on:
%
%		* Percentage of tokens from search found in Literal
%		* Whether they are stem or perfect matches
%		* Longest adjacent sequence (stem-based) appear
%
%	Values are normalised to 0...1, where 1 is a perfect match.

match_score(Search, Literal, Score) :-
	tokens(Search, SearchTokens),
	tokens(Literal, LiteralTokens),
	(   SearchTokens == LiteralTokens
	->  Score = 1,
	    debug(find_literals, 'Step1D1 Rate: SearchTokens ~p, LiteralTokens ~p, perfect match Score 1.', [SearchTokens, LiteralTokens])
	;   literal_distance(SearchTokens, LiteralTokens, LD)
	->  Score is 3/(3+LD)
	;   add_stems(SearchTokens, SearchStems),
	    add_stems(LiteralTokens, LitStems),
	    match_score2(SearchStems, LitStems, Score)
	).

tokens(In, Tokens) :-
	atom(In), !,
	rdf_tokenize_literal(In, Tokens).
tokens(In, In) :-
	assertion(is_list(In)).

match_score2(Search, Lit, Score) :-
	same_by_stem(Search, Lit), !,
	Score = 0.95.
match_score2(Search, Lit, Score) :-
	phrase(matches(Search, 1, Lit), Matches),
	match_count(Matches, Count),
	length(Lit, Len),
	Score is min(1, Count/Len).


same_by_stem([], []).
same_by_stem([_-S|T0], [_-S|T]) :-
	same_by_stem(T0, T).


%%	matches(+For:list(W-S), +Start, +In:list(W-S))//
%
%	Produces a list of m(Match, PosIn, PosFor), where Match is 1 for
%	exact matches and 0.8 it the stem matches.

matches([], _, _) -->
	[].
matches([W-S|T], I, Lit) -->
	match(Lit, I, 1, W, S),
	{ I2 is I + 1 },
	matches(T, I2, Lit).

match([], _, _, _, _) -->
	[].
match([LW-S|T], I, N, W, S) --> !,
	(   { W == LW }
	->  [ m(1,  I,N) ]
	;   [ m(0.8,I,N) ]
	),
	{ N2 is N + 1 },
	match(T, I, N2, W, S).
match([_|T], I, N, W, S) -->
	{ N2 is N + 1 },
	match(T, I, N2, W, S).


%%	match_count(+Matches, -Score) is det.
%
%	Sum the found matches. The idea  was   to  give  extra credit to
%	contiguous chunks, but I cannot find that in the code. A bug?

match_count([], 0).
match_count([m(S,I,N)|T], Score) :-
	sequence(I, N, T, T1, S, Score1),
	match_count(T1, Score2),
	Score is Score1 + Score2.

sequence(I, N, T0, T, S, Score) :-
	I2 is I + 1,
	N2 is N + 1,
	select(m(S2,I2,N2), T0, T1), !,
	Score1 is S+S2,
	sequence(I2, N2, T1, T, Score1, Score).
sequence(_, _, T, T, S, S).

%%	add_stems(+Words:list(atom), -Stemmed:list(Word-Stem)) is det.
%
%	@tbd	Deal with non-iso-latin-1 text

add_stems([], []).
add_stems([H|T0], [H-Stem|T]) :-
	(   porter_stem(H, Stem)
	->  true
	;   Stem = H
	),
	add_stems(T0, T).


		 /*******************************
		 *	      SEARCH		*
		 *******************************/

%%	assoc_map(+List, -Assoc)
%
%	Translate P=F, ... list into an assoc, turning all NS:Local into
%	global predicates.

assoc_map(List, Assoc) :-
	empty_assoc(Assoc0),
	assoc_map(List, Assoc0, Assoc).

assoc_map([], Assoc, Assoc).
assoc_map([P=F|T], Assoc0, Assoc) :-
	rdf_global_id(P, G),
	put_assoc(G, Assoc0, F, Assoc1),
	assoc_map(T, Assoc1, Assoc).


%%	search_graph(+From, +Range, +Score0, +Threshold, +Map, -Score, -Path)
%
%	Search the RDF graph starting at From, looking for an object for
%	which owl_satisfies(O, Range) succeeds. Score0   is  the initial
%	score. The search stops if the score  is below Threshold. Map is
%	an assoc from predicate to  a  number   between  0,  and 1. If a
%	predicate  is  not  in  the  map,    the   system  looks  for  a
%	super-predicate that is in the map. If   that fails it looks for
%	`default'.
%
%	Score is the final score of the  path   and  Path is a list that
%	starts with the found target.  Other   elements  in the list are
%	pairs of the format "Object-Predicate". So, the path
%
%		[T, R1-P1, R2-P2]
%
%	is created from a search R2 <-P2-> R1 <-P1-> T

search_graph(From, Range, Score0, Threshold, Map, Score, Path) :-
	empty_assoc(Done),
	search_graph(From, Range, Score0, Threshold, Map, Done,
		     Score, Path0),
	reverse(Path0, Path).

search_graph(Object, Range, Score, _, _, _, Score, Path) :-
	owl_satisfies(Range, Object),
	Path = [Object].

search_graph(Object, Range, Score0, Threshold, Map, Done,
	     Score, [Object-P|Path]) :-
	(   rdf_has(O2, P, Object)
	;   atom(Object),
	    rdf_has(Object, IP, O2),
	    rdf_has(P, owl:inverseOf, IP)
	),
	\+ get_assoc(O2, Done, _),

	update_score(Score0, P, Map, Map1, Score1),
	Score1 >= Threshold,
	put_assoc(O2, Done, true, Done1),
	search_graph(O2, Range, Score1, Threshold, Map1, Done1, Score, Path).


update_score(Score0, P, Map, Map, Score) :-
	get_assoc(P, Map, F), !,
	Score is Score0*F.
update_score(Score0, P, Map0, Map, Score) :-
	(   rdfs_subproperty_of(P, Super),
	    get_assoc(Super, Map0, F)
	->  true
	;   get_assoc(default, Map0, F)
	),
	put_assoc(P, Map0, F, Map),
	Score is Score0*F.


		 /*******************************
		 *     JOIN PROBABILITIES	*
		 *******************************/

%%	join_probabilities(+Paths:list(Prop-Path),
%%			   -URIs:list(Prop-URI)) is det.
%
%	Given  a  list  of  pairs    Probability-Path,   create  a  list
%	Probability-URI, merging the probabilities  of   each  path that
%	leads to the same final resource.

join_probabilities(P, Merged) :-
	key_prop_by_uri(P, Keyed),
	keysort(Keyed, Sorted),
	join_by_uri(Sorted, Merged).

key_prop_by_uri([], []).
key_prop_by_uri([P-[URI|_]|T0], [URI-P|T]) :-
	key_prop_by_uri(T0, T).

join_by_uri([], []).
join_by_uri([URI-P0|T0], [P-URI|T]) :-
	same_uri(URI, T0, Ps, T1),
	joined_probability([P0|Ps], P),
	join_by_uri(T1, T).

same_uri(URI, [URI-P|T0], [P|TP], T) :- !,
	same_uri(URI, T0, TP, T).
same_uri(_, T, [], T).

joined_probability(List, Joined) :-
	sum_p_no(List, Sum),
	Joined is 1 - Sum.

sum_p_no([], 1).
sum_p_no([H|T], P) :-
	sum_p_no(T, P0),
	P is P0 * (1-H).