/*  This file is part of ClioPatria.

    Author:
    HTTP:	http://e-culture.multimedian.nl/
    GITWEB:	http://gollem.science.uva.nl/git/ClioPatria.git
    GIT:	git://gollem.science.uva.nl/home/git/ClioPatria.git
    GIT:	http://gollem.science.uva.nl/home/git/ClioPatria.git
    Copyright:  2007, E-Culture/MultimediaN

    ClioPatria is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    ClioPatria is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with ClioPatria.  If not, see <http://www.gnu.org/licenses/>.
*/

:- module(fuzzy,
	  [ stem_literals/0,
	    fuz_find_literal/3,             % +Spec, +MatchType, -Literal
	    stem_find_literal/2,	% +Spec, -Literal
	    literal_distance/3		% +L1, +L2, -D
	  ]).

:- use_module(library(semweb/rdf_litindex)).
:- use_module(library(lists)).
:- use_module(library(find_resource)).
:- use_module(library(porter_stem)).

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
This module finds literals of the RDF database based on stemming and
being flexible to ordering of tokens.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

%%	stem_literals
%
%	Create stemming database. As we   now use library(rdf_litindex),
%	which generates and maintains the database  on the fly we simply
%	do a call that forces initialization of the database.

stem_literals :-
	rdf_find_literals(stem(foobar), _).


%%	stem_find_literal(+TextOrTokens, -Literal)
%
%	Find literal values that contain all   stems  from Text. Text is
%	either  an  atom  or  a   list    of   tokens   as  produced  by
%	rdf_tokenize_literal/2.

stem_find_literal(Text, Literal) :-
	find_literal(Text, stem, Literal).


%%	fuz_find_literal(+TextOrTokens, +MatchType, -Literal)
%
%	Find literal values that contain all Text where the
%       matching is determined by MatchType.
%	Text is either an atom or a list of tokens as produced by
%	rdf_tokenize_literal/2.
%	MatchType is one of case, stem or prefix.


fuz_find_literal(Text, MatchType, Literal) :-
	tokens(Text, Tokens),
	list_to_and(Tokens, MatchType, And),
	rdf_find_literals(And, Literals),
	member(Literal, Literals).


%%	list_to_and(+TextOrTokens, +MatchType, -AndTerm)
%
%	AndTerm is a conjunction of terms constructed from
%	MatchType and Token.
%
%	For performance reasons a conjunction of prefix items
%	is not made. Instead only the last conjunct gets a prefix.

list_to_and([], _, true).
list_to_and([One], MatchType, Match) :- !,
	mkmatch(MatchType, One, Match).
list_to_and([H|T], prefix, and(Match, And)) :- !,
        mkmatch(stem, H, Match),
        list_to_and(T, prefix, And).
list_to_and([H|T], MatchType, and(Match, And)) :-
	mkmatch(MatchType, H, Match),
	list_to_and(T, MatchType, And).


mkmatch(_, Number, Number) :-
	number(Number), !.
mkmatch(stem, Token, stem(Token)).
mkmatch(prefix, Token, prefix(Token)).
mkmatch(case, Token, case(Token)).

		 /*******************************
		 *	       DISTANCE		*
		 *******************************/

goal_expansion(forall(C0, A0), \+ (C, \+ A)) :-
	expand_goal(C0, C),
	expand_goal(A0, A).


%%	literal_distance(+Lit1, +Lit2, -Distance)
%
%	Compute the distance between two   literals. Distance values are
%	>= 0, where 0  means  perfect   match.  Handicaps  are  given to
%	inserted, deleted, moved and modified tokens.  See the above URL
%	for a description of the best edit-sequence comparing strings.

literal_distance(L1, L2, Distance) :-
	tokens(L1, TL1),
	tokens(L2, TL2),
	length(TL1, N1),
	length(TL2, N2),
	abs(N1-N2) =< min(N1, N2),	% too much difference in length
	cheapest_edit_path(TL1, TL2, EditCost, _Path),
	Distance is EditCost/max(N1,N2).

tokens(Tokens, Tokens) :-
	is_list(Tokens), !.
tokens(Spec, Tokens) :-
	atom(Spec), !,
	rdf_tokenize_literal(Spec, Tokens).
tokens(Number, [Number]) :-
	number(Number).


%%	cheapest_edit_path(+List1, +List2, -Distance, -Path)
%
%	Compute the cheapest edit path. As edit operations are weighted,
%	this is not necessarily the shorted   one,  but the algorithm is
%	basically the same.
%
%	@see http://www.ling.ohio-state.edu/~cbrew/2002/winter/684.02/string-distance.html

cheapest_edit_path(Toks1, Toks2, Distance, Path) :-
%	DelCost   = 10,			% Delete token
%	InsCost	  = 8,			% Insert token
	MatchCost = 0,			% Matched token
	CaseCost  = 1,			% Different case
	DWIMCost  = 3,			% Spelling error
	StemCost  = 5,			% Tokens have same stem
	SubstCost = 10,			% Replaced token
	MovCost   = 2,			% Token is moved

	T1 =.. [string|Toks1],
	T2 =.. [string|Toks2],
	functor(T1, _, M),
	functor(T2, _, N),
	X is M + 1,
	Y is N + 1,
	M1 is M - 1,
	N1 is N - 1,
	new_array(X, Y, Array),
	nb_set_array(Array, 0, 0, c(0, [])),
	forall(between(0, M1, I),
	       (   get_array(Array, I, 0, c(V0, P0)),
		   I1 is I+1,
		   arg(I1, T1, D),
		   del_cost(D, DC),
		   V is V0 + DC,
		   nb_link_array(Array, I1, 0, c(V, [del(D)|P0]))
	       )),
	forall(between(0, N1, J),
	       (   get_array(Array, 0, J, c(V0, P0)),
		   J1 is J+1,
		   arg(J1, T2, I),
		   ins_cost(I, IC),
		   V is V0 + IC,
		   nb_link_array(Array, 0, J1, c(V, [ins(I)|P0]))
	       )),
	forall(between(0, M1, I),
	       forall(between(0, N1, J),
		      (   I1 is I + 1,
			  J1 is J + 1,
		          arg(I1, T1, V1),
			  arg(J1, T2, V2),
			  (   V1 == V2
			  ->  Subst = MatchCost
			  ;   downcase_atom(V1, L),
			      downcase_atom(V2, L)
			  ->  Subst = CaseCost
			  ;   dwim_match(V1, V2)
			  ->  Subst = DWIMCost
			  ;   same_stem(V1, V2)
			  ->  Subst = StemCost
			  ;   Subst = SubstCost
			  ),
			  get_array(Array, I,  J, c(C1, P1)),
			  get_array(Array, I1, J, c(C2, P2)),
			  get_array(Array, I, J1, c(C3, P3)),

			  SubstC is C1 + Subst,
			  (   memberchk(del(V2), P2)
			  ->  del_cost(V2, DC2),
			      InsC is C2 - DC2 + MovCost
			  ;   ins_cost(V2, InsCost),
			      InsC is C2 + InsCost
			  ),
			  (   memberchk(ins(V1), P3)
			  ->  ins_cost(V1, IC1),
			      DelC is C3 - IC1 + MovCost
			  ;   del_cost(V1, DelCost),
			      DelC is C3 + DelCost
			  ),

			  (   SubstC < InsC
			  ->  (   SubstC < DelC
			      ->  nb_link_array(Array, I1, J1,
						c(SubstC, [subst(V1, V2)|P1]))
			      ;   nb_link_array(Array, I1, J1,
						c(DelC, [del(V1)|P3]))
			      )
			  ;   (   DelC < InsC
			      ->  nb_link_array(Array, I1, J1,
						c(DelC, [del(V1)|P3]))
			      ;   nb_link_array(Array, I1, J1,
						c(InsC, [ins(V2)|P2]))
			      )
			  )))),
%	pp_array(Array),
	get_array(Array, M, N, c(Distance, Path0)),
	reverse(Path0, Path).

ins_cost((,), 1) :- !.
ins_cost(_, 8).

del_cost((,), 1) :- !.
del_cost(_, 10).

same_stem(T1, T2) :-
	atom(T1), atom(T2), !,
	porter_stem(T1, Stem),
	porter_stem(T2, Stem).


		 /*******************************
		 *     SIMPLE ARRAY PACKAGE	*
		 *******************************/

new_array(X, Y, Array) :-
	Size is X * Y + 3,
	functor(Array, array, Size),
	arg(1, Array, 2),		% dimensions
	arg(2, Array, X),		% columns
	arg(3, Array, Y).		% rows

get_array(Array, X, Y, Val) :-
	arg(2, Array, Cols),
	Pos is X + Y*Cols + 4,
	arg(Pos, Array, Val).

nb_set_array(Array, X, Y, Val) :-
	arg(2, Array, Cols),
	Pos is X + Y*Cols + 4,
	nb_setarg(Pos, Array, Val).

nb_link_array(Array, X, Y, Val) :-
	arg(2, Array, Cols),
	Pos is X + Y*Cols + 4,
	nb_linkarg(Pos, Array, Val).

		 /*******************************
		 *	     DEBUGGING		*
		 *******************************/

end_of_file.

pp_array(Array) :-
	functor(Array, array, _),
	arg(1, Array, 2),
	arg(2, Array, Cols),
	arg(3, Array, Rows),
	MaxX is Cols-1,
	MaxY is Rows-1,
	forall(between(0, MaxY, Y),
	       (   forall(between(0, MaxX, X),
			  (	  get_array(Array, X, Y, V),
				  format('~w ', [V])
			  )),
		   nl
	       )).