/* This file was part of ClioPatria. */ :- module(lit_distance, [ literal_distance/3 % +L1, +L2, -D ]). :- use_module(library(semweb/rdf_litindex)). /******************************* * DISTANCE * *******************************/ goal_expansion(forall(C0, A0), \+ (C, \+ A)) :- expand_goal(C0, C), expand_goal(A0, A). % % Compute the distance between two literals. Distance values are % >= 0, where 0 means perfect match. Handicaps are given to % inserted, deleted, moved and modified tokens. See the above URL % for a description of the best edit-sequence comparing strings. literal_distance(L1, L2, Distance) :- tokens(L1, TL1), tokens(L2, TL2), length(TL1, N1), length(TL2, N2), abs(N1-N2) =< min(N1, N2), % too much difference in length cheapest_edit_path(TL1, TL2, EditCost, _Path), Distance is EditCost/max(N1,N2). tokens(Tokens, Tokens) :- is_list(Tokens), !. tokens(Spec, Tokens) :- atom(Spec), !, rdf_tokenize_literal(Spec, Tokens). tokens(Number, [Number]) :- number(Number). %% cheapest_edit_path(+List1, +List2, -Distance, -Path) % % Compute the cheapest edit path. As edit operations are weighted, % this is not necessarily the shorted one, but the algorithm is % basically the same. % % @see http://www.ling.ohio-state.edu/~cbrew/2002/winter/684.02/string-distance.html cheapest_edit_path(Toks1, Toks2, Distance, Path) :- % DelCost = 10, % Delete token % InsCost = 8, % Insert token MatchCost = 0, % Matched token CaseCost = 1, % Different case DWIMCost = 3, % Spelling error StemCost = 5, % Tokens have same stem SubstCost = 10, % Replaced token MovCost = 2, % Token is moved T1 =.. [string|Toks1], T2 =.. [string|Toks2], functor(T1, _, M), functor(T2, _, N), X is M + 1, Y is N + 1, M1 is M - 1, N1 is N - 1, new_array(X, Y, Array), nb_set_array(Array, 0, 0, c(0, [])), forall(between(0, M1, I), ( get_array(Array, I, 0, c(V0, P0)), I1 is I+1, arg(I1, T1, D), del_cost(D, DC), V is V0 + DC, nb_link_array(Array, I1, 0, c(V, [del(D)|P0])) )), forall(between(0, N1, J), ( get_array(Array, 0, J, c(V0, P0)), J1 is J+1, arg(J1, T2, I), ins_cost(I, IC), V is V0 + IC, nb_link_array(Array, 0, J1, c(V, [ins(I)|P0])) )), forall(between(0, M1, I), forall(between(0, N1, J), ( I1 is I + 1, J1 is J + 1, arg(I1, T1, V1), arg(J1, T2, V2), ( V1 == V2 -> Subst = MatchCost ; downcase_atom(V1, L), downcase_atom(V2, L) -> Subst = CaseCost ; dwim_match(V1, V2) -> Subst = DWIMCost ; same_stem(V1, V2) -> Subst = StemCost ; Subst = SubstCost ), get_array(Array, I, J, c(C1, P1)), get_array(Array, I1, J, c(C2, P2)), get_array(Array, I, J1, c(C3, P3)), SubstC is C1 + Subst, ( memberchk(del(V2), P2) -> del_cost(V2, DC2), InsC is C2 - DC2 + MovCost ; ins_cost(V2, InsCost), InsC is C2 + InsCost ), ( memberchk(ins(V1), P3) -> ins_cost(V1, IC1), DelC is C3 - IC1 + MovCost ; del_cost(V1, DelCost), DelC is C3 + DelCost ), ( SubstC < InsC -> ( SubstC < DelC -> nb_link_array(Array, I1, J1, c(SubstC, [subst(V1, V2)|P1])) ; nb_link_array(Array, I1, J1, c(DelC, [del(V1)|P3])) ) ; ( DelC < InsC -> nb_link_array(Array, I1, J1, c(DelC, [del(V1)|P3])) ; nb_link_array(Array, I1, J1, c(InsC, [ins(V2)|P2])) ) )))), % pp_array(Array), get_array(Array, M, N, c(Distance, Path0)), reverse(Path0, Path). ins_cost((','), 1) :- !. ins_cost(_, 8). del_cost((','), 1) :- !. del_cost(_, 10). same_stem(T1, T2) :- atom(T1), atom(T2), !, porter_stem(T1, Stem), porter_stem(T2, Stem). /******************************* * SIMPLE ARRAY PACKAGE * *******************************/ new_array(X, Y, Array) :- Size is X * Y + 3, functor(Array, array, Size), arg(1, Array, 2), % dimensions arg(2, Array, X), % columns arg(3, Array, Y). % rows get_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, arg(Pos, Array, Val). nb_set_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, nb_setarg(Pos, Array, Val). nb_link_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, nb_linkarg(Pos, Array, Val). /******************************* * DEBUGGING * *******************************/ end_of_file. pp_array(Array) :- functor(Array, array, _), arg(1, Array, 2), arg(2, Array, Cols), arg(3, Array, Rows), MaxX is Cols-1, MaxY is Rows-1, forall(between(0, MaxY, Y), ( forall(between(0, MaxX, X), ( get_array(Array, X, Y, V), format('~w ', [V]) )), nl )).