/* This file is part of ClioPatria. Author: HTTP: http://e-culture.multimedian.nl/ GITWEB: http://gollem.science.uva.nl/git/ClioPatria.git GIT: git://gollem.science.uva.nl/home/git/ClioPatria.git GIT: http://gollem.science.uva.nl/home/git/ClioPatria.git Copyright: 2007, E-Culture/MultimediaN ClioPatria is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. ClioPatria is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with ClioPatria. If not, see . */ :- module(fuzzy, [ stem_literals/0, fuz_find_literal/3, % +Spec, +MatchType, -Literal stem_find_literal/2, % +Spec, -Literal literal_distance/3 % +L1, +L2, -D ]). :- use_module(library(semweb/rdf_litindex)). :- use_module(library(lists)). :- use_module(library(find_resource)). :- use_module(library(porter_stem)). /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - This module finds literals of the RDF database based on stemming and being flexible to ordering of tokens. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ %% stem_literals % % Create stemming database. As we now use library(rdf_litindex), % which generates and maintains the database on the fly we simply % do a call that forces initialization of the database. stem_literals :- rdf_find_literals(stem(foobar), _). %% stem_find_literal(+TextOrTokens, -Literal) % % Find literal values that contain all stems from Text. Text is % either an atom or a list of tokens as produced by % rdf_tokenize_literal/2. stem_find_literal(Text, Literal) :- find_literal(Text, stem, Literal). %% fuz_find_literal(+TextOrTokens, +MatchType, -Literal) % % Find literal values that contain all Text where the % matching is determined by MatchType. % Text is either an atom or a list of tokens as produced by % rdf_tokenize_literal/2. % MatchType is one of case, stem or prefix. fuz_find_literal(Text, MatchType, Literal) :- tokens(Text, Tokens), list_to_and(Tokens, MatchType, And), rdf_find_literals(And, Literals), member(Literal, Literals). %% list_to_and(+TextOrTokens, +MatchType, -AndTerm) % % AndTerm is a conjunction of terms constructed from % MatchType and Token. % % For performance reasons a conjunction of prefix items % is not made. Instead only the last conjunct gets a prefix. list_to_and([], _, true). list_to_and([One], MatchType, Match) :- !, mkmatch(MatchType, One, Match). list_to_and([H|T], prefix, and(Match, And)) :- !, mkmatch(stem, H, Match), list_to_and(T, prefix, And). list_to_and([H|T], MatchType, and(Match, And)) :- mkmatch(MatchType, H, Match), list_to_and(T, MatchType, And). mkmatch(_, Number, Number) :- number(Number), !. mkmatch(stem, Token, stem(Token)). mkmatch(prefix, Token, prefix(Token)). mkmatch(case, Token, case(Token)). /******************************* * DISTANCE * *******************************/ goal_expansion(forall(C0, A0), \+ (C, \+ A)) :- expand_goal(C0, C), expand_goal(A0, A). %% literal_distance(+Lit1, +Lit2, -Distance) % % Compute the distance between two literals. Distance values are % >= 0, where 0 means perfect match. Handicaps are given to % inserted, deleted, moved and modified tokens. See the above URL % for a description of the best edit-sequence comparing strings. literal_distance(L1, L2, Distance) :- tokens(L1, TL1), tokens(L2, TL2), length(TL1, N1), length(TL2, N2), abs(N1-N2) =< min(N1, N2), % too much difference in length cheapest_edit_path(TL1, TL2, EditCost, _Path), Distance is EditCost/max(N1,N2). tokens(Tokens, Tokens) :- is_list(Tokens), !. tokens(Spec, Tokens) :- atom(Spec), !, rdf_tokenize_literal(Spec, Tokens). tokens(Number, [Number]) :- number(Number). %% cheapest_edit_path(+List1, +List2, -Distance, -Path) % % Compute the cheapest edit path. As edit operations are weighted, % this is not necessarily the shorted one, but the algorithm is % basically the same. % % @see http://www.ling.ohio-state.edu/~cbrew/2002/winter/684.02/string-distance.html cheapest_edit_path(Toks1, Toks2, Distance, Path) :- % DelCost = 10, % Delete token % InsCost = 8, % Insert token MatchCost = 0, % Matched token CaseCost = 1, % Different case DWIMCost = 3, % Spelling error StemCost = 5, % Tokens have same stem SubstCost = 10, % Replaced token MovCost = 2, % Token is moved T1 =.. [string|Toks1], T2 =.. [string|Toks2], functor(T1, _, M), functor(T2, _, N), X is M + 1, Y is N + 1, M1 is M - 1, N1 is N - 1, new_array(X, Y, Array), nb_set_array(Array, 0, 0, c(0, [])), forall(between(0, M1, I), ( get_array(Array, I, 0, c(V0, P0)), I1 is I+1, arg(I1, T1, D), del_cost(D, DC), V is V0 + DC, nb_link_array(Array, I1, 0, c(V, [del(D)|P0])) )), forall(between(0, N1, J), ( get_array(Array, 0, J, c(V0, P0)), J1 is J+1, arg(J1, T2, I), ins_cost(I, IC), V is V0 + IC, nb_link_array(Array, 0, J1, c(V, [ins(I)|P0])) )), forall(between(0, M1, I), forall(between(0, N1, J), ( I1 is I + 1, J1 is J + 1, arg(I1, T1, V1), arg(J1, T2, V2), ( V1 == V2 -> Subst = MatchCost ; downcase_atom(V1, L), downcase_atom(V2, L) -> Subst = CaseCost ; dwim_match(V1, V2) -> Subst = DWIMCost ; same_stem(V1, V2) -> Subst = StemCost ; Subst = SubstCost ), get_array(Array, I, J, c(C1, P1)), get_array(Array, I1, J, c(C2, P2)), get_array(Array, I, J1, c(C3, P3)), SubstC is C1 + Subst, ( memberchk(del(V2), P2) -> del_cost(V2, DC2), InsC is C2 - DC2 + MovCost ; ins_cost(V2, InsCost), InsC is C2 + InsCost ), ( memberchk(ins(V1), P3) -> ins_cost(V1, IC1), DelC is C3 - IC1 + MovCost ; del_cost(V1, DelCost), DelC is C3 + DelCost ), ( SubstC < InsC -> ( SubstC < DelC -> nb_link_array(Array, I1, J1, c(SubstC, [subst(V1, V2)|P1])) ; nb_link_array(Array, I1, J1, c(DelC, [del(V1)|P3])) ) ; ( DelC < InsC -> nb_link_array(Array, I1, J1, c(DelC, [del(V1)|P3])) ; nb_link_array(Array, I1, J1, c(InsC, [ins(V2)|P2])) ) )))), % pp_array(Array), get_array(Array, M, N, c(Distance, Path0)), reverse(Path0, Path). ins_cost((,), 1) :- !. ins_cost(_, 8). del_cost((,), 1) :- !. del_cost(_, 10). same_stem(T1, T2) :- atom(T1), atom(T2), !, porter_stem(T1, Stem), porter_stem(T2, Stem). /******************************* * SIMPLE ARRAY PACKAGE * *******************************/ new_array(X, Y, Array) :- Size is X * Y + 3, functor(Array, array, Size), arg(1, Array, 2), % dimensions arg(2, Array, X), % columns arg(3, Array, Y). % rows get_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, arg(Pos, Array, Val). nb_set_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, nb_setarg(Pos, Array, Val). nb_link_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, nb_linkarg(Pos, Array, Val). /******************************* * DEBUGGING * *******************************/ end_of_file. pp_array(Array) :- functor(Array, array, _), arg(1, Array, 2), arg(2, Array, Cols), arg(3, Array, Rows), MaxX is Cols-1, MaxY is Rows-1, forall(between(0, MaxY, Y), ( forall(between(0, MaxX, X), ( get_array(Array, X, Y, V), format('~w ', [V]) )), nl )).