:- module(find_resource, [ find_literal/3, % +Text, +MatchType, -Literal find_resource_by_name/3 % +Name, -Hits, +Options ]). :- use_module(library(semweb/rdf_db)). :- use_module(library(semweb/rdfs)). :- use_module(library(semweb/rdf_litindex)). :- use_module(library(semweb/rdf_label)). % force indexing :- rdf_find_literals(zzz,_). %% find_resource_by_name(+Name:atom, %% -Hit:hit(Distance:nonneg,Hit:atom,Prop:atom,Label:atom), %% +Options) is nondet. % % Find a resource based on a Name. % % Options % * Tokenize % Boolean, defaults to true. % % * Match % When Tokenize == true, Match should be one of case, stem, % prefix. Method to match the Name against the literal index. % Default is case. See rdf_find_literals/2 for details. % % When Tokenize == false, Match should be one of the % literal search methods supported by rdf/3, including prefix, % plain, and exact, but _not_ stem or case. % % * Distance % Boolean, compute literal distance if true. Default is true. The % distance is based on two figures: % * Mismatch of the label % * Handicap of the attribute % % * Attribute % List of the form Resource-Distance. Smaller % distance is better, so rdfs:label typically is 0. % If multiple attributes match, we take the best. find_resource_by_name(Name, Hits, Options) :- rdf_equal(rdfs:label, P), option(match(Match), Options, case), option(distance(Distance), Options, true), option(attributes(Attributes), Options, [P-0]), option(tokenize(Tokenize), Options, true), catch(findall(D-Label, find_literal_distance(Distance, Tokenize, Name, Match, Label, D), Pairs), no_stem(_), fail), findall(Hit, uri_with_label_in(Pairs, Attributes, Hit), Hits1), sort(Hits1, Hits2), % sort by URI remove_dup_uris(Hits2, Hits3), % take lowest on URI sort(Hits3, Hits). % sort by distance remove_dup_uris([], []). remove_dup_uris([hit(URI,D,P,L)|T0], [hit(D,URI,P,L)|T]) :- remove_same_uri(URI, T0, T1), remove_dup_uris(T1, T). remove_same_uri(URI, [hit(URI,_,_,_)|T0], T) :- !, remove_same_uri(URI, T0, T). remove_same_uri(_, L, L). uri_with_label_in(LabelPairs, Attributes, hit(URI, Distance, P, Label)) :- member(D-Label, LabelPairs), rdf(URI, P, literal(Label)), \+ rdf(URI, rdf:type, 'http://semanticweb.cs.vu.nl/prestoprime/Tag'), % hack ( member(P-F, Attributes) -> true ; member(AS-F, Attributes), rdfs_subproperty_of(P, AS) ), Distance is D+F. /******************************* * find literals * *******************************/ find_literal_distance(true, Tokenize, Literal, MatchType, Label, Distance) :- !, find_literal(Literal, Tokenize, MatchType, Label), literal_distance(Literal, Label, Distance). find_literal_distance(_, Tokenize, Literal, MatchType, Label, 0) :- find_literal(Literal, Tokenize, MatchType, Label). %% find_literal(+TextOrTokens, +MatchType, -Literal) % % Find literal values that contain all Text where the % matching is determined by MatchType. % Text is either an atom or a list of tokens as produced by % rdf_tokenize_literal/2. % MatchType is one of case, stem or prefix. find_literal(Text, MatchType, Literal) :- find_literal(Text, true, MatchType, Literal). find_literal(Text, true, MatchType, Literal) :- tokens(Text, Tokens), list_to_and(Tokens, MatchType, And), rdf_find_literals(And, Literals), member(Literal, Literals). find_literal(Text, false, MatchType, Result) :- M =.. [ MatchType, Text ], L =.. [ literal, M, Literal ], rdf(_,_,L), literal_text(Literal, Result). %% list_to_and(+TextOrTokens, +MatchType, -AndTerm) % % AndTerm is a conjunction of terms constructed from % MatchType and Token. % % For performance reasons a conjunction of prefix items % is not made. Instead only the last conjunct gets a prefix. list_to_and([], _, true). list_to_and([One], MatchType, Match) :- !, mkmatch(MatchType, One, Match). list_to_and([H|T], prefix, and(Match, And)) :- !, mkmatch(stem, H, Match), list_to_and(T, prefix, And). list_to_and([H|T], MatchType, and(Match, And)) :- mkmatch(MatchType, H, Match), list_to_and(T, MatchType, And). mkmatch(_, Number, Number) :- number(Number), !. mkmatch(stem, Token, stem(Token)). mkmatch(prefix, Token, prefix(Token)). mkmatch(case, Token, case(Token)). /******************************* * DISTANCE * *******************************/ goal_expansion(forall(C0, A0), \+ (C, \+ A)) :- expand_goal(C0, C), expand_goal(A0, A). %% literal_distance(+Lit1, +Lit2, -Distance) % % Compute the distance between two literals. Distance values are % >= 0, where 0 means perfect match. Handicaps are given to % inserted, deleted, moved and modified tokens. See the above URL % for a description of the best edit-sequence comparing strings. literal_distance(L1, L2, Distance) :- tokens(L1, TL1), tokens(L2, TL2), length(TL1, N1), length(TL2, N2), abs(N1-N2) =< min(N1, N2), % too much difference in length cheapest_edit_path(TL1, TL2, EditCost, _Path), Distance is EditCost/max(N1,N2). tokens(Tokens, Tokens) :- is_list(Tokens), !. tokens(Spec, Tokens) :- atom(Spec), !, rdf_tokenize_literal(Spec, Tokens). tokens(Number, [Number]) :- number(Number). %% cheapest_edit_path(+List1, +List2, -Distance, -Path) % % Compute the cheapest edit path. As edit operations are weighted, % this is not necessarily the shorted one, but the algorithm is % basically the same. % % @see http://www.ling.ohio-state.edu/~cbrew/2002/winter/684.02/string-distance.html cheapest_edit_path(Toks1, Toks2, Distance, Path) :- % DelCost = 10, % Delete token % InsCost = 8, % Insert token MatchCost = 0, % Matched token CaseCost = 1, % Different case DWIMCost = 3, % Spelling error StemCost = 5, % Tokens have same stem SubstCost = 10, % Replaced token MovCost = 2, % Token is moved T1 =.. [string|Toks1], T2 =.. [string|Toks2], functor(T1, _, M), functor(T2, _, N), X is M + 1, Y is N + 1, M1 is M - 1, N1 is N - 1, new_array(X, Y, Array), nb_set_array(Array, 0, 0, c(0, [])), forall(between(0, M1, I), ( get_array(Array, I, 0, c(V0, P0)), I1 is I+1, arg(I1, T1, D), del_cost(D, DC), V is V0 + DC, nb_link_array(Array, I1, 0, c(V, [del(D)|P0])) )), forall(between(0, N1, J), ( get_array(Array, 0, J, c(V0, P0)), J1 is J+1, arg(J1, T2, I), ins_cost(I, IC), V is V0 + IC, nb_link_array(Array, 0, J1, c(V, [ins(I)|P0])) )), forall(between(0, M1, I), forall(between(0, N1, J), ( I1 is I + 1, J1 is J + 1, arg(I1, T1, V1), arg(J1, T2, V2), ( V1 == V2 -> Subst = MatchCost ; downcase_atom(V1, L), downcase_atom(V2, L) -> Subst = CaseCost ; dwim_match(V1, V2) -> Subst = DWIMCost ; same_stem(V1, V2) -> Subst = StemCost ; Subst = SubstCost ), get_array(Array, I, J, c(C1, P1)), get_array(Array, I1, J, c(C2, P2)), get_array(Array, I, J1, c(C3, P3)), SubstC is C1 + Subst, ( memberchk(del(V2), P2) -> del_cost(V2, DC2), InsC is C2 - DC2 + MovCost ; ins_cost(V2, InsCost), InsC is C2 + InsCost ), ( memberchk(ins(V1), P3) -> ins_cost(V1, IC1), DelC is C3 - IC1 + MovCost ; del_cost(V1, DelCost), DelC is C3 + DelCost ), ( SubstC < InsC -> ( SubstC < DelC -> nb_link_array(Array, I1, J1, c(SubstC, [subst(V1, V2)|P1])) ; nb_link_array(Array, I1, J1, c(DelC, [del(V1)|P3])) ) ; ( DelC < InsC -> nb_link_array(Array, I1, J1, c(DelC, [del(V1)|P3])) ; nb_link_array(Array, I1, J1, c(InsC, [ins(V2)|P2])) ) )))), % pp_array(Array), get_array(Array, M, N, c(Distance, Path0)), reverse(Path0, Path). ins_cost((','), 1) :- !. ins_cost(_, 8). del_cost((','), 1) :- !. del_cost(_, 10). same_stem(T1, T2) :- atom(T1), atom(T2), !, porter_stem(T1, Stem), porter_stem(T2, Stem). /******************************* * SIMPLE ARRAY PACKAGE * *******************************/ new_array(X, Y, Array) :- Size is X * Y + 3, functor(Array, array, Size), arg(1, Array, 2), % dimensions arg(2, Array, X), % columns arg(3, Array, Y). % rows get_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, arg(Pos, Array, Val). nb_set_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, nb_setarg(Pos, Array, Val). nb_link_array(Array, X, Y, Val) :- arg(2, Array, Cols), Pos is X + Y*Cols + 4, nb_linkarg(Pos, Array, Val).