find_resource/commit
Provide routine find_resource_by_name/3
author | Michiel Hildebrand |
---|---|
Sat Jan 22 17:59:54 2011 +0100 | |
committer | Michiel Hildebrand |
Sat Jan 22 17:59:54 2011 +0100 | |
commit | 78f7df5b343d43dd4f110ea12f03b022bf2345fc |
tree | 9c895541cdf0d1eed63692875cef1565ac6f2e16 |
parent | fe7f3556d3a4b92e3e3c62d17db2d6e5a074b059 |
Diff style: patch stat
diff --git a/config-available/find_resource.pl b/config-available/find_resource.pl index af090a1..1719cba 100644 --- a/config-available/find_resource.pl +++ b/config-available/find_resource.pl @@ -1,5 +1,287 @@ -:- module(conf_find_resource, []). +:- module(conf_find_resource, + [ find_resource_by_name/3 % +Name, -Hits, +Options + ]). /** <module> Library to find RDF resources by a textual query */ +:- use_module(library(semweb/rdf_db)). +:- use_module(library(semweb/rdfs)). +:- use_module(library(semweb/rdf_litindex)). + +%% find_resource_by_name(+Name:atom, +Attributes:list(P-D), +%% -Hit:hit(Distance:nonneg,Hit:atom,Prop:atom,Label:atom), +%% +Options) is nondet. +% +% Find a resource based on a Name. +% +% Options +% * Match +% one of case, stem, prefix. Method to match the Name +% against the literal index. Default is case +% +% * Distance +% Boolean, compute literal distance if true. Default is true. The +% distance is based on two figures: +% * Mismatch of the label +% * Handicap of the attribute +% +% * Attribute +% List of the form Resource-Distance. Smaller +% distance is better, so rdfs:label typically is 0. +% If multiple attributes match, we take the best. + +find_resource_by_name(Name, Hits, Options) :- + rdf_equal(rdfs:label, P), + option(match(Match), Options, case), + option(distance(Distance), Options, true), + option(attributes(Attributes), Options, [P-0]), + catch(findall(D-Label, find_literal_distance(Distance, Name, Match, Label, D), Pairs), + no_stem(_), + fail), + findall(Hit, uri_with_label_in(Pairs, Attributes, Hit), Hits1), + sort(Hits1, Hits2), % sort by URI + remove_dup_uris(Hits2, Hits3), % take lowest on URI + sort(Hits3, Hits). % sort by distance + + +remove_dup_uris([], []). +remove_dup_uris([hit(URI,D,P,L)|T0], [hit(D,URI,P,L)|T]) :- + remove_same_uri(URI, T0, T1), + remove_dup_uris(T1, T). + +remove_same_uri(URI, [hit(URI,_,_,_)|T0], T) :- !, + remove_same_uri(URI, T0, T). +remove_same_uri(_, L, L). + +uri_with_label_in(LabelPairs, Attributes, hit(URI, Distance, P, Label)) :- + member(D-Label, LabelPairs), + rdf(URI, P, literal(Label)), + ( member(P-F, Attributes) + -> true + ; member(AS-F, Attributes), + rdfs_subproperty_of(P, AS) + ), + Distance is D+F. + + + /******************************* + * find literals * + *******************************/ + +find_literal_distance(true, Tokens, MatchType, Label, Distance) :- + !, + find_literal(Tokens, MatchType, Label), + literal_distance(Tokens, Label, Distance). +find_literal_distance(_, Tokens, MatchType, Label, 0) :- + find_literal(Tokens, MatchType, Label). + +%% find_literal(+TextOrTokens, +MatchType, -Literal) +% +% Find literal values that contain all Text where the +% matching is determined by MatchType. +% Text is either an atom or a list of tokens as produced by +% rdf_tokenize_literal/2. +% MatchType is one of case, stem or prefix. + + +find_literal(Text, MatchType, Literal) :- + tokens(Text, Tokens), + list_to_and(Tokens, MatchType, And), + rdf_find_literals(And, Literals), + member(Literal, Literals). + + +%% list_to_and(+TextOrTokens, +MatchType, -AndTerm) +% +% AndTerm is a conjunction of terms constructed from +% MatchType and Token. +% +% For performance reasons a conjunction of prefix items +% is not made. Instead only the last conjunct gets a prefix. + +list_to_and([], _, true). +list_to_and([One], MatchType, Match) :- !, + mkmatch(MatchType, One, Match). +list_to_and([H|T], prefix, and(Match, And)) :- !, + mkmatch(stem, H, Match), + list_to_and(T, prefix, And). +list_to_and([H|T], MatchType, and(Match, And)) :- + mkmatch(MatchType, H, Match), + list_to_and(T, MatchType, And). + + +mkmatch(_, Number, Number) :- + number(Number), !. +mkmatch(stem, Token, stem(Token)). +mkmatch(prefix, Token, prefix(Token)). +mkmatch(case, Token, case(Token)). + + /******************************* + * DISTANCE * + *******************************/ + +goal_expansion(forall(C0, A0), \+ (C, \+ A)) :- + expand_goal(C0, C), + expand_goal(A0, A). + + +%% literal_distance(+Lit1, +Lit2, -Distance) +% +% Compute the distance between two literals. Distance values are +% >= 0, where 0 means perfect match. Handicaps are given to +% inserted, deleted, moved and modified tokens. See the above URL +% for a description of the best edit-sequence comparing strings. + +literal_distance(L1, L2, Distance) :- + tokens(L1, TL1), + tokens(L2, TL2), + length(TL1, N1), + length(TL2, N2), + abs(N1-N2) =< min(N1, N2), % too much difference in length + cheapest_edit_path(TL1, TL2, EditCost, _Path), + Distance is EditCost/max(N1,N2). + +tokens(Tokens, Tokens) :- + is_list(Tokens), !. +tokens(Spec, Tokens) :- + atom(Spec), !, + rdf_tokenize_literal(Spec, Tokens). +tokens(Number, [Number]) :- + number(Number). + + +%% cheapest_edit_path(+List1, +List2, -Distance, -Path) +% +% Compute the cheapest edit path. As edit operations are weighted, +% this is not necessarily the shorted one, but the algorithm is +% basically the same. +% +% @see http://www.ling.ohio-state.edu/~cbrew/2002/winter/684.02/string-distance.html + +cheapest_edit_path(Toks1, Toks2, Distance, Path) :- +% DelCost = 10, % Delete token +% InsCost = 8, % Insert token + MatchCost = 0, % Matched token + CaseCost = 1, % Different case + DWIMCost = 3, % Spelling error + StemCost = 5, % Tokens have same stem + SubstCost = 10, % Replaced token + MovCost = 2, % Token is moved + + T1 =.. [string|Toks1], + T2 =.. [string|Toks2], + functor(T1, _, M), + functor(T2, _, N), + X is M + 1, + Y is N + 1, + M1 is M - 1, + N1 is N - 1, + new_array(X, Y, Array), + nb_set_array(Array, 0, 0, c(0, [])), + forall(between(0, M1, I), + ( get_array(Array, I, 0, c(V0, P0)), + I1 is I+1, + arg(I1, T1, D), + del_cost(D, DC), + V is V0 + DC, + nb_link_array(Array, I1, 0, c(V, [del(D)|P0])) + )), + forall(between(0, N1, J), + ( get_array(Array, 0, J, c(V0, P0)), + J1 is J+1, + arg(J1, T2, I), + ins_cost(I, IC), + V is V0 + IC, + nb_link_array(Array, 0, J1, c(V, [ins(I)|P0])) + )), + forall(between(0, M1, I), + forall(between(0, N1, J), + ( I1 is I + 1, + J1 is J + 1, + arg(I1, T1, V1), + arg(J1, T2, V2), + ( V1 == V2 + -> Subst = MatchCost + ; downcase_atom(V1, L), + downcase_atom(V2, L) + -> Subst = CaseCost + ; dwim_match(V1, V2) + -> Subst = DWIMCost + ; same_stem(V1, V2) + -> Subst = StemCost + ; Subst = SubstCost + ), + get_array(Array, I, J, c(C1, P1)), + get_array(Array, I1, J, c(C2, P2)), + get_array(Array, I, J1, c(C3, P3)), + + SubstC is C1 + Subst, + ( memberchk(del(V2), P2) + -> del_cost(V2, DC2), + InsC is C2 - DC2 + MovCost + ; ins_cost(V2, InsCost), + InsC is C2 + InsCost + ), + ( memberchk(ins(V1), P3) + -> ins_cost(V1, IC1), + DelC is C3 - IC1 + MovCost + ; del_cost(V1, DelCost), + DelC is C3 + DelCost + ), + + ( SubstC < InsC + -> ( SubstC < DelC + -> nb_link_array(Array, I1, J1, + c(SubstC, [subst(V1, V2)|P1])) + ; nb_link_array(Array, I1, J1, + c(DelC, [del(V1)|P3])) + ) + ; ( DelC < InsC + -> nb_link_array(Array, I1, J1, + c(DelC, [del(V1)|P3])) + ; nb_link_array(Array, I1, J1, + c(InsC, [ins(V2)|P2])) + ) + )))), +% pp_array(Array), + get_array(Array, M, N, c(Distance, Path0)), + reverse(Path0, Path). + +ins_cost((,), 1) :- !. +ins_cost(_, 8). + +del_cost((,), 1) :- !. +del_cost(_, 10). + +same_stem(T1, T2) :- + atom(T1), atom(T2), !, + porter_stem(T1, Stem), + porter_stem(T2, Stem). + + + /******************************* + * SIMPLE ARRAY PACKAGE * + *******************************/ + +new_array(X, Y, Array) :- + Size is X * Y + 3, + functor(Array, array, Size), + arg(1, Array, 2), % dimensions + arg(2, Array, X), % columns + arg(3, Array, Y). % rows + +get_array(Array, X, Y, Val) :- + arg(2, Array, Cols), + Pos is X + Y*Cols + 4, + arg(Pos, Array, Val). + +nb_set_array(Array, X, Y, Val) :- + arg(2, Array, Cols), + Pos is X + Y*Cols + 4, + nb_setarg(Pos, Array, Val). + +nb_link_array(Array, X, Y, Val) :- + arg(2, Array, Cols), + Pos is X + Y*Cols + 4, + nb_linkarg(Pos, Array, Val). diff --git a/lib/find_resource.pl b/lib/find_resource.pl new file mode 100644 index 0000000..f3532c5 --- /dev/null +++ b/lib/find_resource.pl @@ -0,0 +1 @@ +:- module(find_resource, []). diff --git a/rdf/cpack/find_resource.ttl b/rdf/cpack/find_resource.ttl index afe63eb..36aa0c0 100644 --- a/rdf/cpack/find_resource.ttl +++ b/rdf/cpack/find_resource.ttl @@ -14,10 +14,7 @@ <> a cpack:Library ; cpack:packageName "find_resource" ; dcterms:title "Library to find RDF resources by a textual query" ; - cpack:author [ a foaf:Person ; - foaf:name "@FOAFNAME@" ; - foaf:mbox "@FOAFMBOX@" ; - ] ; + cpack:author <http://www.few.vu.nl/~michielh/me> ; cpack:primaryRepository [ a cpack:GitRepository ; cpack:gitURL <git://semanticweb.cs.vu.nl/home/hildebra/git/ClioPatria/find_resource.git>