/* This file is part of ClioPatria. Author: HTTP: http://e-culture.multimedian.nl/ GITWEB: http://gollem.science.uva.nl/git/ClioPatria.git GIT: git://gollem.science.uva.nl/home/git/ClioPatria.git GIT: http://gollem.science.uva.nl/home/git/ClioPatria.git Copyright: 2007, E-Culture/MultimediaN ClioPatria is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 2 of the License, or (at your option) any later version. ClioPatria is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with ClioPatria. If not, see . */ :- module(kwd_search, [ match_uri/7, % +Atom, +Range, +Threshold, +Map, -URI, -Score, -Path match_string/7, % +Atom, +Range, +Threshold, +Map, -URI, -Score, -Path search_string/6, % +Atom, +Range, +Threshold, +Map, -Score, -Path search_graph/7, % +From, +Range, +Score0, +Threshold, +Map, -Score, -Path find_literals/3 % +Search, +Threshold, -Literals ]). :- use_module(library(lists)). :- use_module(library(assoc)). :- use_module(library(debug)). :- use_module(library(semweb/rdf_db)). :- use_module(library(semweb/rdfs)). :- use_module(library(semweb/rdf_litindex)). :- use_module(library(porter_stem)). :- use_module(library(semweb/owl)). :- use_module(rdf_search). :- use_module(rdf_cluster). :- use_module(graph_search). :- use_module(fuzzy). :- rdf_meta match_string(+, r, +, +, -), search_string(+, t, +, +, -, -). %% match_string(+Atom, -Matches, +Options) % % Find matches for the literal string Atom controlled by Options. % Options supported are: % % * filter(+URI, :Goal) % Calls Goal to filter target objects. % % * score(+Path, -Score, :Goal) % Calls Goal to rate Path. Score most be unified to a % float between 0.0 and 1.0. If Goal fails this path is % discarded. % % * score_literal(+Literal, -Score, :Goal) % Score the initial literal match. % % * threshold(Float) % Float in the range 0.0..1.0, stopping the search if % the score of a path is below Float. % % * threshold(Max, Min, MinHits) % Perform iterative lowering of threshold, starting at % Max and stopping at Min or after at least MinHits % hits have been found. % % Example % % == % match_string(gogh, Matches, % [ filter(URI, rdfs_individual_of(URI, ulan:'Person')) % score([rdfs:label|_], 1, true) % ]) % == % % @param Matches is a list of hit(URI, Score, Path) terms. %match_string(Word, Matches, Options) :- % select(threshold(Max, Min, MinHits), Options, Options1), !, % iterative_threshold(4, Max, Min, Threshold), % match_string(Word, Matches, [threshold(Threshold)|Options1]), % length(Matches, HitCount), % HitCount >= MinHits. %match_string(Word, Matches, Options) :- % find_literals(Word, ScoredLiterals, Options), % search_graph(ScoredLiterals, Matches, Options). %% iterative_threshold(+Max, +Min, +Steps, -Threshold) is nondet. % % Generate thresholds, starting at Max and going in Steps equal % relative steps down to Min. %iterative_threshold(Max, Min, Steps, Threshold) :- % Steps1 is Steps - 1, % Itv = log(Max) - log(Min), % between(0, Steps1, Step), % Threshold is exp(log(Max) - (Itv*Step/Steps1)). %% match_string(+Atom, +Range, +Threshold, +Map, %% -URI, -Score, -Path) is nondet. % % Search URIs related to the string Atom. % % @param Atom Search specification. Will be tokenized. % @param Range Target objects. Passed to owl_satisfies/2. % @param Threshold Search cut-off threshold (0..1) % @param Map List of Pred-Factor. % @param URI Matching URI % @param Score Score of the match (0..1) % @param Path List of Pred-Obj with path from matching literals to URI. % Note that this does *not* include URI. % @see search_string/7 % @tbd Complete description match_string(Atom, Range, Threshold, Map, URI, Score, Path) :- findall(Score1-Path1, search_string(Atom, all_values_from(Range), Threshold, Map, Score1, Path1), Pairs0), join_probabilities(Pairs0, Pairs1), keysort(Pairs1, Pairs2), reverse(Pairs2, Pairs3), member(Score-URI, Pairs3), findall(Score2-Path2, member(Score2-[URI|Path2], Pairs0), Paths), keysort(Paths,Paths0), reverse(Paths0,[_Score-Path|_Tail]). % TBD Should this _Score not be the result? match_uri(Atom, Range, Threshold, Map, URI, Score, Path) :- assoc_map(Map, Assoc), findall(Score1-Path1, search_graph(Atom, all_values_from(Range), 1, Threshold, Assoc, Score1, Path1), Pairs0), join_probabilities(Pairs0, Pairs1), keysort(Pairs1, Pairs2), reverse(Pairs2, Pairs3), member(Score-URI, Pairs3), findall(Score2-Path2, member(Score2-[URI|Path2], Pairs0), Paths), keysort(Paths,Paths0), reverse(Paths0,[_Score-Path|_Tail]). %% search_string(+Atom, +Range, +Threshold, +Map, -Score, -Path) % % We return all resources belonging to Range that either have a % literal attribute from Atom or have a resource with such an % attribute. Range is a description for owl_satisfies/2. Most % typical usage is =|all_values_from(Class)|= % % Using one_of([Resource]) you can produce all paths leading to a % specific resource. % % @tbd This is an old interface used by old basic search that % is now hacked to run on top of the new search % infrastructure. % % @param Map List of Pred-Factor. Current strength is multiplied % by Factor for Pred. If strength becomes lower than % Threshold the search is stopped. % @param Path List of [Target, R1-P1, R2-P2, ... literal(X)-PN] search_string(Atom, Range, Threshold, _Map, Score, Path) :- graph_search(Atom, State, [ filter([owl_satisfies(Range)]), threshold(Threshold), prune(true) ]), rdf_search_property(State, graph(Graph)), rdf_search_property(State, targets(Targets)), member(Score-Target, Targets), rdf_cluster:search_path(Target, Graph, [_Target|IPath]), op_pairs(IPath, Path1), Path = [Target|Path1], debug(path, 'Path = ~p', [Path]). %% op_pairs(+List, -Pairs) is det. % % Translate between new path descriptions and the old ones. We % also used to delete owl:sameAs here, but this is not the proper % place as we want to highlight the use of sameAs in the % interface. op_pairs([], []). %op_pairs([P,_O|T0], T) :- % rdf_equal(P, owl:sameAs), !, % op_pairs(T0, T). op_pairs([P,O|T0], [O-P|T]) :- op_pairs(T0, T). /******************************* * LITERAL HANDLING * *******************************/ %% find_literals(+SearchFor, -ScoredLiterals:list(Score-Literal), +Options) is det. % % Find all literals in the database that are related to SearchFor % by at least one common stem. Order these literals by the quality % of the match. Options: % % * threshold(+Float) % If present, ignore literals that match with a score % lower than threshold. % % @tbd Allow for more than stem and "compound term" matches. % @bug Tokenization only deals with ISO Latin-1 text. find_literals(Search, Literals, Options) :- option(threshold(Threshold), Options, 0.0), ( rdf_tokenize_literal(Search, Tokens) -> true ; Tokens = [Search] % HACK ), all_literals(Tokens, Literals0), sort(Literals0, Literals1), length(Literals1, NL1), debug(find_literals, '~D matches', [NL1]), sort_matches(Literals1, Tokens, Threshold, Literals). all_literals(Tokens, Literals) :- compound_search_tokens(Tokens, Compounds, AllTokens), list_to_and(AllTokens, LitCond), rdf_find_literals(LitCond, Ls0), debug(find_literals_detail, 'Matching literals ~p', [Ls0]), ( Compounds == [] -> Literals = Ls0 ; tokenize_hits(Ls0, Tokenized), filter_compounds(Compounds, Tokenized, KeyedLiterals), pairs_values(KeyedLiterals, Literals) ). %% compound_search_tokens(+Tokens, -Compounds:list(list), -AllTokens) % % Extract all tokens and the compounds that must be matched. compound_search_tokens([], [], []). compound_search_tokens(['"'|Rest], [Compound|RC], Words) :- append(Compound, ['"'|T], Rest), append(Compound, RW, Words), !, compound_search_tokens(T, RC, RW). compound_search_tokens([H0|T], RC, [H|RW]) :- mkmatch(H0, H), compound_search_tokens(T, RC, RW). list_to_and([], true). list_to_and([One], One) :- !. list_to_and([H|T], and(H, And)) :- !, list_to_and(T, And). mkmatch(Number, Number) :- number(Number), !. mkmatch(Atom, stem(Atom)). %% tokenize_hits(+Literals:list(atom), -Keyed:list(list-atom)) is det. tokenize_hits([], []). tokenize_hits([H|T0], [L-H|T]) :- ( tokenize_atom(H, L) -> true ; L = [H] % HACK ), tokenize_hits(T0, T). filter_compounds([], Literals, Literals). filter_compounds([C0|C], L0, L) :- filter_compound(L0, C0, L1), filter_compounds(C, L1, L). filter_compound([], _, []). filter_compound([H|T0], Compound, [H|T]) :- H = TL-_, append(_, Rest, TL), same_tokens(Compound, Rest), !, filter_compound(T0, Compound, T). filter_compound([_|T0], Compound, T) :- filter_compound(T0, Compound, T). same_tokens([], _). same_tokens([H|T0], [H|T]) :- !, same_tokens(T0, T). same_tokens([-|T0], T) :- !, same_tokens(T0, T). same_tokens(T0, [-|T]) :- !, same_tokens(T0, T). same_tokens([H0|T0], [H1|T]) :- porter_stem(H0, Stem), porter_stem(H1, Stem), same_tokens(T0, T). %% sort_matches(+Matches, +Search, +Threshold, -Set) % % Sort matches by score, best first. Scoring is done using the % search target in Search. All scores below Threshold are removed % from the result-set. % % @tbd If Fuzzy is not =off=, score is always 1 and Set is % the same as Matches. Should be cleaned. sort_matches(Set0, Search, Threshold, Set) :- debug(find_literals, 'Step1D: Score literals and remove the ones below threshold',[]), tag_match_score(Set0, Search, Threshold, Tagged), debug(find_literals, 'Step1E: Sort list of literals by score',[]), keysort(Tagged, Set1), reverse(Set1, Set), debug(find_literals_detail, 'Sorted literals ~p', [Set]). %% tag_match_score(+Set:list(atom), +Search, +Threshold, -Result:list(Score-Atom)) % % Match literal scores on how well they match Search. Matches % below Threshold are deleted from the result. tag_match_score([], _, _, []). tag_match_score([H|T0], Search, Threshold, L) :- match_score(Search, H, S), %debug(search, 'Score ~w~n', [S]), ( S >= Threshold -> L = [S-H|T] ; L = T ), tag_match_score(T0, Search, Threshold, T). %% match_score(+Search, +Literal, -Score) is det. % % Determine quality of the score. There are two cases. Literal is % the same or about the same as Search and Literal is a long % literal and Search only provides a few stems or words in % Literal. In the first case we use the `minimal edit' algorithm % from fuzzy.pl. In the latter case our score depends on: % % * Percentage of tokens from search found in Literal % * Whether they are stem or perfect matches % * Longest adjacent sequence (stem-based) appear % % Values are normalised to 0...1, where 1 is a perfect match. match_score(Search, Literal, Score) :- tokens(Search, SearchTokens), tokens(Literal, LiteralTokens), ( SearchTokens == LiteralTokens -> Score = 1, debug(find_literals, 'Step1D1 Rate: SearchTokens ~p, LiteralTokens ~p, perfect match Score 1.', [SearchTokens, LiteralTokens]) ; literal_distance(SearchTokens, LiteralTokens, LD) -> Score is 3/(3+LD) ; add_stems(SearchTokens, SearchStems), add_stems(LiteralTokens, LitStems), match_score2(SearchStems, LitStems, Score) ). tokens(In, Tokens) :- atom(In), !, rdf_tokenize_literal(In, Tokens). tokens(In, In) :- assertion(is_list(In)). match_score2(Search, Lit, Score) :- same_by_stem(Search, Lit), !, Score = 0.95. match_score2(Search, Lit, Score) :- phrase(matches(Search, 1, Lit), Matches), match_count(Matches, Count), length(Lit, Len), Score is min(1, Count/Len). same_by_stem([], []). same_by_stem([_-S|T0], [_-S|T]) :- same_by_stem(T0, T). %% matches(+For:list(W-S), +Start, +In:list(W-S))// % % Produces a list of m(Match, PosIn, PosFor), where Match is 1 for % exact matches and 0.8 it the stem matches. matches([], _, _) --> []. matches([W-S|T], I, Lit) --> match(Lit, I, 1, W, S), { I2 is I + 1 }, matches(T, I2, Lit). match([], _, _, _, _) --> []. match([LW-S|T], I, N, W, S) --> !, ( { W == LW } -> [ m(1, I,N) ] ; [ m(0.8,I,N) ] ), { N2 is N + 1 }, match(T, I, N2, W, S). match([_|T], I, N, W, S) --> { N2 is N + 1 }, match(T, I, N2, W, S). %% match_count(+Matches, -Score) is det. % % Sum the found matches. The idea was to give extra credit to % contiguous chunks, but I cannot find that in the code. A bug? match_count([], 0). match_count([m(S,I,N)|T], Score) :- sequence(I, N, T, T1, S, Score1), match_count(T1, Score2), Score is Score1 + Score2. sequence(I, N, T0, T, S, Score) :- I2 is I + 1, N2 is N + 1, select(m(S2,I2,N2), T0, T1), !, Score1 is S+S2, sequence(I2, N2, T1, T, Score1, Score). sequence(_, _, T, T, S, S). %% add_stems(+Words:list(atom), -Stemmed:list(Word-Stem)) is det. % % @tbd Deal with non-iso-latin-1 text add_stems([], []). add_stems([H|T0], [H-Stem|T]) :- ( porter_stem(H, Stem) -> true ; Stem = H ), add_stems(T0, T). /******************************* * SEARCH * *******************************/ %% assoc_map(+List, -Assoc) % % Translate P=F, ... list into an assoc, turning all NS:Local into % global predicates. assoc_map(List, Assoc) :- empty_assoc(Assoc0), assoc_map(List, Assoc0, Assoc). assoc_map([], Assoc, Assoc). assoc_map([P=F|T], Assoc0, Assoc) :- rdf_global_id(P, G), put_assoc(G, Assoc0, F, Assoc1), assoc_map(T, Assoc1, Assoc). %% search_graph(+From, +Range, +Score0, +Threshold, +Map, -Score, -Path) % % Search the RDF graph starting at From, looking for an object for % which owl_satisfies(O, Range) succeeds. Score0 is the initial % score. The search stops if the score is below Threshold. Map is % an assoc from predicate to a number between 0, and 1. If a % predicate is not in the map, the system looks for a % super-predicate that is in the map. If that fails it looks for % `default'. % % Score is the final score of the path and Path is a list that % starts with the found target. Other elements in the list are % pairs of the format "Object-Predicate". So, the path % % [T, R1-P1, R2-P2] % % is created from a search R2 <-P2-> R1 <-P1-> T search_graph(From, Range, Score0, Threshold, Map, Score, Path) :- empty_assoc(Done), search_graph(From, Range, Score0, Threshold, Map, Done, Score, Path0), reverse(Path0, Path). search_graph(Object, Range, Score, _, _, _, Score, Path) :- owl_satisfies(Range, Object), Path = [Object]. search_graph(Object, Range, Score0, Threshold, Map, Done, Score, [Object-P|Path]) :- ( rdf_has(O2, P, Object) ; atom(Object), rdf_has(Object, IP, O2), rdf_has(P, owl:inverseOf, IP) ), \+ get_assoc(O2, Done, _), update_score(Score0, P, Map, Map1, Score1), Score1 >= Threshold, put_assoc(O2, Done, true, Done1), search_graph(O2, Range, Score1, Threshold, Map1, Done1, Score, Path). update_score(Score0, P, Map, Map, Score) :- get_assoc(P, Map, F), !, Score is Score0*F. update_score(Score0, P, Map0, Map, Score) :- ( rdfs_subproperty_of(P, Super), get_assoc(Super, Map0, F) -> true ; get_assoc(default, Map0, F) ), put_assoc(P, Map0, F, Map), Score is Score0*F. /******************************* * JOIN PROBABILITIES * *******************************/ %% join_probabilities(+Paths:list(Prop-Path), %% -URIs:list(Prop-URI)) is det. % % Given a list of pairs Probability-Path, create a list % Probability-URI, merging the probabilities of each path that % leads to the same final resource. join_probabilities(P, Merged) :- key_prop_by_uri(P, Keyed), keysort(Keyed, Sorted), join_by_uri(Sorted, Merged). key_prop_by_uri([], []). key_prop_by_uri([P-[URI|_]|T0], [URI-P|T]) :- key_prop_by_uri(T0, T). join_by_uri([], []). join_by_uri([URI-P0|T0], [P-URI|T]) :- same_uri(URI, T0, Ps, T1), joined_probability([P0|Ps], P), join_by_uri(T1, T). same_uri(URI, [URI-P|T0], [P|TP], T) :- !, same_uri(URI, T0, TP, T). same_uri(_, T, [], T). joined_probability(List, Joined) :- sum_p_no(List, Sum), Joined is 1 - Sum. sum_p_no([], 1). sum_p_no([H|T], P) :- sum_p_no(T, P0), P is P0 * (1-H).