tag_matcher/commit

add support for stemming

authorMichiel Hildebrand
Sat Jan 22 20:08:55 2011 +0100
committerMichiel Hildebrand
Sat Jan 22 20:08:55 2011 +0100
commit52c060dfaeacd7d8a3cd4577e26388e53ef7b1f3
tree3fde034faaf605f72d466331d9c8ee190c9f788e
parent2dbeeb2fcc7ad73722c71cb9ae6e7591f18a66d8
Diff style: patch stat
diff --git a/api/tag_matcher.pl b/api/tag_matcher.pl
index 95065a9..58d84a2 100644
--- a/api/tag_matcher.pl
+++ b/api/tag_matcher.pl
@@ -1,5 +1,6 @@
 :- module(tag_matcher,
-	  [ find_tag_concept/3
+	  [ tag_match/5,
+	    find_tag_concept/3
 	  ]).
 
 :- use_module(library(http/http_dispatch)).
@@ -12,8 +13,10 @@
 :- use_module(library(semweb/rdf_db)).
 :- use_module(library(semweb/rdfs)).
 :- use_module(library(semweb/rdf_litindex)).
-:- use_module(library(find_resource)).
+:- use_module(library(snowball)).
+
 :- use_module(components(label)).
+:- use_module(library(find_resource)).
 
 /** <module> Tag matcher
 
@@ -44,10 +47,11 @@ http_tag_match_page(Request) :-
 			  t2(Tag2,
 			     [optional(true), description('another tag')])
 			]),
-	(   nonvar(Tag1),
-	    nonvar(Tag2)
-	->  tag_match(Tag1, Tag2, T1Data, T2Data, Type)
-	;   true
+	(   (var(Tag1); var(Tag2))
+	->  true
+	;   tag_match(Tag1, Tag2, MatchType, T1Data, T2Data)
+	->  true
+	;   MatchType = false
 	),
 	reply_html_page(cliopatria(default),
 			[title('Tag matcher'),
@@ -69,7 +73,7 @@ http_tag_match_page(Request) :-
 				    \html_match_examples
 				  ])),
 			 div(class('tag-match'),
-			     \html_tag_match(Type)),
+			     \html_tag_match(MatchType)),
 			 div(class('tag-data'),
 			     [ \html_tag_data(1, T1Data),
 			       \html_tag_data(2, T2Data)
@@ -112,26 +116,35 @@ data_cell(R) -->
 html_tag_match(Type) -->
 	{ var(Type) },
 	!.
-html_tag_match(@false) -->
+html_tag_match(false) -->
 	html(h3('no match found')).
 html_tag_match(Type) -->
 	html(h3(['match: ', Type])).
 
 html_match_examples -->
-	html([\example_link(exact, 'obama', 'obama'),
-	      ' | ',
-	      \example_link(synonym, 'auto', 'wagen'),
-	      ' | ',
-	      \example_link(specific, labrador, hond),
-	      ' | ',
-	      \example_link(generic, paal, amsterdammertje)
-	     ]).
-
-example_link(Name, Tag1, Tag2) -->
+	{ findall(e(N,T1,T2), example(N,T1,T2), Es)
+	},
+	html_example_links(Es).
+
+html_example_links([E]) -->
+	example_link(E).
+html_example_links([E|Es]) -->
+	example_link(E),
+	html(' | '),
+	html_example_links(Es).
+
+example_link(e(Name, Tag1, Tag2)) -->
 	{ http_location_by_id(http_tag_match_page, Link)
 	},
 	html(a(href(Link+'?t1='+Tag1+'&t2='+Tag2), Name)).
 
+example(exact, verhagen, verhagen).
+example(stemming, auto, autos).
+example(synonym, auto, wagen).
+example('synonym: word order', 'geert wilders', 'wilders, geert').
+example(specific, labrador, hond).
+example(specific2, 'amsterdam-oost', amsterdam).
+example(generic, paal, amsterdammertje).
 
 %%	http_tag_match_api(+Request)
 %
@@ -142,37 +155,51 @@ http_tag_match_api(Request) :-
 			[ t1(Tag1, [description('a tag')]),
 			  t2(Tag2, [description('another tag')])
 			]),
- 	tag_match(Tag1, Tag2, T1Data, T2Data, Type),
+ 	(   tag_match(Tag1, Tag2, MatchType, T1Data, T2Data)
+	->  true
+	;   MatchType = @false,
+	    T1Data = [], T2Data = []
+	),
 	reply_json(json([t1=json([tag=Tag1|T1Data]),
 			 t2=json([tag=Tag2|T2Data]),
-			 match=Type
+			 match=MatchType
 			])).
 
-tag_match(T, T, TData, _, exact) :-	!,
+%%	tag_match(+Tag1, +Tag2, -Match, -Tag1Data, -Tag2Data)
+%
+%	Succeeds if Tag1 and Tag2 match exactly or there is a relation
+%	between the concepts associcated with the Tags. TagData provides
+%	info about how the matching concepts are found.
+
+tag_match(T, T, exact, TData, _) :-	!,
 	(   find_tag_concept(T, _R, Hit)
 	->  tag_data(Hit, TData)
 	;   TData = []
  	).
-tag_match(T1, T2, TData, _, synonym) :-
-	find_tag_concept(T1, R, Hit),
-	find_tag_concept(T2, R, _Hit),
-	!,
-	tag_data(Hit, TData).
-tag_match(T1, T2, T1Data, T2Data, specific) :-
-	find_tag_concept(T1, R1, Hit1),
-	find_tag_concept(T2, R2, Hit2),
-	ancestor_of(R1, R2),
+tag_match(T1, T2, stem, TData, _) :-
+	snowball(dutch, T1, T),
+	snowball(dutch, T2, T),
 	!,
-	tag_data(Hit1, T1Data),
-	tag_data(Hit2, T2Data).
-tag_match(T1, T2, T1Data, T2Data, generic) :-
+	(   find_tag_concept(T, _R, Hit)
+	->  tag_data(Hit, TData)
+	;   TData = []
+ 	).
+tag_match(T1, T2, Type, T1Data, T2Data) :-
 	find_tag_concept(T1, R1, Hit1),
 	find_tag_concept(T2, R2, Hit2),
-	ancestor_of(R2, R1),
+	tag_concept_match(R1, R2, Type),
 	!,
 	tag_data(Hit1, T1Data),
 	tag_data(Hit2, T2Data).
-tag_match(_, _, [], [], @false).
+
+tag_concept_match(R, R, synonym) :- !.
+tag_concept_match(R1, R2, specific) :-
+ 	ancestor_of(R1, R2),
+	!.
+tag_concept_match(R1, R2, generic) :-
+ 	ancestor_of(R2, R1),
+	!.
+
 
 tag_data(hit(D,R,P,L), Data) :-
 	Data = [uri=R, match=L, matchtype=P, distance=D, type=Type],
@@ -191,13 +218,14 @@ find_tag_concept(Tag, R, Hit) :-
 		     PrefLabel-0,
 		     Label-1
 		    ],
-	Options = [match(case),
-		   distance(true),
-		   attributes(LabelList)
+	Options = [match(stem),
+		   attributes(LabelList),
+		   distance(true)
 		  ],
 	find_resource_by_name(Tag, Hits, Options),
 	member(Hit, Hits),
-	Hit = hit(_,R,_,_).
+	Hit = hit(D,R,_,_),
+	D =< 3.
 
 
 %%	tag_concept_type(+Concept, -Type)
@@ -211,9 +239,9 @@ tag_concept_type(R, Type) :-
 tag_concept_type(_, unknown).
 
 scheme_type(gtaa:'Maker', person) :- !.
-scheme_type(gtaa:'Namen', person):- !.
-scheme_type(gtaa:'Persoonsnamen', person) :- !.
-scheme_type(gtaa:'GeografischeNamen', location) :- !.
+scheme_type(gtaa:'Name', person):- !.
+scheme_type(gtaa:'Person', person) :- !.
+scheme_type(gtaa:'Location', location) :- !.
 
 %%	ancestor_of(+R, +Ancestor)
 %