amalgame/commit

REFACTORING: compound label matching no follows the new generator/partition convention

authorJacco van Ossenbruggen
Mon Sep 8 09:34:40 2014 +0200
committerJacco van Ossenbruggen
Mon Sep 8 09:34:40 2014 +0200
commit15df88d9521ad54868de5e55bde478c3a7dc359e
tree84a037dc4db85f0cd033a770f8c91c6332f7dacf
parent9047ef14a86eab9d5ccd06443570531af33876ec
Diff style: patch stat
diff --git a/config-available/ag_modules.pl b/config-available/ag_modules.pl
index 7558085..2e0fb0b 100644
--- a/config-available/ag_modules.pl
+++ b/config-available/ag_modules.pl
@@ -2,6 +2,7 @@
 
 % Candidate correspondence generator components:
 :- use_module(library(ag_modules/ancestor_generator)).
+:- use_module(library(ag_modules/compound_label_generator)).
 :- use_module(library(ag_modules/descendent_generator)).
 :- use_module(library(ag_modules/exact_label_generator)).
 :- use_module(library(ag_modules/isub_generator)).
@@ -9,6 +10,7 @@
 
 % Mapping producing partitioners:
 :- use_module(library(ag_modules/ancestor_selecter)).
+:- use_module(library(ag_modules/compound_label_selecter)).
 :- use_module(library(ag_modules/descendent_selecter)).
 :- use_module(library(ag_modules/exact_label_selecter)).
 :- use_module(library(ag_modules/isub_selecter)).
@@ -21,7 +23,6 @@
 :- use_module(library(ag_modules/propvalue_select)).
 
 % Modules that can be used as matchers
-:- use_module(library(ag_modules/compound_match)).
 :- use_module(library(ag_modules/snowball_match)).
 :- use_module(library(ag_modules/preloaded_mapping)).
 
diff --git a/lib/ag_modules/compound_label_generator.pl b/lib/ag_modules/compound_label_generator.pl
new file mode 100644
index 0000000..69672cf
--- /dev/null
+++ b/lib/ag_modules/compound_label_generator.pl
@@ -0,0 +1,50 @@
+:- module(compound_label_generator,
+	  []).
+
+:- use_module(library(lists)).
+:- use_module(library(semweb/rdf_db)).
+:- use_module(library(amalgame/vocabulary)).
+:- use_module(string_match_util).
+:- use_module(compound_label_match).
+
+:- public amalgame_module/1.
+:- public matcher/4.
+:- public parameter/4.
+
+amalgame_module(amalgame:'CompoundMatcher').
+
+parameter(sourcelabel, oneof(LabelProps), Default,
+	  '(Super)Property to get label of the source by') :-
+	rdf_equal(Default, rdfs:label),
+	label_list(LabelProps).
+parameter(targetlabel, oneof(LabelProps), Default,
+	  '(Super)Property to get the label of the target by') :-
+	rdf_equal(Default, rdfs:label),
+	label_list(LabelProps).
+parameter(source_language, oneof(['any'|L]), 'any', 'Language of source label') :-
+	strategy_languages(_,L).
+parameter(matchacross_lang, boolean, true,
+	  'Allow labels from different language to be matched').
+parameter(matchacross_type, boolean, true,
+	  'Allow labels from different types to be matched').
+parameter(case_sensitive, boolean, false,
+	  'When true the case of labels must be equal').
+parameter(match_qualified_only, boolean, false,
+          'Match only on the fully qualified label').
+
+%%	matcher(+Source, +Target, -Mappings, +Options)
+%
+%	Mappings is a sorted list of matches between instances of Source
+%	and Target.
+
+matcher(Source, Target, Mappings, Options) :-
+	findall(M, align(Source, Target, M, Options), ListOfLists),
+	append(ListOfLists, MappingsUnSorted),
+	sort(MappingsUnSorted, Mappings).
+
+align(Source, TargetScheme, MatchList, Options) :-
+	vocab_member(S, Source),
+	compound_label_match(align(S,_,[]), MatchList,
+			     [target_scheme(TargetScheme)|Options]).
+
+
diff --git a/lib/ag_modules/compound_match.pl b/lib/ag_modules/compound_label_match.pl
similarity index 52%
rename from lib/ag_modules/compound_match.pl
rename to lib/ag_modules/compound_label_match.pl
index a29663b..4406f76 100644
--- a/lib/ag_modules/compound_match.pl
+++ b/lib/ag_modules/compound_label_match.pl
@@ -1,71 +1,17 @@
-:- module(compound_match,
-	  []).
+:- module(compound_label_match,
+	  [ compound_label_match/3 ]).
 
+:- use_module(library(lists)).
+:- use_module(library(option)).
 :- use_module(library(semweb/rdf_db)).
 :- use_module(library(semweb/rdf_litindex)).
 :- use_module(library(amalgame/vocabulary)).
 :- use_module(string_match_util).
 
-:- public amalgame_module/1.
-:- public filter/3.
-:- public matcher/4.
-:- public parameter/4.
-
-amalgame_module(amalgame:'CompoundMatcher').
-
-parameter(sourcelabel, oneof(LabelProps), Default,
-	  '(Super)Property to get label of the source by') :-
-	rdf_equal(Default, rdfs:label),
-	label_list(LabelProps).
-parameter(targetlabel, oneof(LabelProps), Default,
-	  '(Super)Property to get the label of the target by') :-
-	rdf_equal(Default, rdfs:label),
-	label_list(LabelProps).
-parameter(language, oneof(['any'|L]), 'any', 'Language of source label') :-
-	strategy_languages(_,L).
-parameter(matchacross_lang, boolean, true,
-	  'Allow labels from different language to be matched').
-parameter(matchacross_type, boolean, true,
-	  'Allow labels from different types to be matched').
-parameter(case_sensitive, boolean, false,
-	  'When true the case of labels must be equal').
-
-%%	filter(+MappingsIn, -MappingsOut, +Options)
-%
-%	Filter mappings based on exact matching of labels.
-
-filter([], [], _).
-filter([align(S,T,P)|Cs], [C|Mappings], Options) :-
-	(   T = scheme(_)
-	->  match(align(S,_,P), C, Options),
-	    C=align(_,T2,_),
-	    vocab_member(T2, T)
-	;   match(align(S,T,P), C, Options)
-	),
-	!,
-	filter(Cs, Mappings, Options).
-filter([_|Cs], Mappings, Options) :-
-	filter(Cs, Mappings, Options).
-
-
-%%	matcher(+Source, +Target, -Mappings, +Options)
-%
-%	Mappings is a sorted list of matches between instances of Source
-%	and Target.
-
-matcher(Source, Target, Mappings, Options) :-
-	findall(M, align(Source, Target, M, Options), Mappings0),
-	flatten(Mappings0, MappingsFlat),
-	sort(MappingsFlat, Mappings).
-
-align(Source, TargetScheme, Match, Options) :-
-	vocab_member(S, Source),
-	match(align(S,TargetScheme,[]), Match, Options).
-
-match(align(Source, TargetScheme, Prov0), Results, Options) :-
+compound_label_match(align(Source, _Target, Prov0), Results, Options) :-
 	rdf_equal(rdfs:label, RdfsLabel),
 	option(sourcelabel(MatchProp1), Options, RdfsLabel),
-	option(language(Lang), Options, 'any'),
+	option(source_language(Lang), Options, 'any'),
 
 	(   Lang == 'any'
 	->  SourceLang = _
@@ -75,16 +21,18 @@ match(align(Source, TargetScheme, Prov0), Results, Options) :-
 	Prov = [method(compound_label_match),
 		score(Score),
 		match(Match),
-		graph([rdf(Source, SourceProp, literal(lang(SourceLang, SourceLabel)))])
+		graph([rdf(Source, SourceProp,
+			   literal(lang(SourceLang, SourceLabel)))])
 	       ],
 
-	skos_match(Source, MatchProp1, literal(lang(SourceLang, SourceLabel)), SourceProp, Options),
+	skos_match(Source, MatchProp1,
+		   literal(lang(SourceLang, SourceLabel)), SourceProp, Options),
 	rdf_tokenize_literal(SourceLabel, Tokens),
 	findall(Targets-Token-LabelAmbScore,
 		(   member(Token, Tokens),
 		    atom(Token),
 		    match_label(Source, Token, Targets,
-				[scheme(TargetScheme), sourcelang(SourceLang)|Options]),
+				[source_lang(SourceLang)|Options]),
 		    length(Targets, LabelAmbScore)
 
 		),
@@ -118,8 +66,7 @@ match_label(Source, Label, Targets, Options) :-
 	option(matchacross_lang(MatchAcross), Options, true),
 	option(matchacross_type(IgnoreType),  Options, true),
 	option(case_sensitive(CaseSensitive), Options, false),
-	option(sourcelang(SourceLang), Options),
-	option(scheme(TargetScheme), Options),
+	option(source_lang(SourceLang), Options),
 
 	(   CaseSensitive
 	->  SearchTarget=literal(lang(TargetLang, Label))
@@ -133,9 +80,12 @@ match_label(Source, Label, Targets, Options) :-
 	),
 
 	findall(Target-ProvGraph,
-		(   skos_match(Target, MatchProp, SearchTarget, TargetProp, Options),
-		    Source \== Target,  % fix me, replace by target vocab check as in exact_label example
-		    vocab_member(Target, TargetScheme),
+		(   skos_match(Target, MatchProp, SearchTarget,
+			       TargetProp, Options),
+		    (	option(target_scheme(TargetScheme), Options)
+		    ->	vocab_member(Target, TargetScheme)
+		    ;	true
+		    ),
 		    (   IgnoreType
 		    ->  true
 		    ;   matching_types(Source, Target)
@@ -147,5 +97,4 @@ match_label(Source, Label, Targets, Options) :-
 		    ProvGraph = [rdf(Target, TargetProp, TargetTerm)]
 		),
 		Targets),
-	(ground(Targets) -> true; gtrace),
 	Targets \= [].
diff --git a/lib/ag_modules/compound_label_selecter.pl b/lib/ag_modules/compound_label_selecter.pl
new file mode 100644
index 0000000..1294731
--- /dev/null
+++ b/lib/ag_modules/compound_label_selecter.pl
@@ -0,0 +1,40 @@
+:- module(compound_label_selecter,
+	  []).
+
+:- public amalgame_module/1.
+:- public selecter/5.
+:- public parameter/4.
+
+:- use_module(label_selecter).
+:- use_module(compound_label_match).
+:- use_module(string_match_util).
+
+parameter(type,
+	  oneof([source,target, all]), all,
+	 'Select all exact label matches or pick best source/target to disambiguate').
+
+parameter(sourcelabel, oneof(LabelProps), Default,
+	  '(Super)Property to get label of the source by') :-
+	rdf_equal(Default, rdfs:label),
+	label_list(LabelProps).
+parameter(targetlabel, oneof(LabelProps), Default,
+	  '(Super)Property to get the label of the target by') :-
+	rdf_equal(Default, rdfs:label),
+	label_list(LabelProps).
+parameter(source_language, oneof(['any'|L]), 'any',
+	  'Language of source label') :-
+	strategy_languages(_S,L).
+parameter(matchacross_lang, boolean, true,
+	  'Allow labels from different language to be matched').
+parameter(matchacross_type, boolean, true,
+	  'Allow labels from different types to be matched').
+parameter(case_sensitive, boolean, false,
+	  'When true the case of labels must be equal').
+parameter(match_qualified_only, boolean, false,
+	  'Match only on the fully qualified label').
+
+amalgame_module(amalgame:'CompoundLabelSelecter').
+
+selecter(In, Sel, Dis, Und, Options) :-
+	label_selecter(compound_label_match, In, SelList, Dis, Und, Options),
+	append(SelList, Sel). % HACK FIX ME
diff --git a/lib/ag_modules/exact_label_match.pl b/lib/ag_modules/exact_label_match.pl
index 0a466f3..a64a79d 100644
--- a/lib/ag_modules/exact_label_match.pl
+++ b/lib/ag_modules/exact_label_match.pl
@@ -6,6 +6,15 @@
 :- use_module(library(amalgame/vocabulary)).
 :- use_module(string_match_util).
 
+%%	exact_label_match(?In, -Out, +Options) is non det.
+%
+%	In and Out are align(S,T,P) terms.
+%	Typically, there are to modes for calling this predicate:
+%
+%       * Both S and T of In are instantiated, and the predicate is
+%	  called to find a match between S and T, or
+%	* Only S is ground and the predicate is called to find a
+%	matching T.
 exact_label_match(align(Source, Target, Prov0),
 		  align(Source, Target, [Prov|Prov0]), Options) :-
 	rdf_equal(rdfs:label, RdfsLabel),
diff --git a/rdf/tool/ag_modules.ttl b/rdf/tool/ag_modules.ttl
index 5149193..a4c716d 100644
--- a/rdf/tool/ag_modules.ttl
+++ b/rdf/tool/ag_modules.ttl
@@ -12,8 +12,8 @@ amalgame:AncestorMatcher
     rdfs:subClassOf amalgame:CandidateGenerator .
 
 amalgame:CompoundMatcher
-    rdfs:label "generate/label/compound"@en ;
-    skos:definition "A label matcher matching after compound splitting the label(s) of the source concepts"@en ;
+    rdfs:label "generate/label/compound source"@en ;
+    skos:definition "A label matcher matching after compound splitting the label(s) of the source concepts. Source labels are splitted using rdf_tokenize_literal/2. Warning: source label tokens are matched against complete target labels! "@en ;
     rdfs:subClassOf amalgame:CandidateGenerator .
 
 amalgame:DescendentMatcher
@@ -46,6 +46,11 @@ amalgame:AncestorSelecter
     skos:definition "Select mappings with the most mapped ancestors, discard others for the same source/target. If type=all, all correspondences with one or more ancestors are selected."@en ;
     rdfs:subClassOf amalgame:MappingPartitioner .
 
+amalgame:CompoundLabelSelecter
+    rdfs:label "partition/label/compound source"@en ;
+    skos:definition "Select mappings with the most matching labels, discard others for the same source/target. If type=all, all candidates with matching labels are selected.  All matching is done after compound splitting the label(s) of the source concepts. Source labels are splitted using rdf_tokenize_literal/2. Warning: source label tokens are matched against complete target labels!"@en ;
+    rdfs:subClassOf amalgame:MappingPartitioner .
+
 amalgame:DescendentSelecter
     amalgame:need_secondary_inputs true ;
     rdfs:label "partition/structure/descendent"@en ;