amalgame/commit

REFACTORING: snowball generator

authorJacco van Ossenbruggen
Mon Sep 8 09:49:37 2014 +0200
committerJacco van Ossenbruggen
Mon Sep 8 09:49:37 2014 +0200
commit06e9a7ce19c285660e8de0734093a7b5c160e5c9
treef594a3ec0c05e9ffb8f1e50cf616feb3386a32ea
parent15df88d9521ad54868de5e55bde478c3a7dc359e
Diff style: patch stat
diff --git a/config-available/ag_modules.pl b/config-available/ag_modules.pl
index 2e0fb0b..6229570 100644
--- a/config-available/ag_modules.pl
+++ b/config-available/ag_modules.pl
@@ -7,6 +7,7 @@
 :- use_module(library(ag_modules/exact_label_generator)).
 :- use_module(library(ag_modules/isub_generator)).
 :- use_module(library(ag_modules/related_generator)).
+:- use_module(library(ag_modules/snowball_label_generator)).
 
 % Mapping producing partitioners:
 :- use_module(library(ag_modules/ancestor_selecter)).
@@ -23,7 +24,6 @@
 :- use_module(library(ag_modules/propvalue_select)).
 
 % Modules that can be used as matchers
-:- use_module(library(ag_modules/snowball_match)).
 :- use_module(library(ag_modules/preloaded_mapping)).
 
 % Alignment filters/selecters
diff --git a/lib/ag_modules/snowball_label_generator.pl b/lib/ag_modules/snowball_label_generator.pl
new file mode 100644
index 0000000..d842215
--- /dev/null
+++ b/lib/ag_modules/snowball_label_generator.pl
@@ -0,0 +1,48 @@
+:- module(snowball_label_generator,
+	  []).
+
+:- use_module(library(semweb/rdf_db)).
+:- use_module(library(snowball)).
+:- use_module(library(amalgame/vocabulary)).
+:- use_module(string_match_util).
+:- use_module(snowball_label_match).
+
+:- public amalgame_module/1.
+:- public parameter/4.
+:- public matcher/4.
+
+amalgame_module(amalgame:'SnowballMatcher').
+
+parameter(sourcelabel, oneof(LabelProps), Default,
+	  '(Super)Property to get label of the source by') :-
+	rdf_equal(Default, rdfs:label),
+	label_list(LabelProps).
+parameter(targetlabel, oneof(LabelProps), Default,
+	  '(Super)Property to get the label of the target by') :-
+	rdf_equal(Default, rdfs:label),
+	label_list(LabelProps).
+parameter(source_language, oneof(['any'|L]), 'any', 'Language of source label') :-
+	strategy_languages(_,L).
+parameter(matchacross_lang, boolean, true,
+	  'Allow labels from different language to be matched').
+parameter(snowball_language, oneof(Languages), english,
+	  'Language to use for stemmer') :-
+	findall(Alg, snowball_current_algorithm(Alg), Languages).
+parameter(prefix, integer, 4,
+	  'Optimise performence by first generating candidates by matching the prefix.Input is an integer for the prefix length.').
+parameter(edit_distance, integer, 0,
+	  'When >0 allow additional differences between labels').
+
+%%	matcher(+Source, +Target, -Mappings, +Options)
+%
+%	Mappings is a list of matches between instances of Source and
+%	Target.
+
+matcher(Source, Target, Mappings, Options) :-
+	findall(A, align(Source, Target, A, Options), Mappings0),
+	sort(Mappings0, Mappings).
+
+align(Source, Target, Match, Options) :-
+	vocab_member(S, Source),
+	snowball_label_match(align(S,_,[]), Match,
+			     [target_scheme(Target)|Options]).
diff --git a/lib/ag_modules/snowball_match.pl b/lib/ag_modules/snowball_label_match.pl
similarity index 50%
rename from lib/ag_modules/snowball_match.pl
rename to lib/ag_modules/snowball_label_match.pl
index c3cdb4e..7b65998 100644
--- a/lib/ag_modules/snowball_match.pl
+++ b/lib/ag_modules/snowball_label_match.pl
@@ -1,67 +1,16 @@
-:- module(snowball_match,
-	  []).
+:- module(snowball_label_match,
+	  [ snowball_label_match/3 ]).
 
-:- use_module(library(semweb/rdf_db)).
+:- use_module(library(debug)).
+:- use_module(library(option)).
 :- use_module(library(snowball)).
+:- use_module(library(semweb/rdf_db)).
 :- use_module(library(amalgame/lit_distance)).
 :- use_module(library(amalgame/vocabulary)).
 :- use_module(string_match_util).
 
-:- public amalgame_module/1.
-:- public parameter/4.
-:- public filter/3.
-:- public matcher/4.
-
-amalgame_module(amalgame:'SnowballMatcher').
-amalgame_module(amalgame:'SnowballFilter').
-
-parameter(sourcelabel, oneof(LabelProps), Default,
-	  '(Super)Property to get label of the source by') :-
-	rdf_equal(Default, rdfs:label),
-	label_list(LabelProps).
-parameter(targetlabel, oneof(LabelProps), Default,
-	  '(Super)Property to get the label of the target by') :-
-	rdf_equal(Default, rdfs:label),
-	label_list(LabelProps).
-parameter(language, oneof(['any'|L]), 'any', 'Language of source label') :-
-	strategy_languages(_,L).
-parameter(matchacross_lang, boolean, true,
-	  'Allow labels from different language to be matched').
-parameter(snowball_language, oneof(Languages), english,
-	  'Language to use for stemmer') :-
-	findall(Alg, snowball_current_algorithm(Alg), Languages).
-parameter(prefix, integer, 4,
-	  'Optimise performence by first generating candidates by matching the prefix.Input is an integer for the prefix length.').
-parameter(edit_distance, integer, 0,
-	  'When >0 allow additional differences between labels').
-
-
-%%	filter(+MappingsIn, -MappingsOut, +Options)
-%
-%	Filter mappings based on matching stemmed labels.
-
-filter([], [], _).
-filter([C0|Cs], [C|Mappings], Options) :-
-	match(C0, C, Options),
-	!,
-	filter(Cs, Mappings, Options).
-filter([_|Cs], Mappings, Options) :-
-	filter(Cs, Mappings, Options).
-
-%%	matcher(+Source, +Target, -Mappings, +Options)
-%
-%	Mappings is a list of matches between instances of Source and
-%	Target.
-
-matcher(Source, Target, Mappings, Options) :-
-	findall(A, align(Source, Target, A, Options), Mappings0),
-	sort(Mappings0, Mappings).
-
-align(Source, Target, Match, Options) :-
-	vocab_member(S, Source),
-	match(align(S,_,[]), Match, [target_scheme(Target)|Options]).
-
-match(align(Source, Target, Prov0), align(Source, Target, [Prov|Prov0]), Options) :-
+snowball_label_match(align(Source, Target, Prov0),
+		     align(Source, Target, [Prov|Prov0]), Options) :-
 	rdf_equal(rdfs:label,DefaultP),
 	option(snowball_language(Snowball_Language), Options, english),
 	option(prefix(PrefixLength), Options, 4),
diff --git a/rdf/tool/ag_modules.ttl b/rdf/tool/ag_modules.ttl
index a4c716d..513ee84 100644
--- a/rdf/tool/ag_modules.ttl
+++ b/rdf/tool/ag_modules.ttl
@@ -38,6 +38,11 @@ amalgame:RelatedMatcher
     skos:definition "Generate new candidates by looking for 1 or more related concepts that have already been mapped."@en ;
     rdfs:subClassOf amalgame:CandidateGenerator .
 
+amalgame:SnowballMatcher
+    rdfs:label "generate/label/stemmed"@en ;
+    skos:definition "Generate new candidates based on label matching after (snowball) stemming."@en ;
+    rdfs:subClassOf amalgame:CandidateGenerator .
+
 ######## Mapping partitioner classes:
 
 amalgame:AncestorSelecter
@@ -113,11 +118,6 @@ amalgame:SelectPreLoadedSelecter
     skos:definition "Select mappings with corresponding mappings in the preloaded mapping, discard others with the same source/target."@en ;
     rdfs:subClassOf amalgame:MappingPartitioner .
 
-amalgame:SnowballMatcher
-    rdfs:label "string/label (stemmed)"@en ;
-    skos:definition "A label matcher with similarity based on (snowball) stemming."@en ;
-    rdfs:subClassOf amalgame:Matcher .
-
 amalgame:AritySelect
     rdfs:label "ambiguity/remove"@en ;
     skos:definition "Select correspondences with a unique source, target or both, discard others"@en;