amalgame/commit
REFACTORING: snowball generator
author | Jacco van Ossenbruggen |
---|---|
Mon Sep 8 09:49:37 2014 +0200 | |
committer | Jacco van Ossenbruggen |
Mon Sep 8 09:49:37 2014 +0200 | |
commit | 06e9a7ce19c285660e8de0734093a7b5c160e5c9 |
tree | f594a3ec0c05e9ffb8f1e50cf616feb3386a32ea |
parent | 15df88d9521ad54868de5e55bde478c3a7dc359e |
Diff style: patch stat
diff --git a/config-available/ag_modules.pl b/config-available/ag_modules.pl index 2e0fb0b..6229570 100644 --- a/config-available/ag_modules.pl +++ b/config-available/ag_modules.pl @@ -7,6 +7,7 @@ :- use_module(library(ag_modules/exact_label_generator)). :- use_module(library(ag_modules/isub_generator)). :- use_module(library(ag_modules/related_generator)). +:- use_module(library(ag_modules/snowball_label_generator)). % Mapping producing partitioners: :- use_module(library(ag_modules/ancestor_selecter)). @@ -23,7 +24,6 @@ :- use_module(library(ag_modules/propvalue_select)). % Modules that can be used as matchers -:- use_module(library(ag_modules/snowball_match)). :- use_module(library(ag_modules/preloaded_mapping)). % Alignment filters/selecters diff --git a/lib/ag_modules/snowball_label_generator.pl b/lib/ag_modules/snowball_label_generator.pl new file mode 100644 index 0000000..d842215 --- /dev/null +++ b/lib/ag_modules/snowball_label_generator.pl @@ -0,0 +1,48 @@ +:- module(snowball_label_generator, + []). + +:- use_module(library(semweb/rdf_db)). +:- use_module(library(snowball)). +:- use_module(library(amalgame/vocabulary)). +:- use_module(string_match_util). +:- use_module(snowball_label_match). + +:- public amalgame_module/1. +:- public parameter/4. +:- public matcher/4. + +amalgame_module(amalgame:'SnowballMatcher'). + +parameter(sourcelabel, oneof(LabelProps), Default, + '(Super)Property to get label of the source by') :- + rdf_equal(Default, rdfs:label), + label_list(LabelProps). +parameter(targetlabel, oneof(LabelProps), Default, + '(Super)Property to get the label of the target by') :- + rdf_equal(Default, rdfs:label), + label_list(LabelProps). +parameter(source_language, oneof(['any'|L]), 'any', 'Language of source label') :- + strategy_languages(_,L). +parameter(matchacross_lang, boolean, true, + 'Allow labels from different language to be matched'). +parameter(snowball_language, oneof(Languages), english, + 'Language to use for stemmer') :- + findall(Alg, snowball_current_algorithm(Alg), Languages). +parameter(prefix, integer, 4, + 'Optimise performence by first generating candidates by matching the prefix.Input is an integer for the prefix length.'). +parameter(edit_distance, integer, 0, + 'When >0 allow additional differences between labels'). + +%% matcher(+Source, +Target, -Mappings, +Options) +% +% Mappings is a list of matches between instances of Source and +% Target. + +matcher(Source, Target, Mappings, Options) :- + findall(A, align(Source, Target, A, Options), Mappings0), + sort(Mappings0, Mappings). + +align(Source, Target, Match, Options) :- + vocab_member(S, Source), + snowball_label_match(align(S,_,[]), Match, + [target_scheme(Target)|Options]). diff --git a/lib/ag_modules/snowball_match.pl b/lib/ag_modules/snowball_label_match.pl similarity index 50% rename from lib/ag_modules/snowball_match.pl rename to lib/ag_modules/snowball_label_match.pl index c3cdb4e..7b65998 100644 --- a/lib/ag_modules/snowball_match.pl +++ b/lib/ag_modules/snowball_label_match.pl @@ -1,67 +1,16 @@ -:- module(snowball_match, - []). +:- module(snowball_label_match, + [ snowball_label_match/3 ]). -:- use_module(library(semweb/rdf_db)). +:- use_module(library(debug)). +:- use_module(library(option)). :- use_module(library(snowball)). +:- use_module(library(semweb/rdf_db)). :- use_module(library(amalgame/lit_distance)). :- use_module(library(amalgame/vocabulary)). :- use_module(string_match_util). -:- public amalgame_module/1. -:- public parameter/4. -:- public filter/3. -:- public matcher/4. - -amalgame_module(amalgame:'SnowballMatcher'). -amalgame_module(amalgame:'SnowballFilter'). - -parameter(sourcelabel, oneof(LabelProps), Default, - '(Super)Property to get label of the source by') :- - rdf_equal(Default, rdfs:label), - label_list(LabelProps). -parameter(targetlabel, oneof(LabelProps), Default, - '(Super)Property to get the label of the target by') :- - rdf_equal(Default, rdfs:label), - label_list(LabelProps). -parameter(language, oneof(['any'|L]), 'any', 'Language of source label') :- - strategy_languages(_,L). -parameter(matchacross_lang, boolean, true, - 'Allow labels from different language to be matched'). -parameter(snowball_language, oneof(Languages), english, - 'Language to use for stemmer') :- - findall(Alg, snowball_current_algorithm(Alg), Languages). -parameter(prefix, integer, 4, - 'Optimise performence by first generating candidates by matching the prefix.Input is an integer for the prefix length.'). -parameter(edit_distance, integer, 0, - 'When >0 allow additional differences between labels'). - - -%% filter(+MappingsIn, -MappingsOut, +Options) -% -% Filter mappings based on matching stemmed labels. - -filter([], [], _). -filter([C0|Cs], [C|Mappings], Options) :- - match(C0, C, Options), - !, - filter(Cs, Mappings, Options). -filter([_|Cs], Mappings, Options) :- - filter(Cs, Mappings, Options). - -%% matcher(+Source, +Target, -Mappings, +Options) -% -% Mappings is a list of matches between instances of Source and -% Target. - -matcher(Source, Target, Mappings, Options) :- - findall(A, align(Source, Target, A, Options), Mappings0), - sort(Mappings0, Mappings). - -align(Source, Target, Match, Options) :- - vocab_member(S, Source), - match(align(S,_,[]), Match, [target_scheme(Target)|Options]). - -match(align(Source, Target, Prov0), align(Source, Target, [Prov|Prov0]), Options) :- +snowball_label_match(align(Source, Target, Prov0), + align(Source, Target, [Prov|Prov0]), Options) :- rdf_equal(rdfs:label,DefaultP), option(snowball_language(Snowball_Language), Options, english), option(prefix(PrefixLength), Options, 4), diff --git a/rdf/tool/ag_modules.ttl b/rdf/tool/ag_modules.ttl index a4c716d..513ee84 100644 --- a/rdf/tool/ag_modules.ttl +++ b/rdf/tool/ag_modules.ttl @@ -38,6 +38,11 @@ amalgame:RelatedMatcher skos:definition "Generate new candidates by looking for 1 or more related concepts that have already been mapped."@en ; rdfs:subClassOf amalgame:CandidateGenerator . +amalgame:SnowballMatcher + rdfs:label "generate/label/stemmed"@en ; + skos:definition "Generate new candidates based on label matching after (snowball) stemming."@en ; + rdfs:subClassOf amalgame:CandidateGenerator . + ######## Mapping partitioner classes: amalgame:AncestorSelecter @@ -113,11 +118,6 @@ amalgame:SelectPreLoadedSelecter skos:definition "Select mappings with corresponding mappings in the preloaded mapping, discard others with the same source/target."@en ; rdfs:subClassOf amalgame:MappingPartitioner . -amalgame:SnowballMatcher - rdfs:label "string/label (stemmed)"@en ; - skos:definition "A label matcher with similarity based on (snowball) stemming."@en ; - rdfs:subClassOf amalgame:Matcher . - amalgame:AritySelect rdfs:label "ambiguity/remove"@en ; skos:definition "Select correspondences with a unique source, target or both, discard others"@en;