vumix/commit

start with topN evaluation

authorMichiel Hildebrand
Mon May 14 09:12:33 2012 +0200
committerMichiel Hildebrand
Mon May 14 09:12:33 2012 +0200
commitfe3a5f0caa4ebcd0fb72b4c5b10c48c62cbdcc9d
treea7618e9d24446c54a8355eb346645b0d843a0596
parentf65eb7a6c664ade799534fa6b4227a5a6892fe67
Diff style: patch stat
diff --git a/applications/vumix_p0.pl b/applications/vumix_p0.pl
index d7c262e..7fb0721 100644
--- a/applications/vumix_p0.pl
+++ b/applications/vumix_p0.pl
@@ -29,6 +29,8 @@
 :- use_module(library(yaz_util)).
 :- use_module(library(stop_words)).
 :- use_module(library(tfidf)).
+:- use_module(library(find_resource)).
+
 
 /***************************************************
 * http handlers
@@ -560,8 +562,8 @@ html_tags([]) --> !.
 html_tags([Score0-Tag|T]) -->
 	{ Score is round(Score0)
 	},
-	html(li([Tag%,
-		% ' (',Score,')'
+	html(li([Tag,
+		 ' (',Score,')'
 		])),
 	html_tags(T).
 
@@ -743,3 +745,211 @@ rel_concept(C1,C2) :-
 rel_concept(C1,C2) :-
 	rdf_reachable(C1,skos:broader,C2).
 rel_concept(C,C).
+
+
+
+
+		 /*******************************
+		 *	    experiment		*
+		 *******************************/
+
+
+baseline_video(V) :-
+	rdf(V, rdf:type, pprime:'Baseline'),
+	once(rdf(V, dc:subject, _)).
+
+derived_concepts(Tags, Goal, Concepts) :-
+	findall(C, (member(_-Tag, Tags),
+		    call(Goal, Tag, C)
+		   ),
+		Concepts0),
+	sort(Concepts0, Concepts).
+
+derived_ranked_concepts(Tags, Goal, Ranked) :-
+	findall(C-Tag, (member(_-Tag, Tags),
+			call(Goal, Tag, C)
+		       ),
+		Pairs0),
+	keysort(Pairs0, Pairs),
+	group_pairs_by_key(Pairs, Groups),
+	pairs_sort_by_value_count(Groups, Ranked).
+
+tag_concept(exact, Tag, Concept) :-
+	rdf_has(Concept, rdfs:label, literal(exact(Tag),_)),
+	rdf(Concept, skos:inScheme, gtaa:'GTAA').
+
+tag_concept(stem, Tag, Concept) :-
+	snowball(dutch, Tag, TagStem),
+	rdf_has(Concept,rdfs:label, literal(prefix(TagStem), Lit)),
+	rdf(Concept, skos:inScheme, gtaa:'GTAA'),
+	literal_text(Lit, Label),
+	isub(TagStem, Label, true, Sim),
+	Sim > 0.8.
+
+tag_concept(sub, Tag, Concept) :-
+	length(Results, 3),
+	snowball(dutch, Tag, TagStem),
+	find_resource_by_name(TagStem, Hits, [match(prefix),distance(true)]),
+	length(Hits, N),
+	(   N =< 3
+	->  Results = Hits
+	;   append(Results, _, Hits)
+	),
+	member(hit(_D,Concept,_,_), Hits),
+	rdf(Concept,skos:inScheme, gtaa:'GTAA').
+
+
+
+tag_related_concept(StringMatch, Tag, Concept) :-
+	tag_concept(StringMatch, Tag, C1),
+	(   Concept = C1
+	;   related(C1, Concept)
+	).
+
+tag_tree_concept(StringMatch, Tag, Concept) :-
+	tag_concept(StringMatch, Tag, C1),
+	(   Concept = C1
+	;   tree(C1, Concept)
+	).
+
+tag_tree_and_related_concept(StringMatch, Tag, Concept) :-
+	tag_concept(StringMatch, Tag, C1),
+	(   C1 = Concept
+	;   related(C1, Concept)
+	;   tree(C1, Concept)
+	).
+
+tag_tree_related_concept(StringMatch, Tag, Concept) :-
+	tag_concept(StringMatch, Tag, C1),
+	(   C1 = Concept
+	;   related_tree(C1, Concept)
+	;   tree_related(C1, Concept)
+	).
+
+tag_tree_related_sibbling_concept(StringMatch, Tag, Concept) :-
+	tag_concept(StringMatch, Tag, C1),
+	(   C1 = Concept
+	;   related_tree(C1, Concept)
+	;   tree_related(C1, Concept)
+	;   sibbling(C1, Concept)
+	).
+
+related_tree(C, Concept) :-
+	related(C, C1),
+	(   C1 = Concept
+	;   tree(C1, Concept)
+	).
+
+tree_related(C, Concept) :-
+	tree(C, C1),
+	(   C1 = Concept
+	;   related(C1, Concept)
+	).
+
+tree(C1, C2) :-
+	(   rdf_reachable(C1, skos:broader, C2)
+	;   rdf_reachable(C2, skos:broader, C1)
+	).
+
+sibbling(C1, C2) :-
+	rdf(C1, skos:broader, P),
+	rdf(C2, skos:broader, P).
+
+related(C1, C2) :-
+	rdf(C1, skos:related, C2).
+
+baseline_eval([], _,  _, []).
+baseline_eval([C|Cs], Video, Scheme, [C|Rest]) :-
+	rdf(Video, dc:subject, C),
+	rdf(C, skos:inScheme, Scheme),
+	!,
+	baseline_eval(Cs, Video, Scheme, Rest).
+baseline_eval([_|Cs], Video, Scheme, Rest) :-
+	baseline_eval(Cs, Video, Scheme, Rest).
+
+eval_table(Type) :-
+	scheme_alias(Type, Scheme),
+	(   baseline_video(V),
+	    rdf(V, dc:id, literal(Id)),
+	    findall(1-T, video_tag(V, literal(T)), Tags),
+	    remove_stop_words(Tags, dutch, Tags1),
+	    findall(C, (rdf(V, dc:subject, C),
+			rdf(C, skos:inScheme, Scheme)
+		       ), Concepts),
+	    length(Tags, TagCount),
+	    length(Concepts, ConceptCount),
+	    format('~w, ~w\t ~w,   ', [Id, TagCount, ConceptCount]),
+	    concept_eval(V, Tags1, Scheme, tag_concept(exact)),
+	    concept_eval(V, Tags1, Scheme, tag_related_concept(exact)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_concept(exact)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_related_concept(exact)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_and_related_concept(exact)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_related_sibbling_concept(exact)),
+	    format('  '),
+	    concept_eval(V, Tags1, Scheme, tag_concept(stem)),
+	    concept_eval(V, Tags1, Scheme, tag_related_concept(stem)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_concept(stem)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_and_related_concept(stem)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_related_concept(stem)),
+	    concept_eval(V, Tags1, Scheme, tag_tree_related_sibbling_concept(stem)),
+	    %format('  '),
+	    %concept_eval(V, tag_concept(sub)),
+	    %concept_eval(V, tag_related_concept(sub)),
+	    %concept_eval(V, tag_tree_concept(sub)),
+	    %concept_eval(V, tag_tree_related_concept(sub)),
+	    format('~n'),
+	    fail
+	;   true
+	).
+
+topN_eval(Id, N, Type, Goal, TopN) :-
+	scheme_alias(Type, Scheme),
+	rdf(V, dc:id, literal(Id)),
+	ranked_concepts(Id, Goal, Ranked),
+	length(Ranked, Count),
+	(   Count < N
+	->  Results0 = Ranked
+	;   length(Results0, N),
+	    append(Results0, _, Ranked)
+	),
+	pairs_values(Results0, Results),
+	baseline_eval(Results, V, Scheme, InTopN),
+	length(InTopN, TopN).
+
+ranked_concepts(Id, Goal, Derived) :-
+	rdf(V, dc:id, literal(Id)),
+	findall(1-T, video_tag(V, literal(T)), Tags),
+	remove_stop_words(Tags, dutch, Tags1),
+	derived_ranked_concepts(Tags1, Goal, Derived).
+
+scheme_alias(all, Scheme) :-
+	rdf_equal(Scheme, gtaa:'GTAA').
+scheme_alias(persons, Scheme) :-
+	rdf_equal(Scheme, gtaa:'Peroonsnamen').
+scheme_alias(places, Scheme) :-
+	rdf_equal(Scheme, gtaa:'GeografischeNamen').
+scheme_alias(subjects, Scheme) :-
+	rdf_equal(Scheme, gtaa:'OnderwerpenBenG').
+scheme_alias(names, Scheme) :-
+	rdf_equal(Scheme, gtaa:'Namen').
+
+
+concept_eval(V, Tags, Scheme, Goal) :-
+	    derived_concepts(Tags, Goal, Derived),
+	    baseline_eval(Derived,V,Scheme,Eval),
+	    length(Derived,DerivedCount),
+	    length(Eval,EvalCount),
+	    %format('~w, ', [EvalCount]).
+	    format('~w, ~w, ', [EvalCount, DerivedCount]).
+
+
+
+%extend_gtaa_hierarchy :-
+
+gtaa_wn_hierarchy(GTAA, GTAA_Parent) :-
+	rdf(GTAA, skos:exactMatch, WN),
+	rdf_reachable(WN, skos:broader, WN_Parent),
+	WN_Parent \== WN,
+	rdf(GTAA_Parent, skos:exactMatch, WN_Parent).
+	%rdf_assert(GTAA, skos:broader, GTAA_Parent, gtaa_wordnet_broader).
+
diff --git a/lib/beng_xml.pl b/lib/beng_xml.pl
index 1c98268..fd12965 100644
--- a/lib/beng_xml.pl
+++ b/lib/beng_xml.pl
@@ -8,11 +8,13 @@
 :- use_module(library(xpath)).
 :- use_module(library(csv)).
 :- use_module(library(yaz_util)).
-
+:- use_module(library(isub)).
 
 :- dynamic
 	fragment/2.
 
+%:- import_fragments.
+
 user:file_search_path(beng, '/Users/michiel/Projects/pprime/data/werk_10629/10629').
 user:file_search_path(mbh, '/Users/michiel/Projects/pprime/data/manbijthond').
 
@@ -21,10 +23,102 @@ reeks('357006').
 reeks('357007').
 
 
-assert_annotations([]).
-assert_annotations([FragmentId-Selection|T]) :-
-	load_xml_file(XMLFile,XMLTree),
-	xpath(XMLTree, //publicaties//begindatum, element(begindatum,_,[Date])).
+assert_fragment_annotations(As) :-
+	rdf_equal(gtaa:'OnderwerpenBenG', Onderwerpen),
+	rdf_equal(gtaa:'Persoonsnamen', Persons),
+	rdf_equal(gtaa:'GeografischeNamen', Places),
+	rdf_equal(gtaa:'Namen', Names),
+	rdf_transaction((member(A,As),
+			 A = a(F,_Selection,T,P,PL,N),
+			 atom_number(FA, F),
+			 rdf(Video, dc:id, literal(FA)),
+			 debug(frag_assert, '~n~nassert fragments for: ~w', [Video]),
+			 rdf_assert(Video, rdf:type, pprime:'Baseline', baseline),
+			 assert_terms(T, Video, Onderwerpen),
+			 assert_terms(P, Video, Persons),
+			 assert_terms(PL, Video, Places),
+			 assert_terms(N, Video, Names),
+			 fail
+			;true
+			)).
+
+assert_terms([], _, _).
+assert_terms([T|Ts], Video, Scheme) :-
+	findall(C, term_concept_match(T, Scheme, C), Concepts),
+	length(Concepts, N),
+	(   Concepts = [Concept]
+	->  rdf_assert(Video, dc:subject, Concept, baseline)
+	;   debug(frag_assert, '~w: ~w results', [T, N])
+	),
+	assert_terms(Ts, Video, Scheme).
+
+term_concept_match(Term, Scheme, Concept) :-
+	rdf(Concept, skos:prefLabel, literal(Term)),
+	rdf(Concept, skos:inScheme, Scheme).
+
+tag_overlap_stats([]).
+tag_overlap_stats([A|As]) :-
+	tag_overlap(A, O),
+	A = a(F,_Selection,T,P,PL,N),
+	O = a(_,_,OT,OP,OPL,ON),
+	append([T,P,PL,N], Terms),
+	append([OT,OP,OPL,ON], Matches),
+	length(Terms, TLength),
+	length(Matches, MLength),
+	format('~w\t ~w\/~w~n', [F, MLength, TLength]),
+	tag_overlap_stats(As).
+
+tag_overlaps([], []).
+tag_overlaps([A|As], [O|Os]) :-
+	tag_overlap(A, O),
+	tag_overlaps(As, Os).
+
+tag_overlap(A, O) :-
+	A = a(F, Selection, Terms, Persons, Places, Names),
+	O = a(F, Selection, TermTags, PersonTags, PlaceTags, NameTags),
+	atom_number(FragmentId, F),
+	rdf(Video, dc:id, literal(FragmentId)),
+	tag_match(Terms, Video, TermTags),
+	tag_match(Persons, Video, PersonTags),
+	tag_match(Places, Video, PlaceTags),
+	tag_match(Names, Video, NameTags).
+
+tag_match([], _, []).
+tag_match([Term|Ts], Video, [Term-Tag|Rest]) :-
+	concept_tag(Term, Video, Tag),
+	!,
+	tag_match(Ts, Video, Rest).
+tag_match([_|Ts], Video, Rest) :-
+	tag_match(Ts, Video, Rest).
+
+
+concept_tag(Term, Video, Tag) :-
+	snowball(dutch, Term, TermStem),
+	rdf(Video, pprime:hasAnnotation, E),
+	rdf(E, rdf:value, literal(Tag)),
+	snowball(dutch, Tag, TagStem),
+	TagStem = TermStem.
+	%isub(TermStem, TagStem, true, Sim),
+	%Sim > 0.9,
+	%rdf(E, rdf:value, literal(prefix(Stem), _)),
+
+
+segment_terms([], []).
+segment_terms([FragmentId-SelectionFile|Ps], [A|As]) :-
+	A = a(FragmentId, SelectionFile, Terms, Persons, Places, Names),
+	load_xml_file(SelectionFile, SelectionXML),
+	findall(T, selection_term(SelectionXML, //opnameplaatsen/opnameplaats, T), Places0),
+	findall(T, selection_term(SelectionXML, //trefwoorden/trefwoord, T), Terms),
+	findall(T, selection_term(SelectionXML, //persoonsnamen/persoonsnaam, T), Persons0),
+	findall(T, selection_term(SelectionXML, //spreker/naam, T), Persons1),
+	findall(T, selection_term(SelectionXML, //geografische_namen/geografische_naam, T), Places1),
+	findall(T, selection_term(SelectionXML, //namen/naam, T), Names),
+	append(Places0, Places1, Places),
+	append(Persons0, Persons1, Persons),
+	segment_terms(Ps, As).
+
+selection_term(XML, Path, Term) :-
+	xpath(XML, Path, element(_,_,[Term])).
 
 
 %%	overlap(-Pairs)
@@ -73,7 +167,7 @@ filter_waisda_tags([_|T], Rest) :-
 	filter_waisda_tags(T, Rest).
 
 
-%%	link_beng_waisda(+Expressies, -List(fragment-selection))
+%%	link_beng_waisda(+Expressies, -Pairs)
 %
 %	 we assume that sorting on the ids of the fragments and the
 %	 filenames of Selections provides the right order. Thus, the
@@ -117,14 +211,14 @@ beng_selections(Reeks, Selections) :-
 	expressie(Expressies, ReeksPath, Selections).
 
 expressie([], _, []).
-expressie([Dir,File|T], Reeks, [expressie(Dir,Date,Selections)|Rest]) :-
+expressie([Dir,File|T], Reeks, [expressie(AbsDir,Date,Selections)|Rest]) :-
 	atom_concat(Reeks, File, AbsFile),
 	atom_concat(Reeks, Dir, AbsDir),
 	absolute_file_name(beng(AbsFile), ExpressieXML),
 	absolute_file_name(beng(AbsDir), SelectionDir),
 	beng_date(ExpressieXML, Date),
 	directory_files(SelectionDir, [_,_|Selections0]),
-	remove_non_xml(Selections0, Selections),
+	selection_xml_files(Selections0, SelectionDir, Selections),
 	length(Selections, SelectionCount),
 	debug(beng_xml, 'expressie: ~w ~w ~w', [Date,File,SelectionCount]),
 	Selections \== [],
@@ -133,13 +227,14 @@ expressie([Dir,File|T], Reeks, [expressie(Dir,Date,Selections)|Rest]) :-
 expressie([_Dir,_File|T], Reeks, Rest) :-
 	expressie(T, Reeks, Rest).
 
-remove_non_xml([], []).
-remove_non_xml([H|T], [H|Rest]) :-
+selection_xml_files([], _, []).
+selection_xml_files([H|T], Dir, [File|Rest]) :-
 	atom_concat(_, '.xml', H),
 	!,
-	remove_non_xml(T, Rest).
-remove_non_xml([_|T], Rest) :-
-	remove_non_xml(T, Rest).
+	concat_atom([Dir, '/', H], File),
+	selection_xml_files(T, Dir, Rest).
+selection_xml_files([_|T], Dir, Rest) :-
+	selection_xml_files(T, Dir, Rest).
 
 
 %%	beng_date(+XMLFile, -Date)