vumix/commit
start with topN evaluation
author | Michiel Hildebrand |
---|---|
Mon May 14 09:12:33 2012 +0200 | |
committer | Michiel Hildebrand |
Mon May 14 09:12:33 2012 +0200 | |
commit | fe3a5f0caa4ebcd0fb72b4c5b10c48c62cbdcc9d |
tree | a7618e9d24446c54a8355eb346645b0d843a0596 |
parent | f65eb7a6c664ade799534fa6b4227a5a6892fe67 |
Diff style: patch stat
diff --git a/applications/vumix_p0.pl b/applications/vumix_p0.pl index d7c262e..7fb0721 100644 --- a/applications/vumix_p0.pl +++ b/applications/vumix_p0.pl @@ -29,6 +29,8 @@ :- use_module(library(yaz_util)). :- use_module(library(stop_words)). :- use_module(library(tfidf)). +:- use_module(library(find_resource)). + /*************************************************** * http handlers @@ -560,8 +562,8 @@ html_tags([]) --> !. html_tags([Score0-Tag|T]) --> { Score is round(Score0) }, - html(li([Tag%, - % ' (',Score,')' + html(li([Tag, + ' (',Score,')' ])), html_tags(T). @@ -743,3 +745,211 @@ rel_concept(C1,C2) :- rel_concept(C1,C2) :- rdf_reachable(C1,skos:broader,C2). rel_concept(C,C). + + + + + /******************************* + * experiment * + *******************************/ + + +baseline_video(V) :- + rdf(V, rdf:type, pprime:'Baseline'), + once(rdf(V, dc:subject, _)). + +derived_concepts(Tags, Goal, Concepts) :- + findall(C, (member(_-Tag, Tags), + call(Goal, Tag, C) + ), + Concepts0), + sort(Concepts0, Concepts). + +derived_ranked_concepts(Tags, Goal, Ranked) :- + findall(C-Tag, (member(_-Tag, Tags), + call(Goal, Tag, C) + ), + Pairs0), + keysort(Pairs0, Pairs), + group_pairs_by_key(Pairs, Groups), + pairs_sort_by_value_count(Groups, Ranked). + +tag_concept(exact, Tag, Concept) :- + rdf_has(Concept, rdfs:label, literal(exact(Tag),_)), + rdf(Concept, skos:inScheme, gtaa:'GTAA'). + +tag_concept(stem, Tag, Concept) :- + snowball(dutch, Tag, TagStem), + rdf_has(Concept,rdfs:label, literal(prefix(TagStem), Lit)), + rdf(Concept, skos:inScheme, gtaa:'GTAA'), + literal_text(Lit, Label), + isub(TagStem, Label, true, Sim), + Sim > 0.8. + +tag_concept(sub, Tag, Concept) :- + length(Results, 3), + snowball(dutch, Tag, TagStem), + find_resource_by_name(TagStem, Hits, [match(prefix),distance(true)]), + length(Hits, N), + ( N =< 3 + -> Results = Hits + ; append(Results, _, Hits) + ), + member(hit(_D,Concept,_,_), Hits), + rdf(Concept,skos:inScheme, gtaa:'GTAA'). + + + +tag_related_concept(StringMatch, Tag, Concept) :- + tag_concept(StringMatch, Tag, C1), + ( Concept = C1 + ; related(C1, Concept) + ). + +tag_tree_concept(StringMatch, Tag, Concept) :- + tag_concept(StringMatch, Tag, C1), + ( Concept = C1 + ; tree(C1, Concept) + ). + +tag_tree_and_related_concept(StringMatch, Tag, Concept) :- + tag_concept(StringMatch, Tag, C1), + ( C1 = Concept + ; related(C1, Concept) + ; tree(C1, Concept) + ). + +tag_tree_related_concept(StringMatch, Tag, Concept) :- + tag_concept(StringMatch, Tag, C1), + ( C1 = Concept + ; related_tree(C1, Concept) + ; tree_related(C1, Concept) + ). + +tag_tree_related_sibbling_concept(StringMatch, Tag, Concept) :- + tag_concept(StringMatch, Tag, C1), + ( C1 = Concept + ; related_tree(C1, Concept) + ; tree_related(C1, Concept) + ; sibbling(C1, Concept) + ). + +related_tree(C, Concept) :- + related(C, C1), + ( C1 = Concept + ; tree(C1, Concept) + ). + +tree_related(C, Concept) :- + tree(C, C1), + ( C1 = Concept + ; related(C1, Concept) + ). + +tree(C1, C2) :- + ( rdf_reachable(C1, skos:broader, C2) + ; rdf_reachable(C2, skos:broader, C1) + ). + +sibbling(C1, C2) :- + rdf(C1, skos:broader, P), + rdf(C2, skos:broader, P). + +related(C1, C2) :- + rdf(C1, skos:related, C2). + +baseline_eval([], _, _, []). +baseline_eval([C|Cs], Video, Scheme, [C|Rest]) :- + rdf(Video, dc:subject, C), + rdf(C, skos:inScheme, Scheme), + !, + baseline_eval(Cs, Video, Scheme, Rest). +baseline_eval([_|Cs], Video, Scheme, Rest) :- + baseline_eval(Cs, Video, Scheme, Rest). + +eval_table(Type) :- + scheme_alias(Type, Scheme), + ( baseline_video(V), + rdf(V, dc:id, literal(Id)), + findall(1-T, video_tag(V, literal(T)), Tags), + remove_stop_words(Tags, dutch, Tags1), + findall(C, (rdf(V, dc:subject, C), + rdf(C, skos:inScheme, Scheme) + ), Concepts), + length(Tags, TagCount), + length(Concepts, ConceptCount), + format('~w, ~w\t ~w, ', [Id, TagCount, ConceptCount]), + concept_eval(V, Tags1, Scheme, tag_concept(exact)), + concept_eval(V, Tags1, Scheme, tag_related_concept(exact)), + concept_eval(V, Tags1, Scheme, tag_tree_concept(exact)), + concept_eval(V, Tags1, Scheme, tag_tree_related_concept(exact)), + concept_eval(V, Tags1, Scheme, tag_tree_and_related_concept(exact)), + concept_eval(V, Tags1, Scheme, tag_tree_related_sibbling_concept(exact)), + format(' '), + concept_eval(V, Tags1, Scheme, tag_concept(stem)), + concept_eval(V, Tags1, Scheme, tag_related_concept(stem)), + concept_eval(V, Tags1, Scheme, tag_tree_concept(stem)), + concept_eval(V, Tags1, Scheme, tag_tree_and_related_concept(stem)), + concept_eval(V, Tags1, Scheme, tag_tree_related_concept(stem)), + concept_eval(V, Tags1, Scheme, tag_tree_related_sibbling_concept(stem)), + %format(' '), + %concept_eval(V, tag_concept(sub)), + %concept_eval(V, tag_related_concept(sub)), + %concept_eval(V, tag_tree_concept(sub)), + %concept_eval(V, tag_tree_related_concept(sub)), + format('~n'), + fail + ; true + ). + +topN_eval(Id, N, Type, Goal, TopN) :- + scheme_alias(Type, Scheme), + rdf(V, dc:id, literal(Id)), + ranked_concepts(Id, Goal, Ranked), + length(Ranked, Count), + ( Count < N + -> Results0 = Ranked + ; length(Results0, N), + append(Results0, _, Ranked) + ), + pairs_values(Results0, Results), + baseline_eval(Results, V, Scheme, InTopN), + length(InTopN, TopN). + +ranked_concepts(Id, Goal, Derived) :- + rdf(V, dc:id, literal(Id)), + findall(1-T, video_tag(V, literal(T)), Tags), + remove_stop_words(Tags, dutch, Tags1), + derived_ranked_concepts(Tags1, Goal, Derived). + +scheme_alias(all, Scheme) :- + rdf_equal(Scheme, gtaa:'GTAA'). +scheme_alias(persons, Scheme) :- + rdf_equal(Scheme, gtaa:'Peroonsnamen'). +scheme_alias(places, Scheme) :- + rdf_equal(Scheme, gtaa:'GeografischeNamen'). +scheme_alias(subjects, Scheme) :- + rdf_equal(Scheme, gtaa:'OnderwerpenBenG'). +scheme_alias(names, Scheme) :- + rdf_equal(Scheme, gtaa:'Namen'). + + +concept_eval(V, Tags, Scheme, Goal) :- + derived_concepts(Tags, Goal, Derived), + baseline_eval(Derived,V,Scheme,Eval), + length(Derived,DerivedCount), + length(Eval,EvalCount), + %format('~w, ', [EvalCount]). + format('~w, ~w, ', [EvalCount, DerivedCount]). + + + +%extend_gtaa_hierarchy :- + +gtaa_wn_hierarchy(GTAA, GTAA_Parent) :- + rdf(GTAA, skos:exactMatch, WN), + rdf_reachable(WN, skos:broader, WN_Parent), + WN_Parent \== WN, + rdf(GTAA_Parent, skos:exactMatch, WN_Parent). + %rdf_assert(GTAA, skos:broader, GTAA_Parent, gtaa_wordnet_broader). + diff --git a/lib/beng_xml.pl b/lib/beng_xml.pl index 1c98268..fd12965 100644 --- a/lib/beng_xml.pl +++ b/lib/beng_xml.pl @@ -8,11 +8,13 @@ :- use_module(library(xpath)). :- use_module(library(csv)). :- use_module(library(yaz_util)). - +:- use_module(library(isub)). :- dynamic fragment/2. +%:- import_fragments. + user:file_search_path(beng, '/Users/michiel/Projects/pprime/data/werk_10629/10629'). user:file_search_path(mbh, '/Users/michiel/Projects/pprime/data/manbijthond'). @@ -21,10 +23,102 @@ reeks('357006'). reeks('357007'). -assert_annotations([]). -assert_annotations([FragmentId-Selection|T]) :- - load_xml_file(XMLFile,XMLTree), - xpath(XMLTree, //publicaties//begindatum, element(begindatum,_,[Date])). +assert_fragment_annotations(As) :- + rdf_equal(gtaa:'OnderwerpenBenG', Onderwerpen), + rdf_equal(gtaa:'Persoonsnamen', Persons), + rdf_equal(gtaa:'GeografischeNamen', Places), + rdf_equal(gtaa:'Namen', Names), + rdf_transaction((member(A,As), + A = a(F,_Selection,T,P,PL,N), + atom_number(FA, F), + rdf(Video, dc:id, literal(FA)), + debug(frag_assert, '~n~nassert fragments for: ~w', [Video]), + rdf_assert(Video, rdf:type, pprime:'Baseline', baseline), + assert_terms(T, Video, Onderwerpen), + assert_terms(P, Video, Persons), + assert_terms(PL, Video, Places), + assert_terms(N, Video, Names), + fail + ;true + )). + +assert_terms([], _, _). +assert_terms([T|Ts], Video, Scheme) :- + findall(C, term_concept_match(T, Scheme, C), Concepts), + length(Concepts, N), + ( Concepts = [Concept] + -> rdf_assert(Video, dc:subject, Concept, baseline) + ; debug(frag_assert, '~w: ~w results', [T, N]) + ), + assert_terms(Ts, Video, Scheme). + +term_concept_match(Term, Scheme, Concept) :- + rdf(Concept, skos:prefLabel, literal(Term)), + rdf(Concept, skos:inScheme, Scheme). + +tag_overlap_stats([]). +tag_overlap_stats([A|As]) :- + tag_overlap(A, O), + A = a(F,_Selection,T,P,PL,N), + O = a(_,_,OT,OP,OPL,ON), + append([T,P,PL,N], Terms), + append([OT,OP,OPL,ON], Matches), + length(Terms, TLength), + length(Matches, MLength), + format('~w\t ~w\/~w~n', [F, MLength, TLength]), + tag_overlap_stats(As). + +tag_overlaps([], []). +tag_overlaps([A|As], [O|Os]) :- + tag_overlap(A, O), + tag_overlaps(As, Os). + +tag_overlap(A, O) :- + A = a(F, Selection, Terms, Persons, Places, Names), + O = a(F, Selection, TermTags, PersonTags, PlaceTags, NameTags), + atom_number(FragmentId, F), + rdf(Video, dc:id, literal(FragmentId)), + tag_match(Terms, Video, TermTags), + tag_match(Persons, Video, PersonTags), + tag_match(Places, Video, PlaceTags), + tag_match(Names, Video, NameTags). + +tag_match([], _, []). +tag_match([Term|Ts], Video, [Term-Tag|Rest]) :- + concept_tag(Term, Video, Tag), + !, + tag_match(Ts, Video, Rest). +tag_match([_|Ts], Video, Rest) :- + tag_match(Ts, Video, Rest). + + +concept_tag(Term, Video, Tag) :- + snowball(dutch, Term, TermStem), + rdf(Video, pprime:hasAnnotation, E), + rdf(E, rdf:value, literal(Tag)), + snowball(dutch, Tag, TagStem), + TagStem = TermStem. + %isub(TermStem, TagStem, true, Sim), + %Sim > 0.9, + %rdf(E, rdf:value, literal(prefix(Stem), _)), + + +segment_terms([], []). +segment_terms([FragmentId-SelectionFile|Ps], [A|As]) :- + A = a(FragmentId, SelectionFile, Terms, Persons, Places, Names), + load_xml_file(SelectionFile, SelectionXML), + findall(T, selection_term(SelectionXML, //opnameplaatsen/opnameplaats, T), Places0), + findall(T, selection_term(SelectionXML, //trefwoorden/trefwoord, T), Terms), + findall(T, selection_term(SelectionXML, //persoonsnamen/persoonsnaam, T), Persons0), + findall(T, selection_term(SelectionXML, //spreker/naam, T), Persons1), + findall(T, selection_term(SelectionXML, //geografische_namen/geografische_naam, T), Places1), + findall(T, selection_term(SelectionXML, //namen/naam, T), Names), + append(Places0, Places1, Places), + append(Persons0, Persons1, Persons), + segment_terms(Ps, As). + +selection_term(XML, Path, Term) :- + xpath(XML, Path, element(_,_,[Term])). %% overlap(-Pairs) @@ -73,7 +167,7 @@ filter_waisda_tags([_|T], Rest) :- filter_waisda_tags(T, Rest). -%% link_beng_waisda(+Expressies, -List(fragment-selection)) +%% link_beng_waisda(+Expressies, -Pairs) % % we assume that sorting on the ids of the fragments and the % filenames of Selections provides the right order. Thus, the @@ -117,14 +211,14 @@ beng_selections(Reeks, Selections) :- expressie(Expressies, ReeksPath, Selections). expressie([], _, []). -expressie([Dir,File|T], Reeks, [expressie(Dir,Date,Selections)|Rest]) :- +expressie([Dir,File|T], Reeks, [expressie(AbsDir,Date,Selections)|Rest]) :- atom_concat(Reeks, File, AbsFile), atom_concat(Reeks, Dir, AbsDir), absolute_file_name(beng(AbsFile), ExpressieXML), absolute_file_name(beng(AbsDir), SelectionDir), beng_date(ExpressieXML, Date), directory_files(SelectionDir, [_,_|Selections0]), - remove_non_xml(Selections0, Selections), + selection_xml_files(Selections0, SelectionDir, Selections), length(Selections, SelectionCount), debug(beng_xml, 'expressie: ~w ~w ~w', [Date,File,SelectionCount]), Selections \== [], @@ -133,13 +227,14 @@ expressie([Dir,File|T], Reeks, [expressie(Dir,Date,Selections)|Rest]) :- expressie([_Dir,_File|T], Reeks, Rest) :- expressie(T, Reeks, Rest). -remove_non_xml([], []). -remove_non_xml([H|T], [H|Rest]) :- +selection_xml_files([], _, []). +selection_xml_files([H|T], Dir, [File|Rest]) :- atom_concat(_, '.xml', H), !, - remove_non_xml(T, Rest). -remove_non_xml([_|T], Rest) :- - remove_non_xml(T, Rest). + concat_atom([Dir, '/', H], File), + selection_xml_files(T, Dir, Rest). +selection_xml_files([_|T], Dir, Rest) :- + selection_xml_files(T, Dir, Rest). %% beng_date(+XMLFile, -Date)