:- module(beng_xml, [overlap/1, beng_stats/0, import_fragments/0, assert_fragment_annotations/0, pprime_frags_by_year/1 ]). :- use_module(library(xpath)). :- use_module(library(csv)). :- use_module(library(yaz_util)). :- use_module(library(isub)). :- dynamic fragment/2. %:- import_fragments. user:file_search_path(beng, '/Users/michiel/Projects/pprime/data/werk_10629/10629'). user:file_search_path(mbh, '/Users/michiel/Projects/pprime/data/manbijthond'). reeks('10629'). reeks('357006'). reeks('357007'). assert_fragment_annotations :- import_fragments, overlap(Pairs), segment_terms(Pairs, Annotations), assert_fragment_annotations(Annotations). assert_fragment_annotations(As) :- rdf_equal(gtaa:'OnderwerpenBenG', Onderwerpen), rdf_equal(gtaa:'Persoonsnamen', Persons), rdf_equal(gtaa:'GeografischeNamen', Places), rdf_equal(gtaa:'Namen', Names), rdf_transaction((member(A,As), A = a(F,_Selection,T,P,PL,N), atom_number(FA, F), rdf(Video, dc:id, literal(FA)), debug(frag_assert, '~n~nassert fragments for: ~w', [Video]), rdf_assert(Video, rdf:type, pprime:'Baseline', baseline), assert_terms(T, Video, Onderwerpen), assert_terms(P, Video, Persons), assert_terms(PL, Video, Places), assert_terms(N, Video, Names), fail ;true )). assert_terms([], _, _). assert_terms([T|Ts], Video, Scheme) :- findall(C, term_concept_match(T, Scheme, C), Concepts), length(Concepts, N), ( Concepts = [Concept] -> rdf_assert(Video, dc:subject, Concept, baseline) ; debug(frag_assert, '~w: ~w results', [T, N]) ), assert_terms(Ts, Video, Scheme). term_concept_match(Term, Scheme, Concept) :- rdf(Concept, skos:prefLabel, literal(Term)), rdf(Concept, skos:inScheme, Scheme). tag_overlap_stats([]). tag_overlap_stats([A|As]) :- tag_overlap(A, O), A = a(F,_Selection,T,P,PL,N), O = a(_,_,OT,OP,OPL,ON), append([T,P,PL,N], Terms), append([OT,OP,OPL,ON], Matches), length(Terms, TLength), length(Matches, MLength), format('~w\t ~w\/~w~n', [F, MLength, TLength]), tag_overlap_stats(As). tag_overlaps([], []). tag_overlaps([A|As], [O|Os]) :- tag_overlap(A, O), tag_overlaps(As, Os). tag_overlap(A, O) :- A = a(F, Selection, Terms, Persons, Places, Names), O = a(F, Selection, TermTags, PersonTags, PlaceTags, NameTags), atom_number(FragmentId, F), rdf(Video, dc:id, literal(FragmentId)), tag_match(Terms, Video, TermTags), tag_match(Persons, Video, PersonTags), tag_match(Places, Video, PlaceTags), tag_match(Names, Video, NameTags). tag_match([], _, []). tag_match([Term|Ts], Video, [Term-Tag|Rest]) :- concept_tag(Term, Video, Tag), !, tag_match(Ts, Video, Rest). tag_match([_|Ts], Video, Rest) :- tag_match(Ts, Video, Rest). concept_tag(Term, Video, Tag) :- snowball(dutch, Term, TermStem), rdf(Video, pprime:hasAnnotation, E), rdf(E, rdf:value, literal(Tag)), snowball(dutch, Tag, TagStem), TagStem = TermStem. %isub(TermStem, TagStem, true, Sim), %Sim > 0.9, %rdf(E, rdf:value, literal(prefix(Stem), _)), segment_terms([], []). segment_terms([FragmentId-SelectionFile|Ps], [A|As]) :- A = a(FragmentId, SelectionFile, Terms, Persons, Places, Names), load_xml_file(SelectionFile, SelectionXML), findall(T, selection_term(SelectionXML, //opnameplaatsen/opnameplaats, T), Places0), findall(T, selection_term(SelectionXML, //trefwoorden/trefwoord, T), Terms), findall(T, selection_term(SelectionXML, //persoonsnamen/persoonsnaam, T), Persons0), findall(T, selection_term(SelectionXML, //spreker/naam, T), Persons1), findall(T, selection_term(SelectionXML, //geografische_namen/geografische_naam, T), Places1), findall(T, selection_term(SelectionXML, //namen/naam, T), Names), append(Places0, Places1, Places), append(Persons0, Persons1, Persons), segment_terms(Ps, As). selection_term(XML, Path, Term) :- xpath(XML, Path, element(_,_,[Term])). %% overlap(-Pairs) % % Pairs:fragment-selection maps the BandG annotated selections to % Waisda tagged fragments. overlap(Pairs) :- findall(R, reeks(R), Reeksen), maplist(beng_selections, Reeksen, Expressies0), append(Expressies0, Expressies), link_beng_waisda(Expressies, WaisdaFragments0), keysort(WaisdaFragments0, WaisdaFragments), filter_waisda_tags(WaisdaFragments, Pairs). beng_stats :- findall(R, reeks(R), Reeksen), maplist(beng_selections, Reeksen, Expressies0), append(Expressies0, Expressies), findall(S, (member(expressie(_,_,Ss),Expressies), member(S,Ss)), Selecties), link_beng_waisda(Expressies, WaisdaFragments0), keysort(WaisdaFragments0, WaisdaFragments), filter_waisda_tags(WaisdaFragments, WaisdaTaggedFragments), length(Expressies, ExpressieCount), length(Selecties, SelectieCount), length(WaisdaFragments, WaisdaFragmentCount), length(WaisdaTaggedFragments, WaisdaTaggedFragmentCount), format('expressies, ~w~n', [ExpressieCount]), format('selecties, ~w~n', [SelectieCount]), format('waisda fragments, ~w~n', [WaisdaFragmentCount]), format('waisda tagged fragments, ~w~n', [WaisdaTaggedFragmentCount]). filter_waisda_tags([], []). filter_waisda_tags([F-S|T], [F-S|Rest]) :- number(F), atom_number(FA, F), rdf(Video, dc:id, literal(FA)), rdf(Video, pprime:hasAnnotation, E), rdf(E, pprime:score, literal(Score)), atom_number(Score,ScoreN), ScoreN > 5, !, filter_waisda_tags(T, Rest). filter_waisda_tags([_|T], Rest) :- filter_waisda_tags(T, Rest). %% link_beng_waisda(+Expressies, -Pairs) % % we assume that sorting on the ids of the fragments and the % filenames of Selections provides the right order. Thus, the % same order link_beng_waisda([], []). link_beng_waisda([expressie(Dir,Date0,Ss)|T], Linked) :- flip_date(Date0, Date), findall(F, fragment(F, Date), Fs), sort(Fs, Fragments), sort(Ss, Selections), length(Fragments, FN), length(Selections, SN), ( Fragments = [] -> debug(beng_link, 'no fragments found: ~w ~w', [Dir, Date0]), Pairs = [] ; FN == SN -> debug(beng_link, 'linked: ~w ~w', [Dir, Date0]), pairs_keys_values(Pairs, Fragments, Selections) % do something smarter here :) %; FN is SN-1 % last item seems to be missing (ons kent ons) %-> debug(beng_link, 'removed last element: ~w ~w', [Dir, Date0]), % append(Selections1, [_], Selections), % pairs_keys_values(Pairs, Fragments, Selections1) ; debug(beng_link, 'unmatching count: ~w ~w ~w ~w', [Dir, Date0, FN, SN]), Pairs = [] ), link_beng_waisda(T, Rest), append(Pairs,Rest,Linked). flip_date(Date1, Date2) :- concat_atom([D,M,Y], '-', Date1), concat_atom([Y,M,D], '-', Date2). beng_selections(Reeks, Selections) :- %debug(beng_xml), debug(beng_xml, '~n~nreeks: ~w', [Reeks]), absolute_file_name(beng(Reeks), Dir), directory_files(Dir, [_,_|Expressies]), concat_atom(['/', Reeks, '/'], ReeksPath), expressie(Expressies, ReeksPath, Selections). expressie([], _, []). expressie([Dir,File|T], Reeks, [expressie(AbsDir,Date,Selections)|Rest]) :- atom_concat(Reeks, File, AbsFile), atom_concat(Reeks, Dir, AbsDir), absolute_file_name(beng(AbsFile), ExpressieXML), absolute_file_name(beng(AbsDir), SelectionDir), beng_date(ExpressieXML, Date), directory_files(SelectionDir, [_,_|Selections0]), selection_xml_files(Selections0, SelectionDir, Selections), length(Selections, SelectionCount), debug(beng_xml, 'expressie: ~w ~w ~w', [Date,File,SelectionCount]), Selections \== [], !, expressie(T, Reeks, Rest). expressie([_Dir,_File|T], Reeks, Rest) :- expressie(T, Reeks, Rest). selection_xml_files([], _, []). selection_xml_files([H|T], Dir, [File|Rest]) :- atom_concat(_, '.xml', H), !, concat_atom([Dir, '/', H], File), selection_xml_files(T, Dir, Rest). selection_xml_files([_|T], Dir, Rest) :- selection_xml_files(T, Dir, Rest). %% beng_date(+XMLFile, -Date) % % Date is the broadcast date. beng_date(XMLFile, Date) :- load_xml_file(XMLFile,XMLTree), xpath(XMLTree, //publicaties//begindatum, element(begindatum,_,[Date])), !. selection_count(ReeksSelections) :- debug(beng_selection), absolute_file_name(beng(.), Dir), directory_files(Dir, [_,_|Reeksen]), reeks_selections(Reeksen, ReeksSelections). reeks_selections([], []). reeks_selections([Reeks,_File|T], [Reeks-SelectionCount|Rest]) :- beng_selections(Reeks, Selections), length(Selections, SelectionCount), debug(beng_selection, '~w ~w', [Reeks, SelectionCount]), reeks_selections(T, Rest). %% import_fragments % % import_fragments :- File = 'mbh_fragments_simple.csv', absolute_file_name(mbh(File), AbsFile), csv_read_file(AbsFile, [_,_|Rows], []), assert_fragments(Rows). assert_fragments([]). assert_fragments([Row|Rows]) :- Row = row(Date, FragmentId), assert(fragment(FragmentId, Date)), assert_fragments(Rows). %% pprime_frags_by_year(-Pairs) % % pprime_frags_by_year(FragsByYear) :- findall(Id, (rdf(V,rdf:type,pprime:'Video'),rdf(V,dc:id,literal(Id))), Vs), maplist(fragment_year, Vs, Pairs), keysort(Pairs, Sorted), group_pairs_by_key(Sorted, Groups), pairs_sort_by_value_count(Groups, FragsByYear). fragment_year(Id, Year-Id) :- atom_number(Id, N), %Id = N, fragment(N, Date), concat_atom([Year,_,_], '-', Date).