vumix/commit
experiments
author | Michiel Hildebrand |
---|---|
Tue Jun 5 17:57:18 2012 +0200 | |
committer | Michiel Hildebrand |
Tue Jun 5 17:57:18 2012 +0200 | |
commit | a56becc44b7a7f10f772d47e5aab9b976e21490c |
tree | acbe75815800598e89aad32dd3df428dad707f36 |
parent | dc3b8287253e2ddebd7d7ba8c567585fcb30a5cc |
Diff style: patch stat
diff --git a/applications/vumix_p0.pl b/applications/vumix_p0.pl index 8f3e0c9..0d5a7e7 100644 --- a/applications/vumix_p0.pl +++ b/applications/vumix_p0.pl @@ -30,7 +30,7 @@ %:- use_module(library(stop_words)). :- use_module(library(tfidf)). %:- use_module(library(tag_concept)). -:- use_module(library(semrank)). +%:- use_module(library(semrank)). %:- use_module(library(real)). /*************************************************** @@ -118,7 +118,7 @@ http_vumix_p0(Request) :- ( Fields0 = [] -> Fields = ['http://semanticweb.cs.vu.nl/prestoprime/personAnnotation', 'http://semanticweb.cs.vu.nl/prestoprime/placeAnnotation', - 'http://semanticweb.cs.vu.nl/prestoprime/nameAnnotation', + %'http://semanticweb.cs.vu.nl/prestoprime/nameAnnotation', 'http://semanticweb.cs.vu.nl/prestoprime/subjectAnnotation'] ; Fields = Fields0 ), @@ -128,8 +128,8 @@ http_vumix_p0(Request) :- ; logged_on(User, anonymous) ), user_process(User, Target, _), - %tfidf_rank(Target, Concepts), - c_semantic_distance_rank(Target, Concepts), + tfidf_rank(Target, Concepts), + %c_semantic_distance_rank(Target, Concepts), html_page(Target, Fields, Concepts). tfidf_rank(Target, Concepts) :- diff --git a/config-available/vumix.pl b/config-available/vumix.pl index 7b4237c..ab61caf 100644 --- a/config-available/vumix.pl +++ b/config-available/vumix.pl @@ -8,6 +8,6 @@ % hack namespace :- rdf_register_ns(pprime, 'http://semanticweb.cs.vu.nl/prestoprime/'). -:- use_module(applications(vumix)). -:- use_module(applications(vumix_p0)). +%:- use_module(applications(vumix)). +%:- use_module(applications(vumix_p0)). diff --git a/lib/semrank.pl b/lib/semrank.pl index 0e2cc6b..fb0648f 100644 --- a/lib/semrank.pl +++ b/lib/semrank.pl @@ -50,7 +50,7 @@ tag_interpretations(Tags, Edges, Concepts) :- rdf_equal(skos:'Concept', Type), findall(i(Tag,C,W), (member(W-Tag,Tags), - W > 0.01, + %W > 0.001, reconcile(Tag, 10, Type, [], Hits), member(hit(D,C,_,_), Hits), D < 1.5 @@ -92,7 +92,7 @@ semantic_distance(A, B, _) :- semantic_distance(A, B, D) :- rdf_reachable(B, skos:broader, A, 3, N), !, - D is 0.5^N. + D is 0.05^N. semantic_distance(A, B, D) :- rdf_reachable(A, skos:broader, B, 2, N), !, @@ -101,6 +101,13 @@ semantic_distance(A, B, D) :- rdf_reachable(A, skos:related, B, 2, N), !, D is 0.01^N. +semantic_distance(A, B, D) :- + rdf(A, P, B), + \+ rdfs_subproperty_of(P, skos:broader), + \+ rdfs_subproperty_of(P, skos:narrower), + \+ rdfs_subproperty_of(P, skos:related), + !, + D is 0.01. /* semantic_distance(A, B, D) :- rdf(A, _, C), @@ -149,3 +156,49 @@ rename_vertex(C, Assoc, N, Assoc, N, New) :- rename_vertex(C, Assoc, N, Assoc1, N1, N) :- N1 is N+1, put_assoc(C, Assoc, N, Assoc1). + + + +mbh_pagerank_eval_table(V) :- + ( m_video(V), + rdf(V, dc:id, literal(Id)), + video_mbh_terms(V, Terms), + semantic_distance_rank(V, Concepts), + length(Terms, TermCount), + + ( TermCount is 0 + -> format('~w', [Id]) + ; TermCount2 is TermCount*2, + + pairs_values(Concepts, Concepts1), + topN(Concepts1, TermCount, Terms, TopN), + topN(Concepts1, TermCount2, Terms, Top2N), + + format('~w,~2f,~2f~n', [Id,TopN, Top2N]) + ), + fail + ; true + ). + +topN(Tags, N, Terms, TopN) :- + length(Terms, Term_Count), + length(Tags, Tag_Count), + ( Tag_Count > N + -> length(Top_N_Tags, N), + append(Top_N_Tags, _, Tags) + ; Top_N_Tags = Tags + ), + term_match(Terms, Top_N_Tags, Intersect), + length(Intersect, Intersect_Count), + TopN is Intersect_Count/Term_Count. + +term_match([], _, []). +term_match([Term|Ts], Concepts, [Term|Intersect]) :- + member(C, Concepts), + rdf_has(C, rdfs:label, literal(exact(Term), _)), + !, + term_match(Ts, Concepts, Intersect). +term_match([_Term|Ts], Concepts, Intersect) :- + term_match(Ts, Concepts, Intersect). + + diff --git a/lib/tag_cluster.pl b/lib/tag_cluster.pl new file mode 100644 index 0000000..e3fa8ab --- /dev/null +++ b/lib/tag_cluster.pl @@ -0,0 +1,121 @@ +:- module(tag_cluster, + [tag_cluster/3 + ]). + +:- use_module(library(semweb/rdf_db)). +:- use_module(library(semweb/rdfs)). +:- use_module(library(real)). +:- use_module(library(tfidf)). +:- use_module(api(reconcile)). + +tag_cluster(Video, Clusters, TopClusterTags) :- + tag_rank(Video, RankedTags), + pairs_values(RankedTags, Tags), + length(RankedTags, Tag_Count), + debug(semrank, 'tags: ~w', [Tag_Count]), + + tag_interpretations(RankedTags, Interpretations), + list_to_assoc(Interpretations, Assoc), + pairs_keys(Interpretations, TagsWithI0), + sort(TagsWithI0, TagsWithI), + length(TagsWithI, TagsWithI_Count), + debug(semrank, 'tags with interpretation: ~w', [TagsWithI_Count]), + + tag_distance_matrix(Tags, Tags, Assoc, Matrix), + tag_clustering(Matrix, ClusterIndex), + + pairs_keys_values(Membership0, ClusterIndex, RankedTags), + keysort(Membership0, Membership), + group_pairs_by_key(Membership, Clusters), + maplist(top_tag, Clusters, TopClusterTags0), + keysort(TopClusterTags0, TopClusterTags1), + reverse(TopClusterTags1, TopClusterTags). + +top_tag(_-[Tag|_], Tag). + +tag_clustering(Matrix, Clusters) :- + m <- Matrix, + Clusters <- 'kmeans(m, 15)$cluster'. + + +tag_distance_matrix([], _, _, []). +tag_distance_matrix([Tag|Ts], Tags, Assoc, [Row|Rows]) :- + tag_distances(Tags, Tag, Assoc, Row), + tag_distance_matrix(Ts, Tags, Assoc, Rows). + + +tag_distances([], _, _, []). +tag_distances([Tag2|Tags], Tag1, Assoc, [D|Rows]) :- + tag_distance(Tag1, Tag2, Assoc, D), + tag_distances(Tags, Tag1, Assoc, Rows). + +tag_distance(T, T, _, 1) :- !. +tag_distance(T1, T2, Assoc, Distance) :- + findall(D, + ( tag_concept(T1,Assoc,C1), + tag_concept(T2,Assoc,C2), + concept_distance(C1, C2, D) + ), + Ds), + ( Ds = [] + -> Distance = 0 + ; max_list(Ds, Distance) + ). + + +concept_distance(A, A, 1) :- !. +concept_distance(A, B, D) :- + rdf_reachable(A, skos:broader, C, 3, N1), + rdf_reachable(B, skos:broader, C, 3, N2), + D is 0.5^(N1+N2). +concept_distance(A, B, D) :- + rdf_reachable(A, skos:related, B, 2, N), + D is 0.5^N. + + +tag_concept(Tag, I, Concept) :- + get_assoc(Tag, I, Concepts), + member(Concept, Concepts). + +tag_interpretations([], []). +%tag_interpretations([W-_Tag|Tags], Pairs) :- +% W < 0.005, +% !, +% tag_interpretations(Tags, Pairs). +tag_interpretations([_W-Tag|Tags], [Tag-Concepts|Pairs]) :- + rdf_equal(skos:'Concept', Type), + findall(C, + (reconcile(Tag, 10, Type, [], Hits), + member(hit(D,C,_,_), Hits), + D < 6 + ), + Concepts), + tag_interpretations(Tags, Pairs). + + +mbh_cluster_eval_table(V) :- + ( m_video(V), + rdf(V, dc:id, literal(Id)), + video_mbh_terms(V, Terms), + length(Terms, TermCount), + tag_cluster(V, _Clusters, RankedTags), + pairs_values(RankedTags, Tags), + + ( TermCount = 0 + -> format('~w', [Id]) + ; intersect(Tags, Terms, Intersect), + length(Intersect, I_Count), + TopN is I_Count/TermCount, + + format('~w,~w~n', [Id, TopN]) + ), + fail + ; true + ). + + +intersect(Tags, Terms, Intersect) :- + maplist(downcase_atom, Tags, Tags1), + sort(Tags1, Tags2), + ord_intersect(Tags2, Terms, Intersect). + diff --git a/lib/tag_concept.pl b/lib/tag_concept.pl index 0fe3b6d..ff9e89b 100644 --- a/lib/tag_concept.pl +++ b/lib/tag_concept.pl @@ -548,7 +548,7 @@ min_tags(V) :- findall(P,(rdf(V,pprime:hasAnnotation,A),rdf(A,rdf:value,P)), Ps0), sort(Ps0, Ps), length(Ps,N), - N > 200. + N > 0. multiple_players(V) :- findall(P,(rdf(V,pprime:hasAnnotation,A),rdf(A,pprime:creator,P)), Ps0), @@ -560,7 +560,7 @@ multiple_players(V) :- -mbh_topN_eval_table :- +mbh_topN_eval_table(V) :- ( m_video(V), rdf(V, dc:id, literal(Id)), video_mbh_terms(V, Terms), diff --git a/lib/tagrank.pl b/lib/tagrank.pl new file mode 100644 index 0000000..4ec9cbe --- /dev/null +++ b/lib/tagrank.pl @@ -0,0 +1,218 @@ +:- module(tagrank, + [sem_tag_rank/2 + ]). + +:- use_module(library(semweb/rdf_db)). +:- use_module(library(semweb/rdfs)). +:- use_module(library(real)). +:- use_module(library(tfidf)). +:- use_module(api(reconcile)). + + +sem_tag_rank(Video, Ranked) :- + tag_rank(Video, RankedTags), + length(RankedTags, Tag_Count), + debug(semrank, 'tags: ~w', [Tag_Count]), + tag_interpretations(RankedTags, Interpretations), + pairs_values(RankedTags, Tags), + list_to_assoc(Interpretations, Interpretations_Assoc), + tag_distance_graph(Tags, Interpretations_Assoc, Graph), + length(Graph, Edge_Count), + debug(semrank, 'distance graph size: ~w', [Edge_Count]), + weighted_graph_rename(Graph, Mapping, Vector, Weights), + length(Vector, V_Count), + debug(semrank, 'vector: ~w', [V_Count]), + page_rank(Vector, Weights, Scores0), + Scores0 = [_EigenVector|Scores], + assoc_to_list(Mapping, Pairs0), + transpose_pairs(Pairs0, Pairs1), + keysort(Pairs1, Pairs), + pairs_values(Pairs, Resources), + pairs_keys_values(Ranked0, Scores, Resources), + keysort(Ranked0, Ranked1), + reverse(Ranked1, Ranked2), + merge_scores(RankedTags, Ranked2, Ranked). + + +merge_scores(R1, R2, R) :- + transpose_pairs(R1,R1_T), + transpose_pairs(R2,R2_T), + merge_tag_scores(R1_T,R2_T, Merged0), + transpose_pairs(Merged0, Merged), + keysort(Merged, Merged1), + reverse(Merged1, R). + + +merge_tag_scores([], T, T). +merge_tag_scores([R-S1|T1], [R-S2|T2], [R-S|T]) :- + !, + S is S1+S2, + merge_tag_scores(T1, T2, T). +merge_tag_scores([R1-S1|T1], T2, [R1-S1|T]) :- + merge_tag_scores(T1, T2, T). + + +tag_interpretations([], []). +%tag_interpretations([W-_Tag|Tags], Pairs) :- +% W < 0.005, +% !, +% tag_interpretations(Tags, Pairs). +tag_interpretations([_W-Tag|Tags], [Tag-Concepts|Pairs]) :- + rdf_equal(skos:'Concept', Type), + findall(C, + (reconcile(Tag, 10, Type, [], Hits), + member(hit(D,C,_,_), Hits), + D < 6 + ), + Concepts), + tag_interpretations(Tags, Pairs). + +page_rank(Vector, Weights, Rank) :- + <- library(igraph), + v <- Vector, + w <- Weights, + g <- graph(v), + Rank <- 'page.rank(g, directed=TRUE, weights = w)$vector'. + +tag_distance_graph(Tags, Interpretations, Graph) :- + cartesian(Tags, Tags, Cartesian), + pair_distances(Cartesian, Interpretations, Graph). + + +pair_distances([], _, []). +pair_distances([[A,A]|T], I, Graph) :- + !, + pair_distances(T, I, Graph). +pair_distances([[A,B]|T], I, [Rel|Graph]) :- + Rel = i(A,B,Distance), + findall(D, + ( get_assoc(A, I, A_Concepts), + get_assoc(B, I, B_Concepts), + member(A_C, A_Concepts), + member(B_C, B_Concepts), + semantic_distance(A_C,B_C,D) + ), + Ds0), + sort(Ds0, Ds), + reverse(Ds, [Distance|_]), + !, + pair_distances(T, I, Graph). +pair_distances([_|T], I, Graph) :- + pair_distances(T, I, Graph). + + +semantic_distance(A, B, D) :- + rdf_reachable(B, skos:broader, A, 5, N), + !, + D is 0.5^N. +semantic_distance(A, B, D) :- + rdf_reachable(A, skos:broader, B, 2, N), + !, + D is 0.01^N. +semantic_distance(A, B, D) :- + rdf_reachable(A, skos:broader, C, 2, N1), + A \== C, + rdf_reachable(B, skos:broader, C, 2, N2), + B \== C, + !, + N is max(N1, N2), + D is 0.01^N. +semantic_distance(A, B, D) :- + rdf_reachable(A, skos:related, B, 2, N), + !, + D is 0.01^N. +semantic_distance(A, B, D) :- + rdf(A, P, B), + \+ rdfs_subproperty_of(P, skos:broader), + \+ rdfs_subproperty_of(P, skos:narrower), + \+ rdfs_subproperty_of(P, skos:related), + !, + D is 0.01. +/* +semantic_distance(A, B, D) :- + rdf(A, _, C), + ( B == C + -> D = 0.5 + ; rdf_reachable(B, skos:broader, C, 3, _) + -> D = 0.25 + ; rdf_reachable(C, skos:broader, B, 3, _) + -> D = 0.5 + ). +*/ + +%% semantic_distance(+L1, +L2, -Cartesian) +% +% Cartesian is the cartesian product of L1 and L2. + +cartesian([], _L, []). +cartesian([A|N], L, M) :- + pair(A,L,M1), + cartesian(N, L, M2), + append(M1, M2, M). + +pair(_A, [], []) :- !. +pair(A, [B|L], [[A,B]|N] ) :- + pair(A, L, N). + + + +%% weighted_graph_rename(+Graph, -Assoc, -NewGraph, -Weights) +% +% Rename nodes so that they start counted by 1. + +weighted_graph_rename(Graph, Assoc, NewGraph, Weights) :- + empty_assoc(Assoc0), + rename_vertices(Graph, Assoc0, 1, NewGraph, Assoc, Weights). + +rename_vertices([], Assoc, _, [], Assoc, []). +rename_vertices([i(C1,C2,W)|T], Assoc0, N, [NewC1,NewC2|Rest], Assoc, [W|Weights]) :- + rename_vertex(C1, Assoc0, N, Assoc1, N1, NewC1), + rename_vertex(C2, Assoc1, N1, Assoc2, N2, NewC2), + rename_vertices(T, Assoc2, N2, Rest, Assoc, Weights). + +rename_vertex(C, Assoc, N, Assoc, N, New) :- + get_assoc(C, Assoc, New), + !. +rename_vertex(C, Assoc, N, Assoc1, N1, N) :- + N1 is N+1, + put_assoc(C, Assoc, N, Assoc1). + + + + +mbh_semrank_eval_table(V) :- + ( m_video(V), + rdf(V, dc:id, literal(Id)), + video_mbh_terms(V, Terms), + sem_tag_rank(V, Tags), + length(Terms, TermCount), + length(Tags, TagCount), + + ( TermCount is 0 + -> format('~w', [Id]) + ; TermCount2 is TermCount*2, + + pairs_values(Tags, Tags1), + topN(Tags1, TermCount, Terms, TopN), + topN(Tags1, TermCount2, Terms, Top2N), + topN(Tags1, TagCount, Terms, All), + + format('~w,\t~w,\t~w,\t~2f,\t~2f,\t~2f~n', [Id, TagCount, TermCount, TopN, Top2N, All]) + ), + fail + ; true + ). + +topN(Tags, N, Terms, TopN) :- + length(Terms, Term_Count), + length(Tags, Tag_Count), + ( Tag_Count > N + -> length(Top_N_Tags, N), + append(Top_N_Tags, _, Tags) + ; Top_N_Tags = Tags + ), + maplist(downcase_atom, Top_N_Tags, Top_N_Tags1), + sort(Top_N_Tags1, Top_N_Tags2), + ord_intersect(Top_N_Tags2, Terms, Intersect), + length(Intersect, Intersect_Count), + TopN is Intersect_Count/Term_Count. diff --git a/lib/tfidf.pl b/lib/tfidf.pl index f323d44..ea775ff 100644 --- a/lib/tfidf.pl +++ b/lib/tfidf.pl @@ -1,6 +1,7 @@ :- module(tfidf, [flush_tag_rank/1, tag_rank/2, + document_term/2, documents/1, tf/3, idf/3, @@ -42,9 +43,9 @@ documents(Videos) :- document_term(D, T) :- rdf(D, pprime:hasAnnotation, E), rdf(E, rdf:value, literal(T)). - %rdf(E, pprime:score, literal(SA)), - %atom_number(SA, S), - %S > 5. +% rdf(E, pprime:score, literal(SA)), +% atom_number(SA, S), +% S > 5. tf(T, D, TF) :- findall(A, rdf(D, pprime:hasAnnotation, A), As),