vumix/commit

experiments

author	Michiel Hildebrand
	Tue Jun 5 17:57:18 2012 +0200
committer	Michiel Hildebrand
	Tue Jun 5 17:57:18 2012 +0200
commit	a56becc44b7a7f10f772d47e5aab9b976e21490c
tree	acbe75815800598e89aad32dd3df428dad707f36
parent	dc3b8287253e2ddebd7d7ba8c567585fcb30a5cc
Diff style: patch stat
diff --git a/applications/vumix_p0.pl b/applications/vumix_p0.pl
index 8f3e0c9..0d5a7e7 100644
--- a/applications/vumix_p0.pl
+++ b/applications/vumix_p0.pl
@@ -30,7 +30,7 @@
 %:- use_module(library(stop_words)).
 :- use_module(library(tfidf)).
 %:- use_module(library(tag_concept)).
-:- use_module(library(semrank)).
+%:- use_module(library(semrank)).
 %:- use_module(library(real)).
 
 /***************************************************
@@ -118,7 +118,7 @@ http_vumix_p0(Request) :-
 	(   Fields0 = []
 	->  Fields = ['http://semanticweb.cs.vu.nl/prestoprime/personAnnotation',
 		      'http://semanticweb.cs.vu.nl/prestoprime/placeAnnotation',
-		      'http://semanticweb.cs.vu.nl/prestoprime/nameAnnotation',
+		      %'http://semanticweb.cs.vu.nl/prestoprime/nameAnnotation',
 		      'http://semanticweb.cs.vu.nl/prestoprime/subjectAnnotation']
 	;   Fields = Fields0
 	),
@@ -128,8 +128,8 @@ http_vumix_p0(Request) :-
 	;   logged_on(User, anonymous)
         ),
 	user_process(User, Target, _),
-	%tfidf_rank(Target, Concepts),
-	c_semantic_distance_rank(Target, Concepts),
+	tfidf_rank(Target, Concepts),
+	%c_semantic_distance_rank(Target, Concepts),
 	html_page(Target, Fields, Concepts).
 
 tfidf_rank(Target, Concepts) :-
diff --git a/config-available/vumix.pl b/config-available/vumix.pl
index 7b4237c..ab61caf 100644
--- a/config-available/vumix.pl
+++ b/config-available/vumix.pl
@@ -8,6 +8,6 @@
 % hack namespace
  :- rdf_register_ns(pprime, 'http://semanticweb.cs.vu.nl/prestoprime/').
 
-:- use_module(applications(vumix)).
-:- use_module(applications(vumix_p0)).
+%:- use_module(applications(vumix)).
+%:- use_module(applications(vumix_p0)).
 
diff --git a/lib/semrank.pl b/lib/semrank.pl
index 0e2cc6b..fb0648f 100644
--- a/lib/semrank.pl
+++ b/lib/semrank.pl
@@ -50,7 +50,7 @@ tag_interpretations(Tags, Edges, Concepts) :-
 	rdf_equal(skos:'Concept', Type),
 	findall(i(Tag,C,W),
 		(member(W-Tag,Tags),
-		 W > 0.01,
+		 %W > 0.001,
 		 reconcile(Tag, 10, Type, [], Hits),
 		 member(hit(D,C,_,_), Hits),
 		 D < 1.5
@@ -92,7 +92,7 @@ semantic_distance(A, B, _) :-
 semantic_distance(A, B, D) :-
 	rdf_reachable(B, skos:broader, A, 3, N),
 	!,
-	D is 0.5^N.
+	D is 0.05^N.
 semantic_distance(A, B, D) :-
 	rdf_reachable(A, skos:broader, B, 2, N),
 	!,
@@ -101,6 +101,13 @@ semantic_distance(A, B, D) :-
 	rdf_reachable(A, skos:related, B, 2, N),
 	!,
 	D is 0.01^N.
+semantic_distance(A, B, D) :-
+	rdf(A, P, B),
+	\+ rdfs_subproperty_of(P, skos:broader),
+	\+ rdfs_subproperty_of(P, skos:narrower),
+	\+ rdfs_subproperty_of(P, skos:related),
+	!,
+	D is 0.01.
 /*
 semantic_distance(A, B, D) :-
 	rdf(A, _, C),
@@ -149,3 +156,49 @@ rename_vertex(C, Assoc, N, Assoc, N, New) :-
 rename_vertex(C, Assoc, N, Assoc1, N1, N) :-
 	N1 is N+1,
 	put_assoc(C, Assoc, N, Assoc1).
+
+
+
+mbh_pagerank_eval_table(V) :-
+	(   m_video(V),
+	    rdf(V, dc:id, literal(Id)),
+	    video_mbh_terms(V, Terms),
+	    semantic_distance_rank(V, Concepts),
+	    length(Terms, TermCount),
+
+	    (	TermCount is 0
+	    ->	format('~w', [Id])
+	    ;	TermCount2 is TermCount*2,
+
+		pairs_values(Concepts, Concepts1),
+		topN(Concepts1, TermCount, Terms, TopN),
+		topN(Concepts1, TermCount2, Terms, Top2N),
+
+		format('~w,~2f,~2f~n', [Id,TopN, Top2N])
+	    ),
+	    fail
+	;   true
+	).
+
+topN(Tags, N, Terms, TopN) :-
+	length(Terms, Term_Count),
+	length(Tags, Tag_Count),
+	(   Tag_Count > N
+	->  length(Top_N_Tags, N),
+	    append(Top_N_Tags, _, Tags)
+	;   Top_N_Tags = Tags
+	),
+	term_match(Terms, Top_N_Tags, Intersect),
+	length(Intersect, Intersect_Count),
+	TopN is Intersect_Count/Term_Count.
+
+term_match([], _, []).
+term_match([Term|Ts], Concepts, [Term|Intersect]) :-
+	member(C, Concepts),
+	rdf_has(C, rdfs:label, literal(exact(Term), _)),
+	!,
+	term_match(Ts, Concepts, Intersect).
+term_match([_Term|Ts], Concepts, Intersect) :-
+	term_match(Ts, Concepts, Intersect).
+
+
diff --git a/lib/tag_cluster.pl b/lib/tag_cluster.pl
new file mode 100644
index 0000000..e3fa8ab
--- /dev/null
+++ b/lib/tag_cluster.pl
@@ -0,0 +1,121 @@
+:- module(tag_cluster,
+	  [tag_cluster/3
+	  ]).
+
+:- use_module(library(semweb/rdf_db)).
+:- use_module(library(semweb/rdfs)).
+:- use_module(library(real)).
+:- use_module(library(tfidf)).
+:- use_module(api(reconcile)).
+
+tag_cluster(Video, Clusters, TopClusterTags) :-
+	tag_rank(Video, RankedTags),
+	pairs_values(RankedTags, Tags),
+	length(RankedTags, Tag_Count),
+	debug(semrank, 'tags: ~w', [Tag_Count]),
+
+	tag_interpretations(RankedTags, Interpretations),
+	list_to_assoc(Interpretations, Assoc),
+	pairs_keys(Interpretations, TagsWithI0),
+	sort(TagsWithI0, TagsWithI),
+	length(TagsWithI, TagsWithI_Count),
+	debug(semrank, 'tags with interpretation: ~w', [TagsWithI_Count]),
+
+	tag_distance_matrix(Tags, Tags, Assoc, Matrix),
+	tag_clustering(Matrix, ClusterIndex),
+
+	pairs_keys_values(Membership0, ClusterIndex, RankedTags),
+	keysort(Membership0, Membership),
+	group_pairs_by_key(Membership, Clusters),
+	maplist(top_tag, Clusters, TopClusterTags0),
+	keysort(TopClusterTags0, TopClusterTags1),
+	reverse(TopClusterTags1, TopClusterTags).
+
+top_tag(_-[Tag|_], Tag).
+
+tag_clustering(Matrix, Clusters) :-
+	m <- Matrix,
+	Clusters <- 'kmeans(m, 15)$cluster'.
+
+
+tag_distance_matrix([], _, _, []).
+tag_distance_matrix([Tag|Ts], Tags, Assoc, [Row|Rows]) :-
+	tag_distances(Tags, Tag, Assoc, Row),
+	tag_distance_matrix(Ts, Tags, Assoc, Rows).
+
+
+tag_distances([], _, _, []).
+tag_distances([Tag2|Tags], Tag1, Assoc, [D|Rows]) :-
+	tag_distance(Tag1, Tag2, Assoc, D),
+	tag_distances(Tags, Tag1, Assoc, Rows).
+
+tag_distance(T, T, _, 1) :- !.
+tag_distance(T1, T2, Assoc, Distance) :-
+	findall(D,
+		(   tag_concept(T1,Assoc,C1),
+		    tag_concept(T2,Assoc,C2),
+		    concept_distance(C1, C2, D)
+		),
+		Ds),
+	(   Ds = []
+	->  Distance = 0
+	;   max_list(Ds, Distance)
+	).
+
+
+concept_distance(A, A, 1) :- !.
+concept_distance(A, B, D) :-
+	rdf_reachable(A, skos:broader, C, 3, N1),
+	rdf_reachable(B, skos:broader, C, 3, N2),
+	D is 0.5^(N1+N2).
+concept_distance(A, B, D) :-
+	rdf_reachable(A, skos:related, B, 2, N),
+	D is 0.5^N.
+
+
+tag_concept(Tag, I, Concept) :-
+	get_assoc(Tag, I, Concepts),
+	member(Concept, Concepts).
+
+tag_interpretations([], []).
+%tag_interpretations([W-_Tag|Tags], Pairs) :-
+%	W < 0.005,
+%	!,
+%	tag_interpretations(Tags, Pairs).
+tag_interpretations([_W-Tag|Tags], [Tag-Concepts|Pairs]) :-
+	rdf_equal(skos:'Concept', Type),
+	findall(C,
+		(reconcile(Tag, 10, Type, [], Hits),
+		 member(hit(D,C,_,_), Hits),
+		 D < 6
+		),
+		Concepts),
+	tag_interpretations(Tags, Pairs).
+
+
+mbh_cluster_eval_table(V) :-
+	(   m_video(V),
+	    rdf(V, dc:id, literal(Id)),
+	    video_mbh_terms(V, Terms),
+	    length(Terms, TermCount),
+	    tag_cluster(V, _Clusters, RankedTags),
+	    pairs_values(RankedTags, Tags),
+
+	    (	TermCount = 0
+	    ->	format('~w', [Id])
+	    ;	intersect(Tags, Terms, Intersect),
+		length(Intersect, I_Count),
+		TopN is I_Count/TermCount,
+
+		format('~w,~w~n', [Id, TopN])
+	    ),
+	    fail
+	;   true
+	).
+
+
+intersect(Tags, Terms, Intersect) :-
+	maplist(downcase_atom, Tags, Tags1),
+	sort(Tags1, Tags2),
+	ord_intersect(Tags2, Terms, Intersect).
+
diff --git a/lib/tag_concept.pl b/lib/tag_concept.pl
index 0fe3b6d..ff9e89b 100644
--- a/lib/tag_concept.pl
+++ b/lib/tag_concept.pl
@@ -548,7 +548,7 @@ min_tags(V) :-
 	findall(P,(rdf(V,pprime:hasAnnotation,A),rdf(A,rdf:value,P)), Ps0),
 	sort(Ps0, Ps),
 	length(Ps,N),
-	N > 200.
+	N > 0.
 
 multiple_players(V) :-
 	findall(P,(rdf(V,pprime:hasAnnotation,A),rdf(A,pprime:creator,P)), Ps0),
@@ -560,7 +560,7 @@ multiple_players(V) :-
 
 
 
-mbh_topN_eval_table :-
+mbh_topN_eval_table(V) :-
 	(   m_video(V),
 	    rdf(V, dc:id, literal(Id)),
 	    video_mbh_terms(V, Terms),
diff --git a/lib/tagrank.pl b/lib/tagrank.pl
new file mode 100644
index 0000000..4ec9cbe
--- /dev/null
+++ b/lib/tagrank.pl
@@ -0,0 +1,218 @@
+:- module(tagrank,
+	  [sem_tag_rank/2
+	  ]).
+
+:- use_module(library(semweb/rdf_db)).
+:- use_module(library(semweb/rdfs)).
+:- use_module(library(real)).
+:- use_module(library(tfidf)).
+:- use_module(api(reconcile)).
+
+
+sem_tag_rank(Video, Ranked) :-
+	tag_rank(Video, RankedTags),
+	length(RankedTags, Tag_Count),
+	debug(semrank, 'tags: ~w', [Tag_Count]),
+	tag_interpretations(RankedTags, Interpretations),
+	pairs_values(RankedTags, Tags),
+	list_to_assoc(Interpretations, Interpretations_Assoc),
+	tag_distance_graph(Tags, Interpretations_Assoc, Graph),
+	length(Graph, Edge_Count),
+	debug(semrank, 'distance graph size: ~w', [Edge_Count]),
+	weighted_graph_rename(Graph, Mapping, Vector, Weights),
+	length(Vector, V_Count),
+	debug(semrank, 'vector: ~w', [V_Count]),
+	page_rank(Vector, Weights, Scores0),
+	Scores0 = [_EigenVector|Scores],
+	assoc_to_list(Mapping, Pairs0),
+	transpose_pairs(Pairs0, Pairs1),
+	keysort(Pairs1, Pairs),
+	pairs_values(Pairs, Resources),
+	pairs_keys_values(Ranked0, Scores, Resources),
+	keysort(Ranked0, Ranked1),
+	reverse(Ranked1, Ranked2),
+	merge_scores(RankedTags, Ranked2, Ranked).
+
+
+merge_scores(R1, R2, R) :-
+	transpose_pairs(R1,R1_T),
+	transpose_pairs(R2,R2_T),
+	merge_tag_scores(R1_T,R2_T, Merged0),
+	transpose_pairs(Merged0, Merged),
+	keysort(Merged, Merged1),
+	reverse(Merged1, R).
+
+
+merge_tag_scores([], T, T).
+merge_tag_scores([R-S1|T1], [R-S2|T2], [R-S|T]) :-
+	!,
+	S is S1+S2,
+	merge_tag_scores(T1, T2, T).
+merge_tag_scores([R1-S1|T1], T2, [R1-S1|T]) :-
+	merge_tag_scores(T1, T2, T).
+
+
+tag_interpretations([], []).
+%tag_interpretations([W-_Tag|Tags], Pairs) :-
+%	W < 0.005,
+%	!,
+%	tag_interpretations(Tags, Pairs).
+tag_interpretations([_W-Tag|Tags], [Tag-Concepts|Pairs]) :-
+	rdf_equal(skos:'Concept', Type),
+	findall(C,
+		(reconcile(Tag, 10, Type, [], Hits),
+		 member(hit(D,C,_,_), Hits),
+		 D < 6
+		),
+		Concepts),
+	tag_interpretations(Tags, Pairs).
+
+page_rank(Vector, Weights, Rank) :-
+	<- library(igraph),
+	v <- Vector,
+	w <- Weights,
+	g <- graph(v),
+	Rank <- 'page.rank(g, directed=TRUE, weights = w)$vector'.
+
+tag_distance_graph(Tags, Interpretations, Graph) :-
+	 cartesian(Tags, Tags, Cartesian),
+	 pair_distances(Cartesian, Interpretations, Graph).
+
+
+pair_distances([], _, []).
+pair_distances([[A,A]|T], I, Graph) :-
+	!,
+	pair_distances(T, I, Graph).
+pair_distances([[A,B]|T], I, [Rel|Graph]) :-
+	Rel = i(A,B,Distance),
+	findall(D,
+		(   get_assoc(A, I, A_Concepts),
+		    get_assoc(B, I, B_Concepts),
+		    member(A_C, A_Concepts),
+		    member(B_C, B_Concepts),
+		    semantic_distance(A_C,B_C,D)
+		),
+		Ds0),
+	sort(Ds0, Ds),
+	reverse(Ds, [Distance|_]),
+	!,
+	pair_distances(T, I, Graph).
+pair_distances([_|T], I, Graph) :-
+	pair_distances(T, I, Graph).
+
+
+semantic_distance(A, B, D) :-
+	rdf_reachable(B, skos:broader, A, 5, N),
+	!,
+	D is 0.5^N.
+semantic_distance(A, B, D) :-
+	rdf_reachable(A, skos:broader, B, 2, N),
+	!,
+	D is 0.01^N.
+semantic_distance(A, B, D) :-
+	rdf_reachable(A, skos:broader, C, 2, N1),
+	A \== C,
+	rdf_reachable(B, skos:broader, C, 2, N2),
+	B \== C,
+	!,
+	N is max(N1, N2),
+	D is 0.01^N.
+semantic_distance(A, B, D) :-
+	rdf_reachable(A, skos:related, B, 2, N),
+	!,
+	D is 0.01^N.
+semantic_distance(A, B, D) :-
+	rdf(A, P, B),
+	\+ rdfs_subproperty_of(P, skos:broader),
+	\+ rdfs_subproperty_of(P, skos:narrower),
+	\+ rdfs_subproperty_of(P, skos:related),
+	!,
+	D is 0.01.
+/*
+semantic_distance(A, B, D) :-
+	rdf(A, _, C),
+	(   B == C
+	->  D = 0.5
+	;   rdf_reachable(B, skos:broader, C, 3, _)
+	->  D = 0.25
+	;   rdf_reachable(C, skos:broader, B, 3, _)
+	->  D = 0.5
+	).
+*/
+
+%%	semantic_distance(+L1, +L2, -Cartesian)
+%
+%	Cartesian is the cartesian product of L1 and L2.
+
+cartesian([], _L, []).
+cartesian([A|N], L, M) :-
+	pair(A,L,M1),
+	cartesian(N, L, M2),
+	append(M1, M2, M).
+
+pair(_A, [], []) :- !.
+pair(A, [B|L], [[A,B]|N] ) :-
+	pair(A, L, N).
+
+
+
+%%	weighted_graph_rename(+Graph, -Assoc, -NewGraph, -Weights)
+%
+%	Rename nodes so that they start counted by 1.
+
+weighted_graph_rename(Graph, Assoc, NewGraph, Weights) :-
+	empty_assoc(Assoc0),
+	rename_vertices(Graph, Assoc0, 1, NewGraph, Assoc, Weights).
+
+rename_vertices([], Assoc, _, [], Assoc, []).
+rename_vertices([i(C1,C2,W)|T], Assoc0, N, [NewC1,NewC2|Rest], Assoc, [W|Weights]) :-
+	rename_vertex(C1, Assoc0, N, Assoc1, N1, NewC1),
+	rename_vertex(C2, Assoc1, N1, Assoc2, N2, NewC2),
+	rename_vertices(T, Assoc2, N2, Rest, Assoc, Weights).
+
+rename_vertex(C, Assoc, N, Assoc, N, New) :-
+	get_assoc(C, Assoc, New),
+	!.
+rename_vertex(C, Assoc, N, Assoc1, N1, N) :-
+	N1 is N+1,
+	put_assoc(C, Assoc, N, Assoc1).
+
+
+
+
+mbh_semrank_eval_table(V) :-
+	(   m_video(V),
+	    rdf(V, dc:id, literal(Id)),
+	    video_mbh_terms(V, Terms),
+	    sem_tag_rank(V, Tags),
+	    length(Terms, TermCount),
+	    length(Tags, TagCount),
+
+	    (	TermCount is 0
+	    ->	format('~w', [Id])
+	    ;	TermCount2 is TermCount*2,
+
+		pairs_values(Tags, Tags1),
+		topN(Tags1, TermCount, Terms, TopN),
+		topN(Tags1, TermCount2, Terms, Top2N),
+		topN(Tags1, TagCount, Terms, All),
+
+		format('~w,\t~w,\t~w,\t~2f,\t~2f,\t~2f~n', [Id, TagCount, TermCount, TopN, Top2N, All])
+	    ),
+	    fail
+	;   true
+	).
+
+topN(Tags, N, Terms, TopN) :-
+	length(Terms, Term_Count),
+	length(Tags, Tag_Count),
+	(   Tag_Count > N
+	->  length(Top_N_Tags, N),
+	    append(Top_N_Tags, _, Tags)
+	;   Top_N_Tags = Tags
+	),
+	maplist(downcase_atom, Top_N_Tags, Top_N_Tags1),
+	sort(Top_N_Tags1, Top_N_Tags2),
+	ord_intersect(Top_N_Tags2, Terms, Intersect),
+	length(Intersect, Intersect_Count),
+	TopN is Intersect_Count/Term_Count.
diff --git a/lib/tfidf.pl b/lib/tfidf.pl
index f323d44..ea775ff 100644
--- a/lib/tfidf.pl
+++ b/lib/tfidf.pl
@@ -1,6 +1,7 @@
 :- module(tfidf,
 	  [flush_tag_rank/1,
 	   tag_rank/2,
+	   document_term/2,
 	   documents/1,
 	   tf/3,
 	   idf/3,
@@ -42,9 +43,9 @@ documents(Videos) :-
 document_term(D, T) :-
 	rdf(D, pprime:hasAnnotation, E),
 	rdf(E, rdf:value, literal(T)).
-	%rdf(E, pprime:score, literal(SA)),
-	%atom_number(SA, S),
-	%S > 5.
+%	rdf(E, pprime:score, literal(SA)),
+%	atom_number(SA, S),
+%	S > 5.
 
 tf(T, D, TF) :-
 	findall(A, rdf(D, pprime:hasAnnotation, A), As),