vumix/commit

methods to deal with collections in ptr ranking

authorMichiel Hildebrand
Fri May 25 16:32:03 2012 +0200
committerMichiel Hildebrand
Fri May 25 16:32:03 2012 +0200
commit5c7b50f8791ea0c6b5edc4d4251a2e2f1d1d2267
treedd2f7559df870df92a473b3c6aa0d35e576ed680
parentae4af67c8e2668a1f911a4ec61cd5e58f08a0881
Diff style: patch stat
diff --git a/lib/ptr.pl b/lib/ptr.pl
index 5e79151..e5cba8a 100644
--- a/lib/ptr.pl
+++ b/lib/ptr.pl
@@ -1,11 +1,13 @@
 :- module(ptr,
-	  [ptr_rank/2,
-	   ptr/3
+	  [create_collection/0,
+	   c_median/1,
+	   ptr_rank/2
 	  ]).
 
-:- use_module(library(video_annotation)).
 :- use_module(library(stop_words)).
 :- use_module(library(real)).
+:- use_module(library(tag_concept), [baseline_video/1,m_video/1]).
+%:- use_module(video_vectors).
 
 /* Probabilistic tag relevance
 
@@ -13,52 +15,49 @@ http://www2009.eprints.org/36/1/p351.pdf
 
 */
 
-ptr_rank(Video, RankedTags) :-
-	set_prolog_stack(global, limit(512000000)),
-	findall(T, matched_video_tag(Video, literal(T)), Tags0),
-	sort(Tags0, Tags1),
-	remove_stop_words(Tags1, dutch, Tags),
-	length(Tags, TagCount),
-	debug(ptr, 'video: ~w, tags: ~w', [Video,TagCount]),
-	ptr_probs(Tags, Video, TagCount, Probs),
-	pairs_keys_values(Pairs, Probs, Tags),
-	keysort(Pairs, Pairs1),
-	reverse(Pairs1, RankedTags).
+:- dynamic
+	c_vector/2,
+	median_cache/1.
 
-ptr_probs([], _, _, []).
-ptr_probs([Tag|Tags], Video, N, [Prob|Probs]) :-
-	N1 is N - 1,
-	debug(ptr_tag, '~w ~w', [N, Tag]),
-	ptr(Video, Tag, Prob),
-	debug(ptr_tag, '~f2~n', [Prob]),
-	debug(ptr_result, '~w ~f2', [Tag,Prob]),
-	ptr_probs(Tags, Video, N1, Probs).
-
-ptr(Video, Tag, Probability) :-
-	findall(V, matched_video_tag(V, literal(Tag)), Neighbour_Videos0),
-	sort(Neighbour_Videos0, Neighbour_Videos),
-	length(Neighbour_Videos, V_Count),
-	debug(ptr_tag, 'videos: ~w', [V_Count]),
-	tag_vectors([Video|Neighbour_Videos], [V_Vector|N_Vectors]),
-	median_distance([V_Vector|N_Vectors], Median),
-	debug(ptr_tag, 'median ~2f', [Median]),
-	kde(N_Vectors, V_Vector, V_Count, Median, Probability).
+:- set_prolog_stack(global, limit(1240000000)).
 
+		 /*******************************
+		 *   process collection		*
+		 *******************************/
 
-tag_vectors(Videos, Vectors) :-
-	findall(Tag, (member(V,Videos),
-		      matched_video_tag(V, literal(Tag))
-		     ),
-		Tags0),
-	sort(Tags0, Tags1),
-	remove_stop_words(Tags1, dutch, Tags),
+%%	create_collection
+%
+%	Create the collection on which we are going the run the PTR.
+
+create_collection :-
+	debug(ptr_collection),
+	retractall(c_vector(_,_)),
+	% collect videos and tags in the collection
+	findall(V, baseline_video(V), Videos),
+	collection_tags(Videos, Tags),
+	% output some stats
+	length(Videos, Video_Count),
 	length(Tags, Tag_Count),
-	debug(ptr_tag, 'vector length ~w', [Tag_Count]),
-	video_vectors(Videos, Tags, Vectors),
-	debug(ptr_tag, 'created vectors', []).
+	debug(ptr_collection, '~w videos and ~w tags', [Video_Count, Tag_Count]),
+	% create a vector for each video
+	debug(ptr_collection, 'creating video vectors', []),
+	video_vectors(Videos, Tags, VideoVectors),
+	write_collection(VideoVectors).
+
+write_collection(VideoVectors) :-
+	File = 'video_vectors.pl',
+	debug(ptr_collection, 'write vectors to ~w', [File]),
+	open(File, write, Stream),
+	(   member(Video-Vector, VideoVectors),
+	    assert(c_vector(Video, Vector)),
+	    portray_clause(Stream, c_vector(Video, Vector)),
+	    fail
+	;   true
+	),
+	close(Stream).
 
 video_vectors([], _, []).
-video_vectors([V|Vs], Tags, [Vector|Rest]) :-
+video_vectors([V|Vs], Tags, [V-Vector|Rest]) :-
 	findall(Tag, matched_video_tag(V, literal(Tag)), VTags0),
 	sort(VTags0, VTags1),
 	remove_stop_words(VTags1, dutch, VTags),
@@ -73,56 +72,124 @@ v_zip([_Tag|Ts], VTs, [0|Vs]) :-
 	v_zip(Ts, VTs, Vs).
 
 
-kde(Neighbours, X, Video_Count, Median, Prob) :-
-	x <- X,
-	maplist(k(Median), Neighbours, Ks),
+median_test :-
+	D <- [1,2],
+	x <- median(D),
+	print(x).
+
+c_median(M) :-
+	M = 12.64911,
+	%median_cache(M),
+	!.
+c_median(Vectors) :-
+	findall(V, c_vector(_, V), Vectors),
+	pair_wise_distance(Vectors, Ds),
+	d <- Ds,
+	<- print(d),
+	M <- median(d),
+	assert(median_cache(M)).
+
+pair_wise_distance([], []).
+pair_wise_distance([V|Vs], Ds) :-
+	maplist(vector_distance(V), Vs, D0),
+	pair_wise_distance(Vs, D1),
+	append(D0, D1, Ds).
+
+vector_distance(V1,V2,D) :-
+	v1 <- V1,
+	v2 <- V2,
+	D <- sqrt(sum(abs(v1-v2)^2)).
+
+
+		 /*******************************
+		 *	       ptr		*
+		 *******************************/
+
+%%	ptr_rank(+Video, -RankedTags)
+%
+%	Ranked tags is a list of score-tag pairs.
+
+ptr_rank(Video, RankedTags) :-
+	video_tags(Video, Tags),
+	length(Tags, TagCount),
+	debug(ptr, 'video: ~w, tags: ~w', [Video,TagCount]),
+	ptr_probs(Tags, Video, Probs),
+	pairs_keys_values(Pairs, Probs, Tags),
+	keysort(Pairs, Pairs1),
+	reverse(Pairs1, RankedTags).
+
+ptr_probs([], _, []).
+ptr_probs([Tag|Tags], Video, [Prob|Probs]) :-
+	ptr(Video, Tag, Prob),
+	debug(ptr, '~w ~f2', [Tag,Prob]),
+	ptr_probs(Tags, Video, Probs).
+
+ptr(Video, Tag, Probability) :-
+	findall(V, tag_of_c_video(literal(Tag), V), Neighbour_Videos0),
+	sort(Neighbour_Videos0, Neighbour_Videos),
+	length(Neighbour_Videos, V_Count0),
+	debug(ptr, 'videos: ~w', [V_Count0]),
+	V_Count is V_Count0+1,
+	c_vector(Video, V_Vector),
+	maplist(c_vector, Neighbour_Videos, N_Vectors),
+	kde(N_Vectors, V_Vector, V_Count, Probability).
+
+kde(Neighbours, X, Video_Count, Prob) :-
+	maplist(k(X), Neighbours, Ks),
 	sumlist(Ks, K),
 	Prob is (1/Video_Count) * K.
 
-k(Median, N, K) :-
+k(X, N, K) :-
+	% we could declare "x" and "m" once, as in R they are global vars in R, but I do not like global vars in prolog :(
+	c_median(Median),
+	m <- Median,
+	x <- X,
 	n <- N,
-	% there is a bug in real R that makes (x-n)^2 into x-(n^2). I use abs() to explicitly force parenthesis
-	K <- exp(-(sqrt(sum(abs((x-n))^2))^2)/Median^2).
+	K <- exp(-(sqrt(sum((x-n)^2))^2)/m^2).
 
-%	r_in( Prob <- exp(-(sum(sqrt(abs((X - N))^2))^2)/ (Length/4)^2)
-%	).
 
+		 /*******************************
+		 *  video tag utilities		*
+		 *******************************/
 
-median_distance(Vs,Median) :-
-	perm_d(Vs, Ds),
-	length(Ds, Length),
-	sum_list(Ds, Total),
-	Median is (Total/Length)+1.
+%%	collection_tags(+Videos, -Tags)
+%
+%	Tags are all matching tags in a collecition of Videos.
 
-perm_d([], []).
-perm_d([V|Vs], Ds) :-
-	maplist(v_distance(V), Vs, D0),
-	perm_d(Vs, D1),
-	append(D0, D1, Ds).
+collection_tags(Videos, Tags) :-
+	findall(Tag, (member(Video,Videos),
+		      matched_video_tag(Video, literal(Tag))
+		     ),
+		Tags0),
+	sort(Tags0, Tags1),
+	remove_stop_words(Tags1, dutch, Tags).
 
+%%	video_tags(+Video, -Tags)
+%
+%	Tags are all matching tags in Video with stop words removed.
 
-v_distance(V1,V2,D) :-
-	v1 <- V1,
-	v2 <- V2,
-	D <- sqrt(sum(abs(v1-v2)^2)).
+video_tags(Video, Tags) :-
+	findall(T, matched_video_tag(Video, literal(T)), Tags0),
+	sort(Tags0, Tags1),
+	remove_stop_words(Tags1, dutch, Tags).
 
 %%	matched_video_tag(?Video, ?Tag)
 %
 %	Tag is an annotation of Video.
 
 matched_video_tag(Video, Tag) :-
-	ground(Tag),
-	!,
-	rdf(AnnotationId, rdf:value, Tag),
 	rdf(Video, pprime:hasAnnotation, AnnotationId),
+	rdf(AnnotationId, rdf:value, Tag),
 	rdf(AnnotationId, pprime:score, literal(ScoreA)),
 	atom_number(ScoreA, Score),
 	Score > 5.
-matched_video_tag(Video, Tag) :-
-	rdf(Video, pprime:hasAnnotation, AnnotationId),
+
+
+
+tag_of_c_video(Tag, Video) :-
 	rdf(AnnotationId, rdf:value, Tag),
+	rdf(Video, pprime:hasAnnotation, AnnotationId),
+	c_vector(Video, _),
 	rdf(AnnotationId, pprime:score, literal(ScoreA)),
 	atom_number(ScoreA, Score),
 	Score > 5.
-
-
diff --git a/lib/tag_concept.pl b/lib/tag_concept.pl
index 18ec6b8..8394eb3 100644
--- a/lib/tag_concept.pl
+++ b/lib/tag_concept.pl
@@ -2,7 +2,9 @@
 	  [tag_concept/3,
 	   video_tags/2,
 	   video_concepts/3,
-	   derived_concepts/4
+	   derived_concepts/4,
+	   m_video/1,
+	   baseline_video/1
 	  ]).
 
 
@@ -26,8 +28,8 @@
 
 
 baseline_video(V) :-
-	rdf(V, rdf:type, pprime:'Baseline'),
-	once(rdf(V, dc:subject, _)).
+	rdf(V, rdf:type, pprime:'Video'),
+	once(rdf(V, dc:subject, _, mbh_tag)).
 
 derived_concepts(Tags, Goal, Scheme, Concepts) :-
 	findall(C, (member(_Score-Tag, Tags),
@@ -534,15 +536,15 @@ cooccur(Tag1,Tag2,Score) :-
 
 
 m_video(V) :-
-	%baseline_video(V),
-	rdf(V, rdf:type, pprime:'Video'),
+	baseline_video(V),
+	%rdf(V, rdf:type, pprime:'Video'),
 	multiple_players(V).
 
 multiple_players(V) :-
 	findall(P,(rdf(V,pprime:hasAnnotation,A),rdf(A,pprime:creator,P)), Ps0),
 	sort(Ps0, Ps),
 	length(Ps,N),
-	N > 3.
+	N > 1.