vumix/commit
methods to deal with collections in ptr ranking
author | Michiel Hildebrand |
---|---|
Fri May 25 16:32:03 2012 +0200 | |
committer | Michiel Hildebrand |
Fri May 25 16:32:03 2012 +0200 | |
commit | 5c7b50f8791ea0c6b5edc4d4251a2e2f1d1d2267 |
tree | dd2f7559df870df92a473b3c6aa0d35e576ed680 |
parent | ae4af67c8e2668a1f911a4ec61cd5e58f08a0881 |
Diff style: patch stat
diff --git a/lib/ptr.pl b/lib/ptr.pl index 5e79151..e5cba8a 100644 --- a/lib/ptr.pl +++ b/lib/ptr.pl @@ -1,11 +1,13 @@ :- module(ptr, - [ptr_rank/2, - ptr/3 + [create_collection/0, + c_median/1, + ptr_rank/2 ]). -:- use_module(library(video_annotation)). :- use_module(library(stop_words)). :- use_module(library(real)). +:- use_module(library(tag_concept), [baseline_video/1,m_video/1]). +%:- use_module(video_vectors). /* Probabilistic tag relevance @@ -13,52 +15,49 @@ http://www2009.eprints.org/36/1/p351.pdf */ -ptr_rank(Video, RankedTags) :- - set_prolog_stack(global, limit(512000000)), - findall(T, matched_video_tag(Video, literal(T)), Tags0), - sort(Tags0, Tags1), - remove_stop_words(Tags1, dutch, Tags), - length(Tags, TagCount), - debug(ptr, 'video: ~w, tags: ~w', [Video,TagCount]), - ptr_probs(Tags, Video, TagCount, Probs), - pairs_keys_values(Pairs, Probs, Tags), - keysort(Pairs, Pairs1), - reverse(Pairs1, RankedTags). +:- dynamic + c_vector/2, + median_cache/1. -ptr_probs([], _, _, []). -ptr_probs([Tag|Tags], Video, N, [Prob|Probs]) :- - N1 is N - 1, - debug(ptr_tag, '~w ~w', [N, Tag]), - ptr(Video, Tag, Prob), - debug(ptr_tag, '~f2~n', [Prob]), - debug(ptr_result, '~w ~f2', [Tag,Prob]), - ptr_probs(Tags, Video, N1, Probs). - -ptr(Video, Tag, Probability) :- - findall(V, matched_video_tag(V, literal(Tag)), Neighbour_Videos0), - sort(Neighbour_Videos0, Neighbour_Videos), - length(Neighbour_Videos, V_Count), - debug(ptr_tag, 'videos: ~w', [V_Count]), - tag_vectors([Video|Neighbour_Videos], [V_Vector|N_Vectors]), - median_distance([V_Vector|N_Vectors], Median), - debug(ptr_tag, 'median ~2f', [Median]), - kde(N_Vectors, V_Vector, V_Count, Median, Probability). +:- set_prolog_stack(global, limit(1240000000)). + /******************************* + * process collection * + *******************************/ -tag_vectors(Videos, Vectors) :- - findall(Tag, (member(V,Videos), - matched_video_tag(V, literal(Tag)) - ), - Tags0), - sort(Tags0, Tags1), - remove_stop_words(Tags1, dutch, Tags), +%% create_collection +% +% Create the collection on which we are going the run the PTR. + +create_collection :- + debug(ptr_collection), + retractall(c_vector(_,_)), + % collect videos and tags in the collection + findall(V, baseline_video(V), Videos), + collection_tags(Videos, Tags), + % output some stats + length(Videos, Video_Count), length(Tags, Tag_Count), - debug(ptr_tag, 'vector length ~w', [Tag_Count]), - video_vectors(Videos, Tags, Vectors), - debug(ptr_tag, 'created vectors', []). + debug(ptr_collection, '~w videos and ~w tags', [Video_Count, Tag_Count]), + % create a vector for each video + debug(ptr_collection, 'creating video vectors', []), + video_vectors(Videos, Tags, VideoVectors), + write_collection(VideoVectors). + +write_collection(VideoVectors) :- + File = 'video_vectors.pl', + debug(ptr_collection, 'write vectors to ~w', [File]), + open(File, write, Stream), + ( member(Video-Vector, VideoVectors), + assert(c_vector(Video, Vector)), + portray_clause(Stream, c_vector(Video, Vector)), + fail + ; true + ), + close(Stream). video_vectors([], _, []). -video_vectors([V|Vs], Tags, [Vector|Rest]) :- +video_vectors([V|Vs], Tags, [V-Vector|Rest]) :- findall(Tag, matched_video_tag(V, literal(Tag)), VTags0), sort(VTags0, VTags1), remove_stop_words(VTags1, dutch, VTags), @@ -73,56 +72,124 @@ v_zip([_Tag|Ts], VTs, [0|Vs]) :- v_zip(Ts, VTs, Vs). -kde(Neighbours, X, Video_Count, Median, Prob) :- - x <- X, - maplist(k(Median), Neighbours, Ks), +median_test :- + D <- [1,2], + x <- median(D), + print(x). + +c_median(M) :- + M = 12.64911, + %median_cache(M), + !. +c_median(Vectors) :- + findall(V, c_vector(_, V), Vectors), + pair_wise_distance(Vectors, Ds), + d <- Ds, + <- print(d), + M <- median(d), + assert(median_cache(M)). + +pair_wise_distance([], []). +pair_wise_distance([V|Vs], Ds) :- + maplist(vector_distance(V), Vs, D0), + pair_wise_distance(Vs, D1), + append(D0, D1, Ds). + +vector_distance(V1,V2,D) :- + v1 <- V1, + v2 <- V2, + D <- sqrt(sum(abs(v1-v2)^2)). + + + /******************************* + * ptr * + *******************************/ + +%% ptr_rank(+Video, -RankedTags) +% +% Ranked tags is a list of score-tag pairs. + +ptr_rank(Video, RankedTags) :- + video_tags(Video, Tags), + length(Tags, TagCount), + debug(ptr, 'video: ~w, tags: ~w', [Video,TagCount]), + ptr_probs(Tags, Video, Probs), + pairs_keys_values(Pairs, Probs, Tags), + keysort(Pairs, Pairs1), + reverse(Pairs1, RankedTags). + +ptr_probs([], _, []). +ptr_probs([Tag|Tags], Video, [Prob|Probs]) :- + ptr(Video, Tag, Prob), + debug(ptr, '~w ~f2', [Tag,Prob]), + ptr_probs(Tags, Video, Probs). + +ptr(Video, Tag, Probability) :- + findall(V, tag_of_c_video(literal(Tag), V), Neighbour_Videos0), + sort(Neighbour_Videos0, Neighbour_Videos), + length(Neighbour_Videos, V_Count0), + debug(ptr, 'videos: ~w', [V_Count0]), + V_Count is V_Count0+1, + c_vector(Video, V_Vector), + maplist(c_vector, Neighbour_Videos, N_Vectors), + kde(N_Vectors, V_Vector, V_Count, Probability). + +kde(Neighbours, X, Video_Count, Prob) :- + maplist(k(X), Neighbours, Ks), sumlist(Ks, K), Prob is (1/Video_Count) * K. -k(Median, N, K) :- +k(X, N, K) :- + % we could declare "x" and "m" once, as in R they are global vars in R, but I do not like global vars in prolog :( + c_median(Median), + m <- Median, + x <- X, n <- N, - % there is a bug in real R that makes (x-n)^2 into x-(n^2). I use abs() to explicitly force parenthesis - K <- exp(-(sqrt(sum(abs((x-n))^2))^2)/Median^2). + K <- exp(-(sqrt(sum((x-n)^2))^2)/m^2). -% r_in( Prob <- exp(-(sum(sqrt(abs((X - N))^2))^2)/ (Length/4)^2) -% ). + /******************************* + * video tag utilities * + *******************************/ -median_distance(Vs,Median) :- - perm_d(Vs, Ds), - length(Ds, Length), - sum_list(Ds, Total), - Median is (Total/Length)+1. +%% collection_tags(+Videos, -Tags) +% +% Tags are all matching tags in a collecition of Videos. -perm_d([], []). -perm_d([V|Vs], Ds) :- - maplist(v_distance(V), Vs, D0), - perm_d(Vs, D1), - append(D0, D1, Ds). +collection_tags(Videos, Tags) :- + findall(Tag, (member(Video,Videos), + matched_video_tag(Video, literal(Tag)) + ), + Tags0), + sort(Tags0, Tags1), + remove_stop_words(Tags1, dutch, Tags). +%% video_tags(+Video, -Tags) +% +% Tags are all matching tags in Video with stop words removed. -v_distance(V1,V2,D) :- - v1 <- V1, - v2 <- V2, - D <- sqrt(sum(abs(v1-v2)^2)). +video_tags(Video, Tags) :- + findall(T, matched_video_tag(Video, literal(T)), Tags0), + sort(Tags0, Tags1), + remove_stop_words(Tags1, dutch, Tags). %% matched_video_tag(?Video, ?Tag) % % Tag is an annotation of Video. matched_video_tag(Video, Tag) :- - ground(Tag), - !, - rdf(AnnotationId, rdf:value, Tag), rdf(Video, pprime:hasAnnotation, AnnotationId), + rdf(AnnotationId, rdf:value, Tag), rdf(AnnotationId, pprime:score, literal(ScoreA)), atom_number(ScoreA, Score), Score > 5. -matched_video_tag(Video, Tag) :- - rdf(Video, pprime:hasAnnotation, AnnotationId), + + + +tag_of_c_video(Tag, Video) :- rdf(AnnotationId, rdf:value, Tag), + rdf(Video, pprime:hasAnnotation, AnnotationId), + c_vector(Video, _), rdf(AnnotationId, pprime:score, literal(ScoreA)), atom_number(ScoreA, Score), Score > 5. - - diff --git a/lib/tag_concept.pl b/lib/tag_concept.pl index 18ec6b8..8394eb3 100644 --- a/lib/tag_concept.pl +++ b/lib/tag_concept.pl @@ -2,7 +2,9 @@ [tag_concept/3, video_tags/2, video_concepts/3, - derived_concepts/4 + derived_concepts/4, + m_video/1, + baseline_video/1 ]). @@ -26,8 +28,8 @@ baseline_video(V) :- - rdf(V, rdf:type, pprime:'Baseline'), - once(rdf(V, dc:subject, _)). + rdf(V, rdf:type, pprime:'Video'), + once(rdf(V, dc:subject, _, mbh_tag)). derived_concepts(Tags, Goal, Scheme, Concepts) :- findall(C, (member(_Score-Tag, Tags), @@ -534,15 +536,15 @@ cooccur(Tag1,Tag2,Score) :- m_video(V) :- - %baseline_video(V), - rdf(V, rdf:type, pprime:'Video'), + baseline_video(V), + %rdf(V, rdf:type, pprime:'Video'), multiple_players(V). multiple_players(V) :- findall(P,(rdf(V,pprime:hasAnnotation,A),rdf(A,pprime:creator,P)), Ps0), sort(Ps0, Ps), length(Ps,N), - N > 3. + N > 1.