vumix/commit

page rank on semantic distance graph

authorMichiel Hildebrand
Wed May 30 22:29:01 2012 +0200
committerMichiel Hildebrand
Wed May 30 22:29:01 2012 +0200
commitdc3b8287253e2ddebd7d7ba8c567585fcb30a5cc
treee8b0300d251379677c077c1ce41c29aa84c66c6a
parentbfe8583c08e18de5d6516f59ab572b8a75a6531b
Diff style: patch stat
diff --git a/applications/vumix_p0.pl b/applications/vumix_p0.pl
index 80331eb..8f3e0c9 100644
--- a/applications/vumix_p0.pl
+++ b/applications/vumix_p0.pl
@@ -24,13 +24,14 @@
 :- use_module(api(annotation)).
 :- use_module(library(user_process)).
 
-:- use_module(api(reconcile)).
-:- use_module(library(video_annotation)).
+%:- use_module(api(reconcile)).
+%:- use_module(library(video_annotation)).
 :- use_module(library(yaz_util)).
-:- use_module(library(stop_words)).
+%:- use_module(library(stop_words)).
 :- use_module(library(tfidf)).
-:- use_module(library(tag_concept)).
-
+%:- use_module(library(tag_concept)).
+:- use_module(library(semrank)).
+%:- use_module(library(real)).
 
 /***************************************************
 * http handlers
@@ -127,13 +128,16 @@ http_vumix_p0(Request) :-
 	;   logged_on(User, anonymous)
         ),
 	user_process(User, Target, _),
-	tag_rank(Target, Tags0),
-	remove_stop_words(Tags0, dutch, Tags),
+	%tfidf_rank(Target, Concepts),
+	c_semantic_distance_rank(Target, Concepts),
+	html_page(Target, Fields, Concepts).
+
+tfidf_rank(Target, Concepts) :-
+	tag_rank(Target, Tags),
 	link_tags_to_concepts(Tags, Concepts0),
 	keysort(Concepts0, Concepts1),
 	group_pairs_by_key(Concepts1, Concepts2),
-	pairs_sort_by_value_sum(Concepts2, Concepts),
-	html_page(Target, Fields, Concepts).
+	pairs_sort_by_value_sum(Concepts2, Concepts).
 
 %%	link_tags_to_concepts(+Tags, -Concepts)
 %
@@ -141,16 +145,16 @@ http_vumix_p0(Request) :-
 
 link_tags_to_concepts([], []).
 link_tags_to_concepts([Score-Tag|As], [Concept-Score|Rest]) :-
-	Score > 7,
+	%Score > 7,
 	rdf_has(Concept,rdfs:label,literal(exact(Tag),_)),
 	%reconcile(Tag, 3, _Type, [], Hits),
 	%member(hit(D,Concept,_,_), Hits),
 	%D < 3,
 	%\+ rdf(Concept,skos:inScheme,gtaa:'OnderwerpenBenG'), % hack, to remove duplication between onderwerpenBenG en onderwerpen
-	(   rdf(Concept,skos:inScheme,gtaa:'OnderwerpenBenG')
-	->  true
-	;   rdf(Concept,skos:scopeNote,_)
-	),  %hack. Concepts without a scopenote are dubious
+	%(   rdf(Concept,skos:inScheme,gtaa:'OnderwerpenBenG')
+	%->  true
+	%;   rdf(Concept,skos:scopeNote,_)
+	%),  %hack. Concepts without a scopenote are dubious
 	!,
 	link_tags_to_concepts(As, Rest).
 link_tags_to_concepts([_|As],  Rest) :-
diff --git a/lib/semrank.pl b/lib/semrank.pl
new file mode 100644
index 0000000..0e2cc6b
--- /dev/null
+++ b/lib/semrank.pl
@@ -0,0 +1,151 @@
+:- module(semrank,
+	  [r_semantic_distance_rank/2,
+	   c_semantic_distance_rank/2,
+	   semantic_distance_rank/2
+	  ]).
+
+:- use_module(library(semweb/rdf_db)).
+:- use_module(library(semweb/rdfs)).
+:- use_module(library(real)).
+:- use_module(library(tfidf)).
+:- use_module(api(reconcile)).
+
+:- dynamic
+	rank_cache/2.
+
+c_semantic_distance_rank(Video, Ranked) :-
+	rank_cache(Video, Ranked).
+c_semantic_distance_rank(Video, Ranked) :-
+	semantic_distance_rank(Video, Ranked),
+	assert(rank_cache(Video, Ranked)).
+
+r_semantic_distance_rank(Video, Ranked) :-
+	retractall(rank_cache(Video,_)),
+	semantic_distance_rank(Video, Ranked),
+	assert(rank_cache(Video, Ranked)).
+
+semantic_distance_rank(Video, Ranked) :-
+	tag_rank(Video, Tags),
+	tag_interpretations(Tags, I_Graph, Concepts),
+	length(Concepts, Concept_Count),
+	debug(semrank, 'concepts: ~w', [Concept_Count]),
+	semantic_distance_graph(Concepts, C_Graph),
+	append(I_Graph, C_Graph, Graph),
+	length(Graph, Edge_Count),
+	debug(semrank, 'distance graph size: ~w', [Edge_Count]),
+	weighted_graph_rename(Graph, Assoc, Vector, Weights),
+	length(Vector, V_Count),
+	debug(semrank, 'vector: ~w', [V_Count]),
+	page_rank(Vector, Weights, Scores0),
+	Scores0 = [_EigenVector|Scores],
+	assoc_to_list(Assoc, Pairs0),
+	transpose_pairs(Pairs0, Pairs1),
+	keysort(Pairs1, Pairs),
+	pairs_values(Pairs, Resources),
+	pairs_keys_values(Ranked0, Scores, Resources),
+	keysort(Ranked0, Ranked1),
+	reverse(Ranked1, Ranked).
+
+tag_interpretations(Tags, Edges, Concepts) :-
+	rdf_equal(skos:'Concept', Type),
+	findall(i(Tag,C,W),
+		(member(W-Tag,Tags),
+		 W > 0.01,
+		 reconcile(Tag, 10, Type, [], Hits),
+		 member(hit(D,C,_,_), Hits),
+		 D < 1.5
+		),
+		Edges),
+	findall(C, member(i(_,C,_), Edges), Cs),
+	sort(Cs, Concepts).
+
+page_rank(Vector, Weights, Rank) :-
+	<- library(igraph),
+	v <- Vector,
+	w <- Weights,
+	g <- graph(v),
+	Rank <- 'page.rank(g, weights = w)$vector'.
+
+
+semantic_distance_graph(Concepts, Graph) :-
+	 cartesian(Concepts, Concepts, Cartesian),
+	 pair_distances(Cartesian, Graph).
+
+
+pair_distances([], []).
+pair_distances([[A,A]|T], Graph) :-
+	!,
+	pair_distances(T, Graph).
+pair_distances([[A,B]|T], [Rel|Graph]) :-
+	Rel = i(A,B,Distance),
+	semantic_distance(A,B,Distance),
+	!,
+	pair_distances(T, Graph).
+pair_distances([_|T], Graph) :-
+	pair_distances(T, Graph).
+
+semantic_distance(A, B, _) :-
+	rdf_has(A, rdfs:label, L),
+	rdf_has(B, rdfs:label, L),
+	!,
+	fail. % do not add distance relations between diff senses of the same interpretation.
+semantic_distance(A, B, D) :-
+	rdf_reachable(B, skos:broader, A, 3, N),
+	!,
+	D is 0.5^N.
+semantic_distance(A, B, D) :-
+	rdf_reachable(A, skos:broader, B, 2, N),
+	!,
+	D is 0.001^N.
+semantic_distance(A, B, D) :-
+	rdf_reachable(A, skos:related, B, 2, N),
+	!,
+	D is 0.01^N.
+/*
+semantic_distance(A, B, D) :-
+	rdf(A, _, C),
+	(   B == C
+	->  D = 0.5
+	;   rdf_reachable(B, skos:broader, C, 3, _)
+	->  D = 0.25
+	;   rdf_reachable(C, skos:broader, B, 3, _)
+	->  D = 0.5
+	).
+*/
+
+%%	semantic_distance(+L1, +L2, -Cartesian)
+%
+%	Cartesian is the cartesian product of L1 and L2.
+
+cartesian([], _L, []).
+cartesian([A|N], L, M) :-
+	pair(A,L,M1),
+	cartesian(N, L, M2),
+	append(M1, M2, M).
+
+pair(_A, [], []) :- !.
+pair(A, [B|L], [[A,B]|N] ) :-
+	pair(A, L, N).
+
+
+
+%%	weighted_graph_rename(+Graph, -Assoc, -NewGraph, -Weights)
+%
+%	Rename nodes so that they start counted by 1.
+
+weighted_graph_rename(Graph, Assoc, NewGraph, Weights) :-
+	empty_assoc(Assoc0),
+	rename_vertices(Graph, Assoc0, 1, NewGraph, Assoc, Weights).
+
+rename_vertices([], Assoc, _, [], Assoc, []).
+rename_vertices([i(C1,C2,W)|T], Assoc0, N, [NewC1,NewC2|Rest], Assoc, [W|Weights]) :-
+	rename_vertex(C1, Assoc0, N, Assoc1, N1, NewC1),
+	rename_vertex(C2, Assoc1, N1, Assoc2, N2, NewC2),
+	rename_vertices(T, Assoc2, N2, Rest, Assoc, Weights).
+
+rename_vertex(C, Assoc, N, Assoc, N, New) :-
+	get_assoc(C, Assoc, New),
+	!.
+rename_vertex(C, Assoc, N, Assoc1, N1, N) :-
+	N1 is N+1,
+	put_assoc(C, Assoc, N, Assoc1).
diff --git a/lib/tag_concept.pl b/lib/tag_concept.pl
index f234417..0fe3b6d 100644
--- a/lib/tag_concept.pl
+++ b/lib/tag_concept.pl
@@ -413,7 +413,8 @@ concept_graph(IGraph, CGraph, Weights) :-
 interpretation_graph(Tags, Goal, Scheme, IGraph, Weights) :-
 	findall(i(Tag,C)-Score, (member(Score-Tag, Tags),
 				 call(Goal, Tag, C),
-				 once(rdf(C, skos:inScheme, Scheme))
+				 once(rdfs_individual_of(C, skos:'Concept'))
+				 %once(rdf(C, skos:inScheme, Scheme))
 				),
 		Pairs0),
 	sort(Pairs0, Pairs),
@@ -434,7 +435,7 @@ page_rank(Vector, Weights, Rank) :-
 	w <- Weights,
 	g <- graph(v),
 	%r_print( g ),
-	Rank <- 'page.rank(g, weights = w)$vector'.
+	Rank <- 'page.rank(g, directed = FALSE, weights = w)$vector'.
 	%r_in( 'page.rank'(g, vids = 'V'(g), directed = 'TRUE', damping = 0.2,
 		%	  weights = 'NULL', options = 'igraph.arpack.default') ),
 	%r_close.
@@ -610,3 +611,35 @@ term_rm_dot(T0,T) :-
 	;   T = T0
 	).
 
+
+
+		 /*******************************
+		 *	    mbh to GTAA		*
+		 *******************************/
+
+mbh_to_gtaa(Pairs) :-
+	video_mbh_terms(_, Terms),
+	length(Terms, Term_Count),
+	debug(mbh_gtaa, 'terms: ~w', [Term_Count]),
+	gtaa_candidates(Terms, Pairs).
+
+gtaa_candidates([], []).
+gtaa_candidates([Term|Ts], [Term-Rs|Rest]) :-
+	findall(C, (rdf_has(C, rdfs:label, literal(exact(Term),_)),
+		    once(rdfs_individual_of(C, skos:'Concept'))
+		   ),
+		Rs0),
+	sort(Rs0, Rs),
+	length(Rs, Concept_Count),
+	debug(mbh_gtaa, '~w :: ~w concepts', [Term, Concept_Count]),
+	gtaa_candidates(Ts, Rest).
+
+
+
+
+		 /*******************************
+		 *               C		*
+		 *******************************/
+
+
+
diff --git a/rdf/mbh_example.ttl b/rdf/mbh_example.ttl
index 4c38a42..6c259db 100644
--- a/rdf/mbh_example.ttl
+++ b/rdf/mbh_example.ttl
@@ -40,7 +40,8 @@ pprime:subjectAnnotation
     a :annotationField ;
     rdfs:label "Onderwerp"@nl ;
     dc:comment "voeg subject termen toe"@nl ;
-    :scheme <http://data.beeldengeluid.nl/gtaa/OnderwerpenBenG> ;
+    :scheme <http://purl.org/vocabularies/cornetto> ;
+    #:scheme <http://data.beeldengeluid.nl/gtaa/OnderwerpenBenG> ;
     :source "/api/autocomplete?q={query}&filter={\"scheme\":\"http://data.beeldengeluid.nl/gtaa/OnderwerpenBenG\"}" .
 
 pprime:onderwerpAnnotation
diff --git a/web/css/vumix.css b/web/css/vumix.css
index 5d933a1..87771c6 100644
--- a/web/css/vumix.css
+++ b/web/css/vumix.css
@@ -3,7 +3,7 @@
 	height: 100%;
 }
 #fields .bd {
-	height: 560px;
+	height: 600px;
 }
 #fields .ft {
 	border-top: 2px solid white;