vumix/commit

page rank experiments

authorMichiel Hildebrand
Mon May 21 10:11:47 2012 +0200
committerMichiel Hildebrand
Mon May 21 10:11:47 2012 +0200
commit8eac0c86b2e2c12d8d3588c19f3097ba06867cbc
tree5a4b0841203aa7b645b6fa7c4b093cc9ce3a7dee
parente07369e562569ecf237237082de7b0c17ab13cae
Diff style: patch stat
diff --git a/lib/r_exp.pl b/lib/r_exp.pl
index d0d9d0e..aa0fbaa 100644
--- a/lib/r_exp.pl
+++ b/lib/r_exp.pl
@@ -1,9 +1,10 @@
 :- module(r_exp,
 	  []).
 
-:- use_module(library(real)).
+:- use_module(library('R')).
 :- use_module(library(tag_concept)).
 :- use_module(library(video_annotation)).
+:- use_module(library(stop_words)).
 
 %v_compare(V1, V2) :-
 %	video_gtaa_vector(V1, L1),
@@ -35,9 +36,22 @@ list_to_vector(T1, [_|T2], [0|V]) :-
 	list_to_vector(T1, T2, V).
 
 rtest :-
-        y <- rnorm(50),
-        <- y,
-        x <- rnorm(y),
-	<- x11(width=5,height=3.5),
-        <- plot(x,y),
-	devoff.
+	r_open,
+	y <- rnorm(50),
+	r_print( y ),
+	x <- rnorm(y),
+	r_in( x11(width=5,height=3.5) ),
+	r_in( plot(x,y) ),
+	write( 'Press Return to continue...' ), nl,
+	read_line_to_codes( user_input, _ ),
+	r_print( 'dev.off()' ),
+	r_close.
+
+
+pr_test :-
+	r_open,
+	r_lib(igraph),
+	g <- 'random.graph.game'(20, 5/20, directed='TRUE'),
+	r_print( g ),
+	r_in( 'page.rank'(g) ),
+	r_close.
diff --git a/lib/stop_words.pl b/lib/stop_words.pl
index 130cda3..6a958e3 100644
--- a/lib/stop_words.pl
+++ b/lib/stop_words.pl
@@ -1,5 +1,21 @@
 :- module(stop_words,
-	  [stop_word/2]).
+	  [remove_stop_words/3,
+	   stop_word/2]).
+
+
+
+remove_stop_words([], _, []).
+remove_stop_words([E|Tags], Lang, Filtered) :-
+	tag(E, Tag0),
+	downcase_atom(Tag0, Tag),
+	stop_word(Lang, Tag),
+	!,
+	remove_stop_words(Tags, Lang, Filtered).
+remove_stop_words([Tag|Tags], Lang, [Tag|Filtered]) :-
+	remove_stop_words(Tags, Lang, Filtered).
+
+tag(_-Tag, Tag) :- !.
+tag(Tag, Tag).
 
 :- multifile
 	stop_word/2.
diff --git a/lib/tag_concept.pl b/lib/tag_concept.pl
index a840175..44362ba 100644
--- a/lib/tag_concept.pl
+++ b/lib/tag_concept.pl
@@ -2,8 +2,7 @@
 	  [tag_concept/3,
 	   video_tags/2,
 	   video_concepts/3,
-	   derived_concepts/4,
-	   remove_stop_words/3
+	   derived_concepts/4
 	  ]).
 
 
@@ -11,10 +10,10 @@
 :- use_module(library(semweb/rdfs)).
 :- use_module(library(semweb/rdf_label)).
 :- use_module(library(yaz_util)).
-:- use_module(library(stop_words)).
 :- use_module(library(tfidf)).
 :- use_module(library(find_resource)).
 :- use_module(library(video_annotation)).
+:- use_module(library(stop_words)).
 
 :- rdf_meta
 	derived_concepts(+, :, r, -),
@@ -59,6 +58,7 @@ tag_concept(Stem, Tag, Concept) :-
 	->  Diff = 0.9
 	),
 	snowball(dutch, Tag, TagStem),
+	%sub_atom(TagStem, 0, _, 1, Prefix),
 	rdf_has(Concept,rdfs:label, literal(prefix(TagStem), Lit)),
 	rdf(Concept, skos:inScheme, gtaa:'GTAA'),
 	literal_text(Lit, Label),
@@ -67,7 +67,7 @@ tag_concept(Stem, Tag, Concept) :-
 	;   snowball(dutch, Label, LabelStem),
 	    find_resource:literal_distance(TagStem, LabelStem, D),
 	    %isub(TagStem, Label, true, Sim),
-	    D < 5
+	    D < 4
 	    %Sim > Diff.
 	).
 
@@ -332,11 +332,154 @@ list_concepts(Id, Type, Goal) :-
 	).
 
 
-remove_stop_words([], _, []).
-remove_stop_words([_Rank-Tag0|Tags], Lang, Filtered) :-
-	downcase_atom(Tag0, Tag),
-	stop_word(Lang, Tag),
-	!,
-	remove_stop_words(Tags, Lang, Filtered).
-remove_stop_words([Tag|Tags], Lang, [Tag|Filtered]) :-
-	remove_stop_words(Tags, Lang, Filtered).
+
+		 /*******************************
+		 *               C		*
+		 *******************************/
+
+:- use_module(library('R')).
+
+
+concept_rank(Video, Ranked) :-
+	Goal = tag_concept(stem),
+	rdf_equal(Scheme, gtaa:'OnderwerpenBenG'),
+	video_concept_graph(Video, Scheme, Goal, Graph, Weights),
+	graph_rename(Graph, Assoc, Vector),
+	%show_graph(Vector),
+	page_rank(Vector, Weights, Scores0), % why do I have length(Vector)+1 scores???
+	Scores0 = [_|Scores],
+	length(Scores, ScoreCount),
+	debug(c_graph, 'scores ~w', [ScoreCount]),
+	assoc_to_list(Assoc, Pairs0),
+	transpose_pairs(Pairs0, Pairs1),
+	keysort(Pairs1, Pairs),
+	pairs_values(Pairs, Concepts),
+	length(Concepts, ConceptCount),
+	debug(c_graph, 'concepts ~w', [ConceptCount]),
+	pairs_keys_values(Ranked0, Scores, Concepts),
+	keysort(Ranked0, Ranked1),
+	reverse(Ranked1, Ranked).
+
+concept_clusters(Video, Clusters) :-
+	Goal = tag_concept(stem),
+	rdf_equal(Scheme, gtaa:'OnderwerpenBenG'),
+	video_concept_graph(Video, Scheme, Goal, Graph, _Weights),
+	graph_rename(Graph, Assoc, Vector),
+	graph_cluster(Vector, Membership0),
+	Membership0 = [_|Membership],
+	length(Membership, ClusterCount),
+	debug(c_graph, 'clusters ~w', ClusterCount),
+	assoc_to_list(Assoc, Pairs0),
+	transpose_pairs(Pairs0, Pairs1),
+	keysort(Pairs1, Pairs),
+	pairs_values(Pairs, Concepts),
+	length(Concepts, ConceptCount),
+	debug(c_graph, 'concepts ~w', [ConceptCount]),
+	pairs_keys_values(Clusters0, Membership, Concepts),
+	keysort(Clusters0, Clusters).
+
+video_concept_graph(Video, Scheme, Goal, Graph, Weights) :-
+	tag_rank(Video, RankedTagList),
+	length(RankedTagList, Tag_Count),
+	debug(c_graph, '~w tags', [Tag_Count]),
+	interpretation_graph(RankedTagList, Goal, Scheme, IGraph, Weights0),
+	concept_graph(IGraph, CGraph, Weights1),
+	append(IGraph, CGraph, Graph),
+	append(Weights0, Weights1, Weights),
+	length(Graph, Edge_Count),
+	debug(c_graph, '~w edges', [Edge_Count]).
+
+
+%concept_graph(_, [], []).
+concept_graph(IGraph, CGraph, Weights) :-
+	findall(C, member(i(_T,C), IGraph), Cs0),
+	sort(Cs0, Cs),
+	length(Cs, Concept_Count),
+	debug(c_graph, '~w interpretations', [Concept_Count]),
+	findall(i(C,C1)-Score, (member(C, Cs),
+				expand_graph(C,C1,Score)
+			       ),
+		Pairs),
+	pairs_keys_values(Pairs, CGraph, Weights).
+
+
+interpretation_graph(Tags, Goal, Scheme, IGraph, Weights) :-
+	findall(i(Tag,C)-Score, (member(Score-Tag, Tags),
+				 call(Goal, Tag, C),
+				 once(rdf(C, skos:inScheme, Scheme))
+				),
+		Pairs0),
+	sort(Pairs0, Pairs),
+	pairs_keys_values(Pairs, IGraph, Weights).
+
+graph_cluster(Vector, Clusters) :-
+	r_open,
+	r_lib(igraph),
+	v <- Vector,
+	g <- graph(v),
+	Clusters <- 'clusters(g)$membership',
+	r_close.
+
+page_rank(Vector, Weights, Rank) :-
+	r_open,
+	r_lib(igraph),
+	v <- Vector,
+	w <- Weights,
+	g <- graph(v),
+	%r_print( g ),
+	Rank <- 'page.rank(g, weights = w)$vector',
+	%r_in( 'page.rank'(g, vids = 'V'(g), directed = 'TRUE', damping = 0.2,
+		%	  weights = 'NULL', options = 'igraph.arpack.default') ),
+	r_close.
+
+show_graph(Vector) :-
+	r_open,
+	r_lib(igraph),
+	v <- Vector,
+	g <- graph(v),
+	r_print( g ),
+	r_in( tkplot( g ) ),
+	write( 'Press Return to continue...' ), nl,
+	read_line_to_codes( user_input, _ ),
+	r_print( 'dev.off()' ),
+	r_close.
+
+
+expand_graph(C, C1, 0.1) :-
+	rdf(C, skos:related, C1).
+expand_graph(C, C1, 0.1) :-
+	rdf(C, skos:broader, C1).
+expand_graph(C1, C, 0.1) :-
+	rdf_reachable(C1, skos:broader, C).
+
+
+
+%%	graph_rename(+Graph, -Assoc, -NewGraph)
+%
+%	Rename nodes so that they start counted by 1.
+
+graph_rename(Graph, Assoc, NewGraph) :-
+	empty_assoc(Assoc0),
+	rename_vertices(Graph, Assoc0, 1, NewGraph, Assoc).
+
+rename_vertices([], Assoc, _, [], Assoc).
+rename_vertices([i(C1,C2)|T], Assoc0, N, [NewC1,NewC2|Rest], Assoc) :-
+	rename_vertex(C1, Assoc0, N, Assoc1, N1, NewC1),
+	rename_vertex(C2, Assoc1, N1, Assoc2, N2, NewC2),
+	rename_vertices(T, Assoc2, N2, Rest, Assoc).
+
+rename_vertex(C, Assoc, N, Assoc, N, New) :-
+	get_assoc(C, Assoc, New),
+	!.
+rename_vertex(C, Assoc, N, Assoc1, N1, N) :-
+	N1 is N+1,
+	put_assoc(C, Assoc, N, Assoc1).
+
+
+v_count(Graph, Count) :-
+	findall(V, (member(i(V,_), Graph)
+		   ;member(i(_,V), Graph)
+		   ),
+		Vs0),
+	sort(Vs0, Vs),
+	length(Vs, Count).