vumix/commit

Fix idf score (remove duplicate findings of document)

authorMichiel Hildebrand
Fri May 18 18:09:36 2012 +0200
committerMichiel Hildebrand
Fri May 18 18:09:36 2012 +0200
commite228ed41f4ff18131555560d96b1dd82f9730ad4
tree21f9501617d418b0aafd341f51fb7782de95cc83
parent8898bc8c7fb378c7314ee2f71d0cb0845915c343
Diff style: patch stat
diff --git a/lib/tfidf.pl b/lib/tfidf.pl
index ed10ef5..27fb2d5 100644
--- a/lib/tfidf.pl
+++ b/lib/tfidf.pl
@@ -9,6 +9,7 @@
 
 :- use_module(library(csv)).
 :- use_module(library(semweb/rdf_db)).
+:- use_module(library(stop_words)).
 
 :- dynamic
 	tag_rank_cache/2.
@@ -22,7 +23,8 @@ tag_rank(Video, RankedTagList) :-
 tag_rank(Video, RankedTagList) :-
 	documents(Videos),
 	findall(T, document_term(Video, T), Ts0),
-	sort(Ts0, Ts),
+	sort(Ts0, Ts1),
+	remove_stop_words(Ts1, dutch, Ts),
 	maplist(tag_score(Video, Videos), Ts, Scored),
 	keysort(Scored, Sorted),
 	reverse(Sorted, RankedTagList),
@@ -54,7 +56,7 @@ idf(T, Collection, IDF) :-
 	length(Collection, CollectionSize),
 	findall(D,
 		(   member(D, Collection),
-		    document_term(T, D)
+		    once(document_term(T, D))
 		),
 		DT),
 	length(DT, DTCount),
@@ -62,9 +64,6 @@ idf(T, Collection, IDF) :-
 
 
 
-
-
-
 write_csv(File) :-
 	documents(Videos),
 	video_tags(Videos, Rows),