:- module(tfidf, [flush_tag_rank/1, tag_rank/2, document_term/2, documents/1, tf/3, idf/3, link_tags_to_concepts/3 ]). :- use_module(library(csv)). :- use_module(library(semweb/rdf_db)). :- use_module(library(stop_words)). :- dynamic tag_rank_cache/2. flush_tag_rank(Video) :- retractall(tag_rank_cache(Video, _)). tag_rank(Video, RankedTagList) :- tag_rank_cache(Video, RankedTagList), !. tag_rank(Video, RankedTagList) :- documents(Videos), findall(T, document_term(Video, T), Ts0), sort(Ts0, Ts1), remove_stop_words(Ts1, dutch, Ts), maplist(tag_score(Video, Videos), Ts, Scored), keysort(Scored, Sorted), reverse(Sorted, RankedTagList), assert(tag_rank_cache(Video, RankedTagList)). tag_score(D, Collection, Tag, Score-Tag) :- tf(Tag, D, TF), idf(Tag, Collection, IDF), Score is TF*IDF. documents(Videos) :- findall(V, rdf(V,rdf:type,pprime:'Video'), Videos). document_term(D, T) :- rdf(D, pprime:hasAnnotation, E), rdf(E, rdf:value, literal(T)). % rdf(E, pprime:score, literal(SA)), % atom_number(SA, S), % S > 5. tf(T, D, TF) :- findall(A, rdf(D, pprime:hasAnnotation, A), As), findall(T, document_term(D, T), Ts), length(As, Total), length(Ts, Occ), TF is Occ/Total. idf(T, Collection, IDF) :- length(Collection, CollectionSize), findall(D, ( member(D, Collection), once(document_term(T, D)) ), DT), length(DT, DTCount), IDF is log(CollectionSize/(1+DTCount)). write_csv(File) :- documents(Videos), video_tags(Videos, Rows), csv_write_file(File, Rows). video_tags([], []). video_tags([Video|Vs], [row(Id,TagA)|Rs]) :- rdf(Video,dc:id,literal(Id)), findall(T, document_term(Video, T), Tags), concat_atom(Tags, ' ', TagA), video_tags(Vs, Rs). link_tags_to_concepts([], _, []). link_tags_to_concepts([Score-Tag|As], Type, [Score-Concept|Rest]) :- rdf_has(Concept,rdfs:label,literal(exact(Tag),_)), %reconcile(Value, 1, Type, [], [hit(D,Concept,_,_)]), %D < 10, !, link_tags_to_concepts(As, Type, Rest). link_tags_to_concepts([_|As], Type, Rest) :- link_tags_to_concepts(As, Type, Rest).