cluster_search/commit
Added utils file with subset selection and saving to csv.
author | Chris Dijkshoorn |
---|---|
Fri Apr 18 17:52:21 2014 +0200 | |
committer | Chris Dijkshoorn |
Fri Apr 18 17:52:21 2014 +0200 | |
commit | b978d085777749ff57e26e430307e5b37d17ef3f |
tree | d696c69cb7cb1127782dc5809b1067fbbffd196a |
parent | 35247ebb150feedb525e43d1e33b3cda15f10996 |
Diff style: patch stat
diff --git a/config-available/cluster_search.pl b/config-available/cluster_search.pl index aa5a2d8..adbe748 100644 --- a/config-available/cluster_search.pl +++ b/config-available/cluster_search.pl @@ -9,4 +9,8 @@ :- rdf_attach_library(cluster_search(rdf)). :- rdf_load_library('cluster-search-ui-schema'). +% Load utils +:- ['cpack/cluster_search/lib/cluster_search/log_analysis']. +:- ['cpack/cluster_search/lib/cluster_search/utils']. + :- use_module(api(cluster_search)). diff --git a/lib/cluster_search/log_analysis.pl b/lib/cluster_search/log_analysis.pl new file mode 100644 index 0000000..ea296ae --- /dev/null +++ b/lib/cluster_search/log_analysis.pl @@ -0,0 +1,195 @@ +:- module(log, [analyse_csv_log/2]). +/** <module> Log analysis + +This module is for log analysis. The input is an api url or a csv file. +A command to annalyse a csv file can be given in the following manner: + +analyse_csv_log('../data/queries.csv','../out/analysis_results.csv'). + +CSV files should be formatted according to the Google Analytics output +with the following columns: + + * Query + * Total unique searches + * Results on page/search + * % Exit point + * % Query refinement + * Time after search + * Search depth + +Example row: +vermeer,955,"2,41","18,85%","11,98%",00:17:37,"5,73" + +The term after reading the csv: +query(vermeer,955,"2,41","18,85%","11,98%",00:17:37,"5,73") +query(_Q,_S,_P,_E,_R,_T,_D) +The filtered term: +query(Q, [number(Number)]) +*/ +:- use_module(library(csv)). + +%% analyse_csv_log(+InputCSVPath, -OutputCSV) +% +% Reads csv file, analysis it and outputs a csv file. Doesnt +% convert the csv values to numbers, enabling better matching. +analyse_csv_log(File, OutputFilePath) :- + csv_read_file(File, Queries, [functor(query), convert(false)]), + format('Read ~p~n',[File]), + maplist(filter_analytics, Queries, FilteredQueries), + debug(analyse_data, 'Filtered ~p',[FilteredQueries]), + query_analysis(FilteredQueries, QueryAnalysisResults), + debug(analyse_data, 'Analyzed ~p',[QueryAnalysisResults]), + meta_analysis(QueryAnalysisResults), + write_output(csv, OutputFilePath, QueryAnalysisResults). + + +filter_analytics(query(Query,Atom,_Pa,_E,_R,_T,_D), results{query:Query,number:Number}) :- + atom_number(Atom, Number). + +%% query_analysis(+Queries, QueryAnalysisResults) +% +% Constructs a list of analysis topics and the results of the +% analysis +% +% * Direct match +% * Number of direct matches +query_analysis(Queries, QueryAnalysisResults) :- + maplist(match_literal, Queries, Queries1), + maplist(match_number_literal, Queries1, Queries2), + maplist(match_literal_iconclass, Queries2, QueryAnalysisResults). + +%% match_literal(+Query, -Result) +% +% Direct match query with literals. +match_literal(Dic, Dic.put(direct_match, Match)) :- + Query = Dic.query, + ( rdf(_S,_P,literal(Query)) + -> Match = true, + debug(match, 'Matched ~p found match: ~p', [Query, Match]) + ; Match = false + ). + +%% match_literal_iconclass(+Query, -Result) +% +% Direct match query with literals. +match_literal_iconclass(Dic, Dic.put(direct_match_ic, Match)) :- + Query = Dic.query, + ( rdf(S,_P,literal(Query),'file:///Users/rasvaan/search/rdf/vocs/iconclass/iconclass.20121019.ttl') + -> Match = true, + debug(match_ic, 'Matched ~p on ~p found match: ~p', [S, Query, Match]) + ; Match = false + ). + + +match_number_literal(Dic, Dic.put(number_matches, NumberMatches)) :- + Query = Dic.query, + %TODO: are we counting the correct thing? + findall(Subject, rdf(Subject,_P,literal(Query)), MatchedSubjects), + length(MatchedSubjects, NumberMatches), + debug(match, 'Matched ~p found matches: ~p', [Query, MatchedSubjects]). + +% presence_in_vocabularies(query(Query,Searches,_P,_E,_R,_T,_D), query(Query, Searches, Literals)) :- +% find_literals(Query, Literals, []). + +% presence_in_vocabularies(query(Query,Searches,_P,_E,_R,_T,_D), query(Query, Searches, Literals)) :- +% find_literals(Query, Literals, []). + +%% count_queries(+QueryList, -TotalQueries) +% +% Analyse the query logs on a meta level. +% +% * Count total number of times the website got queried +% * Count the number of distinct queries used +meta_analysis(Queries) :- + count_queries(Queries, TotalQueries), + count_matches(Queries, TotalMatches), + count_matches_ic(Queries, TotalMatchesIC), + length(Queries, DistinctQueries), + format('Meta info + distinct queries: ~q + number of times used: ~q + number of matches: ~q + number of matches ic: ~q~n',[DistinctQueries, TotalQueries, TotalMatches, TotalMatchesIC]). + +%% count_queries(+QueryList, -TotalQueries) +% +% +count_queries([], 0). +count_queries([Dic|Dics], SumQueries) :- + count_queries(Dics, OldSum), + Number = Dic.number, + SumQueries is OldSum + Number. + +%% count_queries(+QueryList, -TotalQueries) +% +% +count_matches([], 0). +count_matches([Dic|Dics], NewMatchCount) :- + Match = Dic.direct_match, + count_matches(Dics, OldMatchCount), + add_if_match(Match, OldMatchCount, NewMatchCount). + +%% count_queries(+QueryList, -TotalQueries) +% +% +count_matches_ic([], 0). +count_matches_ic([Dic|Dics], NewMatchCount) :- + Match = Dic.direct_match_ic, + count_matches_ic(Dics, OldMatchCount), + add_if_match(Match, OldMatchCount, NewMatchCount). + +add_if_match(true, Old, New) :- + New is Old + 1. +add_if_match(false, Old, Old). + +%% write_output(+Type, +OutputFilePath, -QueryAnalysisResults) +% +% Write the results to a specific output. Type of outputs: +% +% * csv +write_output(csv, OutputFilePath, QueryAnalysisResults) :- + topics(QueryAnalysisResults, Topics), + Header =.. [header|Topics], + debug(csv_output, 'Header: ~q', [Header]), + maplist(flatten_dict(Topics), QueryAnalysisResults, ValueList), + csv_write_file(OutputFilePath, [Header|ValueList]), + format('Wrote output to ~q~n',[OutputFilePath]). + +%% topics(+AnalysisResults, -Header) +% +% Retrieve all topics from the dict. +topics([Dic|_T], Topics) :- + findall(Topic, Dic.Topic, Topics). + +%% flatten_list(+AnalysisResults, -Term) +% +% Construct terms appropriate for the csv write. +flatten_dict(Topics, Dic, Term) :- + result_values(Topics, Dic, Values), + Term =.. [result|Values]. + +result_values([],_,[]). +result_values([Topic|Topics], Dic, [Value|Values]) :- + Value = Dic.Topic, + result_values(Topics, Dic, Values). + + +/* partition(+KeywordList, +TotalQueries, +SelectionAmount, -HighList, -MediumList, -LowList), +* +*/ +partition(KeywordList, _TotalQueries, SelectionAmount, _HighList, _MediumList, _LowList) :- + partition(high, SelectionAmount, KeywordList), + debug(partition, 'Partition High: ~q', [SelectionAmount]). + +%Why do we need a cut here? +partition(high, 0, _KeywordList) :- !. +partition(high, SelectionAmount, KeywordList) :- + debug(partition, 'Test ~q',[SelectionAmount]), + NewSelectionAmount is SelectionAmount -1, + partition(high, NewSelectionAmount, KeywordList). + +%partition(_, 0, []). +%partition([Query|KeywordList], SelectionAmount, [Query|HighList]) :- +% SelectionAmountNew is SelectionAmount - 1, +% partition(KeywordList, SelectionAmountNew, HighList). + diff --git a/lib/cluster_search/utils.pl b/lib/cluster_search/utils.pl new file mode 100644 index 0000000..cd25987 --- /dev/null +++ b/lib/cluster_search/utils.pl @@ -0,0 +1,411 @@ +:- module(utils, [create_ic_csv/3, make_rma_people_cs/0]). + +:- use_module(library(semweb/rdf_db)). % RDF storage and querying +:- use_module(library(csv)). +:- use_module(library(http/http_dispatch)). % to define handlers +:- use_module(library(http/http_parameters)). % to get parameters from url +:- use_module(library(http/http_json)). % for replying json +:- use_module(library(semweb/rdf_litindex)). +:- use_module(library(http/url_cache)). % checking images in cache +:- use_module(library(http/http_open)). % checking images on server + +:- setting(default_output, any, 'json', 'Default output language.'). + +:- rdf_register_ns(rma, 'http://purl.org/collections/nl/rma/schema#'). +:- rdf_register_ns(ico, 'http://iconclass.org/'). + +:- http_handler(cliopatria(utils), utils, []). +:- rdf_meta util(+,-,t). + +%% utils(+Request) +% +% Main clause, retrieving type of request, the parameters, execute +% code and replies output. +utils(Request) :- + get_parameters(Request, Options), + option(util(Util), Options), + util(Util, Result, Options), + option(output(Output), Options), + reply(Output, Result). + +%% create_ic_csv(+ICCode, +FilePath) +% +% creates a csv file based on given ICCode: +% create_ic_csv('http://iconclass.org/25F3','../out/birds.csv', +% enriched). +% create_ic_csv('http://iconclass.org/25F3','../out/birds.csv', +% images). +create_ic_csv(ICCode, FilePath, enriched) :- + findall(Uri, iconclass_resource(ICCode, Uri), UriList), + enrich_list(UriList, EnrichedList), + write_csv(EnrichedList, FilePath), + debug(output, 'EnrichedList: ~q',[EnrichedList]). + +create_ic_csv(ICCode, FilePath, images) :- + findall(Uri, iconclass_resource(ICCode, Uri), UriList), + filter_uris(only_images, UriList, FilteredList), + enrich_list(FilteredList, EnrichedList), + write_csv(EnrichedList, FilePath), + debug(output, 'FilteredList: ~q',[EnrichedList]). + + +%% get_parameters(+Request, -Parameters) +% +% Retrieves an option object of parameters from the url. +get_parameters(Request, Options) :- + setting(default_output, DefaultOutput), + http_parameters(Request, + [output(Output, [default(DefaultOutput), oneof([json,turtle,rdfxml,html,files])]), + util(Utils, [optional(true), oneof([subset,predicates,list,images])]), + field(Field, [optional(true)]), + value(Value, [optional(true)]), + filter(Filter, [default(all), oneof([all, only_images])]), + enrichment(Enrichment, [default(normal), oneof([normal, rich])]), + operation(Operation, [optional(true), + oneof([literal_contains, + literal_exact, + ic_exact, + ic_subtree])]) + ]), + Options = [output(Output), + util(Utils), + field(Field), + value(Value), + filter(Filter), + operation(Operation), + enrichment(Enrichment)]. + +%% utils(+Utils, -Result, +Options) +% +% Do the thing you need to do: +% * subset: retrieve a json subset based on ic codes +% * predicates: get a list of all the predicates +% * images: download the images to a folder +util(subset, Result, Options) :- + option(value(Value), Options), + option(field(Field), Options), + option(operation(Operation), Options), + option(enrichment(Enrichment), Options), + option(filter(Filter), Options), + debug(enrich, 'Options: ~p', [Options]), + get_uris(Operation, Field, Value, Uris), + debug(subset, 'Uris: ~p', [Uris]), + filter_uris(Filter, Uris, FilteredUris), + length(FilteredUris, NumberFiltered), + debug(subset, 'NumberFiltered: ~p FilteredUris: ~p', [NumberFiltered, FilteredUris]), + generate_json_subset(Enrichment, FilteredUris, Result). + +util(predicates, Result, _Options) :- + findall(Predicate, (rdf_current_predicate(Predicate), once(rdf(_, Predicate, _))), PredicateList), + length(PredicateList, Length), + Result = json([nritems(Length),predicates(PredicateList)]). + +util(images, Result, Options) :- + option(value(Value), Options), + option(field(Field), Options), + debug(images, 'Download images with field: ~p Code: ~p', [Field, Value]), + findall(ImageUrl, + (iconclass_resource(Value, Uri), + has_image(Uri, true), + rdf(Uri, rma:imageURL, ImageUrl)), + Result), + debug(images, 'Found the following Urls: ~p', [Result]). + +get_uris(ic_exact, Field, Value, Uris) :- + findall(Uri, rdf(Uri, Field, Value), Uris), + debug(enrich, 'findall Field: ~p Value: ~p', [Field, Value]). + +get_uris(ic_subtree, _Field, Value, Uris) :- + findall(Uri, iconclass_resource(Value, Uri), Uris). + +get_uris(literal_contains, Field, Value, Uris) :- + rdf_find_literals(case(Value), ListOfLiterals), + get_uris_literals(ListOfLiterals, Field, Uris). + +get_uris(literal_exact, Field, Value, Uris) :- + findall(Uri, rdf(Uri, Field, literal(Value)), Uris). + +get_uris_literals([], _Field, []). +get_uris_literals([Literal|Literals], Field, Results) :- + findall(Uri, rdf(Uri, Field, literal(Literal)), Uris), + append(Uris, TempResults, Results), + get_uris_literals(Literals, Field, TempResults). + +generate_json_subset(Enrichment, UriList, Result) :- + length(UriList, Length), + enrich_list_subset(Enrichment, UriList, EnrichedList), + Result = json([nritems(Length),items(EnrichedList)]). + +%% iconclass_resource(+ICCode, -Uri) +% +% Returns a Uri with ICCode or can reach ICCode. +iconclass_resource(ICCode, Uri) :- + rdf(Uri, rma:contentClassification, IC), + rdf_reachable(IC, skos:broader, ICCode). + +%% enrich_list_subset(+UriList, -EnrichedList) +% +% loops through a list of Uris and enriches those objects +enrich_list_subset(_Enrichment, [],[]). +enrich_list_subset(Enrichment, [Uri|Tail], [EnrichedUri|EnrichedList]) :- + enrich_subset_uri(Enrichment, Uri,EnrichedUri), + enrich_list_subset(Enrichment, Tail, EnrichedList). + +%% enrich_subset(+TypeEnrichment, +Uri, -EnrichedItem) +% +% queries for more information based on: Uri, +% description(Description), maker(Makers), date(Date), +% technique(Technique), material(Material), image(HttpsImage), +% ic(ICCodes) +enrich_subset_uri(normal, Uri, json([uri(Uri), title(Title), image(HttpsImage), ic(ICCodes)])) :- + findall(IC, get_ic(Uri, IC), ICCodes), + rdf(Uri, rma:imageURL, Image), + atom_concat(http, MinHttp, Image), + atom_concat(https, MinHttp, HttpsImage), + rdf(Uri, dcterms:title, literal(Title)), + debug(enrich, 'Normal Title ~q',[Title]). + +enrich_subset_uri(rich, Uri, json([uri(Uri), title(Title), description(Description), + maker(Makers), date(Date), technique(Technique), + image(HttpsImage), material(Material), ic(ICCodes)])) :- + get_title(Uri, Title), + get_description(Uri, Description), + findall(Maker, get_maker(Uri, Maker), Makers), + get_date(Uri, Date), + get_image(Uri, HttpsImage), + get_technique(Uri, Technique), + get_material(Uri, Material), + findall(IC, get_ic(Uri, IC), ICCodes), + debug(enrich, 'Rich MAkers ~q',[Title]). + +get_title(Uri, Title) :- + (rdf(Uri, dcterms:title, literal(Title)) + -> true; + Title = 'null'). + +get_description(Uri, Description) :- + (rdf(Uri, dcterms:description, literal(lang(nl,Description))) + -> true; + Description = 'null'). + +get_maker(Uri, json([maker(Maker), role(Role)])) :- + rdf(Uri, rma:maker, Maker0), + rdf(Maker0, rdf:value, Maker1), + rdf(Maker1, rma:name, literal(Maker)), + get_maker_role(Maker0, Role). + +get_maker_role(Maker, Role) :- + (rdf(Maker, rma:makerRole, Role0), + rdf(Role0, skos:prefLabel, literal(Role)) + -> true; + Role = 'null'). + +get_image(Uri, HttpsImage) :- + (rdf(Uri, rma:imageURL, Image) + -> atom_concat(http, MinHttp, Image), + atom_concat(https, MinHttp, HttpsImage); + HttpsImage = 'null'). +get_technique(Uri, Technique) :- + (rdf(Uri, rma:technique, Technique0), + rdf(Technique0, rdf:value, Technique1), + rdf(Technique1, skos:prefLabel, literal(Technique)), + debug(locale, 'Technique: ~p', [Technique]) + -> true; + Technique = 'null'). + +get_material(Uri, Material) :- + (rdf(Uri, rma:material, Material0), + rdf(Material0, rdf:value, Material1), + rdf(Material1, skos:prefLabel, literal(Material)) + -> true; + Material = 'null'). + +get_date(Uri, Date) :- + (rdf(Uri, rma:dating, Date0), + rdf(Date0, rdfs:label, literal(Date)) + -> true; + Date = 'null'). + + %, label(Label), broader(BroaderICs), narrower(NarrowerICs) +get_ic(Uri, json([notation(Notation), label(Label), broader(BroaderICs), narrower(NarrowerICs)])) :- rdf(Uri, rma:contentClassification, ICUri), + rdf(ICUri, skos:notation, literal(Notation)), + findall(BroaderIC, get_broader_ic(ICUri, BroaderIC), BroaderICs), + findall(NarrowerIC, get_narrower_ic(ICUri, NarrowerIC), NarrowerICs), + get_label(ICUri, Label). + +get_label(Uri, Label) :- + (rdf(Uri, skos:prefLabel, literal(lang(en, Label))) + -> true; + Label = 'null'). + +get_broader_ic(Uri, SuperIC) :- + rdf(Uri, skos:broader, IC0), + rdf(IC0, skos:notation, literal(SuperIC)). + +get_narrower_ic(Uri, NarrowerIC) :- + rdf(Uri, skos:narrower, IC0), + rdf(IC0, skos:notation, literal(NarrowerIC)). + +%% filter_uris(+Filter, +UriList, -FilteredList +% +% Filters a list of uris, based on the specified filter. Filters: +% +% * only_images: checks whether images are present on server. +filter_uris(only_images, [],[]). +filter_uris(only_images, [Uri|Uris], [Uri|UrisWithImage]) :- + has_image(Uri, true), !, + debug(loop, 'Filter Uris1 LOOP', []), + filter_uris(only_images, Uris, UrisWithImage). +filter_uris(only_images, [_|Uris], UrisWithImage) :- + debug(loop, 'Filter Uris2 LOOP', []), + filter_uris(only_images, Uris, UrisWithImage). +filter_uris(all, Uris, Uris). + +%% has_image(+Uri, -Boolean) +% +% Returns true/false depending on whether an image is available. +has_image(Uri, true) :- + rdf(Uri, rma:imageURL, Image), + url_cached(Image, file(_)), + debug(has_image, 'Has image in cache: true', []), + !. + +has_image(Uri, true) :- + rdf(Uri, rma:imageURL, Image), + http_response(Image, Status), + Status == 200, + debug(has_image, 'Has image on server: true', []), + !. + +has_image(_, false):- + debug(has_image, 'Has image in cache: FALSE', []). + +http_response(URL, Status) :- + http_open(URL, In, + [method(head), + status_code(Status), + cert_verify_hook(ssl_verify) + ]), + close(In). + + +:- public ssl_verify/5. +%% ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error) +% +% Currently we accept all certificates. +ssl_verify(_SSL, + _ProblemCertificate, _AllCertificates, _FirstCertificate, + _Error). + +%% enrich_list(+UriList, -EnrichedList) +% +% loops through a list of Uris and enriches those objects +enrich_list([],[]). +enrich_list([Uri|Tail], [EnrichedUri|EnrichedList]) :- + enrich(Uri,EnrichedUri), + enrich_list(Tail, EnrichedList). + +%% enrich(+Uri, -EnrichedItem) +% +% queries for more information based on Uri +enrich(Uri,EnrichedUri) :- + rdf(Uri, rma:objectNumber, literal(ObjectNumber)), + rdf(Uri, dcterms:title, literal(Title)), + rdf(Uri, rma:imageURL, ImageUrl), + %atom_concat('https://www.rijksmuseum.nl/nl/zoeken?v=list&f=1&p=1&ps=10&objectnumber=', ObjectNumber, SearchString), + %debug(enrich, 'Searchstring ~q',[SearchString]), + EnrichedUri = artwork(Uri,ObjectNumber,ImageUrl,Title), + debug(enrich, 'Nice and shiny URI: ~q',[EnrichedUri]). + +%% write_csv(+ObjectList, +FilePath) +% +% Write list of objects to specified file path. +write_csv(ObjectList, FilePath) :- + debug(csv, 'ObjectList: ~q',[ObjectList]), + csv_write_file(FilePath, ObjectList). + +%% make_rma_people_cs +% +% Load triples to make the rma_people thesaurus a SKOS vocabulary +make_rma_people_cs :- +% rdf_assert('/http://purl.org/collections/nl/rma/schema#Person', rdfs:subClassOf, skos:'Concept'), dat is al ok in de bijgeladen rma-people-skos definities? + CS = 'http://purl.org/collections/nl/rma/schema#PersonScheme', + rdf_assert(CS, rdf:type, skos:'ConceptScheme', rma_people_cs), + rdf_assert(CS, rdfs:label, literal('Rijksmuseum People'), rma_people_cs), + ( rdf(C, rdf:type, 'http://purl.org/collections/nl/rma/schema#Person'), + rdf_assert(C, skos:inScheme, CS, rma_people_cs), + fail + ; true + ). + +%% reply(+Result, +Output) +% +% Returns the result according to a specified output method. +reply(json, PrologOut) :- + debug(output, 'PrologOut: ~q',[PrologOut]), + reply_json(PrologOut). + +reply(files, []). +reply(files, [Url|UrlList]) :- + Folder = '../data/images/', + concat('http://www.rijksmuseum.nl/assetimage2.jsp?id=', Name, Url), + concat(Name, '.jpg', FileName), + concat(Folder, FileName, FilePath), + fetch_url_raw(Url, FilePath,_,_), + reply(files, UrlList). + +%% fetch_url_raw(+URL:atom, +Path:atom, -MimeType:atom, -Modified) is det. +% +% Fetch data from URL and put it into the file Path. MimeType is +% unified with the MIME-type as reported by the server or +% text/plain if the server did not provide a MIME-Type. +% +% @error existence_error(url, URL) +fetch_url_raw(URL, File, MimeType, Modified) :- + debug(images, 'Downloading ~w ...', [URL]), + atom_concat(File, '.tmp', TmpFile), + ( catch(fetch_to_file(URL, TmpFile, Code, Header), E, true) + -> true + ; E = predicate_failed(http_get/3) + ), + ( var(E) + -> true + ; ( debugging(url_cache) + -> print_message(error, E) + ; true + ), + catch(delete_file(TmpFile), _, true), + ( debugging(url_cache) + -> message_to_string(E, Msg), + debug(images, 'Download failed: ~w', [Msg]) + ; true + ), + throw(E) + ), + ( Code == 200 + -> rename_file(TmpFile, File) + ; catch(delete_file(TmpFile), _, true), + throw(error(existence_error(url, URL), _)) + ), + ( memberchk(content_type(MimeType0), Header) + -> true + ; MimeType0 = 'text/plain' + ), + ignore(memberchk(last_modified(Modified), Header)), + debug(images, 'Downloaded ~w, mime-type: ~w', + [URL, MimeType0]), + MimeType = MimeType0. + +fetch_to_file(Url, File, Code, [content_type(ContentType), last_modified(LastModified)]) :- + setup_call_cleanup( + open(File, write, Out, [ type(binary) ]), + setup_call_cleanup( + http_open(Url, In, + [ header(content_type, ContentType), + header(last_modified, LastModified), + status_code(Code), + cert_verify_hook(ssl_verify) + ]), + copy_stream_data(In, Out), + close(In)), + close(Out)).