cluster_search/commit
Proper split of experiment files.
author | Chris Dijkshoorn |
---|---|
Tue May 20 13:31:51 2014 +0200 | |
committer | Chris Dijkshoorn |
Tue May 20 13:31:51 2014 +0200 | |
commit | c05be4c488f6298fbcca4c1dc29a4af79480bacc |
tree | baaa480db1da396fa7cf6c2845c161e40d85b759 |
parent | c50cf3686e5f715fce251521868bd3ba3cb53a1f |
Diff style: patch stat
diff --git a/README.md b/README.md index 362c097..e03026e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Cluster Search Readme +# Cluster Search Readme # Cluster Search is running on Cliopatria powered by Prolog. Below the instructions for setting up cluster search on a mac or linux machine. @@ -8,22 +8,23 @@ Cluster Search is running on Cliopatria powered by Prolog. Below the instruction Install latest SWI-Prolog development release: http://www.swi-prolog.org/download/devel -Clone the Cliopatria webserver code in a suitable directory: +Clone the Cliopatria webserver code in a suitable directory: ```bash -$ git clone git://www.swi-prolog.org/home/pl/git/ClioPatria.git +$ mkdir git +$ cd git +$ git clone https://github.com/SWI-Prolog/ClioPatria.git ``` -Create a folder for the clustersearch project at the same level as the Cliopatria folder: +Create a folder for the clustersearch project: ```bash -$ ls - ClioPatria +$ cd .. $ mkdir clustersearch $ cd clustersearch ``` Now configure this directory as a ClioPatria project by running: ```bash -$ ../ClioPatria/configure +$ ../git/ClioPatria/configure ``` You can now run the web server using the command below (typing 'halt.' will stop it): diff --git a/lib/cluster_search/experiment_analysis.pl b/lib/cluster_search/experiment_analysis.pl index d140656..fa6f46d 100644 --- a/lib/cluster_search/experiment_analysis.pl +++ b/lib/cluster_search/experiment_analysis.pl @@ -1,4 +1,4 @@ -:- module(experiment_analysis, [analyse_results/1]). +:- module(experiment_analysis, [analyse_results/2]). /** <module> Run analysis on the results of the graph search experiment */ @@ -8,36 +8,28 @@ % % Analyse the obtained results. There are four result sets, All, % High, Medium and Low. -analyse_results(All) :- +analyse_results(All, Options) :- cluster_topics(All, ClusterTopics), append(ClusterTopics, [query], NonNummericalTopics), baseline_topics(All, NonNummericalTopics, BaselineTopics), append(ClusterTopics, BaselineTopics, AllmostAllTopics), - filter_errors(AllmostAllTopics, All, Errors, Fails, TimeOuts, Filtered), - format('Errors: ~p~n Fails: ~p~n TimeOuts: ~p~n', - [Errors, Fails, TimeOuts]), - timed_format('~s Running numerical analysis on ~p~n', [BaselineTopics]), + filter_errors(AllmostAllTopics, All, _E, _F, _T, Filtered), + timed_format('~s Running numerical analysis on ~p~n', + [BaselineTopics]), % get total queries to enable splitting - %sum(number, Filtered, SumQueryFrequency), - %split_logs(Filtered, SumQueryFrequency, High, Medium, Low), - %numerics(BaselineTopics, High, high, NumericAnalysisHigh, HighRows), - %numerics(BaselineTopics, Medium, medium, NumericAnalysisMedium, MediumRows), - %numerics(BaselineTopics, Low, low, NumericAnalysisLow, LowRows), - %numerics(BaselineTopics, Filtered, all, NumericAnalysisAll, AllRows), - %create_csv_baseline(LowRows, MediumRows, HighRows, AllRows, - % '../out/literals_match.csv'), - %output(latex_table2, Filtered), - %output(latex_table3, NumericAnalysisHigh, NumericAnalysisMedium, - % NumericAnalysisLow, NumericAnalysisAll), - %output(latex_table4, NumericAnalysisHigh, NumericAnalysisMedium, - % NumericAnalysisLow, NumericAnalysisAll), - %output(latex_table5, NumericAnalysisHigh, NumericAnalysisMedium, - % NumericAnalysisLow, NumericAnalysisAll), - %output(latex_table6, NumericAnalysisHigh, NumericAnalysisMedium, - % NumericAnalysisLow, NumericAnalysisAll), - %output(latex_table7, NumericAnalysisHigh, NumericAnalysisMedium, - % NumericAnalysisLow, NumericAnalysisAll). - %GRAPHSEARCH + sum(number, Filtered, SumQueryFrequency), + split_logs(Filtered, SumQueryFrequency, High, Medium, Low), + numerics(BaselineTopics, High, high, HighResults, HighRows), + numerics(BaselineTopics, Medium, medium, MediumResults, MediumRows), + numerics(BaselineTopics, Low, low, LowResults, LowRows), + numerics(BaselineTopics, Filtered, all, AllResults, AllRows), + %TODO: Check arguments + print_statistics_baseline(BaselineTopics, HighResults, MediumResults, + LowResults, AllResults, Options), + print_latex_baseline(HighResults, MediumResults, LowResults, + AllResults, Filtered, Options), + create_csv_baseline(LowRows, MediumRows, HighRows, AllRows, + '../out/literals_match.csv'), maplist(meta_analyse_clusters(ClusterTopics), _Rows, Filtered, ClusterAnalysisAll), %create_csv_graph_search(LowRows, MediumRows, HighRows, AllRows, @@ -79,16 +71,22 @@ check_cluster([_Topic|Topics], ClusterTopics) :- % Find the errors, fails and timeouts from the results list and % filter them out. filter_errors(Topics, Inputs, Errors, Fails, TimeOuts, Filtered) :- + %TODO: write errors to log. findall(Error, error_in(Topics, Inputs, Error), Errors0), list_to_set(Errors0, Errors), + length(Errors, NumberErrors), findall(Fail, fail_in(Topics, Inputs, Fail), Fails0), list_to_set(Fails0, Fails), + length(Fails, NumberFails), findall(TimeOut, time_out_in(Topics, Inputs, TimeOut), TimeOuts0), list_to_set(TimeOuts0, TimeOuts), + length(TimeOuts, NumberTimeOuts), debug(exp_filter, 'Errors: ~p~n Fails: ~p~n TimeOuts: ~p', [Errors, Fails, TimeOuts]), append([Errors, Fails, TimeOuts], EverythingBad), subtract(Inputs, EverythingBad, Filtered), + timed_format('~s Errors: ~d Fails: ~d TimeOuts: ~d~n', + [NumberErrors, NumberFails, NumberTimeOuts]), debug(exp_filter, 'Fitlered: ~p', [Filtered]). error_in(Topics, Inputs, Input) :- @@ -168,28 +166,15 @@ numerical_statistics(Input, Split, Row, Topic, AnalysisResult) :- nth1(1, SortedNumbers, LowestNumber), variance(Numbers, Mean, NumberOfQueries, Variance), StandardDeviation is sqrt(Variance), -% print_numerical_statistics(Topic, Split, NumberOfQueries, Sum, Mean, -% Variance, StandardDeviation, -% LowestNumber, HighestNumber), AnalysisResult = results{topic:Topic, - distinct:NumberOfQueries, - sum:Sum, - mean:Mean, - variance:Variance, - standard_deviation:StandardDeviation, - lowest_number:LowestNumber, - highest_number:HighestNumber}. - -print_numerical_statistics(Topic, Split, NumberOfQueries, Sum, Mean, Variance, - StandardDeviation, LowestNumber, HighestNumber) :- - format('~tstatistics ~p ~p~t~36|~n~n', [Topic, Split]), - format('distinct ~`.t~d ~36|~n', [NumberOfQueries]), - format('sum ~`.t~d ~36|~n', [Sum]), - format('mean ~`.t~2f ~36|~n', [Mean]), - format('variance ~`.t~2f ~36|~n', [Variance]), - format('standard deviation ~`.t~2f ~36|~n', [StandardDeviation]), - format('lowest number ~`.t~d ~36|~n', [LowestNumber]), - format('highest number ~`.t~d ~36|~n~n', [HighestNumber]). + split:Split, + distinct:NumberOfQueries, + sum:Sum, + mean:Mean, + variance:Variance, + standard_deviation:StandardDeviation, + lowest_number:LowestNumber, + highest_number:HighestNumber}. %% get_number_list(+Inputs, +Topic, -Numbers) % diff --git a/lib/cluster_search/experiment_data.pl b/lib/cluster_search/experiment_data.pl index ce5bc24..ddd5c68 100644 --- a/lib/cluster_search/experiment_data.pl +++ b/lib/cluster_search/experiment_data.pl @@ -3,8 +3,7 @@ search_logs/2, load_search_logs/2, load_graphs/1, - unload_graphs/1, - server_versus_local/3]). + unload_graphs/1]). /** <module> Load and unload data This module is used for retrieving information about data and the data itself. @@ -25,24 +24,7 @@ rdf_dirs(['rma', 'getty/aat', 'getty/ulan', 'iconclass', - 'naturalis' - ]). - -server_versus_local(server, Collection, Vocabularies) :- - % limit the number of cpus used - set_prolog_flag(cpu_count, 20), - % increase stack size - set_prolog_stack(global, limit(10 000 000 000)), - Collection = rma_edm, - Vocabularies = [wn, aat, ic, ulan, ioc], - set_setting(search:basic_search_target, - ''). - -server_versus_local(local, Collection, Vocabularies) :- - Collection = rma_pk, - Vocabularies = [ic_local], - set_setting(search:basic_search_target, - 'http://purl.org/collections/nl/rma/schema#Work'). + 'naturalis']). %file paths to search logs search_logs(month, '../data/test_queries_jan14.csv'). diff --git a/lib/cluster_search/experiment_graph_search.pl b/lib/cluster_search/experiment_graph_search.pl index 47cbc0b..d48df4f 100644 --- a/lib/cluster_search/experiment_graph_search.pl +++ b/lib/cluster_search/experiment_graph_search.pl @@ -4,12 +4,16 @@ This module is used for getting experimental results of using. */ -:- use_module(library(cluster_search/kwd_search)). -:- use_module(api(cluster_search)). :- use_module(library(semweb/rdf_db)). :- use_module(library(thread)). :- use_module(library(time)). :- use_module(library(settings)). +:- use_module(api(cluster_search)). +:- use_module(library(cluster_search/kwd_search)). +:- use_module(library(cluster_search/experiment_utils)). +:- use_module(library(cluster_search/experiment_data)). +:- use_module(library(cluster_search/experiment_analysis)). + % set time limit on graph search max_time_graph_search(10). @@ -25,7 +29,7 @@ red_button(ServerOrLocal) :- %load_graphs([Collection|Vocabularies]), baseline(QueryList, [Collection|Vocabularies], BaselineResults), experiment(BaselineResults, Collection, Vocabularies, ExperimentalResults), - analyse_results(ExperimentalResults). + analyse_results(ExperimentalResults, [latex(false), baseline_statistcs(true)]). %% server_versus_local(+ServerOrLocal, -Collection, -Vocabularies) % diff --git a/lib/cluster_search/experiment_utils.pl b/lib/cluster_search/experiment_utils.pl index 2bbe8a1..730205c 100644 --- a/lib/cluster_search/experiment_utils.pl +++ b/lib/cluster_search/experiment_utils.pl @@ -1,12 +1,18 @@ -:- module(experiment_utils, [timed_format/2, +:- module(experiment_utils, [time_stamp/1, + timed_format/2, create_csv_baseline/5, + print_statistics_baseline/6, + print_statistics/2, + print_latex_baseline/6, output/5, - output/2]). + output/2]). /** <module> Utils for experiment. -Output handling. +Output handling and other utils. */ +:- use_module(library(option)). + %% timed_format(+Format, +List) % % Show a formatted string on output with beginning with time @@ -20,10 +26,7 @@ timed_format(Format, List) :- time_stamp(String) :- get_time(Time), stamp_date_time(Time, Date, 'local'), - format_time(string(String), - %'%d %b %Y %T', - '%T', - Date, posix). + format_time(string(String),'%T', Date, posix). %% create_csv_baseline(High, Medium, Low, Filtered, +FilePath) % @@ -93,6 +96,54 @@ output(latex_table8, High, Medium, Low, Total) :- print_latex(baseline_wn20, Total, all), format('~n'). +print_statistics_baseline(_Topics, _High, _Medium, _Low, _All, Options) :- + option(baseline_statistcs(false), Options), + !. +print_statistics_baseline(Topics, High, Medium, Low, All, _Options) :- + maplist(print_statistics(High), Topics), + maplist(print_statistics(Medium), Topics), + maplist(print_statistics(Low), Topics), + maplist(print_statistics(All), Topics). + +%% print_statistics(+Dic, +Topics) +% +% Print statistics for a nummerical topic. +print_statistics(Dics, Topic) :- + find_dict(Topic, Dics, Dic), + Split = Dic.split, + NumberOfQueries = Dic.distinct, + Sum = Dic.sum, + Mean = Dic.mean, + Variance = Dic.variance, + StandardDeviation = Dic.standard_deviation, + LowestNumber = Dic.lowest_number, + HighestNumber = Dic.highest_number, + format('~tstatistics of ~p ~p~t~36|~n~n', [Topic, Split]), + format('distinct ~`.t~d ~36|~n', [NumberOfQueries]), + format('sum ~`.t~d ~36|~n', [Sum]), + format('mean ~`.t~2f ~36|~n', [Mean]), + format('variance ~`.t~2f ~36|~n', [Variance]), + format('standard deviation ~`.t~2f ~36|~n', [StandardDeviation]), + format('lowest number ~`.t~d ~36|~n', [LowestNumber]), + format('highest number ~`.t~d ~36|~n~n', [HighestNumber]). + +print_latex_baseline(_High, _Medium, _Low, _All, _Filtered, Options) :- + option(latex(false), Options), + !. + +print_latex_baseline(HighResults, MediumResults, LowResults, + AllResults, Filtered, _Options) :- + output(latex_table2, Filtered), + output(latex_table3, HighResults, MediumResults, + LowResults, AllResults), + output(latex_table4, HighResults, MediumResults, + LowResults, AllResults), + output(latex_table5, HighResults, MediumResults, + LowResults, AllResults), + output(latex_table6, HighResults, MediumResults, + LowResults, AllResults), + output(latex_table7, HighResults, MediumResults, + LowResults, AllResults). %% print_latex(+Lines, +Counter, +ListOfDicts) %