cluster_search/commit

Proper split of experiment files.

authorChris Dijkshoorn
Tue May 20 13:31:51 2014 +0200
committerChris Dijkshoorn
Tue May 20 13:31:51 2014 +0200
commitc05be4c488f6298fbcca4c1dc29a4af79480bacc
treebaaa480db1da396fa7cf6c2845c161e40d85b759
parentc50cf3686e5f715fce251521868bd3ba3cb53a1f
Diff style: patch stat
diff --git a/README.md b/README.md
index 362c097..e03026e 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# Cluster Search Readme
+# Cluster Search Readme #
 
 
 Cluster Search is running on Cliopatria powered by Prolog. Below the instructions for setting up cluster search on a mac or linux machine.
@@ -8,22 +8,23 @@ Cluster Search is running on Cliopatria powered by Prolog. Below the instruction
 
 Install latest SWI-Prolog development release: http://www.swi-prolog.org/download/devel
 
-Clone the Cliopatria webserver code in a suitable directory: 
+Clone the Cliopatria webserver code in a suitable directory:
 ```bash
-$ git clone git://www.swi-prolog.org/home/pl/git/ClioPatria.git
+$ mkdir git
+$ cd git
+$ git clone https://github.com/SWI-Prolog/ClioPatria.git
 ```
 
-Create a folder for the clustersearch project at the same level as the Cliopatria folder:
+Create a folder for the clustersearch project:
 ```bash
-$ ls
-  ClioPatria
+$ cd ..
 $ mkdir clustersearch
 $ cd clustersearch
 ```
 
 Now configure this directory as a ClioPatria project by running:
 ```bash
-$ ../ClioPatria/configure
+$ ../git/ClioPatria/configure
 ```
 
 You can now run the web server using the command below (typing 'halt.' will stop it):
diff --git a/lib/cluster_search/experiment_analysis.pl b/lib/cluster_search/experiment_analysis.pl
index d140656..fa6f46d 100644
--- a/lib/cluster_search/experiment_analysis.pl
+++ b/lib/cluster_search/experiment_analysis.pl
@@ -1,4 +1,4 @@
-:- module(experiment_analysis, [analyse_results/1]).
+:- module(experiment_analysis, [analyse_results/2]).
 /** <module> Run analysis on the results of the graph search experiment
 */
 
@@ -8,36 +8,28 @@
 %
 %	Analyse the obtained results. There are four result sets, All,
 %	High, Medium and Low.
-analyse_results(All) :-
+analyse_results(All, Options) :-
     cluster_topics(All, ClusterTopics),
     append(ClusterTopics, [query], NonNummericalTopics),
     baseline_topics(All, NonNummericalTopics, BaselineTopics),
     append(ClusterTopics, BaselineTopics, AllmostAllTopics),
-    filter_errors(AllmostAllTopics, All, Errors, Fails, TimeOuts, Filtered),
-    format('Errors: ~p~n Fails: ~p~n TimeOuts: ~p~n',
-	   [Errors, Fails, TimeOuts]),
-    timed_format('~s Running numerical analysis on ~p~n', [BaselineTopics]),
+    filter_errors(AllmostAllTopics, All, _E, _F, _T, Filtered),
+    timed_format('~s Running numerical analysis on ~p~n',
+		 [BaselineTopics]),
     % get total queries to enable splitting
-    %sum(number, Filtered, SumQueryFrequency),
-    %split_logs(Filtered, SumQueryFrequency, High, Medium, Low),
-    %numerics(BaselineTopics, High, high, NumericAnalysisHigh, HighRows),
-    %numerics(BaselineTopics, Medium, medium, NumericAnalysisMedium, MediumRows),
-    %numerics(BaselineTopics, Low, low, NumericAnalysisLow, LowRows),
-    %numerics(BaselineTopics, Filtered, all, NumericAnalysisAll, AllRows),
-    %create_csv_baseline(LowRows, MediumRows, HighRows, AllRows,
-    %			'../out/literals_match.csv'),
-    %output(latex_table2, Filtered),
-    %output(latex_table3, NumericAnalysisHigh, NumericAnalysisMedium,
-    %	   NumericAnalysisLow, NumericAnalysisAll),
-    %output(latex_table4, NumericAnalysisHigh, NumericAnalysisMedium,
-    %	   NumericAnalysisLow, NumericAnalysisAll),
-    %output(latex_table5, NumericAnalysisHigh, NumericAnalysisMedium,
-    %	   NumericAnalysisLow, NumericAnalysisAll),
-    %output(latex_table6, NumericAnalysisHigh, NumericAnalysisMedium,
-    %	   NumericAnalysisLow, NumericAnalysisAll),
-    %output(latex_table7, NumericAnalysisHigh, NumericAnalysisMedium,
-    %	   NumericAnalysisLow, NumericAnalysisAll).
-    %GRAPHSEARCH
+    sum(number, Filtered, SumQueryFrequency),
+    split_logs(Filtered, SumQueryFrequency, High, Medium, Low),
+    numerics(BaselineTopics, High, high, HighResults, HighRows),
+    numerics(BaselineTopics, Medium, medium, MediumResults, MediumRows),
+    numerics(BaselineTopics, Low, low, LowResults, LowRows),
+    numerics(BaselineTopics, Filtered, all, AllResults, AllRows),
+    %TODO: Check arguments
+    print_statistics_baseline(BaselineTopics, HighResults, MediumResults,
+			      LowResults, AllResults, Options),
+    print_latex_baseline(HighResults, MediumResults, LowResults,
+			 AllResults, Filtered, Options),
+    create_csv_baseline(LowRows, MediumRows, HighRows, AllRows,
+			'../out/literals_match.csv'),
     maplist(meta_analyse_clusters(ClusterTopics), _Rows,
 		       Filtered, ClusterAnalysisAll),
     %create_csv_graph_search(LowRows, MediumRows, HighRows, AllRows,
@@ -79,16 +71,22 @@ check_cluster([_Topic|Topics], ClusterTopics) :-
 %	Find the errors, fails and timeouts from the results list and
 %	filter them out.
 filter_errors(Topics, Inputs, Errors, Fails, TimeOuts, Filtered) :-
+    %TODO: write errors to log.
     findall(Error, error_in(Topics, Inputs, Error), Errors0),
     list_to_set(Errors0, Errors),
+    length(Errors, NumberErrors),
     findall(Fail, fail_in(Topics, Inputs, Fail), Fails0),
     list_to_set(Fails0, Fails),
+    length(Fails, NumberFails),
     findall(TimeOut, time_out_in(Topics, Inputs, TimeOut), TimeOuts0),
     list_to_set(TimeOuts0, TimeOuts),
+    length(TimeOuts, NumberTimeOuts),
     debug(exp_filter, 'Errors: ~p~n Fails: ~p~n TimeOuts: ~p',
 	   [Errors, Fails, TimeOuts]),
     append([Errors, Fails, TimeOuts], EverythingBad),
     subtract(Inputs, EverythingBad, Filtered),
+    timed_format('~s Errors: ~d Fails: ~d TimeOuts: ~d~n',
+		 [NumberErrors, NumberFails, NumberTimeOuts]),
     debug(exp_filter, 'Fitlered: ~p', [Filtered]).
 
 error_in(Topics, Inputs, Input) :-
@@ -168,28 +166,15 @@ numerical_statistics(Input, Split, Row, Topic,  AnalysisResult) :-
     nth1(1, SortedNumbers, LowestNumber),
     variance(Numbers, Mean, NumberOfQueries, Variance),
     StandardDeviation is sqrt(Variance),
-%   print_numerical_statistics(Topic, Split, NumberOfQueries, Sum, Mean,
-%			       Variance, StandardDeviation,
-%			       LowestNumber, HighestNumber),
     AnalysisResult = results{topic:Topic,
-			      distinct:NumberOfQueries,
-			      sum:Sum,
-			      mean:Mean,
-			      variance:Variance,
-			      standard_deviation:StandardDeviation,
-			      lowest_number:LowestNumber,
-			      highest_number:HighestNumber}.
-
-print_numerical_statistics(Topic, Split, NumberOfQueries, Sum, Mean, Variance,
-			   StandardDeviation, LowestNumber, HighestNumber) :-
-    format('~tstatistics ~p ~p~t~36|~n~n', [Topic, Split]),
-    format('distinct ~`.t~d ~36|~n', [NumberOfQueries]),
-    format('sum ~`.t~d ~36|~n', [Sum]),
-    format('mean ~`.t~2f ~36|~n', [Mean]),
-    format('variance ~`.t~2f ~36|~n', [Variance]),
-    format('standard deviation ~`.t~2f ~36|~n', [StandardDeviation]),
-    format('lowest number ~`.t~d ~36|~n', [LowestNumber]),
-    format('highest number ~`.t~d ~36|~n~n', [HighestNumber]).
+			     split:Split,
+			     distinct:NumberOfQueries,
+			     sum:Sum,
+			     mean:Mean,
+			     variance:Variance,
+			     standard_deviation:StandardDeviation,
+			     lowest_number:LowestNumber,
+			     highest_number:HighestNumber}.
 
 %%	get_number_list(+Inputs, +Topic, -Numbers)
 %
diff --git a/lib/cluster_search/experiment_data.pl b/lib/cluster_search/experiment_data.pl
index ce5bc24..ddd5c68 100644
--- a/lib/cluster_search/experiment_data.pl
+++ b/lib/cluster_search/experiment_data.pl
@@ -3,8 +3,7 @@
 			    search_logs/2,
 			    load_search_logs/2,
 			    load_graphs/1,
-			    unload_graphs/1,
-			    server_versus_local/3]).
+			    unload_graphs/1]).
 /** <module> Load and unload data
 
 This module is used for retrieving information about data and the data itself.
@@ -25,24 +24,7 @@ rdf_dirs(['rma',
 	  'getty/aat',
 	  'getty/ulan',
 	  'iconclass',
-	  'naturalis'
-	 ]).
-
-server_versus_local(server, Collection, Vocabularies) :-
-    % limit the number of cpus used
-    set_prolog_flag(cpu_count, 20),
-    % increase stack size
-    set_prolog_stack(global, limit(10 000 000 000)),
-    Collection = rma_edm,
-    Vocabularies = [wn, aat, ic, ulan, ioc],
-    set_setting(search:basic_search_target,
-		'').
-
-server_versus_local(local, Collection, Vocabularies) :-
-    Collection = rma_pk,
-    Vocabularies = [ic_local],
-    set_setting(search:basic_search_target,
-		'http://purl.org/collections/nl/rma/schema#Work').
+	  'naturalis']).
 
 %file paths to search logs
 search_logs(month, '../data/test_queries_jan14.csv').
diff --git a/lib/cluster_search/experiment_graph_search.pl b/lib/cluster_search/experiment_graph_search.pl
index 47cbc0b..d48df4f 100644
--- a/lib/cluster_search/experiment_graph_search.pl
+++ b/lib/cluster_search/experiment_graph_search.pl
@@ -4,12 +4,16 @@
 This module is used for getting experimental results of using.
 */
 
-:- use_module(library(cluster_search/kwd_search)).
-:- use_module(api(cluster_search)).
 :- use_module(library(semweb/rdf_db)).
 :- use_module(library(thread)).
 :- use_module(library(time)).
 :- use_module(library(settings)).
+:- use_module(api(cluster_search)).
+:- use_module(library(cluster_search/kwd_search)).
+:- use_module(library(cluster_search/experiment_utils)).
+:- use_module(library(cluster_search/experiment_data)).
+:- use_module(library(cluster_search/experiment_analysis)).
+
 
 % set time limit on graph search
 max_time_graph_search(10).
@@ -25,7 +29,7 @@ red_button(ServerOrLocal) :-
     %load_graphs([Collection|Vocabularies]),
     baseline(QueryList, [Collection|Vocabularies], BaselineResults),
     experiment(BaselineResults, Collection, Vocabularies, ExperimentalResults),
-    analyse_results(ExperimentalResults).
+    analyse_results(ExperimentalResults, [latex(false), baseline_statistcs(true)]).
 
 %%	server_versus_local(+ServerOrLocal, -Collection, -Vocabularies)
 %
diff --git a/lib/cluster_search/experiment_utils.pl b/lib/cluster_search/experiment_utils.pl
index 2bbe8a1..730205c 100644
--- a/lib/cluster_search/experiment_utils.pl
+++ b/lib/cluster_search/experiment_utils.pl
@@ -1,12 +1,18 @@
-:- module(experiment_utils, [timed_format/2,
+:- module(experiment_utils, [time_stamp/1,
+			     timed_format/2,
 			     create_csv_baseline/5,
+			     print_statistics_baseline/6,
+			     print_statistics/2,
+			     print_latex_baseline/6,
 			     output/5,
-			    output/2]).
+			     output/2]).
 /** <module> Utils for experiment.
 
-Output handling.
+Output handling and other utils.
 */
 
+:- use_module(library(option)).
+
 %%	timed_format(+Format, +List)
 %
 %	Show a formatted string on output with beginning with time
@@ -20,10 +26,7 @@ timed_format(Format, List) :-
 time_stamp(String) :-
     get_time(Time),
     stamp_date_time(Time, Date, 'local'),
-    format_time(string(String),
-		%'%d %b %Y %T',
-		'%T',
-		Date, posix).
+    format_time(string(String),'%T', Date, posix).
 
 %%	create_csv_baseline(High, Medium, Low, Filtered, +FilePath)
 %
@@ -93,6 +96,54 @@ output(latex_table8, High, Medium, Low, Total) :-
     print_latex(baseline_wn20, Total, all),
     format('~n').
 
+print_statistics_baseline(_Topics, _High, _Medium, _Low, _All, Options) :-
+    option(baseline_statistcs(false), Options),
+    !.
+print_statistics_baseline(Topics, High, Medium, Low, All, _Options) :-
+    maplist(print_statistics(High), Topics),
+    maplist(print_statistics(Medium), Topics),
+    maplist(print_statistics(Low), Topics),
+    maplist(print_statistics(All), Topics).
+
+%%	print_statistics(+Dic, +Topics)
+%
+%	Print statistics for a nummerical topic.
+print_statistics(Dics, Topic) :-
+    find_dict(Topic, Dics, Dic),
+    Split = Dic.split,
+    NumberOfQueries = Dic.distinct,
+    Sum = Dic.sum,
+    Mean = Dic.mean,
+    Variance = Dic.variance,
+    StandardDeviation = Dic.standard_deviation,
+    LowestNumber = Dic.lowest_number,
+    HighestNumber = Dic.highest_number,
+    format('~tstatistics of ~p ~p~t~36|~n~n', [Topic, Split]),
+    format('distinct ~`.t~d ~36|~n', [NumberOfQueries]),
+    format('sum ~`.t~d ~36|~n', [Sum]),
+    format('mean ~`.t~2f ~36|~n', [Mean]),
+    format('variance ~`.t~2f ~36|~n', [Variance]),
+    format('standard deviation ~`.t~2f ~36|~n', [StandardDeviation]),
+    format('lowest number ~`.t~d ~36|~n', [LowestNumber]),
+    format('highest number ~`.t~d ~36|~n~n', [HighestNumber]).
+
+print_latex_baseline(_High, _Medium, _Low, _All, _Filtered, Options) :-
+    option(latex(false), Options),
+    !.
+
+print_latex_baseline(HighResults, MediumResults, LowResults,
+		     AllResults, Filtered, _Options) :-
+    output(latex_table2, Filtered),
+    output(latex_table3, HighResults, MediumResults,
+	   LowResults, AllResults),
+    output(latex_table4, HighResults, MediumResults,
+	   LowResults, AllResults),
+    output(latex_table5, HighResults, MediumResults,
+	   LowResults, AllResults),
+    output(latex_table6, HighResults, MediumResults,
+	   LowResults, AllResults),
+    output(latex_table7, HighResults, MediumResults,
+	   LowResults, AllResults).
 
 %%	print_latex(+Lines, +Counter, +ListOfDicts)
 %