/*  This file is part of ClioPatria.

    Author:
    HTTP:	http://e-culture.multimedian.nl/
    GITWEB:	http://gollem.science.uva.nl/git/ClioPatria.git
    GIT:	git://gollem.science.uva.nl/home/git/ClioPatria.git
    GIT:	http://gollem.science.uva.nl/home/git/ClioPatria.git
    Copyright:  2007, E-Culture/MultimediaN

    ClioPatria is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 2 of the License, or
    (at your option) any later version.

    ClioPatria is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with ClioPatria.  If not, see <http://www.gnu.org/licenses/>.
*/

:- module(rdf_graph,
	  [ new_search_graph/1,		% -Graph
	    search_graph_add_node/3,	% !Graph, +Node, +Fields
	    search_graph_add_edge/4,	% !Graph, +From, +Pred, +To
	    search_graph_add_edge/5,	% !Graph, +From, +Pred, +To, +Weight
	    search_graph_set_node_type/3, % !Graph, +Node, +Type

	    search_graph_prune/2,	% !Graph, +Nodes
	    search_graph_prune/3,	% !Graph, +Node, -New
	    search_graph_drains/2,	% +Graph, -Drains
	    search_graph_sources/2,	% +Graph, -Sources

					% Agenda
	    search_graph_next_agenda/3,	% !Graph, -Node, -Score
	    search_graph_agenda/2,	% +Graph, -Pairs

					% Queries
	    search_graph_size/2,	% +Graph, -NodeCount
	    search_graph_node_score/3,	% +Graph, +Node, -Score
	    search_graph_node_type/3,	% +Graph, +Node, -Type
	    search_graph_nodes_score_list/2, % @Graph, -list(Node-Score)

					% RDF Query graph
	    search_graph_rdf_graph/2,	% +Graph, -Triples
	    search_graph_rdf/4,		% +Graph, ?S, ?P, ?O
	    search_graph_rdfs/4		% +Graph, ?S, ?P, ?O
	  ]).
:- use_module(library(record)).
:- use_module(library(rbtrees)).
:- use_module(library(error)).
:- use_module(library(lists)).
:- use_module(library(option)).
:- use_module(library(semweb/rdf_db)).

/** <module> Representing RDF search paths

---++ Introduction

This module represents a search-graph through   an RDF graph starting at
one or more locations in the graph.   This search-graph is essentially a
subgraph of the original graph.  As  it   is  intended  as a dynamically
evolving structure during reasoning, it is  represented as a pure Prolog
term and therefore subject to  backtracking   and  normal  Prolog memory
management.

---++ Representation

The graph is represented as an RB tree from URI to a record

	node_data(Next, Previous, Score, Type)

where Next and Previous are  lists   of  p([fi](Predicate),  Target). If
f(P), the link is rdf(Prev, P, Next), if  i(P), the link is rdf(Next, P,
Prev). The score of the node indicates semantic distance, ranging from 0
(very far way) to 1 (very close).   Finally, Type classifies the node as
one of =start=, =node= or =target=.

The graphs as a whole contains

	$ nodes :
	An rb_tree mapping node-id to a node_data term as described above.
	$ agenda :
	An rb_tree mapping score to a list of nodes with that score.
	$ size :
	Nunber of nodes (for statistical reasons).

---++ Issues

---+++ Shall we deal internally with owl:sameAs?

---+++ How to deal with owl property types?

	$ owl:inverseOf :
	$ owl:TransitiveProperty :
	$ owl:SymmetricProperty :

@tbd	No weight loss passing a blank node?
*/

:- record graph(nodes, agenda, size=0).
:- record node_data(next=[], previous=[], score=1, type=node).


%%	new_search_graph(-Graph)
%
%	Create a new search graph.  The initial graph is empty.

new_search_graph(Graph) :-
	rb_empty(Nodes),
	rb_empty(Agenda),
	make_graph([nodes(Nodes), agenda(Agenda)], Graph).


%%	search_graph_add_node(!Graph, +Node, +Fields)
%
%	Add a (start-)node to the graph

search_graph_add_node(Graph, Node, Fields) :-
	graph_nodes(Graph, Assoc0),
	make_node_data(Fields, NodeData),
	rb_insert(Assoc0, Node, NodeData, Assoc1),
	set_nodes_of_graph(Assoc1, Graph),
	graph_size(Graph, Size0),
	Size is Size0 + 1,
	set_size_of_graph(Size, Graph),
	node_data_score(NodeData, Score0),
	add_new_node_to_agenda(Graph, Node, Score0, Score),
	(   Score == Score0
	->  true
	;   set_score_of_node_data(Score, NodeData)
	).


%%	search_graph_add_edge(!Graph, +From, +Pred, +To) is det.
%%	search_graph_add_edge(!Graph, +From, +Pred, +To, +Options) is det.
%
%	Add a link {From->Pred->To} to the graph.  Options include
%
%	  * weight(+Weight)
%	  Weight to use for the predicate link instead of the default
%
%	  * inverse(true)
%	  <From, Pred, To> is formed from rdf(To,Pred,From)

search_graph_add_edge(Graph, From, Pred, To) :-
	search_graph_add_edge(Graph, From, Pred, To, []).
search_graph_add_edge(Graph, From, Pred, To, Options) :-
	graph_nodes(Graph, Assoc0),
	rb_lookup(From, FromData, Assoc0),
	(   option(inverse(true), Options)
	->  PTerm = i(Pred)
	;   PTerm = f(Pred)
	),
	add_next_node_data(FromData, p(PTerm, To)),
	node_data_score(FromData, Score0),
	(   option(weight(Weight), Options)
	->  Score is Score0 * Weight
	;   update_score(Graph, From, Pred, To, Score0, Score)
	),
	(   rb_lookup(To, ToData, Assoc0)
	->  add_previous_node_data(ToData, p(PTerm, From)),
	    node_data_score(ToData, Score1),
	    join_score(Score1, Score, NewScore),
	    reschedule_node(Graph, To, Score1, NewScore, FinalScore),
	    set_score_of_node_data(FinalScore, ToData)
	;   search_graph_add_node(Graph, To,
				  [ score(Score),
				    previous([p(PTerm, From)])
				  ])
	).

add_next_node_data(Data, Next) :-
	node_data_next(Data, Next0),
	set_next_of_node_data([Next|Next0], Data).

add_previous_node_data(Data, Previous) :-
	node_data_previous(Data, Previous0),
	set_previous_of_node_data([Previous|Previous0], Data).

%%	join_score(+Score1, +Score2, -Score)
%
%	Join two scores comming from independent paths.  Scores are
%	considered probabilities.

join_score(S1, S2, S) :-
	S is 1-((1-S1)*(1-S2)).

%%	update_score(+Graph, +From, +Pred, +To, +Score0, -Score)
%
%	Given that From has score ScoreO, compute the score for To based
%	on Pred. First prototype used a   predefined value per predicate
%	in  the  range  0..1.  Alternatively   we  can  use  statistical
%	measures. Some porposals:
%
%	  * How many subjects of the same type have this property with
%	    this value?
%	  * How many subjects have this object (over this relation)?
%
%	@param	Score Float in the range 0.0..1.0

update_score(_Graph, From, Pred, To, Score0, Score) :-
	by_in_links(From, Pred, To, Factor),
	Score is Score0 * Factor.

by_in_links(_From, Pred, To, Factor) :-
	rdf_estimate_complexity(_, Pred, To, Complexity),
	Factor is 1/Complexity.

%%	search_graph_set_node_type(+Graph, +Node, +Type) is det.
%
%	Set the type for Node in Graph to Type.
%
%	@error	existence_error(node, Node)

search_graph_set_node_type(Graph, Node, Type) :-
	graph_nodes(Graph, Assoc),
	(   rb_lookup(Node, Data, Assoc)
	->  set_type_of_node_data(Type, Data)
	;   existence_error(node, Node)
	).

%%	search_graph_node_type(+Graph, +Node, -Type) is det.
%
%	Type is the classification for Node in Graph.
%
%	@tbd	Generalise modes?

search_graph_node_type(Graph, Node, Type) :-
	graph_nodes(Graph, Assoc),
	(   rb_lookup(Node, Data, Assoc)
	->  node_data_type(Data, Type)
	;   existence_error(node, Node)
	).

%%	search_graph_node_score(+Graph, +Node, -Weight) is det.
%
%	Weight is the current score for Node in Graph.

search_graph_node_score(Graph, Node, Weight) :-
	graph_nodes(Graph, Assoc),
	rb_lookup(Node, Data, Assoc),
	node_data_score(Data, Weight).

%%	search_graph_size(+Graph, -Size) is det.
%
%	Size of the graph in nodes.

search_graph_size(Graph, Size) :-
	graph_size(Graph, Size).


		 /*******************************
		 *	      PRUNING		*
		 *******************************/

%%	search_graph_prune(!Graph, +Nodes) is det.
%%	search_graph_prune(!Graph, +Nodes, -NewLeaves) is det.
%
%	Delete non-target leave nodes from Graph.

search_graph_prune(Graph, Prune) :-
	search_graph_prune(Graph, Prune, _).

search_graph_prune(Graph, Prune, NewLeaves) :-
	is_list(Prune), !,
	find_ancestors(Prune, Graph, Ancestors),
	group_pairs_by_key(Ancestors, Grouped),
	delete_next_links(Grouped, Graph, NewLeaves).
search_graph_prune(Graph, Prune, NewLeaves) :-
	search_graph_prune(Graph, [Prune], NewLeaves).

%%	find_ancestors(+Prune, +Graph, -Ancestors)
%
%	@param Ancestors is a difference-list of terms From-p(Pred,To)

find_ancestors(Prune, Graph, Ancestors) :-
	graph_nodes(Graph, Nodes),
	find_ancestors(Prune, Graph, Nodes, RestNodes,
		       0, Deleted, Ancestors, []),
	graph_size(Graph, Size0),
	Size is Size0 - Deleted,
	set_size_of_graph(Size, Graph),
	set_nodes_of_graph(RestNodes, Graph).

find_ancestors([], _, Nodes, Nodes, C, C, A, A).
find_ancestors([H|T], Graph, Nodes0, Nodes, C0, C, A, AT) :-
	rb_lookup(H, Data, Nodes0),
	(   node_data_type(Data, target)
	->  find_ancestors(T, Graph, Nodes0, Nodes, C0, C, A, AT)
	;   node_data_next(Data, Next),
	    Next \== []
	->  permission_error(Graph, prune, H)
	;   node_data_previous(Data, Prev),
	    make_a_list(Prev, H, A, AT0),
	    rb_delete(Nodes0, H, Nodes1),
	    C1 is C0 + 1,
	    find_ancestors(T, Graph, Nodes1, Nodes, C1, C, AT0, AT)
	).

make_a_list([], _, A, A).
make_a_list([p(Pred, From)|T0], To, [From-p(Pred,To)|T1], T) :-
	make_a_list(T0, To, T1, T).


delete_next_links([], _, []).
delete_next_links([H-DelNext|T], Graph, NewLeaves) :-
	graph_nodes(Graph, Nodes),
	rb_lookup(H, Data, Nodes),
	node_data_next(Data, Next0),
	subtract_set(Next0, DelNext, Next),
	set_next_of_node_data(Next, Data),
	(   Next == []
	->  NewLeaves = [H|NewLeaves1],
	    delete_next_links(T, Graph, NewLeaves1)
	;   delete_next_links(T, Graph, NewLeaves)
	).

%%	subtract_set(+Set, +Subtract, -Remaining) is det.
%
%	Remaining are the elements from Set   that  are not in Subtract.
%	None of the sets is ordered. We have to apply heuristics to find
%	out the best approach.

subtract_set(Set, [One], Remaining) :- !,
	selectchk(One, Set, Remaining).
subtract_set(Set, Subtract, Remaining) :-
	sort(Set, SSet),
	sort(Subtract, SSubstract),
	ord_subtract(SSet, SSubstract, Remaining).


%%	search_graph_nodes_score_list(@Graph, -Scores:list(Node-Score)) is det.
%
%	Scores is a list with Node-Score pairs, with all nodes in
%	Graph.

search_graph_nodes_score_list(Graph, Scores) :-
	graph_nodes(Graph, Assoc),
	rb_visit(Assoc, NodeList),
	scores(NodeList, Scores).

scores([], []).
scores([Node-Data|T0], [Node-Score|T]) :-
	node_data_score(Data, Score), !,
	scores(T0, T).


%%	search_graph_drains(@Graph, -Nodes)
%
%	List of all drains (end-of-path)

search_graph_drains(Graph, Nodes) :-
	graph_nodes(Graph, Assoc),
	rb_visit(Assoc, NodeList),
	drains(NodeList, Nodes).

drains([], []).
drains([Node-Data|T0], [Node|T]) :-
	node_data_next(Data, []), !,
	drains(T0, T).
drains([_|T0], T) :-
	drains(T0, T).


%%	search_graph_sources(@Graph, -Nodes)
%
%	List of all drains (start-of-path)

search_graph_sources(Graph, Nodes) :-
	graph_nodes(Graph, Assoc),
	rb_visit(Assoc, NodeList),
	sources(NodeList, Nodes).

sources([], []).
sources([Node-Data|T0], [Node|T]) :-
	node_data_previous(Data, []), !,
	sources(T0, T).
sources([_|T0], T) :-
	sources(T0, T).


		 /*******************************
		 *	      AGENDA		*
		 *******************************/

%%	add_new_node_to_agenda(!Graph, +Node, +Score, -FinalScore) is det.
%
%	Add a new node to the agenda. We   want  a unique score for each
%	node, so we can find the  node   from  the  score. Therefore, if
%	there is already a node with this score, we use a slightly lower
%	score, etc.
%
%	@tbd: use rb_update/5.

add_new_node_to_agenda(Graph, Node, Score, FinalScore) :-
	must_be(between(0.0,1.0), Score),
	graph_agenda(Graph, RBTree0),
	into_agenda(RBTree0, Node, Score, RBTree, FinalScore),
	set_agenda_of_graph(RBTree, Graph).

into_agenda(RBTree0, Node, Score, RBTree, FinalScore) :-
	rb_insert_new(RBTree0, Score, Node, RBTree), !,
	FinalScore = Score.
into_agenda(RBTree0, Node, Score, RBTree, FinalScore) :-
	rb_previous(RBTree0, Score, PrevScore, _), !,
	(   Score - PrevScore =< 0.001
	->  between(1, infinite, Step),
	    FinalScore is PrevScore - 0.000000001*Step
	;   between(1, infinite, Step),
	    FinalScore is Score - 0.001*Step
	),
	rb_insert_new(RBTree0, FinalScore, Node, RBTree), !.
into_agenda(RBTree0, Node, Score, RBTree, FinalScore) :-
	FinalScore is Score - 0.001,
	rb_insert_new(RBTree0, FinalScore, Node, RBTree).

%%	reschedule_node(!Graph, +Node, +OldScore, -NewScore)
%
%	Change the score of Node in  the   agenda  of  Graph. Leaves the
%	agenda untouched if Node is not in   it as this implies the node
%	is already expanded.

reschedule_node(Graph, Node, OldScore, NewScore, FinalScore) :-
	graph_agenda(Graph, RBTree0),
	(   rb_delete(OldScore, Node, RBTree0)
	->  into_agenda(RBTree0, Node, NewScore, RBTree, FinalScore),
	    set_agenda_of_graph(RBTree, Graph)
	;   FinalScore = NewScore	% already expanded
	).

%%	search_graph_next_agenda(!Graph, -Node, -Score) is semidet.
%
%	Fetches the best node from the agenda and deletes it

search_graph_next_agenda(Graph, Node, Score) :-
	graph_agenda(Graph, RBTree0),
	rb_del_max(RBTree0, Score, Node, RBTree),
	set_agenda_of_graph(RBTree, Graph).


%%	search_graph_agenda(+Graph, -Pairs)
%
%	Get agenda from the graph, showing   not-expanded nodes as pairs
%	Score-Node.  E.g.
%
%	==
%	Agenda = [1-a, 0.5-b, 0.25-c]
%	==

search_graph_agenda(Graph, Pairs) :-
	graph_agenda(Graph, RBTree),
	rb_visit(RBTree, Pairs0),
	reverse(Pairs0, Pairs).


		 /*******************************
		 *	       RDF		*
		 *******************************/

%%	search_graph_rdf_graph(+Graph, -RDF:list(rdf(S,P,O))) is det.
%
%	Create a list of rdf(S,P,O) triples from Graph.

search_graph_rdf_graph(Graph, RDF) :-
	graph_nodes(Graph, Nodes),
	rb_visit(Nodes, Pairs),
	phrase(graph_to_rdf(Pairs), RDF).

graph_to_rdf([]) -->
	[].
graph_to_rdf([R-Data|T]) -->
	{ node_data_next(Data, Next)
	},
	node_to_rdf(Next, R),
	graph_to_rdf(T).

node_to_rdf([], _) -->
	[].
node_to_rdf([p(P, O)|T], S) -->
	link_to_rdf(P, S, O),
	node_to_rdf(T, S).

link_to_rdf(f(P), S, O) -->
	[ rdf(S, P, O) ].
link_to_rdf(i(P), S, O) -->
	[ rdf(O, P, S) ].

%%	search_graph_rdf(+Graph, ?S, ?P, ?O) is nondet.
%
%	rdf(S,P,O) is an RDF statement in Graph. Provides indexed access
%	if either S or O are given.

:- rdf_meta
	search_graph_rdf(+, r, r, o),
	search_graph_rdfs(+, r, r, o).

search_graph_rdf(Graph, S, P, O) :-
	ground(S), ground(O), !,
	graph_nodes(Graph, Nodes),
	rb_lookup(S, SData, Nodes),
	rb_lookup(O, OData, Nodes),
	(   node_data_next(SData, SN),
	    dmember(P, p(f(P),O), SN)
	;   node_data_next(OData, PN),
	    dmember(P, p(i(P),S), PN)
	).
search_graph_rdf(Graph, S, P, O) :-
	(   search_graph_rdf_(Graph, S, f(P), O)
	;   search_graph_rdf_(Graph, O, i(P), S)
	).

search_graph_rdf_(Graph, S, P, O) :-
	graph_nodes(Graph, Nodes),
	(   ground(S)
	->  rb_lookup(S, SData, Nodes),
	    node_data_next(SData, List),
	    member(p(P, O), List)
	;   ground(O)
	->  rb_lookup(O, OData, Nodes),
	    node_data_previous(OData, List),
	    member(p(P, S), List)
	;   rb_in(S, SData, Nodes),
	    node_data_next(SData, List),
	    member(p(P, O), List)
	).

dmember(Ground, Element, List) :-
	ground(Ground), !,
	memberchk(Element, List).
dmember(_Ground, Element, List) :-
	member(Element, List), !.

%%	search_graph_rdfs(+Graph, ?S, ?P, ?O) is nondet.
%
%	rdf(S,P0,O)  is  an  RDF  statement  in    Graph  and  P0  is  a
%	subPropertyOf P. Provides indexed access if   either  S or O are
%	given.
%
%	@see This is the same as rdf_has/3 from library rdf_db.pl

search_graph_rdfs(Graph, S, P, O) :-
	search_graph_rdf(Graph, S, P0, O),
	rdf_reachable(P0, rdfs:subPropertyOf, P).