aers_rewrite/commit

cleanup the records, mainly removing blank nodes

authorCWI eculture 2
Tue Jan 22 10:31:31 2013 +0100
committerCWI eculture 2
Tue Jan 22 10:31:31 2013 +0100
commitb934224aa2ca0870b761f09a9fcfbae59012e335
treedba5e57658e41cc9661bfbdf78fb1fb11a14b196
parent22427e98f0c9b27843cadf9d27e1fbaa71b0037f
Diff style: patch stat
diff --git a/config-available/aers_rewrite.pl b/config-available/aers_rewrite.pl
index 0732b0a..6511182 100644
--- a/config-available/aers_rewrite.pl
+++ b/config-available/aers_rewrite.pl
@@ -6,5 +6,6 @@
 :- use_module(library(semweb/rdf_db)).
 
 :- rdf_register_ns(aers, 'http://aers.data2semantics.org/vocab/').
+:- rdf_register_ns(aers_r, 'http://aers.data2semantics.org/report/').
 
 :- use_module(library(aers_rewrite)).
diff --git a/lib/aers_rewrite.pl b/lib/aers_rewrite.pl
index 1eaede5..e5aaa0d 100644
--- a/lib/aers_rewrite.pl
+++ b/lib/aers_rewrite.pl
@@ -1,11 +1,14 @@
 :- module(aers_rewrite,
-	  [test/0,
+	  [convert/0,
+	   convert/2,
+	   test/0,
 	   load/0,
-	   load_year/1,
-	   rewrite/0
+	   load_year/2,
+	   rewrite_graph/1
 	  ]).
 
 :- use_module(library(semweb/rdf_db)).
+:- use_module(library(semweb/rdf_turtle_write)).
 :- use_module(library(xmlrdf/xmlrdf)).
 :- use_module(library(xmlrdf/rdf_rewrite)).
 :- use_module(library(xmlrdf/rdf_convert_util)).
@@ -13,61 +16,218 @@
 :- rdf_register_ns(aers, 'http://aers.data2semantics.org/vocab/').
 
 user:file_search_path(data, '../fda_data').
+user:file_search_path(aers_rdf, '../rdf/aers').
+
+:- set_prolog_stack(global, limit(2*10**9)).
+
+year('04').
+year('05').
+year('06').
+year('07').
+year('08').
+year('09').
+
+:- debug(convert).
+:- debug(rdf_rewrite).
+
+convert :-
+	(   year(Year),
+	    atom_concat('aers_20', Year, Graph),
+	    convert(Year, Graph),
+	    fail
+	;   true
+	).
+
+convert(Year, Graph) :-
+	debug(convert, 'convert ~w to ~w', [Year, Graph]),
+	atom_concat(Graph, '.ttl', FileName),
+	absolute_file_name(aers_rdf(FileName), File),
+	load_year(Year, Graph),
+	debug(convert, 'rewrite ~w', [Graph]),
+	rewrite_graph(Graph),
+	debug(convert, 'save ~w', [Graph]),
+	rdf_save_turtle(File, [graph(Graph)]),
+	rdf_retractall(_,_,_,Graph).
 
 test :-
 	absolute_file_name(data('Adr_test.sgm'), TestFile),
-	load(TestFile).
+	load(test, TestFile).
 
 load :-
 	absolute_file_name(data(sgml), Dir,
 			   [ file_type(directory)
 			   ]),
+	Graph = data,
 	atom_concat(Dir, '/*.SGM', Pattern),
 	expand_file_name(Pattern, Files),
-	maplist(load, Files).
+	maplist(load(Graph), Files).
 
-load_year(Year) :-
+load_year(Year, Graph) :-
 	absolute_file_name(data(sgml), Dir,
 			   [ file_type(directory)
 			   ]),
 	concat_atom([Dir, '/ADR',Year,'*.SGM'], Pattern),
 	expand_file_name(Pattern, Files),
-	maplist(load, Files).
+	maplist(load(Graph), Files).
 
-load(File) :-
+load(Graph, File) :-
 	rdf_current_ns(aers, Prefix),
 	load_xml_as_rdf(File,
 			[ dialect(xml),
 			  unit(safetyreport),
-			  prefix(Prefix)
+			  prefix(Prefix),
+			  graph(Graph)
 			]).
 
-rewrite :-
-	rdf_rewrite(data).
+rewrite_graph(Graph) :-
+	rdf_rewrite(Graph).
 
+% report
+%
 % URIs of the reports are based on the safetyreportid
 
 assign_uris @@
-{S, aers:safetyreportid, literal(Id)}\
-{ S } <=>
-	literal_to_id(['report', Id], aers, URI),
-	{ URI }.
+{S, aers:safetyreportid, Id}\
+{S}
+<=>
+literal_to_id(Id, aers_r, URI),
+{URI}.
 
-% Reactions are direct properties of the report and not the patient
+% reaction
+%
+% Reactions are direct properties of the report
 
 reaction @@
-{Patient, aers:reaction, Reaction},
 {Report, aers:patient, Patient}\
-{Patient, aers:reaction, Reaction}
+{Patient, aers:reaction, BNode},
+{BNode, aers:reactionmeddrapt, Reaction},
+{BNode, _, _}
 <=>
 { Report, aers:reaction, Reaction }.
 
-% Drugs are direct properties of the report and not the patient
+% drug
+%
+% Drugs are direct properties of the report
+% characterization codes are URIs
 
 drug @@
-{Patient, aers:drug, Drug},
 {Report, aers:patient, Patient}\
-{Patient, aers:drug, Drug}
+{Patient, aers:drug, Drug},
+{Drug, aers:drugcharacterization, literal(C)}
+<=>
+drug_characterization_uri(C, URI),
+{ Report, aers:drug, Drug },
+{ Drug, aers:drugcharacterization, URI}.
+
+% Patient
+%
+% deathDate is a direct property of the patient
+% gender has a URI
+
+patient_death @@
+{Patient, aers:patientdeath, BNode},
+{BNode, aers:patientdeathdate, Date},
+{BNode, _, _}
+<=>
+{Patient, aers:patientdeathdate, Date}.
+
+patient_gender @@
+{Patient, aers:patientsex, literal(C)}
+<=>
+gender_uri(C, URI),
+{Patient, aers:patientsex, URI}.
+
+outcome @@
+{Report, P, literal('1')}
+<=>
+outcome_uri(P, Outcome),
+{Report, aers:outcome, Outcome}.
+
+outcome @@
+{_, aers:serious, literal('1')}
 <=>
-{ Report, aers:drug, Drug }.
+true.
+
+% sender
+%
+% Sender is a direct property
+
+sender @@
+{Record, aers:sender, BNode},
+{BNode, aers:senderorganization, Organization},
+{BNode, _, _}
+<=>
+{Record, aers:sender, Organization}.
+
+% Source
+%
+% qualification code and code are direct properties of
+% the record
+% qualification code has a URI
+
+source @@
+{Report, aers:primarysource, Source},
+{Source, aers:qualification, literal(Q)},
+{Source, aers:reportercountry, Country} ?,
+{Source, _, _}
+<=>
+source_qualification_uri(Q, Qualification),
+{Report, aers:sourceQualification, Qualification},
+{Report, aers:sourceCountry, Country}.
+
+% meaningless triples
+%
+% date format is always 102, and thus not needed
+% we don't need empty literals
+
+format_properties @@
+{_, P, _}
+<=>
+format_property(P).
+
+empty_literals @@
+{_,_,literal('')}
+<=>
+true.
+
+
+format_property(P) :- rdf_equal(P, aers:drugenddateformat).
+format_property(P) :- rdf_equal(P, aers:drugstartdateformat).
+format_property(P) :- rdf_equal(P, aers:patientdeathdateformat).
+format_property(P) :- rdf_equal(P, aers:receiptdateformat).
+format_property(P) :- rdf_equal(P, aers:receivedateformat).
+format_property(P) :- rdf_equal(P, aers:transmissiondateformat).
+%format_property(P) :- rdf_equal(aers:patientonsetageunit, P).
+
+source_qualification_uri('1', C) :- !, rdf_equal(C, aers:'source/physician').
+source_qualification_uri('2', C) :- !, rdf_equal(C, aers:'source/pharmacist').
+source_qualification_uri('3', C) :- !, rdf_equal(C, aers:'source/healthProfessional').
+source_qualification_uri('4', C) :- !, rdf_equal(C, aers:'source/lawyer').
+source_qualification_uri('5', C) :- rdf_equal(C, aers:'source/nonHealthProfessional').
+
+gender_uri('1', C) :- !, rdf_equal(C, aers:'gender/male').
+gender_uri('2', C) :- rdf_equal(C, aers:'gender/female').
+
+drug_characterization_uri('1', C) :- !, rdf_equal(C, aers:'drug/role/suspect').
+drug_characterization_uri('2', C) :- !, rdf_equal(C, aers:'drug/role/concomitant').
+drug_characterization_uri('3', C) :- rdf_equal(C, aers:'drug/role/interacting').
+
+outcome_uri(P, C) :-
+	rdf_equal(P, aers:seriousnessdeath),!,
+	rdf_equal(C, aers:'outcome/death').
+outcome_uri(P, C) :-
+	rdf_equal(P, aers:seriousnesslifethreatening),!,
+	rdf_equal(C, aers:'outcome/lifethreatening').
+outcome_uri(P, C) :-
+	rdf_equal(P, aers:seriousnesshospitalization),!,
+	rdf_equal(C, aers:'outcome/hospitalization').
+outcome_uri(P, C) :-
+	rdf_equal(P, aers:seriousnessdisabling),!,
+	rdf_equal(C, aers:'outcome/disabling').
+outcome_uri(P, C) :-
+	rdf_equal(P, aers:seriousnesscongenitalanomaly),!,
+	rdf_equal(C, aers:'outcome/congenitalanomaly').
+outcome_uri(P, C) :-
+	rdf_equal(P, aers:seriousnessother),
+	rdf_equal(C, aers:'outcome/other').