35
36:- module(rdf_file_type,
37 [ rdf_guess_data_format/2, 38 rdf_guess_format_and_load/2 39 ]). 40:- use_module(library(semweb/rdf_db)). 41:- use_module(library(memfile)). 42:- use_module(library(sgml)). 43:- use_module(library(lists)). 44:- use_module(library(apply)). 45:- use_module(library(option)). 46:- if(exists_source(library(archive))). 47:- use_module(library(archive)). 48:- endif. 49
53
54
63
64rdf_guess_format_and_load(Stream, Options) :-
65 option(format(_), Options),
66 !,
67 rdf_load(stream(Stream), Options).
68:- if(current_predicate(archive_data_stream/3)). 69rdf_guess_format_and_load(Stream, Options) :-
70 setup_call_cleanup(
71 archive_open(Stream, Archive, [format(all),format(raw)]),
72 forall(archive_data_stream(Archive, DataStream, [meta_data(MetaData)]),
73 call_cleanup(
74 ( member_base_uri(MetaData, Options, Options2),
75 option(base_uri(Base), Options2, 'http://example.org/'),
76 set_stream(DataStream, file_name(Base)),
77 ( file_base_name(Base, FileName),
78 non_rdf_file(FileName)
79 -> true
80 ; rdf_guess_data_format(DataStream, Format)
81 -> rdf_load(stream(DataStream), [format(Format)|Options2])
82 ; true
83 )
84 ),
85 close(DataStream))),
86 archive_close(Archive)).
87
88member_base_uri([_], Options, Options) :- !.
89member_base_uri(MetaData, Options0, Options) :-
90 append(MetaPath, [_], MetaData),
91 maplist(get_dict(name), MetaPath, MetaSegments),
92 select_option(base_uri(Base0), Options0, Options1, 'http://archive.org'),
93 atomic_list_concat([Base0|MetaSegments], /, Base),
94 Options = [base_uri(Base)|Options1].
95:- else. 96rdf_guess_format_and_load(Stream, Options) :-
97 rdf_guess_data_format(Stream, Format),
98 rdf_load(stream(Stream), [format(Format)|Options]).
99:- endif. 100
101non_rdf_file(File) :-
102 file_name_extension(Base, Ext, File),
103 ( non_rdf_ext(Ext)
104 -> true
105 ; downcase_atom(Base, Lower),
106 non_rdf_base(Lower)
107 ).
108
109non_rdf_ext(pdf).
110non_rdf_ext(txt).
111non_rdf_ext(md).
112non_rdf_ext(doc).
113
114non_rdf_base(readme).
115non_rdf_base(todo).
116
125
126rdf_guess_data_format(_, Format) :-
127 nonvar(Format),
128 !.
129rdf_guess_data_format(Stream, xml) :-
130 xml_doctype(Stream, _),
131 !.
132rdf_guess_data_format(Stream, Format) :-
133 stream_property(Stream, file_name(File)),
134 file_name_extension(_, Ext, File),
135 rdf_db:rdf_file_type(Ext, Format),
136 !.
137rdf_guess_data_format(_, turtle).
138
139
153
154:- if(current_predicate(peek_string/3)). 155xml_doctype(Stream, DocType) :-
156 peek_string(Stream, 4096, Start),
157 setup_call_cleanup(
158 open_string_stream(Start, In),
159 xml_doctype_2(In, DocType),
160 close(In)).
161:- else. 162xml_doctype(Stream, DocType) :-
163 xml_doctype_2(Stream, DocType).
164:- endif. 165
166xml_doctype_2(Stream, DocType) :-
167 catch(setup_call_cleanup(make_parser(Stream, Parser, State),
168 sgml_parse(Parser,
169 [ source(Stream),
170 max_errors(1),
171 syntax_errors(quiet),
172 call(begin, on_begin),
173 call(cdata, on_cdata)
174 ]),
175 cleanup_parser(Stream, Parser, State)),
176 E, true),
177 nonvar(E),
178 E = tag(DocType).
179
180make_parser(Stream, Parser, state(Pos)) :-
181 stream_property(Stream, position(Pos)),
182 new_sgml_parser(Parser, []),
183 set_sgml_parser(Parser, dialect(xmlns)).
184
185cleanup_parser(Stream, Parser, state(Pos)) :-
186 free_sgml_parser(Parser),
187 set_stream_position(Stream, Pos).
188
189on_begin(Tag, Attributes, _Parser) :-
190 memberchk(xmlns:_=_, Attributes),
191 throw(tag(Tag)).
192
193on_cdata(_CDATA, _Parser) :-
194 throw(error(cdata)).
195
196
197open_string_stream(String, Stream) :-
198 new_memory_file(MF),
199 setup_call_cleanup(
200 open_memory_file(MF, write, Out),
201 format(Out, '~s', [String]),
202 close(Out)),
203 open_memory_file(MF, read, Stream,
204 [ free_on_close(true)
205 ])