View source with raw comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2011-2018, University of Amsterdam,
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(rdf_file_type,
   37          [ rdf_guess_data_format/2,    % +Stream, ?Format
   38            rdf_guess_format_and_load/2 % +Stream, +Options
   39          ]).   40:- use_module(library(semweb/rdf_db)).   41:- use_module(library(memfile)).   42:- use_module(library(sgml)).   43:- use_module(library(lists)).   44:- use_module(library(apply)).   45:- use_module(library(option)).   46:- if(exists_source(library(archive))).   47:- use_module(library(archive)).   48:- endif.

Load RDF data from unknown file-type

*/

 rdf_guess_format_and_load(+Stream, +Options) is det
Guess the RDF format in Stream and load it. Stream must be a repositional stream. Options are passed to rdf_load/2. In addition, it processed the following options:
filename(filename)
Name of the uploaded file.
   64rdf_guess_format_and_load(Stream, Options) :-
   65    option(format(_), Options),
   66    !,
   67    rdf_load(stream(Stream), Options).
   68:- if(current_predicate(archive_data_stream/3)).   69rdf_guess_format_and_load(Stream, Options) :-
   70    setup_call_cleanup(
   71        archive_open(Stream, Archive, [format(all),format(raw)]),
   72        forall(archive_data_stream(Archive, DataStream, [meta_data(MetaData)]),
   73               call_cleanup(
   74                   ( member_base_uri(MetaData, Options, Options2),
   75                     option(base_uri(Base), Options2, 'http://example.org/'),
   76                     set_stream(DataStream, file_name(Base)),
   77                     (   file_base_name(Base, FileName),
   78                         non_rdf_file(FileName)
   79                     ->  true
   80                     ;   rdf_guess_data_format(DataStream, Format)
   81                     ->  rdf_load(stream(DataStream), [format(Format)|Options2])
   82                     ;   true
   83                     )
   84                   ),
   85                   close(DataStream))),
   86        archive_close(Archive)).
   87
   88member_base_uri([_], Options, Options) :- !.
   89member_base_uri(MetaData, Options0, Options) :-
   90    append(MetaPath, [_], MetaData),
   91    maplist(get_dict(name), MetaPath, MetaSegments),
   92    select_option(base_uri(Base0), Options0, Options1, 'http://archive.org'),
   93    atomic_list_concat([Base0|MetaSegments], /, Base),
   94    Options = [base_uri(Base)|Options1].
   95:- else.   96rdf_guess_format_and_load(Stream, Options) :-
   97    rdf_guess_data_format(Stream, Format),
   98    rdf_load(stream(Stream), [format(Format)|Options]).
   99:- endif.  100
  101non_rdf_file(File) :-
  102    file_name_extension(Base, Ext, File),
  103    (   non_rdf_ext(Ext)
  104    ->  true
  105    ;   downcase_atom(Base, Lower),
  106        non_rdf_base(Lower)
  107    ).
  108
  109non_rdf_ext(pdf).
  110non_rdf_ext(txt).
  111non_rdf_ext(md).
  112non_rdf_ext(doc).
  113
  114non_rdf_base(readme).
  115non_rdf_base(todo).
 rdf_guess_data_format(+Stream, ?Format)
Guess the format of an RDF file from the actual content. Currently, this seeks for a valid XML document upto the rdf:RDF element before concluding that the file is RDF/XML. Otherwise it assumes that the document is Turtle.
To be done
- Recognise Turtle variations from content
  126rdf_guess_data_format(_, Format) :-
  127    nonvar(Format),
  128    !.
  129rdf_guess_data_format(Stream, xml) :-
  130    xml_doctype(Stream, _),
  131    !.
  132rdf_guess_data_format(Stream, Format) :-
  133    stream_property(Stream, file_name(File)),
  134    file_name_extension(_, Ext, File),
  135    rdf_db:rdf_file_type(Ext, Format),
  136    !.
  137rdf_guess_data_format(_, turtle).
 xml_doctype(+Stream, -DocType) is semidet
Parse a stream and get the name of the first XML element and demand that this element defines XML namespaces. Fails if the document is illegal XML before the first element.

Note that it is not possible to define valid RDF/XML without namespaces, while it is not possible to define a valid absolute Turtle URI (using <URI>) with a valid xmlns declaration.

Arguments:
Stream- denotes the input. If peek_string/3 is provided (SWI-Prolog version 7), it is not necessary that the stream can be repositioned. Older versions require a repositionable stream.
  154:- if(current_predicate(peek_string/3)).  155xml_doctype(Stream, DocType) :-
  156    peek_string(Stream, 4096, Start),
  157    setup_call_cleanup(
  158        open_string_stream(Start, In),
  159        xml_doctype_2(In, DocType),
  160        close(In)).
  161:- else.  162xml_doctype(Stream, DocType) :-
  163    xml_doctype_2(Stream, DocType).
  164:- endif.  165
  166xml_doctype_2(Stream, DocType) :-
  167    catch(setup_call_cleanup(make_parser(Stream, Parser, State),
  168                             sgml_parse(Parser,
  169                                        [ source(Stream),
  170                                          max_errors(1),
  171                                          syntax_errors(quiet),
  172                                          call(begin, on_begin),
  173                                          call(cdata, on_cdata)
  174                                        ]),
  175                             cleanup_parser(Stream, Parser, State)),
  176          E, true),
  177    nonvar(E),
  178    E = tag(DocType).
  179
  180make_parser(Stream, Parser, state(Pos)) :-
  181    stream_property(Stream, position(Pos)),
  182    new_sgml_parser(Parser, []),
  183    set_sgml_parser(Parser, dialect(xmlns)).
  184
  185cleanup_parser(Stream, Parser, state(Pos)) :-
  186    free_sgml_parser(Parser),
  187    set_stream_position(Stream, Pos).
  188
  189on_begin(Tag, Attributes, _Parser) :-
  190    memberchk(xmlns:_=_, Attributes),
  191    throw(tag(Tag)).
  192
  193on_cdata(_CDATA, _Parser) :-
  194    throw(error(cdata)).
  195
  196
  197open_string_stream(String, Stream) :-
  198    new_memory_file(MF),
  199    setup_call_cleanup(
  200        open_memory_file(MF, write, Out),
  201        format(Out, '~s', [String]),
  202        close(Out)),
  203    open_memory_file(MF, read, Stream,
  204                     [ free_on_close(true)
  205                     ])