View source with formatted comments or as raw
    1/*  Part of ClioPatria SeRQL and SPARQL server
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2011-2018, University of Amsterdam,
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(rdf_file_type,
   37          [ rdf_guess_data_format/2,    % +Stream, ?Format
   38            rdf_guess_format_and_load/2 % +Stream, +Options
   39          ]).   40:- use_module(library(semweb/rdf_db)).   41:- use_module(library(memfile)).   42:- use_module(library(sgml)).   43:- use_module(library(lists)).   44:- use_module(library(apply)).   45:- use_module(library(option)).   46:- if(exists_source(library(archive))).   47:- use_module(library(archive)).   48:- endif.   49
   50/** <module> Load RDF data from unknown file-type
   51
   52*/
   53
   54
   55%!  rdf_guess_format_and_load(+Stream, +Options) is det.
   56%
   57%   Guess the RDF format in Stream  and   load  it. Stream must be a
   58%   _repositional_ stream. Options are  passed   to  rdf_load/2.  In
   59%   addition, it processed the following options:
   60%
   61%     - filename(filename)
   62%     Name of the uploaded file.
   63
   64rdf_guess_format_and_load(Stream, Options) :-
   65    option(format(_), Options),
   66    !,
   67    rdf_load(stream(Stream), Options).
   68:- if(current_predicate(archive_data_stream/3)).   69rdf_guess_format_and_load(Stream, Options) :-
   70    setup_call_cleanup(
   71        archive_open(Stream, Archive, [format(all),format(raw)]),
   72        forall(archive_data_stream(Archive, DataStream, [meta_data(MetaData)]),
   73               call_cleanup(
   74                   ( member_base_uri(MetaData, Options, Options2),
   75                     option(base_uri(Base), Options2, 'http://example.org/'),
   76                     set_stream(DataStream, file_name(Base)),
   77                     (   file_base_name(Base, FileName),
   78                         non_rdf_file(FileName)
   79                     ->  true
   80                     ;   rdf_guess_data_format(DataStream, Format)
   81                     ->  rdf_load(stream(DataStream), [format(Format)|Options2])
   82                     ;   true
   83                     )
   84                   ),
   85                   close(DataStream))),
   86        archive_close(Archive)).
   87
   88member_base_uri([_], Options, Options) :- !.
   89member_base_uri(MetaData, Options0, Options) :-
   90    append(MetaPath, [_], MetaData),
   91    maplist(get_dict(name), MetaPath, MetaSegments),
   92    select_option(base_uri(Base0), Options0, Options1, 'http://archive.org'),
   93    atomic_list_concat([Base0|MetaSegments], /, Base),
   94    Options = [base_uri(Base)|Options1].
   95:- else.   96rdf_guess_format_and_load(Stream, Options) :-
   97    rdf_guess_data_format(Stream, Format),
   98    rdf_load(stream(Stream), [format(Format)|Options]).
   99:- endif.  100
  101non_rdf_file(File) :-
  102    file_name_extension(Base, Ext, File),
  103    (   non_rdf_ext(Ext)
  104    ->  true
  105    ;   downcase_atom(Base, Lower),
  106        non_rdf_base(Lower)
  107    ).
  108
  109non_rdf_ext(pdf).
  110non_rdf_ext(txt).
  111non_rdf_ext(md).
  112non_rdf_ext(doc).
  113
  114non_rdf_base(readme).
  115non_rdf_base(todo).
  116
  117%!  rdf_guess_data_format(+Stream, ?Format)
  118%
  119%   Guess the format  of  an  RDF   file  from  the  actual content.
  120%   Currently, this seeks for a valid  XML document upto the rdf:RDF
  121%   element before concluding that the file is RDF/XML. Otherwise it
  122%   assumes that the document is Turtle.
  123%
  124%   @tbd    Recognise Turtle variations from content
  125
  126rdf_guess_data_format(_, Format) :-
  127    nonvar(Format),
  128    !.
  129rdf_guess_data_format(Stream, xml) :-
  130    xml_doctype(Stream, _),
  131    !.
  132rdf_guess_data_format(Stream, Format) :-
  133    stream_property(Stream, file_name(File)),
  134    file_name_extension(_, Ext, File),
  135    rdf_db:rdf_file_type(Ext, Format),
  136    !.
  137rdf_guess_data_format(_, turtle).
  138
  139
  140%!  xml_doctype(+Stream, -DocType) is semidet.
  141%
  142%   Parse a stream and get the name   of the first XML element *and*
  143%   demand that this element defines XML   namespaces.  Fails if the
  144%   document is illegal XML before the first element.
  145%
  146%   Note that it is not  possible   to  define valid RDF/XML without
  147%   namespaces, while it is not possible  to define a valid absolute
  148%   Turtle URI (using <URI>) with a valid xmlns declaration.
  149%
  150%   @arg Stream denotes the input. If peek_string/3 is provided
  151%   (SWI-Prolog version 7), it is not necessary that the stream can
  152%   be repositioned. Older versions require a repositionable stream.
  153
  154:- if(current_predicate(peek_string/3)).  155xml_doctype(Stream, DocType) :-
  156    peek_string(Stream, 4096, Start),
  157    setup_call_cleanup(
  158        open_string_stream(Start, In),
  159        xml_doctype_2(In, DocType),
  160        close(In)).
  161:- else.  162xml_doctype(Stream, DocType) :-
  163    xml_doctype_2(Stream, DocType).
  164:- endif.  165
  166xml_doctype_2(Stream, DocType) :-
  167    catch(setup_call_cleanup(make_parser(Stream, Parser, State),
  168                             sgml_parse(Parser,
  169                                        [ source(Stream),
  170                                          max_errors(1),
  171                                          syntax_errors(quiet),
  172                                          call(begin, on_begin),
  173                                          call(cdata, on_cdata)
  174                                        ]),
  175                             cleanup_parser(Stream, Parser, State)),
  176          E, true),
  177    nonvar(E),
  178    E = tag(DocType).
  179
  180make_parser(Stream, Parser, state(Pos)) :-
  181    stream_property(Stream, position(Pos)),
  182    new_sgml_parser(Parser, []),
  183    set_sgml_parser(Parser, dialect(xmlns)).
  184
  185cleanup_parser(Stream, Parser, state(Pos)) :-
  186    free_sgml_parser(Parser),
  187    set_stream_position(Stream, Pos).
  188
  189on_begin(Tag, Attributes, _Parser) :-
  190    memberchk(xmlns:_=_, Attributes),
  191    throw(tag(Tag)).
  192
  193on_cdata(_CDATA, _Parser) :-
  194    throw(error(cdata)).
  195
  196
  197open_string_stream(String, Stream) :-
  198    new_memory_file(MF),
  199    setup_call_cleanup(
  200        open_memory_file(MF, write, Out),
  201        format(Out, '~s', [String]),
  202        close(Out)),
  203    open_memory_file(MF, read, Stream,
  204                     [ free_on_close(true)
  205                     ])