View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2006-2015, University of Amsterdam
    7                              VU University Amsterdam
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(rdf_http_plugin, []).   37:- use_module(library(semweb/rdf_db), []). % we define hooks for this
   38
   39:- autoload(library(date),[parse_time/2]).   40:- autoload(library(error),[domain_error/2]).   41:- autoload(library(lists),[append/2]).   42:- autoload(library(option),[option/3]).   43:- autoload(library(http/http_header),[http_timestamp/2]).   44:- autoload(library(http/http_open),[http_open/3]).   45
   46/** <module> RDF HTTP Plugin
   47
   48This module allows loading data into   the semantic web library directly
   49from an HTTP server. The following example  loads the RDF core data into
   50the RDF database.
   51
   52    ==
   53    :- use_module(library(semweb/rdf_db)).
   54    :- use_module(library(semweb/rdf_http_plugin)).
   55
   56        ...,
   57        rdf_load('http://www.w3.org/1999/02/22-rdf-syntax-ns')
   58    ==
   59*/
   60
   61:- multifile
   62    rdf_db:rdf_open_hook/8,
   63    rdf_db:url_protocol/1,
   64    rdf_db:rdf_storage_encoding/2,
   65    rdf_db:rdf_file_type/2,
   66    rdf_content_type/3.   67
   68rdf_db:url_protocol(http).
   69rdf_db:url_protocol(https).
   70
   71
   72% define `rdf_format` as a type.
   73:- multifile error:has_type/2.   74error:has_type(rdf_format, Term):-
   75    error:has_type(oneof([nquads,ntriples,rdfa,trig,turtle,xml]), Term).
   76
   77%!  rdf_extra_headers(-RequestHeaders:list(compound), +Options:list) is det.
   78%
   79%   Send extra headers with the request. Note that, although we also
   80%   process RDF embedded in HTML, we do  not explicitely ask for it.
   81%   Doing so causes some   (e.g., http://w3.org/2004/02/skos/core to
   82%   reply with the HTML description rather than the RDF).
   83%
   84%   When given, option format(+atom) is used in order to prioritize
   85%   the corresponding RDF content types.
   86
   87rdf_extra_headers([ cert_verify_hook(ssl_verify),
   88                    request_header('Accept'=AcceptValue)
   89                  ], Options) :-
   90    option(format(Format), Options, _VAR),
   91    rdf_accept_header_value(Format, AcceptValue).
   92
   93
   94%!  rdf_db:rdf_open_hook(+Scheme, +URL, +HaveModified,
   95%!                       -Stream, -Cleanup, -Modified, -Format,
   96%!                       +Options) is semidet.
   97%
   98%   Load hook implementation for HTTP(S) URLs.
   99%
  100%   @arg HaveModified is bound to a timestamp (number) if we already
  101%        have a copy and that copy was modified at HaveModified.
  102%   @arg Modified is bound to =unknown=, =not_modified= or a
  103%        timestamp.
  104
  105rdf_db:rdf_open_hook(https, SourceURL, HaveModified, Stream, Cleanup,
  106                     Modified, Format, Options) :-
  107    rdf_db:rdf_open_hook(http, SourceURL, HaveModified, Stream, Cleanup,
  108                         Modified, Format, Options).
  109rdf_db:rdf_open_hook(http, SourceURL, HaveModified, Stream, Cleanup,
  110                     Modified, Format, Options) :-
  111    modified_since_header(HaveModified, Header),
  112    TypeHdr = [ header(content_type, ContentType),
  113                header(last_modified, ModifiedText)
  114              ],
  115    rdf_extra_headers(Extra, Options),
  116    append([Extra, TypeHdr, Header, Options], OpenOptions),
  117    catch(http_open(SourceURL, Stream0,
  118                    [ status_code(Code)
  119                    | OpenOptions
  120                    ]), E, true),
  121    (   Code == 200
  122    ->  (   open_envelope(ContentType, SourceURL,
  123                          Stream0, Stream, Format)
  124        ->  Cleanup = close(Stream),
  125            (   nonvar(ModifiedText),
  126                parse_time(ModifiedText, ModifiedStamp)
  127            ->  Modified = last_modified(ModifiedStamp)
  128            ;   Modified = unknown
  129            )
  130        ;   close(Stream0),
  131            domain_error(content_type, ContentType)
  132        )
  133    ;   Code == 304
  134    ->  Modified = not_modified,
  135        Cleanup = true
  136    ;   var(E)
  137    ->  throw(error(existence_error(url, SourceURL),
  138                    context(_, status(Code,_))))
  139    ;   throw(E)
  140    ).
  141
  142:- public ssl_verify/5.  143
  144%!  ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error)
  145%
  146%   Currently we accept  all  certificates.
  147
  148ssl_verify(_SSL,
  149           _ProblemCertificate, _AllCertificates, _FirstCertificate,
  150           _Error).
  151
  152%!  modified_since_header(+LastModified, -ExtraHeaders) is det.
  153%
  154%   Add an =|If-modified-since|= if we have a version with the given
  155%   time-stamp.
  156
  157modified_since_header(HaveModified, []) :-
  158    var(HaveModified),
  159    !.
  160modified_since_header(HaveModified,
  161                      [ request_header('If-modified-since' =
  162                                       Modified)
  163                      ]) :-
  164    http_timestamp(HaveModified, Modified).
  165
  166%!  open_envelope(+ContentType, +SourceURL, +Stream0, -Stream,
  167%!                ?Format) is semidet.
  168%
  169%   Open possible envelope formats.
  170
  171open_envelope(ContentType, SourceURL, Stream0, Stream, Format) :-
  172    (   ContentType == 'application/x-gzip'
  173    ;   ContentType == 'application/octet-stream',
  174        file_name_extension(_, gz, SourceURL)
  175    ),
  176    !,
  177    rdf_db:rdf_storage_encoding(_, gzip),
  178    !,
  179    (   var(Format)
  180    ->  file_name_extension(BaseURL, _GzExt, SourceURL),
  181        file_name_extension(_, Ext, BaseURL),
  182        rdf_db:rdf_file_type(Ext, Format)
  183    ;   true
  184    ),
  185    stream_pair(Stream0, Read, _),
  186    rdf_zlib_plugin:zopen(Read, Stream, []).
  187open_envelope(_, _, Stream, Stream, Format) :-
  188    nonvar(Format),
  189    !.
  190open_envelope(ContentType, SourceURL, Stream, Stream, Format) :-
  191    major_content_type(ContentType, Major),
  192    (   rdf_content_type(Major, _, Format)
  193    ->  true
  194    ;   Major == 'text/plain'       % server is not properly configured
  195    ->  file_name_extension(_, Ext, SourceURL),
  196        rdf_db:rdf_file_type(Ext, Format)
  197    ).
  198
  199major_content_type(ContentType, Major) :-
  200    sub_atom(ContentType, Pre, _, _, (;)),
  201    !,
  202    sub_atom(ContentType, 0, Pre, _, Major).
  203major_content_type(Major, Major).
  204
  205
  206%% rdf_accept_header_value(?Format:rdf_format, -AcceptValue:atom) is det.
  207
  208rdf_accept_header_value(Format, AcceptValue) :-
  209    findall(AcceptValue, accept_value(Format, AcceptValue), AcceptValues),
  210    atomic_list_concat(['*/*;q=0.001'|AcceptValues], ',', AcceptValue).
  211
  212accept_value(Format, AcceptValue) :-
  213    rdf_content_type(MediaType, QValue0, Format0),
  214    (   Format == Format0
  215    ->  QValue = 1.0
  216    ;   QValue = QValue0
  217    ),
  218    format(atom(AcceptValue), '~a;q=~3f', [MediaType,QValue]).
  219
  220
  221%!  rdf_content_type(?MediaType:atom, ?QualityValue:between(0.0,1.0),
  222%!                   ?Format:rdf_format) is nondet.
  223%
  224%   Quality values are intended to be   used  in accordance with RFC
  225%   2616. Quality values  are  determined   based  on  the following
  226%   criteria:
  227%
  228%       | **Label** | **Criterion**             | **Value** |
  229%       | A         | Supported RDF parser      | 0.43      |
  230%       | B         | RDF-specific content type | 0.33      |
  231%       | C         | Official content type     | 0.23      |
  232%
  233%   For example, `text/turtle` has quality value 0.99 because it is
  234%   an official content type that is RDF-specific and that has a parser
  235%   in Semweb.
  236%
  237%   This intentionally allows the user to add another content type with
  238%   a higher Q-value (i.e., >0.99).
  239%
  240%   Deduce the RDF encoding from the   mime-type.  This predicate is
  241%   defined as multifile such that the user can associate additional
  242%   content types to RDF formats.
  243%
  244%   @bug The turtle parser only parses a subset of n3.
  245%        (The N3 format is treated as if it were Turtle.)
  246%   @see Discussion http://richard.cyganiak.de/blog/2008/03/what-is-your-rdf-browsers-accept-header/
  247%   @see N-Quadruples http://www.w3.org/ns/formats/N-Quads
  248%   @see N-Triples http://www.w3.org/ns/formats/N-Triples
  249%   @see N3 http://www.w3.org/ns/formats/N3
  250%   @see RDFa http://www.w3.org/ns/formats/RDFa
  251%   @see TriG http://www.w3.org/ns/formats/TriG
  252%   @see Turtle http://www.w3.org/ns/formats/Turtle
  253%   @see XML/RDF http://www.w3.org/ns/formats/RDF_XML
  254
  255rdf_content_type('application/n-quads',    0.99, nquads  ). %ABC
  256rdf_content_type('application/n-triples',  0.99, ntriples). %ABC
  257rdf_content_type('application/rdf',        0.76, xml     ). %AB
  258rdf_content_type('application/rdf+turtle', 0.76, turtle  ). %AB
  259rdf_content_type('application/rdf+xml',    0.76, xml     ). %AB
  260rdf_content_type('application/rss+xml',    0.66, xml     ). %AC
  261rdf_content_type('application/trig',       0.99, trig    ). %ABC
  262rdf_content_type('application/turtle',     0.76, turtle  ). %AB
  263rdf_content_type('application/x-trig',     0.76, trig    ). %AB
  264rdf_content_type('application/x-turtle',   0.76, turtle  ). %AB
  265rdf_content_type('application/xhtml+xml',  0.66, rdfa    ). %AC
  266rdf_content_type('application/xml',        0.66, xml     ). %AC
  267rdf_content_type('text/html',              0.66, rdfa    ). %AC
  268rdf_content_type('text/n3',                0.56, turtle  ). %BC (N3)
  269rdf_content_type('text/rdf',               0.76, xml     ). %AB
  270rdf_content_type('text/rdf+n3',            0.33, turtle  ). %B (N3)
  271rdf_content_type('text/rdf+xml',           0.76, xml     ). %AB
  272rdf_content_type('text/turtle',            0.99, turtle  ). %ABC
  273rdf_content_type('text/xml',               0.66, xml     ). %AC
  274rdf_content_type('application/x-gzip',     0.23, gzip    ). %C