View source with raw comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2023, VU University Amsterdam
    7			      SWI-Prolog Solutions b.v.
    8    All rights reserved.
    9
   10    Redistribution and use in source and binary forms, with or without
   11    modification, are permitted provided that the following conditions
   12    are met:
   13
   14    1. Redistributions of source code must retain the above copyright
   15       notice, this list of conditions and the following disclaimer.
   16
   17    2. Redistributions in binary form must reproduce the above copyright
   18       notice, this list of conditions and the following disclaimer in
   19       the documentation and/or other materials provided with the
   20       distribution.
   21
   22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   33    POSSIBILITY OF SUCH DAMAGE.
   34*/
   35
   36:- module(uri,
   37          [ uri_components/2,           % ?URI, ?Components
   38            uri_data/3,                 % ?Field, +Components, ?Data
   39            uri_data/4,                 % +Field, +Components, -Data, -New
   40	    uri_edit/3,			% +Actions,+URI0,-URI
   41
   42            uri_normalized/2,           % +URI, -NormalizedURI
   43            iri_normalized/2,           % +IRI, -NormalizedIRI
   44            uri_normalized_iri/2,       % +URI, -NormalizedIRI
   45            uri_normalized/3,           % +URI, +Base, -NormalizedURI
   46            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
   47            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
   48            uri_resolve/3,              % +URI, +Base, -AbsURI
   49            uri_is_global/1,            % +URI
   50            uri_query_components/2,     % ?QueryString, ?NameValueList
   51            uri_authority_components/2, % ?Authority, ?Components
   52            uri_authority_data/3,       % ?Field, ?Components, ?Data
   53					% Encoding
   54            uri_encoded/3,              % +Component, ?Value, ?Encoded
   55            uri_file_name/2,            % ?URI, ?Path
   56            uri_iri/2                   % ?URI, ?IRI
   57	  ]).   58:- autoload(library(error), [domain_error/2]).   59:- use_foreign_library(foreign(uri)).

Process URIs

This library provides high-performance C-based primitives for manipulating URIs. We decided for a C-based implementation for the much better performance on raw character manipulation. Notably, URI handling primitives are used in time-critical parts of RDF processing. This implementation is based on RFC-3986:

http://labs.apache.org/webarch/uri/rfc/rfc3986.html

The URI processing in this library is rather liberal. That is, we break URIs according to the rules, but we do not validate that the components are valid. Also, percent-decoding for IRIs is liberal. It first tries UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.

Earlier experience has shown that strict enforcement of the URI syntax results in many errors that are accepted by many other web-document processing tools. */

 uri_components(+URI, -Components) is det
uri_components(-URI, +Components) is det
Break a URI into its 5 basic components according to the RFC-3986 regular expression:
^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
 12            3  4          5       6  7        8 9
Arguments:
Components- is a term uri_components(Scheme, Authority, Path, Search, Fragment). If a URI is parsed, i.e., using mode (+,-), components that are not found are left uninstantiated (variable). See uri_data/3 for accessing this structure.
 uri_data(?Field, +Components, ?Data) is semidet
Provide access the uri_component structure. Defined field-names are: scheme, authority, path, search and fragment
  102uri_data(scheme,    uri_components(S, _, _, _, _), S).
  103uri_data(authority, uri_components(_, A, _, _, _), A).
  104uri_data(path,      uri_components(_, _, P, _, _), P).
  105uri_data(search,    uri_components(_, _, _, S, _), S).
  106uri_data(fragment,  uri_components(_, _, _, _, F), F).
 uri_data(+Field, +Components, +Data, -NewComponents) is semidet
NewComponents is the same as Components with Field set to Data.
  112uri_data(scheme,    uri_components(_, A, P, Q, F), S,
  113                    uri_components(S, A, P, Q, F)).
  114uri_data(authority, uri_components(S, _, P, Q, F), A,
  115                    uri_components(S, A, P, Q, F)).
  116uri_data(path,      uri_components(S, A, _, Q, F), P,
  117                    uri_components(S, A, P, Q, F)).
  118uri_data(search,    uri_components(S, A, P, _, F), Q,
  119                    uri_components(S, A, P, Q, F)).
  120uri_data(fragment,  uri_components(S, A, P, Q, _), F,
  121                    uri_components(S, A, P, Q, F)).
 uri_normalized(+URI, -NormalizedURI) is det
NormalizedURI is the normalized form of URI. Normalization is syntactic and involves the following steps:
 iri_normalized(+IRI, -NormalizedIRI) is det
NormalizedIRI is the normalized form of IRI. Normalization is syntactic and involves the following steps:
See also
- This is similar to uri_normalized/2, but does not do normalization of %-escapes.
 uri_normalized_iri(+URI, -NormalizedIRI) is det
As uri_normalized/2, but percent-encoding is translated into IRI Unicode characters. The translation is liberal: valid UTF-8 sequences of %-encoded bytes are mapped to the Unicode character. Other %XX-sequences are mapped to the corresponding ISO-Latin-1 character and sole % characters are left untouched.
See also
- uri_iri/2.
 uri_is_global(+URI) is semidet
True if URI has a scheme. The semantics is the same as the code below, but the implementation is more efficient as it does not need to parse the other components, nor needs to bind the scheme. The condition to demand a scheme of more than one character is added to avoid confusion with DOS path names.
uri_is_global(URI) :-
        uri_components(URI, Components),
        uri_data(scheme, Components, Scheme),
        nonvar(Scheme),
        atom_length(Scheme, Len),
        Len > 1.
 uri_resolve(+URI, +Base, -GlobalURI) is det
Resolve a possibly local URI relative to Base. This implements http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
 uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det
NormalizedGlobalURI is the normalized global version of URI. Behaves as if defined by:
uri_normalized(URI, Base, NormalizedGlobalURI) :-
        uri_resolve(URI, Base, GlobalURI),
        uri_normalized(GlobalURI, NormalizedGlobalURI).
 iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det
NormalizedGlobalIRI is the normalized global version of IRI. This is similar to uri_normalized/3, but does not do %-escape normalization.
 uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det
NormalizedGlobalIRI is the normalized global IRI of URI. Behaves as if defined by:
uri_normalized(URI, Base, NormalizedGlobalIRI) :-
        uri_resolve(URI, Base, GlobalURI),
        uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
 uri_query_components(+String, -Query) is det
uri_query_components(-String, +Query) is det
Perform encoding and decoding of an URI query string. Query is a list of fully decoded (Unicode) Name=Value pairs. In mode (-,+), query elements of the forms Name(Value) and Name-Value are also accepted to enhance interoperability with the option and pairs libraries. E.g.
?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.

?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
Q = [a=b, c='d+w', n='VU Amsterdam'].
 uri_authority_components(+Authority, -Components) is det
uri_authority_components(-Authority, +Components) is det
Break-down the authority component of a URI. The fields of the structure Components can be accessed using uri_authority_data/3. This predicate deals with IPv6 addresses written as [ip], returning the ip as host, without the enclosing []. When constructing an authority string and the host contains :, the host is embraced in []. If [] is not used correctly, the behavior should be considered poorly defined. If there is no balancing `]` or the host part does not end with `]`, these characters are considered normal characters and part of the (invalid) host name.
 uri_authority_data(+Field, ?Components, ?Data) is semidet
Provide access the uri_authority structure. Defined field-names are: user, password, host and port
  242uri_authority_data(user,     uri_authority(U, _, _, _), U).
  243uri_authority_data(password, uri_authority(_, P, _, _), P).
  244uri_authority_data(host,     uri_authority(_, _, H, _), H).
  245uri_authority_data(port,     uri_authority(_, _, _, P), P).
 uri_encoded(+Component, +Value, -Encoded) is det
uri_encoded(+Component, -Value, +Encoded) is det
Encoded is the URI encoding for Value. When encoding (Value->Encoded), Component specifies the URI component where the value is used. It is one of query_value, fragment, path or segment. Besides alphanumerical characters, the following characters are passed verbatim (the set is split in logical groups according to RFC3986).
query_value, fragment
"-._~" | "!$'()*,;" | "@" | "/?"
path
"-._~" | "!$&'()*,;=" | "@" | "/"
segment
"-._~" | "!$&'()*,;=" | "@"
 uri_iri(+URI, -IRI) is det
uri_iri(-URI, +IRI) is det
Convert between a URI, encoded in US-ASCII and an IRI. An IRI is a fully expanded Unicode string. Unicode strings are first encoded into UTF-8, after which %-encoding takes place.
Errors
- syntax_error(Culprit) in mode (+,-) if URI is not a legally percent-encoded UTF-8 string.
 uri_file_name(+URI, -FileName) is semidet
uri_file_name(-URI, +FileName) is det
Convert between a URI and a local file_name. This protocol is covered by RFC 1738. Please note that file-URIs use absolute paths. The mode (-, +) translates a possible relative path into an absolute one.
  284uri_file_name(URI, FileName) :-
  285    nonvar(URI),
  286    !,
  287    uri_components(URI, Components),
  288    uri_data(scheme, Components, File), File == file,
  289    (   uri_data(authority, Components, '')
  290    ->  true
  291    ;   uri_data(authority, Components, localhost)
  292    ),
  293    uri_data(path, Components, FileNameEnc),
  294    uri_encoded(path, FileName0, FileNameEnc),
  295    delete_leading_slash(FileName0, FileName).
  296uri_file_name(URI, FileName) :-
  297    nonvar(FileName),
  298    !,
  299    absolute_file_name(FileName, Path0),
  300    ensure_leading_slash(Path0, Path),
  301    uri_encoded(path, Path, PathEnc),
  302    uri_data(scheme, Components, file),
  303    uri_data(authority, Components, ''),
  304    uri_data(path, Components, PathEnc),
  305    uri_components(URI, Components).
 ensure_leading_slash(+WinPath, -Path)
 delete_leading_slash(+Path, -WinPath)
Deal with the fact that absolute paths in Windows start with a drive letter rather than a /. For URIs we need a path that starts with a /.
  314ensure_leading_slash(Path, SlashPath) :-
  315    (   sub_atom(Path, 0, _, _, /)
  316    ->  SlashPath = Path
  317    ;   atom_concat(/, Path, SlashPath)
  318    ).
  319
  320:- if(current_prolog_flag(windows, true)).  321delete_leading_slash(Path, WinPath) :-
  322    atom_concat(/, WinPath, Path),
  323    is_absolute_file_name(WinPath),
  324    !.
  325:- endif.  326delete_leading_slash(Path, Path).
  327
  328
  329		 /*******************************
  330		 *          MODIFYING           *
  331		 *******************************/
 uri_edit(+Actions, +URI0, -URI) is det
Modify a URI according to Actions. Actions is either a single action or a (nested) list of actions. Defined primitive actions are:
scheme(+Scheme)
Set the Scheme of the URI (typically http, https, etc.)
user(+User)
Add/set the user of the authority component.
password(+Password)
Add/set the password of the authority component.
host(+Host)
Add/set the host (or ip address) of the authority component.
port(+Port)
Add/set the port of the authority component.
path(+Path)
Set/extend the path component. If Path is not absolute it is taken relative to the path of URI0.
search(+KeyValues)
Extend the Key=Value pairs of the current search (query) component. New values replace existing values. If KeyValues is written as =(KeyValues) the current search component is ignored. KeyValues is a list, whose elements are one of Key=Value, Key-Value or `Key(Value)`.
fragment(+Fragment)
Set the Fragment of the uri.

Components can be removed by using a variable as value, except from path which can be reset using path(/) and query which can be dropped using query(=([])).

Arguments:
URI0- is either a valid uri or a variable to start fresh.
  367uri_edit(Actions, URI0, URI) :-
  368    (   var(URI0)
  369    ->  URI1 = '/'
  370    ;   URI1 = URI0
  371    ),
  372    uri_components(URI1, Comp0),
  373    edit_components(Actions, Comp0, Comp),
  374    uri_components(URI, Comp).
  375
  376edit_components([], Comp0, Comp) =>
  377    Comp = Comp0.
  378edit_components([H|T], Comp0, Comp) =>
  379    edit_components(H, Comp0, Comp1),
  380    edit_components(T, Comp1, Comp).
  381edit_components(scheme(Scheme), Comp0, Comp) =>
  382    uri_data(scheme, Comp0, Scheme, Comp).
  383edit_components(path(Path), Comp0, Comp) =>
  384    uri_data(path, Comp0, Path0),
  385    (   (   var(Path0)
  386        ;   Path0 == ''
  387        )
  388    ->  Path1 = '/'
  389    ;   Path1 = Path0
  390    ),
  391    uri_normalized(Path, Path1, Path2),
  392    uri_data(path, Comp0, Path2, Comp).
  393edit_components(fragment(Fragment), Comp0, Comp) =>
  394    uri_data(fragment, Comp0, Fragment, Comp).
  395edit_components(Authority, Comp0, Comp),
  396  authority_field(Authority) =>
  397    uri_data(authority, Comp0, Auth0),
  398    (   var(Auth0)
  399    ->  true
  400    ;   uri_authority_components(Auth0, AComp0)
  401    ),
  402    edit_auth_components(Authority, AComp0, AComp),
  403    uri_authority_components(Auth, AComp),
  404    uri_data(authority, Comp0, Auth, Comp).
  405edit_components(query(Search), Comp0, Comp) =>
  406    edit_components(search(Search), Comp0, Comp).
  407edit_components(search(=(Search)), Comp0, Comp) =>
  408    uri_query_components(String, Search),
  409    uri_data(search, Comp0, String, Comp).
  410edit_components(search(Search), Comp0, Comp) =>
  411    uri_data(search, Comp0, SS0),
  412    (   var(SS0)
  413    ->  Search0 = []
  414    ;   uri_query_components(SS0, Search0)
  415    ),
  416    join_search(Search0, Search, Search1),
  417    uri_query_components(SS1, Search1),
  418    uri_data(search, Comp0, SS1, Comp).
  419edit_components(Other, _, _) =>
  420    domain_error(uri_edit, Other).
  421
  422authority_field(user(_)).
  423authority_field(password(_)).
  424authority_field(host(_)).
  425authority_field(port(_)).
  426
  427edit_auth_components(user(User),
  428		     uri_authority(_, Passwd, Host, Port),
  429		     uri_authority(User, Passwd, Host, Port)).
  430edit_auth_components(password(Passwd),
  431		     uri_authority(User, _, Host, Port),
  432		     uri_authority(User, Passwd, Host, Port)).
  433edit_auth_components(host(Host),
  434		     uri_authority(User, Passwd, _, Port),
  435		     uri_authority(User, Passwd, Host, Port)).
  436edit_auth_components(port(Port),
  437		     uri_authority(User, Passwd, Host, _),
  438		     uri_authority(User, Passwd, Host, Port)).
  439
  440join_search([], Search, Search).
  441join_search([N=_|ST], New, Search) :-
  442    (   memberchk(N=_, New)
  443    ->  true
  444    ;   functor(T, N, 1),
  445	memberchk(T, New)
  446    ->  true
  447    ;   memberchk(N-_, New)
  448    ),
  449    !,
  450    join_search(ST, New, Search).
  451join_search([H|ST], New, [H|Search]) :-
  452    join_search(ST, New, Search).
  453
  454
  455                 /*******************************
  456                 *            SANDBOX           *
  457                 *******************************/
  458
  459:- multifile sandbox:safe_primitive/1.  460
  461sandbox:safe_primitive(uri:uri_components(_,_)).
  462sandbox:safe_primitive(uri:uri_normalized(_,_)).
  463sandbox:safe_primitive(uri:iri_normalized(_,_)).
  464sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
  465sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
  466sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
  467sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
  468sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
  469sandbox:safe_primitive(uri:uri_is_global(_)).
  470sandbox:safe_primitive(uri:uri_query_components(_,_)).
  471sandbox:safe_primitive(uri:uri_authority_components(_,_)).
  472sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
  473sandbox:safe_primitive(uri:uri_iri(_,_))