35
   36:- module(url_cache,
   37          [ url_cache/3,                   38            url_cache_file/4,              39            url_cache_delete/1,            40            url_cached/2,                  41            url_cached/3,                  42            url_cache_reset_server_status/0,
   43            url_cache_reset_server_status/1    44          ]).   45:- use_module(library(http/http_open)).   46:- if(exists_source(library(http/http_ssl_plugin))).   47:- use_module(library(http/http_ssl_plugin)).   48:- endif.   49:- use_module(library(http/mimetype)).   50:- use_module(library(url)).   51:- use_module(library(debug)).   52:- use_module(library(error)).   53:- use_module(library(settings)).   54:- use_module(library(base64)).   55:- use_module(library(utf8)).   56:- use_module(library(lists)).   57:- use_module(library(sha)).   58
   59:- setting(cache:url_cache_directory, atom, 'cache/url',
   60           'Directory to cache fetched remote URLs').
   85url_cache(URL, Path, MimeType) :-
   86    url_cache_dir(Dir),
   87    url_cache_file(URL, Dir, url, Path),
   88    atom_concat(Path, '.meta', TypeFile),
   89    (   exists_file(Path),
   90        exists_file(TypeFile),
   91        read_meta_file(TypeFile, mime_type(MimeType0))
   92    ->  MimeType = MimeType0
   93    ;   fetch_url(URL, Path, MimeType, Modified),
   94        get_time(NowF),
   95        Now is round(NowF),
   96        open(TypeFile, write, Out,
   97             [ encoding(utf8),
   98               lock(write)
   99             ]),
  100        format(Out,
  101               'mime_type(~q).~n\c
  102                    url(~q).~n\c
  103                    fetched(~q).~n',
  104               [MimeType, URL, Now]),
  105        (   nonvar(Modified)
  106        ->  format(Out, 'last_modified(~q).~n', [Modified])
  107        ;   true
  108        ),
  109        close(Out)
  110    ).
  111
  112read_meta_file(MimeFile, Term) :-
  113    setup_call_cleanup(open(MimeFile, read, In,
  114                            [ encoding(utf8),
  115                              lock(read)
  116                            ]),
  117                       ndet_read(In, Term),
  118                       close(In)).
  119
  120ndet_read(Stream, Term) :-
  121    repeat,
  122    read(Stream, Term0),
  123    (   Term0 == end_of_file
  124    ->  !, fail
  125    ;   Term = Term0
  126    ).
  136url_cache_delete(URL) :-
  137    url_cache_dir(Dir),
  138    url_cache_file(URL, Dir, url, Path),
  139    atom_concat(Path, '.meta', TypeFile),
  140    catch(delete_file(TypeFile), E0, true),
  141    catch(delete_file(Path), E1, true),
  142    error_ok(E0),
  143    error_ok(E1).
  144
  145error_ok(E) :-
  146    subsumes_term(error(existence_error(file, _), _), E),
  147    !.
  148error_ok(E) :-
  149    throw(E).
  155url_cache_dir(Dir) :-
  156    setting(cache:url_cache_directory, Dir),
  157    make_directory_path(Dir).
  163make_directory_path(Dir) :-
  164    make_directory_path_2(Dir),
  165    !.
  166make_directory_path(Dir) :-
  167    permission_error(create, directory, Dir).
  168
  169make_directory_path_2(Dir) :-
  170    exists_directory(Dir),
  171    !.
  172make_directory_path_2(Dir) :-
  173    Dir \== (/),
  174    !,
  175    file_directory_name(Dir, Parent),
  176    make_directory_path_2(Parent),
  177    make_directory(Dir).
  183fetch_url(URL, File, MimeType, Modified) :-
  184    parse_url_ex(URL, Parts),
  185    server(Parts, Server),
  186    (   allow(Server)
  187    ->  true
  188    ;   throw(error(existence_error(url, URL),
  189                    context(url_cache/3, 'Too many errors from server')))
  190    ),
  191    get_time(Now),
  192    (   catch(fetch_url_raw(URL, File,
  193                            MimeType, Modified), E, true)
  194    ->  (   var(E)
  195        ->  register_stats(Server, Now, true)
  196        ;   register_stats(Server, Now, error(E)),
  197            throw(E)
  198        )
  199    ;   register_stats(Server, Now, false)
  200    ).
  201
  202server(Parts, Server) :-
  203    memberchk(host(Host), Parts),
  204    !,
  205    (   memberchk(port(Port), Parts)
  206    ->  Server = Host:Port
  207    ;   Server = Host
  208    ).
  209server(_,_) :-
  210    assertion(false).
  211
  225
  226:- dynamic
  227    server_status/3.                  228
  229allow(Server) :-
  230    server_status(Server, Status),
  231    debug(url_cache, 'Status ~q: ~w', [Server, Status]),
  232    Status > 0.
  233
  234server_status(Server, Status) :-
  235    get_time(Now),
  236    with_mutex(url_cache_status,
  237               server_status(Server, S0, T0)),
  238    !,
  239    Status is min(100, S0 + round(Now-T0)//60).
  240server_status(_, 100).
  241
  242register_stats(Server, Start, Result) :-
  243    get_time(Now),
  244    Time is Now - Start,
  245    (   server_status(Server, S0, T0)
  246    ->  true
  247    ;   S0 = 100,
  248        T0 = Now
  249    ),
  250    Since is Start - T0,
  251    update_status(Result, Time, Since, S0, S1),
  252    with_mutex(url_cache_status,
  253               (   retractall(server_status(Server, _, _)),
  254                   assert(server_status(Server, S1, Start)))).
  255
  256update_status(true, Time, Since, S0, S) :-
  257    !,
  258    S is min(100, S0 + round(20-4*sqrt(Time)) + round(Since)//60).
  259update_status(_, Time, _Since, S0, S) :-
  260    !,
  261    S is max(-100, S0 - (10 + round(Time))).
  269url_cache_reset_server_status :-
  270    with_mutex(url_cache_status,
  271               retractall(server_status(_,_,_))).
  272url_cache_reset_server_status(Server) :-
  273    must_be(atom, Server),
  274    with_mutex(url_cache_status,
  275               retractall(server_status(Server,_,_))).
  286fetch_url_raw(URL, File, MimeType, Modified) :-
  287    debug(url_cache, 'Downloading ~w ...', [URL]),
  288    atom_concat(File, '.tmp', TmpFile),
  289    (   catch(fetch_to_file(URL, TmpFile, Code, Header), E, true)
  290    ->  true
  291    ;   E = predicate_failed(http_get/3)
  292    ),
  293    (   var(E)
  294    ->  true
  295    ;   (   debugging(url_cache)
  296        ->  print_message(error, E)
  297        ;   true
  298        ),
  299        catch(delete_file(TmpFile), _, true),
  300        (   debugging(url_cache)
  301        ->  message_to_string(E, Msg),
  302            debug(url_cache, 'Download failed: ~w', [Msg])
  303        ;   true
  304        ),
  305        throw(E)
  306    ),
  307    (   Code == 200
  308    ->  rename_file(TmpFile, File)
  309    ;   catch(delete_file(TmpFile), _, true),
  310        throw(error(existence_error(url, URL), _))
  311    ),
  312    (   memberchk(content_type(MimeType0), Header)
  313    ->  true
  314    ;   MimeType0 = 'text/plain'
  315    ),
  316    ignore(memberchk(last_modified(Modified), Header)),
  317    debug(url_cache, 'Downloaded ~w, mime-type: ~w',
  318          [URL, MimeType0]),
  319    MimeType = MimeType0.
  320
  321fetch_to_file(URL, File, Code,
  322              [ content_type(ContentType),
  323                last_modified(LastModified)
  324              ]) :-
  325    setup_call_cleanup(
  326        open(File, write, Out, [ type(binary) ]),
  327        setup_call_cleanup(
  328            http_open(URL, In,
  329                      [ header(content_type, ContentType),
  330                        header(last_modified, LastModified),
  331                        status_code(Code),
  332                        cert_verify_hook(ssl_verify)
  333                      ]),
  334            copy_stream_data(In, Out),
  335            close(In)),
  336        close(Out)).
  337
  338:- public ssl_verify/5.
  344ssl_verify(_SSL,
  345           _ProblemCertificate, _AllCertificates, _FirstCertificate,
  346           _Error).
  347
  348parse_url_ex(URL, Parts) :-
  349    is_list(URL),
  350    !,
  351    Parts = URL.
  352parse_url_ex(URL, Parts) :-
  353    parse_url(URL, Parts),
  354    !.
  355parse_url_ex(URL, _) :-
  356    domain_error(url, URL).
  364url_cache_file(URL, Dir, Ext, Path) :-
  365    url_to_file(URL, Ext, File),
  366    sub_atom(File, 0, 2, _, L1),
  367    ensure_dir(Dir, L1, Dir1),
  368    sub_atom(File, 2, 2, _, L2),
  369    ensure_dir(Dir1, L2, Dir2),
  370    sub_atom(File, 4, _, 0, LocalFile),
  371    atomic_list_concat([Dir2, /, LocalFile], Path).
  372
  373ensure_dir(D0, Sub, Dir) :-
  374    atomic_list_concat([D0, /, Sub], Dir),
  375    (   exists_directory(Dir)
  376    ->  true
  377    ;   make_directory(Dir)
  378    ).
  387url_to_file(URL, Ext, File) :-
  388    sha_hash(URL, Hash, []),
  389    phrase(hex_digits(Hash), Codes),
  390    string_to_list(String, Codes),
  391    file_name_extension(String, Ext, File).
  392
  393hex_digits([]) -->
  394    "".
  395hex_digits([H|T]) -->
  396    byte(H),
  397    hex_digits(T).
  398
  399byte(Byte) -->
  400    { High is (Byte>>4) /\ 0xf,
  401      Low is (Byte /\ 0xf),
  402      code_type(H, xdigit(High)),
  403      code_type(L, xdigit(Low))
  404    },
  405    [H,L].
  406
  407
  408                 
  428url_cached(URL, Property) :-
  429    url_cache_dir(Dir),
  430    url_cached(Dir, URL, Property).
  431
  432url_cached(Dir, URL, Property) :-
  433    nonvar(URL),
  434    !,
  435    url_cache_file(URL, Dir, url, Path),
  436    atom_concat(Path, '.meta', MetaFile),
  437    exists_file(MetaFile),
  438    cache_file_property(Property, MetaFile).
  439url_cached(Dir, URL, Property) :-
  440    nonvar(Property),
  441    Property = file(File),
  442    atom(File),
  443    atom_concat(Dir, Rest, File),
  444    \+ sub_atom(Rest, _, _, _, '../'),
  445    file_name_extension(Base, url, File),
  446    file_name_extension(Base, meta, MetaFile),
  447    exists_file(MetaFile),
  448    once(read_meta_file(MetaFile, url(URL))).
  449url_cached(Dir, URL, Property) :-
  450    atom_concat(Dir, '/??', TopPat),
  451    expand_file_name(TopPat, TopDirs),
  452    member(TopDir, TopDirs),
  453    atom_concat(TopDir, '/??', DirPat),
  454    expand_file_name(DirPat, FileDirs),
  455    member(FileDir, FileDirs),
  456    atom_concat(FileDir, '/*.meta', FilePat),
  457    expand_file_name(FilePat, MetaFiles),
  458    member(MetaFile, MetaFiles),
  459    once(read_meta_file(MetaFile, url(URL))),
  460    check_cache_file(MetaFile, URL),
  461    cache_file_property(Property, MetaFile).
  462
  463check_cache_file(MetaFile, URL) :-
  464    file_name_extension(File, meta, MetaFile),
  465    (   exists_file(File)
  466    ->  true
  467    ;   print_message(warning, url_cache(no_file(File, MetaFile, URL))),
  468        delete_file(MetaFile),
  469        fail
  470    ).
  471
  472cache_file_property(Property, MetaFile) :-
  473    var(Property),
  474    !,
  475    cache_file_property_ndet(Property, MetaFile).
  476cache_file_property(Property, MetaFile) :-
  477    cache_file_property_ndet(Property, MetaFile),
  478    !.
  479
  480
  481cache_file_property_ndet(file(File), MetaFile) :-
  482    file_name_extension(File, meta, MetaFile).
  483cache_file_property_ndet(P, MetaFile) :-
  484    read_meta_file(MetaFile, P),
  485    P \= url(_).
  486
  487                   490
  491:- multifile
  492    prolog:message//1.  493
  494prolog:message(url_cache(no_file(File, _MetaFile, URL))) -->
  495    [ 'URL Cache: file ~q does not exist (URL=~q)'-[File, URL] ]
 
Cache the content of external URLs in local files
This library provides a cache for data stored in extenal URLs. The content of each URL is kept in a file and described by a meta-file that remembers the mime-type, the original URL, when it was fetched and -if provided by the server- the last-modified stamp.