36
37:- module(sgml,
38 [ load_html/3, 39 load_xml/3, 40 load_sgml/3, 41
42 load_sgml_file/2, 43 load_xml_file/2, 44 load_html_file/2, 45
46 load_structure/3, 47
48 load_dtd/2, 49 load_dtd/3, 50 dtd/2, 51 dtd_property/2, 52
53 new_dtd/2, 54 free_dtd/1, 55 open_dtd/3, 56
57 new_sgml_parser/2, 58 free_sgml_parser/1, 59 set_sgml_parser/2, 60 get_sgml_parser/2, 61 sgml_parse/2, 62
63 sgml_register_catalog_file/2, 64
65 xml_quote_attribute/3, 66 xml_quote_cdata/3, 67 xml_quote_attribute/2, 68 xml_quote_cdata/2, 69 xml_name/1, 70 xml_name/2, 71
72 xsd_number_string/2, 73 xsd_time_string/3, 74
75 xml_basechar/1, 76 xml_ideographic/1, 77 xml_combining_char/1, 78 xml_digit/1, 79 xml_extender/1, 80
81 iri_xml_namespace/2, 82 iri_xml_namespace/3, 83 xml_is_dom/1 84 ]). 85:- autoload(library(error),[instantiation_error/1]). 86:- autoload(library(iostream),[open_any/5,close_any/1]). 87:- autoload(library(lists),[member/2,selectchk/3]). 88:- autoload(library(option),[select_option/3,merge_options/3]). 89
90:- meta_predicate
91 load_structure(+, -, :),
92 load_html(+, -, :),
93 load_xml(+, -, :),
94 load_sgml(+, -, :). 95
96:- predicate_options(load_structure/3, 3,
97 [ charpos(integer),
98 cdata(oneof([atom,string])),
99 defaults(boolean),
100 dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
101 doctype(atom),
102 dtd(any),
103 encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
104 entity(atom,atom),
105 keep_prefix(boolean),
106 file(atom),
107 line(integer),
108 offset(integer),
109 number(oneof([token,integer])),
110 qualify_attributes(boolean),
111 shorttag(boolean),
112 case_sensitive_attributes(boolean),
113 case_preserving_attributes(boolean),
114 system_entities(boolean),
115 max_memory(integer),
116 space(oneof([sgml,preserve,default,remove,strict])),
117 xmlns(atom),
118 xmlns(atom,atom),
119 pass_to(sgml_parse/2, 2)
120 ]). 121:- predicate_options(load_html/3, 3,
122 [ pass_to(load_structure/3, 3)
123 ]). 124:- predicate_options(load_xml/3, 3,
125 [ pass_to(load_structure/3, 3)
126 ]). 127:- predicate_options(load_sgml/3, 3,
128 [ pass_to(load_structure/3, 3)
129 ]). 130:- predicate_options(load_dtd/3, 3,
131 [ dialect(oneof([sgml,xml,xmlns])),
132 pass_to(open/4, 4)
133 ]). 134:- predicate_options(sgml_parse/2, 2,
135 [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
136 callable),
137 cdata(oneof([atom,string])),
138 content_length(integer),
139 document(-any),
140 max_errors(integer),
141 parse(oneof([file,element,content,declaration,input])),
142 source(any),
143 syntax_errors(oneof([quiet,print,style])),
144 xml_no_ns(oneof([error,quiet]))
145 ]). 146:- predicate_options(new_sgml_parser/2, 2,
147 [ dtd(any)
148 ]). 149
150
177
178:- multifile user:file_search_path/2. 179:- dynamic user:file_search_path/2. 180
181user:file_search_path(dtd, '.').
182user:file_search_path(dtd, swi('library/DTD')).
183
184sgml_register_catalog_file(File, Location) :-
185 prolog_to_os_filename(File, OsFile),
186 '_sgml_register_catalog_file'(OsFile, Location).
187
188:- use_foreign_library(foreign(sgml2pl)). 189
190register_catalog(Base) :-
191 absolute_file_name(dtd(Base),
192 [ extensions([soc]),
193 access(read),
194 file_errors(fail)
195 ],
196 SocFile),
197 sgml_register_catalog_file(SocFile, end).
198
199:- initialization
200 ignore(register_catalog('HTML4')). 201
202
203 206
213
214:- thread_local
215 current_dtd/2. 216:- volatile
217 current_dtd/2. 218:- thread_local
219 registered_cleanup/0. 220:- volatile
221 registered_cleanup/0. 222
223:- multifile
224 dtd_alias/2. 225
226:- create_prolog_flag(html_dialect, html5, [type(atom)]). 227
228dtd_alias(html4, 'HTML4').
229dtd_alias(html5, 'HTML5').
230dtd_alias(html, DTD) :-
231 current_prolog_flag(html_dialect, Dialect),
232 dtd_alias(Dialect, DTD).
233
243
244dtd(Type, DTD) :-
245 current_dtd(Type, DTD),
246 !.
247dtd(Type, DTD) :-
248 new_dtd(Type, DTD),
249 ( dtd_alias(Type, Base)
250 -> true
251 ; Base = Type
252 ),
253 absolute_file_name(dtd(Base),
254 [ extensions([dtd]),
255 access(read)
256 ], DtdFile),
257 load_dtd(DTD, DtdFile),
258 register_cleanup,
259 asserta(current_dtd(Type, DTD)).
260
273
274load_dtd(DTD, DtdFile) :-
275 load_dtd(DTD, DtdFile, []).
276load_dtd(DTD, DtdFile, Options) :-
277 sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
278 setup_call_cleanup(
279 open_dtd(DTD, DTDOptions, DtdOut),
280 setup_call_cleanup(
281 open(DtdFile, read, DtdIn, OpenOptions),
282 copy_stream_data(DtdIn, DtdOut),
283 close(DtdIn)),
284 close(DtdOut)).
285
290
291:- public
292 destroy_dtds/0. 293
294destroy_dtds :-
295 ( current_dtd(_Type, DTD),
296 free_dtd(DTD),
297 fail
298 ; true
299 ).
300
304
305register_cleanup :-
306 registered_cleanup,
307 !.
308register_cleanup :-
309 ( thread_self(main)
310 -> at_halt(destroy_dtds)
311 ; current_prolog_flag(threads, true)
312 -> prolog_listen(this_thread_exit, destroy_dtds)
313 ; true
314 ),
315 assert(registered_cleanup).
316
317
318 321
322prop(doctype(_), _).
323prop(elements(_), _).
324prop(entities(_), _).
325prop(notations(_), _).
326prop(entity(E, _), DTD) :-
327 ( nonvar(E)
328 -> true
329 ; '$dtd_property'(DTD, entities(EL)),
330 member(E, EL)
331 ).
332prop(element(E, _, _), DTD) :-
333 ( nonvar(E)
334 -> true
335 ; '$dtd_property'(DTD, elements(EL)),
336 member(E, EL)
337 ).
338prop(attributes(E, _), DTD) :-
339 ( nonvar(E)
340 -> true
341 ; '$dtd_property'(DTD, elements(EL)),
342 member(E, EL)
343 ).
344prop(attribute(E, A, _, _), DTD) :-
345 ( nonvar(E)
346 -> true
347 ; '$dtd_property'(DTD, elements(EL)),
348 member(E, EL)
349 ),
350 ( nonvar(A)
351 -> true
352 ; '$dtd_property'(DTD, attributes(E, AL)),
353 member(A, AL)
354 ).
355prop(notation(N, _), DTD) :-
356 ( nonvar(N)
357 -> true
358 ; '$dtd_property'(DTD, notations(NL)),
359 member(N, NL)
360 ).
361
362dtd_property(DTD, Prop) :-
363 prop(Prop, DTD),
364 '$dtd_property'(DTD, Prop).
365
366
367 370
392
393load_structure(Spec, DOM, Options) :-
394 sgml_open_options(Options, OpenOptions, SGMLOptions),
395 setup_call_cleanup(
396 open_any(Spec, read, In, Close, OpenOptions),
397 load_structure_from_stream(In, DOM, SGMLOptions),
398 close_any(Close)).
399
400sgml_open_options(Options, OpenOptions, SGMLOptions) :-
401 Options = M:Plain,
402 ( select_option(encoding(Encoding), Plain, NoEnc)
403 -> ( sgml_encoding(Encoding)
404 -> merge_options(NoEnc, [type(binary)], OpenOptions),
405 SGMLOptions = Options
406 ; OpenOptions = Plain,
407 SGMLOptions = M:NoEnc
408 )
409 ; merge_options(Plain, [type(binary)], OpenOptions),
410 SGMLOptions = Options
411 ).
412
413sgml_encoding(Enc) :-
414 downcase_atom(Enc, Enc1),
415 sgml_encoding_l(Enc1).
416
417sgml_encoding_l('iso-8859-1').
418sgml_encoding_l('us-ascii').
419sgml_encoding_l('utf-8').
420sgml_encoding_l('utf8').
421sgml_encoding_l('iso_latin_1').
422sgml_encoding_l('ascii').
423
424load_structure_from_stream(In, Term, M:Options) :-
425 ( select_option(dtd(DTD), Options, Options1)
426 -> ExplicitDTD = true
427 ; ExplicitDTD = false,
428 Options1 = Options
429 ),
430 move_front(Options1, dialect(_), Options2), 431 setup_call_cleanup(
432 new_sgml_parser(Parser,
433 [ dtd(DTD)
434 ]),
435 parse(Parser, M:Options2, TermRead, In),
436 free_sgml_parser(Parser)),
437 ( ExplicitDTD == true
438 -> ( DTD = dtd(_, DocType),
439 dtd_property(DTD, doctype(DocType))
440 -> true
441 ; true
442 )
443 ; free_dtd(DTD)
444 ),
445 Term = TermRead.
446
447move_front(Options0, Opt, Options) :-
448 selectchk(Opt, Options0, Options1),
449 !,
450 Options = [Opt|Options1].
451move_front(Options, _, Options).
452
453
454parse(Parser, M:Options, Document, In) :-
455 set_parser_options(Options, Parser, In, Options1),
456 parser_meta_options(Options1, M, Options2),
457 set_input_location(Parser, In),
458 sgml_parse(Parser,
459 [ document(Document),
460 source(In)
461 | Options2
462 ]).
463
464set_parser_options([], _, _, []).
465set_parser_options([H|T], Parser, In, Rest) :-
466 ( set_parser_option(H, Parser, In)
467 -> set_parser_options(T, Parser, In, Rest)
468 ; Rest = [H|R2],
469 set_parser_options(T, Parser, In, R2)
470 ).
471
472set_parser_option(Var, _Parser, _In) :-
473 var(Var),
474 !,
475 instantiation_error(Var).
476set_parser_option(Option, Parser, _) :-
477 def_entity(Option, Parser),
478 !.
479set_parser_option(offset(Offset), _Parser, In) :-
480 !,
481 seek(In, Offset, bof, _).
482set_parser_option(Option, Parser, _In) :-
483 parser_option(Option),
484 !,
485 set_sgml_parser(Parser, Option).
486set_parser_option(Name=Value, Parser, In) :-
487 Option =.. [Name,Value],
488 set_parser_option(Option, Parser, In).
489
490
491parser_option(dialect(_)).
492parser_option(shorttag(_)).
493parser_option(case_sensitive_attributes(_)).
494parser_option(case_preserving_attributes(_)).
495parser_option(system_entities(_)).
496parser_option(max_memory(_)).
497parser_option(file(_)).
498parser_option(line(_)).
499parser_option(space(_)).
500parser_option(number(_)).
501parser_option(defaults(_)).
502parser_option(doctype(_)).
503parser_option(qualify_attributes(_)).
504parser_option(encoding(_)).
505parser_option(keep_prefix(_)).
506
507
508def_entity(entity(Name, Value), Parser) :-
509 get_sgml_parser(Parser, dtd(DTD)),
510 xml_quote_attribute(Value, QValue),
511 setup_call_cleanup(open_dtd(DTD, [], Stream),
512 format(Stream, '<!ENTITY ~w "~w">~n',
513 [Name, QValue]),
514 close(Stream)).
515def_entity(xmlns(URI), Parser) :-
516 set_sgml_parser(Parser, xmlns(URI)).
517def_entity(xmlns(NS, URI), Parser) :-
518 set_sgml_parser(Parser, xmlns(NS, URI)).
519
523
524parser_meta_options([], _, []).
525parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
526 !,
527 parser_meta_options(T0, M, T).
528parser_meta_options([H|T0], M, [H|T]) :-
529 parser_meta_options(T0, M, T).
530
531
535
536set_input_location(Parser, _In) :-
537 get_sgml_parser(Parser, file(_)),
538 !.
539set_input_location(Parser, In) :-
540 stream_property(In, file_name(File)),
541 !,
542 set_sgml_parser(Parser, file(File)),
543 stream_property(In, position(Pos)),
544 set_sgml_parser(Parser, position(Pos)).
545set_input_location(_, _).
546
547 550
557
558load_sgml_file(File, Term) :-
559 load_sgml(File, Term, []).
560
567
568load_xml_file(File, Term) :-
569 load_xml(File, Term, []).
570
577
578load_html_file(File, DOM) :-
579 load_html(File, DOM, []).
580
607
608load_html(File, Term, M:Options) :-
609 current_prolog_flag(html_dialect, Dialect),
610 dtd(Dialect, DTD),
611 merge_options(Options,
612 [ dtd(DTD),
613 dialect(Dialect),
614 max_errors(-1),
615 syntax_errors(quiet)
616 ], Options1),
617 load_structure(File, Term, M:Options1).
618
626
627load_xml(Input, DOM, M:Options) :-
628 merge_options(Options,
629 [ dialect(xml)
630 ], Options1),
631 load_structure(Input, DOM, M:Options1).
632
640
641load_sgml(Input, DOM, M:Options) :-
642 merge_options(Options,
643 [ dialect(sgml)
644 ], Options1),
645 load_structure(Input, DOM, M:Options1).
646
647
648
649 652
660
661xml_quote_attribute(In, Quoted) :-
662 xml_quote_attribute(In, Quoted, ascii).
663
664xml_quote_cdata(In, Quoted) :-
665 xml_quote_cdata(In, Quoted, ascii).
666
670
671xml_name(In) :-
672 xml_name(In, ascii).
673
674
675 678
690
691
692 695
700
701xml_is_dom(0) :- !, fail. 702xml_is_dom(List) :-
703 is_list(List),
704 !,
705 xml_is_content_list(List).
706xml_is_dom(Term) :-
707 xml_is_element(Term).
708
709xml_is_content_list([]).
710xml_is_content_list([H|T]) :-
711 xml_is_content(H),
712 xml_is_content_list(T).
713
714xml_is_content(0) :- !, fail.
715xml_is_content(pi(Pi)) :-
716 !,
717 atom(Pi).
718xml_is_content(CDATA) :-
719 atom(CDATA),
720 !.
721xml_is_content(CDATA) :-
722 string(CDATA),
723 !.
724xml_is_content(Term) :-
725 xml_is_element(Term).
726
727xml_is_element(element(Name, Attributes, Content)) :-
728 dom_name(Name),
729 dom_attributes(Attributes),
730 xml_is_content_list(Content).
731
732dom_name(NS:Local) :-
733 atom(NS),
734 atom(Local),
735 !.
736dom_name(Local) :-
737 atom(Local).
738
739dom_attributes(0) :- !, fail.
740dom_attributes([]).
741dom_attributes([H|T]) :-
742 dom_attribute(H),
743 dom_attributes(T).
744
745dom_attribute(Name=Value) :-
746 dom_name(Name),
747 atomic(Value).
748
749
750 753:- multifile
754 prolog:message/3. 755
757
758prolog:message(sgml(Parser, File, Line, Message)) -->
759 { get_sgml_parser(Parser, dialect(Dialect))
760 },
761 [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
762
763
764 767
768:- multifile
769 prolog:called_by/2. 770
771prolog:called_by(sgml_parse(_, Options), Called) :-
772 findall(Meta, meta_call_term(_, Meta, Options), Called).
773
774meta_call_term(T, G+N, Options) :-
775 T = call(Event, G),
776 pmember(T, Options),
777 call_params(Event, Term),
778 functor(Term, _, N).
779
780pmember(X, List) :- 781 nonvar(List),
782 List = [H|T],
783 ( X = H
784 ; pmember(X, T)
785 ).
786
787call_params(begin, begin(tag,attributes,parser)).
788call_params(end, end(tag,parser)).
789call_params(cdata, cdata(cdata,parser)).
790call_params(pi, pi(cdata,parser)).
791call_params(decl, decl(cdata,parser)).
792call_params(error, error(severity,message,parser)).
793call_params(xmlns, xmlns(namespace,url,parser)).
794call_params(urlns, urlns(url,url,parser)).
795
796 799
800:- multifile
801 sandbox:safe_primitive/1,
802 sandbox:safe_meta_predicate/1. 803
804sandbox:safe_meta_predicate(sgml:load_structure/3).
805sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
806 dtd_alias(Dialect, _).
807sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
808sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
809sandbox:safe_primitive(sgml:xml_name(_,_)).
810sandbox:safe_primitive(sgml:xml_basechar(_)).
811sandbox:safe_primitive(sgml:xml_ideographic(_)).
812sandbox:safe_primitive(sgml:xml_combining_char(_)).
813sandbox:safe_primitive(sgml:xml_digit(_)).
814sandbox:safe_primitive(sgml:xml_extender(_)).
815sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
816sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
817sandbox:safe_primitive(sgml:xsd_time_string(_,_,_))