diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml index 7bc35ee172d..5cd7ec9157a 100644 --- a/doc/src/sgml/datatype.sgml +++ b/doc/src/sgml/datatype.sgml @@ -4136,9 +4136,11 @@ a0ee-bc99-9c0b-4ef8-bb6d-6bb9-bd38-0a11 The xml type can store well-formed documents, as defined by the XML standard, as well - as content fragments, which are defined by the - production XMLDecl? content in the XML - standard. Roughly, this means that content fragments can have + as content fragments, which are defined by reference + to the more permissive + document node + of the XQuery and XPath data model. + Roughly, this means that content fragments can have more than one top-level element or character node. The expression xmlvalue IS DOCUMENT can be used to evaluate whether a particular xml @@ -4213,16 +4215,6 @@ SET xmloption TO { DOCUMENT | CONTENT }; data are allowed. - - - With the default XML option setting, you cannot directly cast - character strings to type xml if they contain a - document type declaration, because the definition of XML content - fragment does not accept them. If you need to do that, either - use XMLPARSE or change the XML option. - - - diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c index 0309fdde9af..79698997def 100644 --- a/src/backend/utils/adt/xml.c +++ b/src/backend/utils/adt/xml.c @@ -139,6 +139,7 @@ static int parse_xml_decl(const xmlChar *str, size_t *lenp, xmlChar **version, xmlChar **encoding, int *standalone); static bool print_xml_decl(StringInfo buf, const xmlChar *version, pg_enc encoding, int standalone); +static bool xml_doctype_in_content(const xmlChar *str); static xmlDocPtr xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, int encoding); static text *xml_xmlnodetoxmltype(xmlNodePtr cur, PgXmlErrorContext *xmlerrcxt); @@ -1154,8 +1155,15 @@ parse_xml_decl(const xmlChar *str, size_t *lenp, if (xmlStrncmp(p, (xmlChar *) " */ - utf8len = strlen((const char *) (p + 5)); + /* + * If next char is a name char, it's a PI like + * rather than an XMLDecl, so we have done what we came to do and found no + * XMLDecl. + * + * We need an input length value for xmlGetUTF8Char, but there's no need + * to count the whole document size, so use strnlen not strlen. + */ + utf8len = strnlen((const char *) (p + 5), MAX_MULTIBYTE_CHAR_LEN); utf8char = xmlGetUTF8Char(p + 5, &utf8len); if (PG_XMLISNAMECHAR(utf8char)) goto finished; @@ -1326,6 +1334,88 @@ print_xml_decl(StringInfo buf, const xmlChar *version, return false; } +/* + * Test whether an input that is to be parsed as CONTENT contains a DTD. + * + * The SQL/XML:2003 definition of CONTENT ("XMLDecl? content") is not + * satisfied by a document with a DTD, which is a bit of a wart, as it means + * the CONTENT type is not a proper superset of DOCUMENT. SQL/XML:2006 and + * later fix that, by redefining content with reference to the "more + * permissive" Document Node of the XQuery/XPath Data Model, such that any + * DOCUMENT value is indeed also a CONTENT value. That definition is more + * useful, as CONTENT becomes usable for parsing input of unknown form (think + * pg_restore). + * + * As used below in parse_xml when parsing for CONTENT, libxml does not give + * us the 2006+ behavior, but only the 2003; it will choke if the input has + * a DTD. But we can provide the 2006+ definition of CONTENT easily enough, + * by detecting this case first and simply doing the parse as DOCUMENT. + * + * A DTD can be found arbitrarily far in, but that would be a contrived case; + * it will ordinarily start within a few dozen characters. The only things + * that can precede it are an XMLDecl (here, the caller will have called + * parse_xml_decl already), whitespace, comments, and processing instructions. + * This function need only return true if it sees a valid sequence of such + * things leading to must follow */ + p = xmlStrstr(p + 2, (xmlChar *) "--"); + if (!p || p[2] != '>') + return false; + /* advance over comment, and keep scanning */ + p += 3; + continue; + } + + /* otherwise, if it's not a PI , fail */ + if (*p != '?') + return false; + p++; + + /* find end of PI (the string ?> is forbidden within a PI) */ + e = xmlStrstr(p, (xmlChar *) "?>"); + if (!e) + return false; + + /* we don't check PIs carefully, but do reject "xml" target */ + if (e - p >= 3 && xmlStrncasecmp(p, (xmlChar *) "xml", 3) == 0) + return false; + + /* advance over PI, keep scanning */ + p = e + 2; + } +} + /* * Convert a C string to XML internal representation @@ -1361,6 +1451,12 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, /* Use a TRY block to ensure we clean up correctly */ PG_TRY(); { + bool parse_as_document = false; + int res_code; + size_t count = 0; + xmlChar *version = NULL; + int standalone = 0; + xmlInitParser(); ctxt = xmlNewParserCtxt(); @@ -1368,7 +1464,25 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); + /* Decide whether to parse as document or content */ if (xmloption_arg == XMLOPTION_DOCUMENT) + parse_as_document = true; + else + { + /* Parse and skip over the XML declaration, if any */ + res_code = parse_xml_decl(utf8string, + &count, &version, NULL, &standalone); + if (res_code != 0) + xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT, + "invalid XML content: invalid XML declaration", + res_code); + + /* Is there a DOCTYPE element? */ + if (xml_doctype_in_content(utf8string + count)) + parse_as_document = true; + } + + if (parse_as_document) { /* * Note, that here we try to apply DTD defaults @@ -1383,23 +1497,18 @@ xml_parse(text *data, XmlOptionType xmloption_arg, bool preserve_whitespace, XML_PARSE_NOENT | XML_PARSE_DTDATTR | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS)); if (doc == NULL || xmlerrcxt->err_occurred) - xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, - "invalid XML document"); + { + /* Use original option to decide which error code to throw */ + if (xmloption_arg == XMLOPTION_DOCUMENT) + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_DOCUMENT, + "invalid XML document"); + else + xml_ereport(xmlerrcxt, ERROR, ERRCODE_INVALID_XML_CONTENT, + "invalid XML content"); + } } else { - int res_code; - size_t count; - xmlChar *version; - int standalone; - - res_code = parse_xml_decl(utf8string, - &count, &version, NULL, &standalone); - if (res_code != 0) - xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT, - "invalid XML content: invalid XML declaration", - res_code); - doc = xmlNewDoc(version); Assert(doc->encoding == NULL); doc->encoding = xmlStrdup((const xmlChar *) "UTF-8"); diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out index 9541b444abb..29d40baf792 100644 --- a/src/test/regress/expected/xml.out +++ b/src/test/regress/expected/xml.out @@ -532,6 +532,13 @@ LINE 1: EXECUTE foo ('bad'); DETAIL: line 1: Start tag expected, '<' not found bad ^ +SELECT xml ''; +ERROR: invalid XML document +LINE 1: SELECT xml ''; + ^ +DETAIL: line 1: Extra content at the end of the document + + ^ SET XML OPTION CONTENT; EXECUTE foo (''); xmlconcat @@ -545,6 +552,45 @@ EXECUTE foo ('good'); good (1 row) +SELECT xml ' '; + xml +-------------------------------------------------------------------- + +(1 row) + +SELECT xml ' '; + xml +------------------------------ + +(1 row) + +SELECT xml ''; + xml +------------------ + +(1 row) + +SELECT xml ' oops '; +ERROR: invalid XML content +LINE 1: SELECT xml ' oops '; + ^ +DETAIL: line 1: StartTag: invalid element name + oops + ^ +SELECT xml ' '; +ERROR: invalid XML content +LINE 1: SELECT xml ' '; + ^ +DETAIL: line 1: StartTag: invalid element name + + ^ +SELECT xml ''; +ERROR: invalid XML content +LINE 1: SELECT xml ''; + ^ +DETAIL: line 1: Extra content at the end of the document + + ^ -- Test backwards parsing CREATE VIEW xmlview1 AS SELECT xmlcomment('test'); CREATE VIEW xmlview2 AS SELECT xmlconcat('hello', 'you'); diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out index a9b2b3b711b..edd1c731570 100644 --- a/src/test/regress/expected/xml_1.out +++ b/src/test/regress/expected/xml_1.out @@ -429,11 +429,53 @@ EXECUTE foo (''); ERROR: prepared statement "foo" does not exist EXECUTE foo ('bad'); ERROR: prepared statement "foo" does not exist +SELECT xml ''; +ERROR: unsupported XML feature +LINE 1: SELECT xml ''; + ^ +DETAIL: This functionality requires the server to be built with libxml support. +HINT: You need to rebuild PostgreSQL using --with-libxml. SET XML OPTION CONTENT; EXECUTE foo (''); ERROR: prepared statement "foo" does not exist EXECUTE foo ('good'); ERROR: prepared statement "foo" does not exist +SELECT xml ' '; +ERROR: unsupported XML feature +LINE 1: SELECT xml ' '; +ERROR: unsupported XML feature +LINE 1: SELECT xml ' '; +ERROR: unsupported XML feature +LINE 1: SELECT xml ''; + ^ +DETAIL: This functionality requires the server to be built with libxml support. +HINT: You need to rebuild PostgreSQL using --with-libxml. +SELECT xml ' oops '; +ERROR: unsupported XML feature +LINE 1: SELECT xml ' oops '; + ^ +DETAIL: This functionality requires the server to be built with libxml support. +HINT: You need to rebuild PostgreSQL using --with-libxml. +SELECT xml ' '; +ERROR: unsupported XML feature +LINE 1: SELECT xml ' '; + ^ +DETAIL: This functionality requires the server to be built with libxml support. +HINT: You need to rebuild PostgreSQL using --with-libxml. +SELECT xml ''; +ERROR: unsupported XML feature +LINE 1: SELECT xml ''; + ^ +DETAIL: This functionality requires the server to be built with libxml support. +HINT: You need to rebuild PostgreSQL using --with-libxml. -- Test backwards parsing CREATE VIEW xmlview1 AS SELECT xmlcomment('test'); CREATE VIEW xmlview2 AS SELECT xmlconcat('hello', 'you'); diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out index 393861ee59a..6f18130f6a7 100644 --- a/src/test/regress/expected/xml_2.out +++ b/src/test/regress/expected/xml_2.out @@ -512,6 +512,13 @@ LINE 1: EXECUTE foo ('bad'); DETAIL: line 1: Start tag expected, '<' not found bad ^ +SELECT xml ''; +ERROR: invalid XML document +LINE 1: SELECT xml ''; + ^ +DETAIL: line 1: Extra content at the end of the document + + ^ SET XML OPTION CONTENT; EXECUTE foo (''); xmlconcat @@ -525,6 +532,45 @@ EXECUTE foo ('good'); good (1 row) +SELECT xml ' '; + xml +-------------------------------------------------------------------- + +(1 row) + +SELECT xml ' '; + xml +------------------------------ + +(1 row) + +SELECT xml ''; + xml +------------------ + +(1 row) + +SELECT xml ' oops '; +ERROR: invalid XML content +LINE 1: SELECT xml ' oops '; + ^ +DETAIL: line 1: StartTag: invalid element name + oops + ^ +SELECT xml ' '; +ERROR: invalid XML content +LINE 1: SELECT xml ' '; + ^ +DETAIL: line 1: StartTag: invalid element name + + ^ +SELECT xml ''; +ERROR: invalid XML content +LINE 1: SELECT xml ''; + ^ +DETAIL: line 1: Extra content at the end of the document + + ^ -- Test backwards parsing CREATE VIEW xmlview1 AS SELECT xmlcomment('test'); CREATE VIEW xmlview2 AS SELECT xmlconcat('hello', 'you'); diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql index 07b2dd77bed..ca1f0422b36 100644 --- a/src/test/regress/sql/xml.sql +++ b/src/test/regress/sql/xml.sql @@ -149,10 +149,17 @@ PREPARE foo (xml) AS SELECT xmlconcat('', $1); SET XML OPTION DOCUMENT; EXECUTE foo (''); EXECUTE foo ('bad'); +SELECT xml ''; SET XML OPTION CONTENT; EXECUTE foo (''); EXECUTE foo ('good'); +SELECT xml ' '; +SELECT xml ' '; +SELECT xml ''; +SELECT xml ' oops '; +SELECT xml ' '; +SELECT xml ''; -- Test backwards parsing