From 2f3655c9c3c6ab8c1fdab17e7afe4080ad7ffc1b Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Tue, 20 May 2025 19:40:06 +0200 Subject: [PATCH] parser: Pop PEs that start markup declarations explicitly We currently only handle "Validity constraint: Proper Declaration/PE Nesting", but we must detect "Well-formedness constraint: PE Between Declarations" separately: > The replacement text of a parameter entity reference in a DeclSep must > match the production extSubsetDecl. PEs in DeclSeps are PEs that start with a full markup declaration (or another PE). These are handled in xmParse{Internal|External}Subset. We set a flag on these PEs and don't close them implicitly in xmlSkipBlankCharsPE. This will make unterminated declarations in such PEs cause a parser error. The PEs are closed explicitly in xmParse{Internal|External}Subset, the only location where they are allowed to end. --- include/private/parser.h | 1 + parser.c | 166 +++++++++++++++++++--------- python/tests/reader2.py | 9 +- result/errors/759573-2.xml.ent | 4 +- result/errors/759573-2.xml.err | 4 +- result/errors/759573-2.xml.str | 4 +- result/errors10/781361.xml.err | 15 ++- result/valid/cond_sect2.xml.err | 9 +- result/valid/cond_sect2.xml.err.rdr | 9 +- 9 files changed, 148 insertions(+), 73 deletions(-) diff --git a/include/private/parser.h b/include/private/parser.h index 9ad9ac6e..fd3a0f1a 100644 --- a/include/private/parser.h +++ b/include/private/parser.h @@ -33,6 +33,7 @@ #define XML_INPUT_USES_ENC_DECL (1u << 4) #define XML_INPUT_ENCODING_ERROR (1u << 5) #define XML_INPUT_PROGRESSIVE (1u << 6) +#define XML_INPUT_MARKUP_DECL (1u << 7) #define PARSER_STOPPED(ctxt) ((ctxt)->disableSAX > 1) diff --git a/parser.c b/parser.c index 36cfec89..1c6bd16c 100644 --- a/parser.c +++ b/parser.c @@ -204,6 +204,9 @@ xmlCtxtParseEntity(xmlParserCtxtPtr ctxt, xmlEntityPtr ent); static int xmlLoadEntityContent(xmlParserCtxtPtr ctxt, xmlEntityPtr entity); +static void +xmlParsePERefInternal(xmlParserCtxt *ctxt, int markupDecl); + /************************************************************************ * * * Some factorized error routines * @@ -2440,7 +2443,7 @@ xmlSkipBlankCharsPE(xmlParserCtxtPtr ctxt) { * even consume the whole entity and pop it. We might * even pop multiple PEs in this loop. */ - xmlParsePEReference(ctxt); + xmlParsePERefInternal(ctxt, 0); inParam = PARSER_IN_PE(ctxt); expandParam = PARSER_EXTERNAL(ctxt); @@ -2448,6 +2451,14 @@ xmlSkipBlankCharsPE(xmlParserCtxtPtr ctxt) { if (inParam == 0) break; + /* + * Don't pop parameter entities that start a markup + * declaration to detect Well-formedness constraint: + * PE Between Declarations. + */ + if (ctxt->input->flags & XML_INPUT_MARKUP_DECL) + break; + xmlPopPE(ctxt); inParam = PARSER_IN_PE(ctxt); @@ -2747,7 +2758,7 @@ xmlParseStringCharRef(xmlParserCtxtPtr ctxt, const xmlChar **str) { */ void xmlParserHandlePEReference(xmlParserCtxt *ctxt) { - xmlParsePEReference(ctxt); + xmlParsePERefInternal(ctxt, 0); } /** @@ -6648,20 +6659,31 @@ xmlParseConditionalSections(xmlParserCtxtPtr ctxt) { int *inputIds = NULL; size_t inputIdsSize = 0; size_t depth = 0; + int isFreshPE = 0; + int oldInputNr = ctxt->inputNr; + int declInputNr = ctxt->inputNr; - while (PARSER_STOPPED(ctxt) == 0) { - if ((RAW == '<') && (NXT(1) == '!') && (NXT(2) == '[')) { + while (!PARSER_STOPPED(ctxt)) { + if (ctxt->input->cur >= ctxt->input->end) { + if (ctxt->inputNr <= oldInputNr) { + xmlFatalErr(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED, NULL); + goto error; + } + + xmlPopPE(ctxt); + declInputNr = ctxt->inputNr; + } else if ((RAW == '<') && (NXT(1) == '!') && (NXT(2) == '[')) { int id = ctxt->input->id; - SKIP(3); SKIP_BLANKS_PE; + isFreshPE = 0; + if (CMP7(CUR_PTR, 'I', 'N', 'C', 'L', 'U', 'D', 'E')) { SKIP(7); SKIP_BLANKS_PE; if (RAW != '[') { xmlFatalErr(ctxt, XML_ERR_CONDSEC_INVALID, NULL); - xmlHaltParser(ctxt); goto error; } if (ctxt->input->id != id) { @@ -6700,7 +6722,6 @@ xmlParseConditionalSections(xmlParserCtxtPtr ctxt) { SKIP_BLANKS_PE; if (RAW != '[') { xmlFatalErr(ctxt, XML_ERR_CONDSEC_INVALID, NULL); - xmlHaltParser(ctxt); goto error; } if (ctxt->input->id != id) { @@ -6741,11 +6762,17 @@ xmlParseConditionalSections(xmlParserCtxtPtr ctxt) { } } else { xmlFatalErr(ctxt, XML_ERR_CONDSEC_INVALID_KEYWORD, NULL); - xmlHaltParser(ctxt); goto error; } } else if ((depth > 0) && (RAW == ']') && (NXT(1) == ']') && (NXT(2) == '>')) { + if (isFreshPE) { + xmlFatalErrMsg(ctxt, XML_ERR_CONDSEC_INVALID, + "Parameter entity must match " + "extSubsetDecl\n"); + goto error; + } + depth--; if (ctxt->input->id != inputIds[depth]) { xmlFatalErrMsg(ctxt, XML_ERR_ENTITY_BOUNDARY, @@ -6754,17 +6781,23 @@ xmlParseConditionalSections(xmlParserCtxtPtr ctxt) { } SKIP(3); } else if ((RAW == '<') && ((NXT(1) == '!') || (NXT(1) == '?'))) { + isFreshPE = 0; xmlParseMarkupDecl(ctxt); + } else if (RAW == '%') { + xmlParsePERefInternal(ctxt, 1); + if (ctxt->inputNr > declInputNr) { + isFreshPE = 1; + declInputNr = ctxt->inputNr; + } } else { xmlFatalErr(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED, NULL); - xmlHaltParser(ctxt); goto error; } if (depth == 0) break; - SKIP_BLANKS_PE; + SKIP_BLANKS; SHRINK; GROW; } @@ -6937,7 +6970,7 @@ xmlParseExternalSubset(xmlParserCtxt *ctxt, const xmlChar *publicId, } ctxt->myDoc->properties = XML_DOC_INTERNAL; } - if ((ctxt->myDoc != NULL) && (ctxt->myDoc->intSubset == NULL) && + if ((ctxt->myDoc->intSubset == NULL) && (xmlCreateIntSubset(ctxt->myDoc, NULL, publicId, systemId) == NULL)) { xmlErrMemory(ctxt); } @@ -6945,27 +6978,32 @@ xmlParseExternalSubset(xmlParserCtxt *ctxt, const xmlChar *publicId, ctxt->inSubset = 2; oldInputNr = ctxt->inputNr; - SKIP_BLANKS_PE; - while (((RAW != 0) || (ctxt->inputNr > oldInputNr)) && - (!PARSER_STOPPED(ctxt))) { - GROW; - if ((RAW == '<') && (NXT(1) == '!') && (NXT(2) == '[')) { + SKIP_BLANKS; + while (!PARSER_STOPPED(ctxt)) { + if (ctxt->input->cur >= ctxt->input->end) { + if (ctxt->inputNr <= oldInputNr) { + xmlParserCheckEOF(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED); + break; + } + + xmlPopPE(ctxt); + } else if ((RAW == '<') && (NXT(1) == '!') && (NXT(2) == '[')) { xmlParseConditionalSections(ctxt); } else if ((RAW == '<') && ((NXT(1) == '!') || (NXT(1) == '?'))) { xmlParseMarkupDecl(ctxt); + } else if (RAW == '%') { + xmlParsePERefInternal(ctxt, 1); } else { xmlFatalErr(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED, NULL); - xmlHaltParser(ctxt); - return; + + while (ctxt->inputNr > oldInputNr) + xmlPopPE(ctxt); + break; } - SKIP_BLANKS_PE; + SKIP_BLANKS; SHRINK; + GROW; } - - while (ctxt->inputNr > oldInputNr) - xmlPopPE(ctxt); - - xmlParserCheckEOF(ctxt, XML_ERR_EXT_SUBSET_NOT_FINISHED); } /** @@ -7455,8 +7493,6 @@ xmlParseStringEntityRef(xmlParserCtxtPtr ctxt, const xmlChar ** str) { /** * Parse a parameter entity reference. Always consumes '%'. * - * @deprecated Internal function, don't use. - * * The entity content is handled directly by pushing it's content as * a new input stream. * @@ -7482,10 +7518,10 @@ xmlParseStringEntityRef(xmlParserCtxtPtr ctxt, const xmlChar ** str) { * NOTE: misleading but this is handled. * * @param ctxt an XML parser context + * @param markupDecl whether the PERef starts a markup declaration */ -void -xmlParsePEReference(xmlParserCtxt *ctxt) -{ +static void +xmlParsePERefInternal(xmlParserCtxt *ctxt, int markupDecl) { const xmlChar *name; xmlEntityPtr entity = NULL; xmlParserInputPtr input; @@ -7548,6 +7584,9 @@ xmlParsePEReference(xmlParserCtxt *ctxt) entity->flags |= XML_ENT_EXPANDING; + if (markupDecl) + input->flags |= XML_INPUT_MARKUP_DECL; + GROW; if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) { @@ -7562,6 +7601,18 @@ xmlParsePEReference(xmlParserCtxt *ctxt) } } +/** + * Parse a parameter entity reference. + * + * @deprecated Internal function, don't use. + * + * @param ctxt an XML parser context + */ +void +xmlParsePEReference(xmlParserCtxt *ctxt) { + xmlParsePERefInternal(ctxt, 0); +} + /** * Load the content of an entity. * @@ -7880,44 +7931,49 @@ xmlParseInternalSubset(xmlParserCtxtPtr ctxt) { * Subsequence (markupdecl | PEReference | S)* */ SKIP_BLANKS; - while (((RAW != ']') || (ctxt->inputNr > oldInputNr)) && - (PARSER_STOPPED(ctxt) == 0)) { - - /* - * Conditional sections are allowed from external entities included - * by PE References in the internal subset. - */ - if ((PARSER_EXTERNAL(ctxt)) && - (RAW == '<') && (NXT(1) == '!') && (NXT(2) == '[')) { + while (1) { + if (PARSER_STOPPED(ctxt)) { + return; + } else if (ctxt->input->cur >= ctxt->input->end) { + if (ctxt->inputNr <= oldInputNr) { + xmlFatalErr(ctxt, XML_ERR_INT_SUBSET_NOT_FINISHED, NULL); + return; + } + xmlPopPE(ctxt); + } else if ((RAW == ']') && (ctxt->inputNr <= oldInputNr)) { + NEXT; + SKIP_BLANKS; + break; + } else if ((PARSER_EXTERNAL(ctxt)) && + (RAW == '<') && (NXT(1) == '!') && (NXT(2) == '[')) { + /* + * Conditional sections are allowed in external entities + * included by PE References in the internal subset. + */ xmlParseConditionalSections(ctxt); } else if ((RAW == '<') && ((NXT(1) == '!') || (NXT(1) == '?'))) { - xmlParseMarkupDecl(ctxt); + xmlParseMarkupDecl(ctxt); } else if (RAW == '%') { - xmlParsePEReference(ctxt); + xmlParsePERefInternal(ctxt, 1); } else { - xmlFatalErr(ctxt, XML_ERR_INT_SUBSET_NOT_FINISHED, NULL); - break; + xmlFatalErr(ctxt, XML_ERR_INT_SUBSET_NOT_FINISHED, NULL); + + while (ctxt->inputNr > oldInputNr) + xmlPopPE(ctxt); + return; } - SKIP_BLANKS_PE; + SKIP_BLANKS; SHRINK; GROW; - } - - while (ctxt->inputNr > oldInputNr) - xmlPopPE(ctxt); - - if (RAW == ']') { - NEXT; - SKIP_BLANKS; - } + } } /* * We should be at the end of the DOCTYPE declaration. */ - if ((ctxt->wellFormed) && (RAW != '>')) { - xmlFatalErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, NULL); - return; + if (RAW != '>') { + xmlFatalErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, NULL); + return; } NEXT; } diff --git a/python/tests/reader2.py b/python/tests/reader2.py index 59141a88..b5f493ff 100755 --- a/python/tests/reader2.py +++ b/python/tests/reader2.py @@ -47,14 +47,17 @@ value ^ """.format(dir_prefix), 'cond_sect2': -"""{0}/dtds/cond_sect2.dtd:15: parser error : All markup of the conditional section is not in the same entity +"""{0}/dtds/cond_sect2.dtd:15: parser error : Parameter entity must match extSubsetDecl %ent; ^ Entity: line 1: ]]> ^ -{0}/dtds/cond_sect2.dtd:17: parser error : Content error in the external subset - +{0}/dtds/cond_sect2.dtd:15: parser error : Content error in the external subset + %ent; + ^ +Entity: line 1: +]]> ^ """.format(dir_prefix), 'rss': diff --git a/result/errors/759573-2.xml.ent b/result/errors/759573-2.xml.ent index ea2726e1..e7418a73 100644 --- a/result/errors/759573-2.xml.ent +++ b/result/errors/759573-2.xml.ent @@ -19,12 +19,12 @@ Entity: line 1: ./test/errors/759573-2.xml:6: parser error : Content error in the internal subset %xx; ^ -Entity: line 2: +Entity: line 1: ^ -test/valid/dtds/cond_sect2.dtd:17: parser error : Content error in the external subset - +test/valid/dtds/cond_sect2.dtd:15: parser error : Content error in the external subset + %ent; + ^ +Entity: line 1: +]]> ^ diff --git a/result/valid/cond_sect2.xml.err.rdr b/result/valid/cond_sect2.xml.err.rdr index fd3cb75d..28d3b2dd 100644 --- a/result/valid/cond_sect2.xml.err.rdr +++ b/result/valid/cond_sect2.xml.err.rdr @@ -1,10 +1,13 @@ -test/valid/dtds/cond_sect2.dtd:15: parser error : All markup of the conditional section is not in the same entity +test/valid/dtds/cond_sect2.dtd:15: parser error : Parameter entity must match extSubsetDecl %ent; ^ Entity: line 1: ]]> ^ -test/valid/dtds/cond_sect2.dtd:17: parser error : Content error in the external subset - +test/valid/dtds/cond_sect2.dtd:15: parser error : Content error in the external subset + %ent; + ^ +Entity: line 1: +]]> ^ ./test/valid/cond_sect2.xml : failed to parse