From 6edf1a645e76e967759f01b6d86d8eaf7b2bd497 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Mon, 9 Sep 2024 02:09:20 +0200 Subject: [PATCH] html: Fix DOCTYPE parsing --- HTMLparser.c | 385 +++++++++++++++++++++------------------------------ 1 file changed, 155 insertions(+), 230 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index 9d638289..f1e4bf67 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -80,23 +80,6 @@ htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, str1, str2, NULL, 0, msg, str1, str2); } -/** - * htmlParseErrInt: - * @ctxt: an HTML parser context - * @error: the error number - * @msg: the error message - * @val: integer info - * - * Handle a fatal parser error, i.e. violating Well-Formedness constraints - */ -static void LIBXML_ATTR_FORMAT(3,0) -htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, - const char *msg, int val) -{ - xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR, - NULL, NULL, NULL, val, msg, val); -} - /************************************************************************ * * * Parser stacks related functions and macros * @@ -2996,125 +2979,6 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) { return(ret); } -/** - * htmlParseSystemLiteral: - * @ctxt: an HTML parser context - * - * parse an HTML Literal - * - * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") - * - * Returns the SystemLiteral parsed or NULL - */ - -static xmlChar * -htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { - size_t len = 0, startPosition = 0; - int err = 0; - int quote; - xmlChar *ret = NULL; - - if ((CUR != '"') && (CUR != '\'')) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "SystemLiteral \" or ' expected\n", NULL, NULL); - return(NULL); - } - quote = CUR; - NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((PARSER_STOPPED(ctxt) == 0) && - (CUR != 0) && (CUR != quote) && (CUR != '>')) { - /* TODO: Handle UTF-8 */ - if (!IS_CHAR_CH(CUR)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in SystemLiteral 0x%X\n", CUR); - err = 1; - } - NEXT; - len++; - } - if (CUR != quote) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - if (err == 0) { - ret = xmlStrndup((BASE_PTR+startPosition), len); - if (ret == NULL) { - htmlErrMemory(ctxt); - return(NULL); - } - } - NEXT; - } - - return(ret); -} - -/** - * htmlParsePubidLiteral: - * @ctxt: an HTML parser context - * - * parse an HTML public literal - * - * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" - * - * Returns the PubidLiteral parsed or NULL. - */ - -static xmlChar * -htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { - size_t len = 0, startPosition = 0; - int err = 0; - int quote; - xmlChar *ret = NULL; - - if ((CUR != '"') && (CUR != '\'')) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "PubidLiteral \" or ' expected\n", NULL, NULL); - return(NULL); - } - quote = CUR; - NEXT; - - /* - * Name ::= (Letter | '_') (NameChar)* - */ - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((PARSER_STOPPED(ctxt) == 0) && - (CUR != 0) && (CUR != quote) && (CUR != '>')) { - if (!IS_PUBIDCHAR_CH(CUR)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in PubidLiteral 0x%X\n", CUR); - err = 1; - } - len++; - NEXT; - } - - if (CUR != quote) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - if (err == 0) { - ret = xmlStrndup((BASE_PTR + startPosition), len); - if (ret == NULL) { - htmlErrMemory(ctxt); - return(NULL); - } - } - NEXT; - } - - return(ret); -} - static void htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf, int size, int mode) { @@ -3281,64 +3145,6 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) { return(res); } -/** - * htmlParseExternalID: - * @ctxt: an HTML parser context - * @publicID: a xmlChar** receiving PubidLiteral - * - * Parse an External ID or a Public ID - * - * [75] ExternalID ::= 'SYSTEM' S SystemLiteral - * | 'PUBLIC' S PubidLiteral S SystemLiteral - * - * [83] PublicID ::= 'PUBLIC' S PubidLiteral - * - * Returns the function returns SystemLiteral and in the second - * case publicID receives PubidLiteral, is strict is off - * it is possible to return NULL and have publicID set. - */ - -static xmlChar * -htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { - xmlChar *URI = NULL; - - if ((UPPER == 'S') && (UPP(1) == 'Y') && - (UPP(2) == 'S') && (UPP(3) == 'T') && - (UPP(4) == 'E') && (UPP(5) == 'M')) { - SKIP(6); - if (!IS_BLANK_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, - "Space required after 'SYSTEM'\n", NULL, NULL); - } - SKIP_BLANKS; - URI = htmlParseSystemLiteral(ctxt); - if (URI == NULL) { - htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, - "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); - } - } else if ((UPPER == 'P') && (UPP(1) == 'U') && - (UPP(2) == 'B') && (UPP(3) == 'L') && - (UPP(4) == 'I') && (UPP(5) == 'C')) { - SKIP(6); - if (!IS_BLANK_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, - "Space required after 'PUBLIC'\n", NULL, NULL); - } - SKIP_BLANKS; - *publicID = htmlParsePubidLiteral(ctxt); - if (*publicID == NULL) { - htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, - "htmlParseExternalID: PUBLIC, no Public Identifier\n", - NULL, NULL); - } - SKIP_BLANKS; - if ((CUR == '"') || (CUR == '\'')) { - URI = htmlParseSystemLiteral(ctxt); - } - } - return(URI); -} - /** * htmlParseComment: * @ctxt: an HTML parser context @@ -3515,21 +3321,92 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { } +/** + * htmlParseDoctypeLiteral: + * @ctxt: an HTML parser context + * + * Parse a DOCTYPE SYTSTEM or PUBLIC literal. + * + * Returns the literal or NULL in case of error. + */ + +static xmlChar * +htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) { + xmlChar *buf = NULL; + int len; + int size = HTML_PARSER_BUFFER_SIZE; + int quote, cur, l; + int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + XML_MAX_TEXT_LENGTH : + XML_MAX_NAME_LENGTH; + + if ((CUR != '"') && (CUR != '\'')) + return(NULL); + quote = CUR; + NEXT; + + buf = xmlMalloc(size); + if (buf == NULL) { + htmlErrMemory(ctxt); + return(NULL); + } + len = 0; + + while (ctxt->input->cur < ctxt->input->end) { + cur = CUR_CHAR(l); + + if (cur == '>') + break; + + if (cur == quote) { + SKIP(1); + break; + } + + if (len + 5 >= size) { + xmlChar *tmp; + + size *= 2; + tmp = (xmlChar *) xmlRealloc(buf, size); + if (tmp == NULL) { + xmlFree(buf); + htmlErrMemory(ctxt); + return(NULL); + } + buf = tmp; + } + + COPY_BUF(buf,len,cur); + if (len > maxLength) { + htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, + "identifier too long", NULL, NULL); + xmlFree(buf); + return(NULL); + } + + NEXTL(l); + } + + buf[len] = 0; + return(buf); +} + /** * htmlParseDocTypeDecl: * @ctxt: an HTML parser context * - * parse a DOCTYPE declaration - * - * [28] doctypedecl ::= '' + * Parse a DOCTYPE declaration. */ static void htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { - const xmlChar *name; - xmlChar *ExternalID = NULL; + xmlChar *name = NULL; + xmlChar *publicId = NULL; xmlChar *URI = NULL; + int nameCap, nameSize; + int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + XML_MAX_TEXT_LENGTH : + XML_MAX_NAME_LENGTH; /* * We know that 'input->cur < ctxt->input->end) { + int l; + int c = CUR_CHAR(l); + + if (c == '>') + break; + + if (nameSize + 5 > nameCap) { + size_t newCap = nameCap ? nameCap * 2 : 32; + xmlChar *tmp = xmlRealloc(name, newCap); + + if (tmp == NULL) { + htmlErrMemory(ctxt); + xmlFree(name); + return; + } + + name = tmp; + nameCap = newCap; + } + + if (c < 0x80) { + if (IS_WS_HTML(c)) + break; + + if ((ctxt->options & HTML_PARSE_HTML5) && + (c >= 'A') && (c <= 'Z')) + c += 32; + + name[nameSize++] = c; + } else { + COPY_BUF(name, nameSize, c); + } + + if (nameSize > maxLength) { + htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT, + "identifier too long", NULL, NULL); + goto bogus; + } + + NEXTL(l); } + + if (name != NULL) + name[nameSize] = 0; + /* * Check that upper(name) == "HTML" !!!!!!!!!!!!! */ @@ -3554,37 +3470,46 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { SKIP_BLANKS; /* - * Check for SystemID and ExternalID + * Check for SystemID and publicId */ - URI = htmlParseExternalID(ctxt, &ExternalID); - SKIP_BLANKS; - - /* - * We should be at the end of the DOCTYPE declaration. - */ - if (CUR != '>') { - htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, - "DOCTYPE improperly terminated\n", NULL, NULL); - /* Ignore bogus content */ - while ((CUR != 0) && (CUR != '>') && - (PARSER_STOPPED(ctxt) == 0)) - NEXT; + if ((UPPER == 'P') && (UPP(1) == 'U') && + (UPP(2) == 'B') && (UPP(3) == 'L') && + (UPP(4) == 'I') && (UPP(5) == 'C')) { + SKIP(6); + SKIP_BLANKS; + publicId = htmlParseDoctypeLiteral(ctxt); + if (publicId == NULL) + goto bogus; + SKIP_BLANKS; + URI = htmlParseDoctypeLiteral(ctxt); + } else if ((UPPER == 'S') && (UPP(1) == 'Y') && + (UPP(2) == 'S') && (UPP(3) == 'T') && + (UPP(4) == 'E') && (UPP(5) == 'M')) { + SKIP(6); + SKIP_BLANKS; + URI = htmlParseDoctypeLiteral(ctxt); + } + +bogus: + /* Ignore bogus content */ + while (ctxt->input->cur < ctxt->input->end) { + int c = CUR; + + NEXT; + if (c == '>') + break; } - if (CUR == '>') - SKIP(1); /* * Create or update the document accordingly to the DOCTYPE */ if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && (!ctxt->disableSAX)) - ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); + ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI); - /* - * Cleanup, since we don't use all those identifiers - */ - if (URI != NULL) xmlFree(URI); - if (ExternalID != NULL) xmlFree(ExternalID); + xmlFree(name); + xmlFree(URI); + xmlFree(publicId); } /**