From 6bb2ea8e706b48151c2d672d2263c1ca7b19f199 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sat, 1 Feb 2025 14:58:06 +0100 Subject: [PATCH] html: Adjust xmlDetectEncoding for HTML Don't check for UTF-32 or EBCDIC. We now perform BOM sniffing and the first step of the HTML5 prescan algorithm (detect UTF-16 XML declarations). The rest of the algorithm still has to be implemented. --- HTMLparser.c | 17 ++++++++++++++--- parserInternals.c | 23 ++++++++++++++++++++--- 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index 00b64c13..00c30edb 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4346,8 +4346,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { xmlDetectEncoding(ctxt); /* - * This is wrong but matches long-standing behavior. In most cases, - * a document starting with an XML declaration will specify UTF-8. + * TODO: Implement HTML5 prescan algorithm + */ + + /* + * This is wrong but matches long-standing behavior. In most + * cases, a document starting with an XML declaration will + * specify UTF-8. The HTML5 prescan algorithm handles + * XML declarations in a better way. */ if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && (xmlStrncmp(ctxt->input->cur, BAD_CAST "input->flags & XML_INPUT_HAS_ENCODING) == 0) && (xmlStrncmp(ctxt->input->cur, BAD_CAST "html) && + (in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) { enc = XML_CHAR_ENCODING_UCS4BE; autoFlag = XML_INPUT_AUTO_OTHER; } else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) { + /* + * TODO: The HTML5 spec requires to check that the + * next codepoint is an 'x'. + */ enc = XML_CHAR_ENCODING_UTF16BE; autoFlag = XML_INPUT_AUTO_UTF16BE; } @@ -1467,10 +1478,15 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) { case 0x3C: if (in[1] == 0x00) { - if ((in[2] == 0x00) && (in[3] == 0x00)) { + if ((!ctxt->html) && + (in[2] == 0x00) && (in[3] == 0x00)) { enc = XML_CHAR_ENCODING_UCS4LE; autoFlag = XML_INPUT_AUTO_OTHER; } else if ((in[2] == 0x3F) && (in[3] == 0x00)) { + /* + * TODO: The HTML5 spec requires to check that the + * next codepoint is an 'x'. + */ enc = XML_CHAR_ENCODING_UTF16LE; autoFlag = XML_INPUT_AUTO_UTF16LE; } @@ -1478,7 +1494,8 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) { break; case 0x4C: - if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) { + if ((!ctxt->html) && + (in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) { enc = XML_CHAR_ENCODING_EBCDIC; autoFlag = XML_INPUT_AUTO_OTHER; }