From 6bb2ea8e706b48151c2d672d2263c1ca7b19f199 Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Sat, 1 Feb 2025 14:58:06 +0100
Subject: [PATCH] html: Adjust xmlDetectEncoding for HTML

Don't check for UTF-32 or EBCDIC.

We now perform BOM sniffing and the first step of the HTML5 prescan
algorithm (detect UTF-16 XML declarations). The rest of the algorithm
still has to be implemented.
---
 HTMLparser.c      | 17 ++++++++++++++---
 parserInternals.c | 23 ++++++++++++++++++++---
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/HTMLparser.c b/HTMLparser.c
index 00b64c13..00c30edb 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -4346,8 +4346,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
     xmlDetectEncoding(ctxt);
 
     /*
-     * This is wrong but matches long-standing behavior. In most cases,
-     * a document starting with an XML declaration will specify UTF-8.
+     * TODO: Implement HTML5 prescan algorithm
+     */
+
+    /*
+     * This is wrong but matches long-standing behavior. In most
+     * cases, a document starting with an XML declaration will
+     * specify UTF-8. The HTML5 prescan algorithm handles
+     * XML declarations in a better way.
      */
     if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
         (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
@@ -4943,10 +4949,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 
                 xmlDetectEncoding(ctxt);
 
+                /*
+                 * TODO: Implement HTML5 prescan algorithm
+                 */
+
                 /*
                  * This is wrong but matches long-standing behavior. In most
                  * cases, a document starting with an XML declaration will
-                 * specify UTF-8.
+                 * specify UTF-8. The HTML5 prescan algorithm handles
+                 * XML declarations in a better way.
                  */
                 if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
                     (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
diff --git a/parserInternals.c b/parserInternals.c
index ee62a48d..effd566c 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1454,12 +1454,23 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
     enc = XML_CHAR_ENCODING_NONE;
     bomSize = 0;
 
+    /*
+     * BOM sniffing and detection of initial bytes of an XML
+     * declaration.
+     *
+     * The HTML5 spec doesn't cover UTF-32 (UCS-4) or EBCDIC.
+     */
     switch (in[0]) {
         case 0x00:
-            if ((in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
+            if ((!ctxt->html) &&
+                (in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
                 enc = XML_CHAR_ENCODING_UCS4BE;
                 autoFlag = XML_INPUT_AUTO_OTHER;
             } else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) {
+                /*
+                 * TODO: The HTML5 spec requires to check that the
+                 * next codepoint is an 'x'.
+                 */
                 enc = XML_CHAR_ENCODING_UTF16BE;
                 autoFlag = XML_INPUT_AUTO_UTF16BE;
             }
@@ -1467,10 +1478,15 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
 
         case 0x3C:
             if (in[1] == 0x00) {
-                if ((in[2] == 0x00) && (in[3] == 0x00)) {
+                if ((!ctxt->html) &&
+                    (in[2] == 0x00) && (in[3] == 0x00)) {
                     enc = XML_CHAR_ENCODING_UCS4LE;
                     autoFlag = XML_INPUT_AUTO_OTHER;
                 } else if ((in[2] == 0x3F) && (in[3] == 0x00)) {
+                    /*
+                     * TODO: The HTML5 spec requires to check that the
+                     * next codepoint is an 'x'.
+                     */
                     enc = XML_CHAR_ENCODING_UTF16LE;
                     autoFlag = XML_INPUT_AUTO_UTF16LE;
                 }
@@ -1478,7 +1494,8 @@ xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
             break;
 
         case 0x4C:
-	    if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
+	    if ((!ctxt->html) &&
+                (in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
 	        enc = XML_CHAR_ENCODING_EBCDIC;
                 autoFlag = XML_INPUT_AUTO_OTHER;
             }