html: Ignore U+0000 in body text

Align with HTML5. Fixes #908.
2025-10-24 13:33:01 +03:00 · 2025-05-07 14:32:42 +02:00
parent a1e83b2401
commit f3a080bc48
6 changed files with 29 additions and 9 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
            case '\0':
                skip = 1;
-                repl = BAD_CAST "\xEF\xBF\xBD";
+
-                replSize = 3;
+                if (mode == 0) {
                    /*
                     * The HTML5 spec says that the tokenizer should
                     * pass on U+0000 unmodified in normal data mode.
                     * These characters should then be ignored in body
                     * and other text, but should be replaced with
                     * U+FFFD in foreign content.
                     *
                     * At least for now, we always strip U+0000 when
                     * tokenizing.
                     */
                    repl = BAD_CAST "";
                    replSize = 0;
                } else {
                    repl = BAD_CAST "\xEF\xBF\xBD";
                    replSize = 3;
                }
                goto next_chunk;
            case '\n':
--- a/result/html-tokenizer/test2.test
+++ b/result/html-tokenizer/test2.test
@@ -139,8 +139,6 @@ Comment
 Character
 foo < bar
 34
 Character
 <EFBFBD>
 35
 Comment
 -x
--- a/result/html-tokenizer/test3.test
+++ b/result/html-tokenizer/test3.test
@@ -508,7 +508,7 @@ Character
 <
 171
 Character
-<<EFBFBD>
+<
 172
 Character
 <	
--- a/result/html-tokenizer/test4.test
+++ b/result/html-tokenizer/test4.test
@@ -186,8 +186,6 @@ xYz
 57
 Comment
 doc
 Character
 <EFBFBD>
 58
 Comment
 doc<EFBFBD>
--- a/result/html-tokenizer/unicodeCharsProblematic.test
+++ b/result/html-tokenizer/unicodeCharsProblematic.test
@@ -1,4 +1,4 @@
 0
 Character
-<EFBFBD>
+
--- a/tools/genHtml5LibTests.py
+++ b/tools/genHtml5LibTests.py
@@ -43,6 +43,9 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
            output = ''
            for token in test['output']:
                if token[1] == '\0':
                    continue
                output += token[0] + '\n'
                if token[0] == 'DOCTYPE':
@@ -61,7 +64,11 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
            output = re.sub(r'\\u([A-Fa-f0-9]{4})',
                            lambda m: chr(int(m[1], 16)),
                            output)
-            output = re.sub(r'\x00', '\uFFFD', output)
+
            # The HTML5 spec splits handling of U+0000 across
            # tokenizer and tree builder. We already ignore
            # U+0000 in body text when tokenizing.
            output = re.sub(r'\x00', '', output)
            for state in test.get('initialStates', ['Data state']):
                state_no = state_map.get(state)