diff --git a/HTMLparser.c b/HTMLparser.c
index c252235d..0107b12d 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
case '\0':
skip = 1;
- repl = BAD_CAST "\xEF\xBF\xBD";
- replSize = 3;
+
+ if (mode == 0) {
+ /*
+ * The HTML5 spec says that the tokenizer should
+ * pass on U+0000 unmodified in normal data mode.
+ * These characters should then be ignored in body
+ * and other text, but should be replaced with
+ * U+FFFD in foreign content.
+ *
+ * At least for now, we always strip U+0000 when
+ * tokenizing.
+ */
+ repl = BAD_CAST "";
+ replSize = 0;
+ } else {
+ repl = BAD_CAST "\xEF\xBF\xBD";
+ replSize = 3;
+ }
+
goto next_chunk;
case '\n':
diff --git a/result/html-tokenizer/test2.test b/result/html-tokenizer/test2.test
index 66918106..bfd3c875 100644
--- a/result/html-tokenizer/test2.test
+++ b/result/html-tokenizer/test2.test
@@ -139,8 +139,6 @@ Comment
Character
foo < bar
34
-Character
-�
35
Comment
-x
diff --git a/result/html-tokenizer/test3.test b/result/html-tokenizer/test3.test
index ff4b1c2c..53e9bc41 100644
--- a/result/html-tokenizer/test3.test
+++ b/result/html-tokenizer/test3.test
@@ -508,7 +508,7 @@ Character
<
171
Character
-<�
+<
172
Character
<
diff --git a/result/html-tokenizer/test4.test b/result/html-tokenizer/test4.test
index 4de3e6e2..ab725497 100644
--- a/result/html-tokenizer/test4.test
+++ b/result/html-tokenizer/test4.test
@@ -186,8 +186,6 @@ xYz
57
Comment
doc
-Character
-�
58
Comment
doc�
diff --git a/result/html-tokenizer/unicodeCharsProblematic.test b/result/html-tokenizer/unicodeCharsProblematic.test
index b92e0121..d6799c69 100644
--- a/result/html-tokenizer/unicodeCharsProblematic.test
+++ b/result/html-tokenizer/unicodeCharsProblematic.test
@@ -1,4 +1,4 @@
0
Character
-�
+
diff --git a/tools/genHtml5LibTests.py b/tools/genHtml5LibTests.py
index e0cfa562..999aa3d8 100644
--- a/tools/genHtml5LibTests.py
+++ b/tools/genHtml5LibTests.py
@@ -43,6 +43,9 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
output = ''
for token in test['output']:
+ if token[1] == '\0':
+ continue
+
output += token[0] + '\n'
if token[0] == 'DOCTYPE':
@@ -61,7 +64,11 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)),
output)
- output = re.sub(r'\x00', '\uFFFD', output)
+
+ # The HTML5 spec splits handling of U+0000 across
+ # tokenizer and tree builder. We already ignore
+ # U+0000 in body text when tokenizing.
+ output = re.sub(r'\x00', '', output)
for state in test.get('initialStates', ['Data state']):
state_no = state_map.get(state)