diff --git a/HTMLparser.c b/HTMLparser.c index c252235d..0107b12d 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) { case '\0': skip = 1; - repl = BAD_CAST "\xEF\xBF\xBD"; - replSize = 3; + + if (mode == 0) { + /* + * The HTML5 spec says that the tokenizer should + * pass on U+0000 unmodified in normal data mode. + * These characters should then be ignored in body + * and other text, but should be replaced with + * U+FFFD in foreign content. + * + * At least for now, we always strip U+0000 when + * tokenizing. + */ + repl = BAD_CAST ""; + replSize = 0; + } else { + repl = BAD_CAST "\xEF\xBF\xBD"; + replSize = 3; + } + goto next_chunk; case '\n': diff --git a/result/html-tokenizer/test2.test b/result/html-tokenizer/test2.test index 66918106..bfd3c875 100644 --- a/result/html-tokenizer/test2.test +++ b/result/html-tokenizer/test2.test @@ -139,8 +139,6 @@ Comment Character foo < bar 34 -Character -� 35 Comment -x diff --git a/result/html-tokenizer/test3.test b/result/html-tokenizer/test3.test index ff4b1c2c..53e9bc41 100644 --- a/result/html-tokenizer/test3.test +++ b/result/html-tokenizer/test3.test @@ -508,7 +508,7 @@ Character < 171 Character -<� +< 172 Character < diff --git a/result/html-tokenizer/test4.test b/result/html-tokenizer/test4.test index 4de3e6e2..ab725497 100644 --- a/result/html-tokenizer/test4.test +++ b/result/html-tokenizer/test4.test @@ -186,8 +186,6 @@ xYz 57 Comment doc -Character -� 58 Comment doc� diff --git a/result/html-tokenizer/unicodeCharsProblematic.test b/result/html-tokenizer/unicodeCharsProblematic.test index b92e0121..d6799c69 100644 --- a/result/html-tokenizer/unicodeCharsProblematic.test +++ b/result/html-tokenizer/unicodeCharsProblematic.test @@ -1,4 +1,4 @@ 0 Character -� + diff --git a/tools/genHtml5LibTests.py b/tools/genHtml5LibTests.py index e0cfa562..999aa3d8 100644 --- a/tools/genHtml5LibTests.py +++ b/tools/genHtml5LibTests.py @@ -43,6 +43,9 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')): output = '' for token in test['output']: + if token[1] == '\0': + continue + output += token[0] + '\n' if token[0] == 'DOCTYPE': @@ -61,7 +64,11 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')): output = re.sub(r'\\u([A-Fa-f0-9]{4})', lambda m: chr(int(m[1], 16)), output) - output = re.sub(r'\x00', '\uFFFD', output) + + # The HTML5 spec splits handling of U+0000 across + # tokenizer and tree builder. We already ignore + # U+0000 in body text when tokenizing. + output = re.sub(r'\x00', '', output) for state in test.get('initialStates', ['Data state']): state_no = state_map.get(state)