1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

html: Ignore U+0000 in body text

Align with HTML5. Fixes #908.
This commit is contained in:
Nick Wellnhofer
2025-05-07 14:32:42 +02:00
parent a1e83b2401
commit f3a080bc48
6 changed files with 29 additions and 9 deletions

View File

@@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
case '\0': case '\0':
skip = 1; skip = 1;
repl = BAD_CAST "\xEF\xBF\xBD";
replSize = 3; if (mode == 0) {
/*
* The HTML5 spec says that the tokenizer should
* pass on U+0000 unmodified in normal data mode.
* These characters should then be ignored in body
* and other text, but should be replaced with
* U+FFFD in foreign content.
*
* At least for now, we always strip U+0000 when
* tokenizing.
*/
repl = BAD_CAST "";
replSize = 0;
} else {
repl = BAD_CAST "\xEF\xBF\xBD";
replSize = 3;
}
goto next_chunk; goto next_chunk;
case '\n': case '\n':

View File

@@ -139,8 +139,6 @@ Comment
Character Character
foo < bar foo < bar
34 34
Character
<EFBFBD>
35 35
Comment Comment
-x -x

View File

@@ -508,7 +508,7 @@ Character
< <
171 171
Character Character
<<EFBFBD> <
172 172
Character Character
< <

View File

@@ -186,8 +186,6 @@ xYz
57 57
Comment Comment
doc doc
Character
<EFBFBD>
58 58
Comment Comment
doc<EFBFBD> doc<EFBFBD>

View File

@@ -1,4 +1,4 @@
0 0
Character Character
<EFBFBD>

View File

@@ -43,6 +43,9 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
output = '' output = ''
for token in test['output']: for token in test['output']:
if token[1] == '\0':
continue
output += token[0] + '\n' output += token[0] + '\n'
if token[0] == 'DOCTYPE': if token[0] == 'DOCTYPE':
@@ -61,7 +64,11 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
output = re.sub(r'\\u([A-Fa-f0-9]{4})', output = re.sub(r'\\u([A-Fa-f0-9]{4})',
lambda m: chr(int(m[1], 16)), lambda m: chr(int(m[1], 16)),
output) output)
output = re.sub(r'\x00', '\uFFFD', output)
# The HTML5 spec splits handling of U+0000 across
# tokenizer and tree builder. We already ignore
# U+0000 in body text when tokenizing.
output = re.sub(r'\x00', '', output)
for state in test.get('initialStates', ['Data state']): for state in test.get('initialStates', ['Data state']):
state_no = state_map.get(state) state_no = state_map.get(state)