mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
17
HTMLparser.c
17
HTMLparser.c
@@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
|
||||
|
||||
case '\0':
|
||||
skip = 1;
|
||||
|
||||
if (mode == 0) {
|
||||
/*
|
||||
* The HTML5 spec says that the tokenizer should
|
||||
* pass on U+0000 unmodified in normal data mode.
|
||||
* These characters should then be ignored in body
|
||||
* and other text, but should be replaced with
|
||||
* U+FFFD in foreign content.
|
||||
*
|
||||
* At least for now, we always strip U+0000 when
|
||||
* tokenizing.
|
||||
*/
|
||||
repl = BAD_CAST "";
|
||||
replSize = 0;
|
||||
} else {
|
||||
repl = BAD_CAST "\xEF\xBF\xBD";
|
||||
replSize = 3;
|
||||
}
|
||||
|
||||
goto next_chunk;
|
||||
|
||||
case '\n':
|
||||
|
||||
@@ -139,8 +139,6 @@ Comment
|
||||
Character
|
||||
foo < bar
|
||||
34
|
||||
Character
|
||||
<EFBFBD>
|
||||
35
|
||||
Comment
|
||||
-x
|
||||
|
||||
@@ -508,7 +508,7 @@ Character
|
||||
<
|
||||
171
|
||||
Character
|
||||
<<EFBFBD>
|
||||
<
|
||||
172
|
||||
Character
|
||||
<
|
||||
|
||||
@@ -186,8 +186,6 @@ xYz
|
||||
57
|
||||
Comment
|
||||
doc
|
||||
Character
|
||||
<EFBFBD>
|
||||
58
|
||||
Comment
|
||||
doc<EFBFBD>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
0
|
||||
Character
|
||||
|
||||
<EFBFBD>
|
||||
|
||||
|
||||
@@ -43,6 +43,9 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
|
||||
|
||||
output = ''
|
||||
for token in test['output']:
|
||||
if token[1] == '\0':
|
||||
continue
|
||||
|
||||
output += token[0] + '\n'
|
||||
|
||||
if token[0] == 'DOCTYPE':
|
||||
@@ -61,7 +64,11 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
|
||||
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
||||
lambda m: chr(int(m[1], 16)),
|
||||
output)
|
||||
output = re.sub(r'\x00', '\uFFFD', output)
|
||||
|
||||
# The HTML5 spec splits handling of U+0000 across
|
||||
# tokenizer and tree builder. We already ignore
|
||||
# U+0000 in body text when tokenizing.
|
||||
output = re.sub(r'\x00', '', output)
|
||||
|
||||
for state in test.get('initialStates', ['Data state']):
|
||||
state_no = state_map.get(state)
|
||||
|
||||
Reference in New Issue
Block a user