mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
21
HTMLparser.c
21
HTMLparser.c
@@ -3226,8 +3226,25 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
|
|||||||
|
|
||||||
case '\0':
|
case '\0':
|
||||||
skip = 1;
|
skip = 1;
|
||||||
repl = BAD_CAST "\xEF\xBF\xBD";
|
|
||||||
replSize = 3;
|
if (mode == 0) {
|
||||||
|
/*
|
||||||
|
* The HTML5 spec says that the tokenizer should
|
||||||
|
* pass on U+0000 unmodified in normal data mode.
|
||||||
|
* These characters should then be ignored in body
|
||||||
|
* and other text, but should be replaced with
|
||||||
|
* U+FFFD in foreign content.
|
||||||
|
*
|
||||||
|
* At least for now, we always strip U+0000 when
|
||||||
|
* tokenizing.
|
||||||
|
*/
|
||||||
|
repl = BAD_CAST "";
|
||||||
|
replSize = 0;
|
||||||
|
} else {
|
||||||
|
repl = BAD_CAST "\xEF\xBF\xBD";
|
||||||
|
replSize = 3;
|
||||||
|
}
|
||||||
|
|
||||||
goto next_chunk;
|
goto next_chunk;
|
||||||
|
|
||||||
case '\n':
|
case '\n':
|
||||||
|
|||||||
@@ -139,8 +139,6 @@ Comment
|
|||||||
Character
|
Character
|
||||||
foo < bar
|
foo < bar
|
||||||
34
|
34
|
||||||
Character
|
|
||||||
<EFBFBD>
|
|
||||||
35
|
35
|
||||||
Comment
|
Comment
|
||||||
-x
|
-x
|
||||||
|
|||||||
@@ -508,7 +508,7 @@ Character
|
|||||||
<
|
<
|
||||||
171
|
171
|
||||||
Character
|
Character
|
||||||
<<EFBFBD>
|
<
|
||||||
172
|
172
|
||||||
Character
|
Character
|
||||||
<
|
<
|
||||||
|
|||||||
@@ -186,8 +186,6 @@ xYz
|
|||||||
57
|
57
|
||||||
Comment
|
Comment
|
||||||
doc
|
doc
|
||||||
Character
|
|
||||||
<EFBFBD>
|
|
||||||
58
|
58
|
||||||
Comment
|
Comment
|
||||||
doc<EFBFBD>
|
doc<EFBFBD>
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
0
|
0
|
||||||
Character
|
Character
|
||||||
|
|
||||||
<EFBFBD>
|
|
||||||
|
|||||||
@@ -43,6 +43,9 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
|
|||||||
|
|
||||||
output = ''
|
output = ''
|
||||||
for token in test['output']:
|
for token in test['output']:
|
||||||
|
if token[1] == '\0':
|
||||||
|
continue
|
||||||
|
|
||||||
output += token[0] + '\n'
|
output += token[0] + '\n'
|
||||||
|
|
||||||
if token[0] == 'DOCTYPE':
|
if token[0] == 'DOCTYPE':
|
||||||
@@ -61,7 +64,11 @@ for filename in sorted(glob.glob('../html5lib-tests/tokenizer/*.test')):
|
|||||||
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
output = re.sub(r'\\u([A-Fa-f0-9]{4})',
|
||||||
lambda m: chr(int(m[1], 16)),
|
lambda m: chr(int(m[1], 16)),
|
||||||
output)
|
output)
|
||||||
output = re.sub(r'\x00', '\uFFFD', output)
|
|
||||||
|
# The HTML5 spec splits handling of U+0000 across
|
||||||
|
# tokenizer and tree builder. We already ignore
|
||||||
|
# U+0000 in body text when tokenizing.
|
||||||
|
output = re.sub(r'\x00', '', output)
|
||||||
|
|
||||||
for state in test.get('initialStates', ['Data state']):
|
for state in test.get('initialStates', ['Data state']):
|
||||||
state_no = state_map.get(state)
|
state_no = state_map.get(state)
|
||||||
|
|||||||
Reference in New Issue
Block a user