1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

Make htmlCurrentChar always translate U+0000

The general assumption is that htmlCurrentChar only returns 0 if the
end of the input buffer is reached. The UTF-8 path already logged an
error if a zero byte U+0000 was found and returned a space character
instead. Make the ASCII code path do the same.

htmlParseTryOrFinish skips zero bytes at the beginning of a buffer, so
even if 0 was returned from htmlCurrentChar, the push parser would make
progress. But rescanning the input could cause performance problems.

The pull parser would abort parsing and now handles zero bytes in ASCII
mode the same way as the push parser or as in UTF-8 mode.

It would be better to return the replacement character U+FFFD instead,
but some of the client code assumes that the UTF-8 length of input and
output matches.
This commit is contained in:
Nick Wellnhofer
2020-07-15 14:38:55 +02:00
parent dfd4e33048
commit e050062ca9

View File

@@ -436,6 +436,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
*/
if ((int) *ctxt->input->cur < 0x80) {
*len = 1;
if ((*ctxt->input->cur == 0) &&
(ctxt->input->cur < ctxt->input->end)) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Char 0x%X out of allowed range\n", 0);
return(' ');
}
return((int) *ctxt->input->cur);
}
@@ -5437,6 +5443,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
}
if (avail < 1)
goto done;
/*
* This is done to make progress and avoid an infinite loop
* if a parsing attempt was aborted by hitting a NUL byte. After
* changing htmlCurrentChar, this probably isn't necessary anymore.
* We should consider removing this check.
*/
cur = in->cur[0];
if (cur == 0) {
SKIP(1);