1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

Fix slow parsing of HTML with encoding errors

Under certain circumstances, the HTML parser would try to guess and
switch input encodings multiple times, leading to slow processing of
documents with encoding errors. The repeated scanning of the input
buffer when guessing encodings could even lead to quadratic behavior.

The code htmlCurrentChar probably assumed that if there's an encoding
handler, it is guaranteed to produce valid UTF-8. This holds true in
general, but if the detected encoding was "UTF-8", the UTF8ToUTF8
encoding handler simply invoked memcpy without checking for invalid
UTF-8. This still must be fixed, preferably by not using this handler
at all.

Also leave a note that switching encodings twice seems impossible to
implement correctly. Add a check when handling UTF-8 encoding errors
in htmlCurrentChar to avoid this situation, even if encoders produce
invalid UTF-8.

Found by OSS-Fuzz.
This commit is contained in:
Nick Wellnhofer
2021-02-20 20:30:43 +01:00
parent 02bee4c414
commit dcb80b92da
3 changed files with 26 additions and 2 deletions

View File

@ -457,6 +457,11 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
ctxt->input->encoding = guess;
handler = xmlFindCharEncodingHandler((const char *) guess);
if (handler != NULL) {
/*
* Don't use UTF-8 encoder which isn't required and
* can produce invalid UTF-8.
*/
if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
xmlSwitchToEncoding(ctxt, handler);
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
@ -570,7 +575,16 @@ encoding_error:
BAD_CAST buffer, NULL);
}
ctxt->charset = XML_CHAR_ENCODING_8859_1;
/*
* Don't switch encodings twice. Note that if there's an encoder, we
* shouldn't receive invalid UTF-8 anyway.
*
* Note that if ctxt->input->buf == NULL, switching encodings is
* impossible, see Gitlab issue #34.
*/
if ((ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder == NULL))
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
*len = 1;
return((int) *ctxt->input->cur);
}

View File

@ -373,6 +373,11 @@ UTF8ToUTF8(unsigned char* out, int *outlen,
if (len < 0)
return(-1);
/*
* FIXME: Conversion functions must assure valid UTF-8, so we have
* to check for UTF-8 validity. Preferably, this converter shouldn't
* be used at all.
*/
memcpy(out, inb, len);
*outlen = len;

View File

@ -1153,6 +1153,11 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
* Note: this is a bit dangerous, but that's what it
* takes to use nearly compatible signature for different
* encodings.
*
* FIXME: Encoders might buffer partial byte sequences, so
* this probably can't work. We should return an error and
* make sure that callers never try to switch the encoding
* twice.
*/
xmlCharEncCloseFunc(input->buf->encoder);
input->buf->encoder = handler;