mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
Fix slow parsing of HTML with encoding errors
Under certain circumstances, the HTML parser would try to guess and switch input encodings multiple times, leading to slow processing of documents with encoding errors. The repeated scanning of the input buffer when guessing encodings could even lead to quadratic behavior. The code htmlCurrentChar probably assumed that if there's an encoding handler, it is guaranteed to produce valid UTF-8. This holds true in general, but if the detected encoding was "UTF-8", the UTF8ToUTF8 encoding handler simply invoked memcpy without checking for invalid UTF-8. This still must be fixed, preferably by not using this handler at all. Also leave a note that switching encodings twice seems impossible to implement correctly. Add a check when handling UTF-8 encoding errors in htmlCurrentChar to avoid this situation, even if encoders produce invalid UTF-8. Found by OSS-Fuzz.
This commit is contained in:
18
HTMLparser.c
18
HTMLparser.c
@ -457,7 +457,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
ctxt->input->encoding = guess;
|
||||
handler = xmlFindCharEncodingHandler((const char *) guess);
|
||||
if (handler != NULL) {
|
||||
xmlSwitchToEncoding(ctxt, handler);
|
||||
/*
|
||||
* Don't use UTF-8 encoder which isn't required and
|
||||
* can produce invalid UTF-8.
|
||||
*/
|
||||
if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
|
||||
xmlSwitchToEncoding(ctxt, handler);
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
|
||||
"Unsupported encoding %s", guess, NULL);
|
||||
@ -570,7 +575,16 @@ encoding_error:
|
||||
BAD_CAST buffer, NULL);
|
||||
}
|
||||
|
||||
ctxt->charset = XML_CHAR_ENCODING_8859_1;
|
||||
/*
|
||||
* Don't switch encodings twice. Note that if there's an encoder, we
|
||||
* shouldn't receive invalid UTF-8 anyway.
|
||||
*
|
||||
* Note that if ctxt->input->buf == NULL, switching encodings is
|
||||
* impossible, see Gitlab issue #34.
|
||||
*/
|
||||
if ((ctxt->input->buf != NULL) &&
|
||||
(ctxt->input->buf->encoder == NULL))
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
||||
*len = 1;
|
||||
return((int) *ctxt->input->cur);
|
||||
}
|
||||
|
Reference in New Issue
Block a user