1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-08 23:22:04 +03:00

html: Handle incomplete UTF-8 when push-parsing

For now, incomplete UTF-8 is always an error in push mode.

Eventually, we could pass chunked data to the character handler when
push-parsing. Then we'd have to handle incomplete sequences.
This commit is contained in:
Nick Wellnhofer
2025-01-31 22:08:13 +01:00
parent 6bb2ea8e70
commit e48fb5e4f2

View File

@ -379,7 +379,8 @@ htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
}
static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) {
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
int partial) {
unsigned c = str[0];
int size;
@ -424,7 +425,8 @@ htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) {
return(size);
incomplete:
return(0);
if (partial)
return(0);
invalid:
/* Only report the first error */
@ -2424,7 +2426,7 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
buf[nbchar++] = c;
}
} else {
size = htmlValidateUtf8(ctxt, in, avail);
size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
if (size > 0) {
if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
@ -2811,7 +2813,7 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
goto restart;
}
size = htmlValidateUtf8(ctxt, in, avail);
size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
if (size <= 0) {
skip = 1;
@ -3260,7 +3262,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
goto restart;
}
size = htmlValidateUtf8(ctxt, in, avail);
/*
* We should handle partial data to allow the push
* parser to pass incomplete chunks.
*/
size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
if (size <= 0) {
skip = 1;