1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

html: Handle incomplete UTF-8 when push-parsing

For now, incomplete UTF-8 is always an error in push mode.

Eventually, we could pass chunked data to the character handler when
push-parsing. Then we'd have to handle incomplete sequences.
This commit is contained in:
Nick Wellnhofer
2025-01-31 22:08:13 +01:00
parent 6bb2ea8e70
commit e48fb5e4f2

View File

@ -379,7 +379,8 @@ htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
} }
static int static int
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) { htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
int partial) {
unsigned c = str[0]; unsigned c = str[0];
int size; int size;
@ -424,7 +425,8 @@ htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) {
return(size); return(size);
incomplete: incomplete:
return(0); if (partial)
return(0);
invalid: invalid:
/* Only report the first error */ /* Only report the first error */
@ -2424,7 +2426,7 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
buf[nbchar++] = c; buf[nbchar++] = c;
} }
} else { } else {
size = htmlValidateUtf8(ctxt, in, avail); size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
if (size > 0) { if (size > 0) {
if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) { if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
@ -2811,7 +2813,7 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
goto restart; goto restart;
} }
size = htmlValidateUtf8(ctxt, in, avail); size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
if (size <= 0) { if (size <= 0) {
skip = 1; skip = 1;
@ -3260,7 +3262,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
goto restart; goto restart;
} }
size = htmlValidateUtf8(ctxt, in, avail); /*
* We should handle partial data to allow the push
* parser to pass incomplete chunks.
*/
size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
if (size <= 0) { if (size <= 0) {
skip = 1; skip = 1;