From e48fb5e4f2d6174a50e3edea26655943f080a7cf Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Fri, 31 Jan 2025 22:08:13 +0100 Subject: [PATCH] html: Handle incomplete UTF-8 when push-parsing For now, incomplete UTF-8 is always an error in push mode. Eventually, we could pass chunked data to the character handler when push-parsing. Then we'd have to handle incomplete sequences. --- HTMLparser.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index 00c30edb..71aa2859 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -379,7 +379,8 @@ htmlMaskMatch(htmlAsciiMask mask, unsigned c) { } static int -htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) { +htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len, + int partial) { unsigned c = str[0]; int size; @@ -424,7 +425,8 @@ htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len) { return(size); incomplete: - return(0); + if (partial) + return(0); invalid: /* Only report the first error */ @@ -2424,7 +2426,7 @@ htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) { buf[nbchar++] = c; } } else { - size = htmlValidateUtf8(ctxt, in, avail); + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); if (size > 0) { if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) { @@ -2811,7 +2813,7 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, goto restart; } - size = htmlValidateUtf8(ctxt, in, avail); + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); if (size <= 0) { skip = 1; @@ -3260,7 +3262,11 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { goto restart; } - size = htmlValidateUtf8(ctxt, in, avail); + /* + * We should handle partial data to allow the push + * parser to pass incomplete chunks. + */ + size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0); if (size <= 0) { skip = 1;