1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-21 14:53:44 +03:00

encoding: Detect truncated multi-byte sequences with ICU

Unlike iconv or the internal converters, ICU consumes truncated multi-
byte sequences at the end of an input buffer. We currently check for a
non-empty raw input buffer to detect truncated sequences, so this fails
with ICU.

It might be possible to inspect the pivot buffer pointers, but it seems
cleaner to implement a `flush` flag for some encoding and I/O functions.
After flushing, we can check for U_TRUNCATED_CHAR_FOUND with ICU, or
detect remaining input with other converters.

Also fix detection of truncated sequences for HTML, XML content and
DTDs with iconv.
This commit is contained in:
Nick Wellnhofer
2025-03-10 02:18:51 +01:00
parent 76c6ddfef9
commit 69b83bb68e
14 changed files with 287 additions and 133 deletions

View File

@@ -4385,6 +4385,11 @@ htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) {
htmlParseContent(ctxt);
/*
* Only check for truncated multi-byte sequences
*/
xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
/* TODO: Use xmlCtxtIsCatastrophicError */
if (ctxt->errNo != XML_ERR_NO_MEMORY) {
xmlNodePtr cur;
@@ -4509,11 +4514,9 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
htmlParseContent(ctxt);
/*
* autoclose
* Only check for truncated multi-byte sequences
*/
if (CUR == 0)
htmlAutoCloseOnEnd(ctxt);
xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
/*
* SAX: end of the document processing.
@@ -5237,12 +5240,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
int terminate) {
if ((ctxt == NULL) || (ctxt->input == NULL))
if ((ctxt == NULL) ||
(ctxt->input == NULL) || (ctxt->input->buf == NULL) ||
(size < 0) ||
((size > 0) && (chunk == NULL)))
return(XML_ERR_ARGUMENT);
if (PARSER_STOPPED(ctxt) != 0)
return(ctxt->errNo);
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL)) {
if (size > 0) {
size_t pos = ctxt->input->cur - ctxt->input->base;
int res;
@@ -5261,6 +5267,11 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
htmlAutoCloseOnEnd(ctxt);
/*
* Only check for truncated multi-byte sequences
*/
xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
ctxt->sax->endDocument(ctxt->userData);