From 34c9108f153596d87f64db6d26b3c7f8fccaa3f2 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sun, 7 Jul 2024 18:38:31 +0200 Subject: [PATCH] encoding: Add sizeOut argument to xmlCharEncInput When push parsing, we want to convert as much of the input as possible. When pull parsing memory buffers, we want to convert data chunk by chunk to save memory. --- encoding.c | 127 ++++++++++++++++++++++++++---------------- include/private/enc.h | 2 +- parser.c | 41 +++++++------- parserInternals.c | 8 ++- xmlIO.c | 28 ++++++++-- 5 files changed, 126 insertions(+), 80 deletions(-) diff --git a/encoding.c b/encoding.c index cbe057cc0..3d336fb13 100644 --- a/encoding.c +++ b/encoding.c @@ -1537,75 +1537,104 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, /** * xmlCharEncInput: * @input: a parser input buffer + * @sizeOut: pointer to output size + * + * @sizeOut should be set to the maximum output size (or SIZE_MAX). + * After return, it is set to the number of bytes written. * * Generic front-end for the encoding handler on parser input * - * Returns the number of bytes written or an XML_ENC_ERR code. + * Returns an XML_ENC_ERR code. */ int -xmlCharEncInput(xmlParserInputBufferPtr input) +xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut) { + xmlBufPtr out, in; + const xmlChar *dataIn; + size_t availIn; + size_t maxOut; + size_t totalIn, totalOut; int ret; - size_t avail; - size_t toconv; - int c_in; - int c_out; - xmlBufPtr in; - xmlBufPtr out; - const xmlChar *inData; - size_t inTotal = 0; - if ((input == NULL) || (input->encoder == NULL) || - (input->buffer == NULL) || (input->raw == NULL)) - return(XML_ENC_ERR_INTERNAL); out = input->buffer; in = input->raw; - toconv = xmlBufUse(in); - if (toconv == 0) - return (0); - inData = xmlBufContent(in); - inTotal = 0; + maxOut = *sizeOut; + totalOut = 0; - do { - c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv; + *sizeOut = 0; - avail = xmlBufAvail(out); - if (avail > INT_MAX) - avail = INT_MAX; - if (avail < 4096) { + availIn = xmlBufUse(in); + if (availIn == 0) + return(0); + dataIn = xmlBufContent(in); + totalIn = 0; + + while (1) { + size_t availOut; + int completeOut, completeIn; + int c_out, c_in; + + availOut = xmlBufAvail(out); + if (availOut > INT_MAX / 2) + availOut = INT_MAX / 2; + + if (availOut < maxOut) { + c_out = availOut; + completeOut = 0; + } else { + c_out = maxOut; + completeOut = 1; + } + + if (availIn > INT_MAX / 2) { + c_in = INT_MAX / 2; + completeIn = 0; + } else { + c_in = availIn; + completeIn = 1; + } + + ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, + dataIn, &c_in); + + totalIn += c_in; + dataIn += c_in; + availIn -= c_in; + + totalOut += c_out; + maxOut -= c_out; + xmlBufAddLen(out, c_out); + + if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) { + input->error = xmlEncConvertError(ret); + return(ret); + } + + if ((completeOut) && (completeIn)) + break; + if ((completeOut) && (ret == XML_ENC_ERR_SPACE)) + break; + if ((completeIn) && (ret == XML_ENC_ERR_SUCCESS)) + break; + + if (ret == XML_ENC_ERR_SPACE) { if (xmlBufGrow(out, 4096) < 0) { input->error = XML_ERR_NO_MEMORY; return(XML_ENC_ERR_MEMORY); } - avail = xmlBufAvail(out); } - - c_in = toconv; - c_out = avail; - ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - inData, &c_in); - inTotal += c_in; - inData += c_in; - toconv -= c_in; - xmlBufAddLen(out, c_out); - } while (ret == XML_ENC_ERR_SPACE); - - xmlBufShrink(in, inTotal); - - if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in) - input->rawconsumed = ULONG_MAX; - else - input->rawconsumed += c_in; - - if (((ret != 0) && (c_out == 0)) || - (ret == XML_ENC_ERR_MEMORY)) { - if (input->error == 0) - input->error = xmlEncConvertError(ret); - return(ret); } - return (c_out); + xmlBufShrink(in, totalIn); + + if (input->rawconsumed > ULONG_MAX - (unsigned long) totalIn) + input->rawconsumed = ULONG_MAX; + else + input->rawconsumed += totalIn; + + *sizeOut = totalOut; + return(XML_ERR_OK); } /** diff --git a/include/private/enc.h b/include/private/enc.h index cd549145e..864025f8e 100644 --- a/include/private/enc.h +++ b/include/private/enc.h @@ -11,7 +11,7 @@ XML_HIDDEN int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, int *outlen, const unsigned char *in, int *inlen); XML_HIDDEN int -xmlCharEncInput(xmlParserInputBufferPtr input); +xmlCharEncInput(xmlParserInputBufferPtr input, size_t *sizeOut); XML_HIDDEN int xmlCharEncOutput(xmlOutputBufferPtr output, int init); diff --git a/parser.c b/parser.c index e52392ce5..b35e804ec 100644 --- a/parser.c +++ b/parser.c @@ -11561,14 +11561,18 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate) { size_t curBase; size_t maxLength; + size_t pos; int end_in_lf = 0; + int res; if ((ctxt == NULL) || (size < 0)) return(XML_ERR_ARGUMENT); + if ((chunk == NULL) && (size > 0)) + return(XML_ERR_ARGUMENT); + if ((ctxt->input == NULL) || (ctxt->input->buf == NULL)) + return(XML_ERR_ARGUMENT); if (ctxt->disableSAX != 0) return(ctxt->errNo); - if (ctxt->input == NULL) - return(XML_ERR_INTERNAL_ERROR); ctxt->input->flags |= XML_INPUT_PROGRESSIVE; if (ctxt->instate == XML_PARSER_START) @@ -11579,18 +11583,17 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, size--; } - if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && - (ctxt->input->buf != NULL)) { - size_t pos = ctxt->input->cur - ctxt->input->base; - int res; - - res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); - xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos); - if (res < 0) { - xmlCtxtErrIO(ctxt, ctxt->input->buf->error, NULL); - xmlHaltParser(ctxt); - return(ctxt->errNo); - } + /* + * Also push an empty chunk to make sure that the raw buffer + * will be flushed if there is an encoder. + */ + pos = ctxt->input->cur - ctxt->input->base; + res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); + xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos); + if (res < 0) { + xmlCtxtErrIO(ctxt, ctxt->input->buf->error, NULL); + xmlHaltParser(ctxt); + return(ctxt->errNo); } xmlParseTryOrFinish(ctxt, terminate); @@ -11608,11 +11611,8 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1)) return(ctxt->errNo); - if ((end_in_lf == 1) && (ctxt->input != NULL) && - (ctxt->input->buf != NULL)) { - size_t pos = ctxt->input->cur - ctxt->input->base; - int res; - + if (end_in_lf == 1) { + pos = ctxt->input->cur - ctxt->input->base; res = xmlParserInputBufferPush(ctxt->input->buf, 1, "\r"); xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos); if (res < 0) { @@ -11639,8 +11639,7 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, xmlFatalErrMsg(ctxt, XML_ERR_DOCUMENT_EMPTY, "Start tag expected, '<' not found\n"); } - } else if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder != NULL) && + } else if ((ctxt->input->buf->encoder != NULL) && (ctxt->input->buf->error == 0) && (!xmlBufIsEmpty(ctxt->input->buf->raw))) { xmlFatalErrMsg(ctxt, XML_ERR_INVALID_CHAR, diff --git a/parserInternals.c b/parserInternals.c index 1590ce621..8022cd249 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1273,7 +1273,6 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input, xmlCharEncodingHandlerPtr handler) { xmlParserInputBufferPtr in; xmlBufPtr buf; - int nbchars; int code = XML_ERR_OK; if ((input == NULL) || (input->buf == NULL)) { @@ -1326,6 +1325,8 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input, */ if (input->end > input->base) { size_t processed; + size_t nbchars; + int res; /* * Shrink the current input buffer. @@ -1336,8 +1337,9 @@ xmlInputSetEncodingHandler(xmlParserInputPtr input, input->consumed += processed; in->rawconsumed = processed; - nbchars = xmlCharEncInput(in); - if (nbchars < 0) + nbchars = 4000 /* MINLEN */; + res = xmlCharEncInput(in, &nbchars); + if (res < 0) code = in->error; } diff --git a/xmlIO.c b/xmlIO.c index d2b70432a..8df36a690 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -48,6 +48,10 @@ #include "private/error.h" #include "private/io.h" +#ifndef SIZE_MAX + #define SIZE_MAX ((size_t) -1) +#endif + /* #define VERBOSE_FAILURE */ #define MINLEN 4000 @@ -2105,7 +2109,7 @@ xmlOutputBufferCreateFilenameDefault(xmlOutputBufferCreateFilenameFunc func) int xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) { - int nbchars = 0; + size_t nbchars = 0; int ret; if (len < 0) return(0); @@ -2130,9 +2134,11 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, /* * convert as much as possible to the parser reading buffer. */ - nbchars = xmlCharEncInput(in); - if (nbchars < 0) - return(-1); + nbchars = SIZE_MAX; + if (xmlCharEncInput(in, &nbchars) < 0) + return(-1); + if (nbchars > INT_MAX) + nbchars = INT_MAX; } else { nbchars = len; ret = xmlBufAdd(in->buffer, (xmlChar *) buf, nbchars); @@ -2229,9 +2235,19 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) { } if (in->encoder != NULL) { - res = xmlCharEncInput(in); - if (res < 0) + size_t sizeOut; + + /* + * Don't convert whole buffer when reading from memory. + */ + if (in->readcallback == NULL) + sizeOut = len; + else + sizeOut = SIZE_MAX; + + if (xmlCharEncInput(in, &sizeOut) < 0) return(-1); + res = sizeOut; } return(res); }