1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

parser: Stream data when reading from memory

Don't create a copy of the whole input buffer. Read the data chunk by
chunk to save memory.

Historically, it was probably envisioned to read data from memory
without additional copying. This doesn't work reliably with the current
design of the XML parser which requires a terminating null byte at the
end of input buffers. This lead to xmlReadMemory interfaces, which
expect pointer and size arguments, being changed to make a
zero-terminated copy of the input buffer. Interfaces based on
xmlReadDoc, which actually expect a zero-terminated string and
would make zero-copy operation work, were then simplified to rely on
xmlReadMemoryi, resulting in an unnecessary copy.

To avoid copying (possibly gigabytes) of memory temporarily, we now
stream in-memory input just like content read from files in a
chunk-by-chunk fashion (using a somewhat outdated INPUT_CHUNK size of
250 bytes). As a side effect, we also avoid another copy of the whole
input when handling non-UTF-8 data which was made possible by some
earlier commits.

Interfaces expecting zero-terminated strings now make use of strnlen
which unfortunately isn't part of the standard C library and only
mandated since POSIX 2008.
This commit is contained in:
Nick Wellnhofer
2023-08-08 15:21:28 +02:00
parent 5aff27ae78
commit 834b8123ef
6 changed files with 211 additions and 49 deletions

View File

@@ -32,6 +32,7 @@
#include "private/enc.h"
#include "private/error.h"
#include "private/html.h"
#include "private/io.h"
#include "private/parser.h"
#include "private/tree.h"
@@ -5169,7 +5170,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) {
/**
* htmlCreateDocParserCtxt:
* @cur: a pointer to an array of xmlChar
* @str: a pointer to an array of xmlChar
* @encoding: a free form C string describing the HTML document encoding, or NULL
*
* Create a parser context for an HTML document.
@@ -5179,17 +5180,37 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) {
* Returns the new parser context or NULL
*/
static htmlParserCtxtPtr
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
int len;
htmlParserCtxtPtr ctxt;
htmlCreateDocParserCtxt(const xmlChar *str, const char *encoding) {
xmlParserCtxtPtr ctxt;
xmlParserInputPtr input;
xmlParserInputBufferPtr buf;
if (cur == NULL)
if (str == NULL)
return(NULL);
len = xmlStrlen(cur);
ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
ctxt = htmlNewParserCtxt();
if (ctxt == NULL)
return(NULL);
buf = xmlParserInputBufferCreateString(str);
if (buf == NULL) {
xmlFreeParserCtxt(ctxt);
return(NULL);
}
input = xmlNewInputStream(ctxt);
if (input == NULL) {
xmlFreeParserInputBuffer(buf);
xmlFreeParserCtxt(ctxt);
return(NULL);
}
input->filename = NULL;
input->buf = buf;
xmlBufResetInput(buf->buffer, input);
inputPush(ctxt, input);
if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
@@ -5219,6 +5240,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
}
}
}
return(ctxt);
}
@@ -6932,13 +6954,33 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
* Returns the resulting document tree
*/
htmlDocPtr
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
const char *URL, const char *encoding, int options)
{
if (cur == NULL)
xmlParserInputBufferPtr input;
xmlParserInputPtr stream;
if (ctxt == NULL)
return (NULL);
return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
encoding, options));
if (str == NULL)
return (NULL);
xmlInitParser();
htmlCtxtReset(ctxt);
input = xmlParserInputBufferCreateString(str);
if (input == NULL) {
return(NULL);
}
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
if (stream == NULL) {
xmlFreeParserInputBuffer(input);
return(NULL);
}
inputPush(ctxt, stream);
return (htmlDoRead(ctxt, URL, encoding, options, 1));
}
/**