mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-28 00:21:53 +03:00
572129 speed up parasing of large HTML text nodes
* HTMLparser.c: use a different lookup function htmlParseLookupChars() to avoid the quadratic behaviour
This commit is contained in:
committed by
Daniel Veillard
parent
b468f7444c
commit
56a03035bf
79
HTMLparser.c
79
HTMLparser.c
@ -4725,6 +4725,80 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
|
|||||||
return(-1);
|
return(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlParseLookupChars:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
* @stop: Array of chars, which stop the lookup.
|
||||||
|
* @stopLen: Length of stop-Array
|
||||||
|
*
|
||||||
|
* Try to find if any char of the stop-Array is available in the input
|
||||||
|
* stream.
|
||||||
|
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
|
||||||
|
* to avoid rescanning sequences of bytes, it DOES change the state of the
|
||||||
|
* parser, do not use liberally.
|
||||||
|
*
|
||||||
|
* Returns the index to the current parsing point if a stopChar
|
||||||
|
* is available, -1 otherwise.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
|
||||||
|
int stopLen)
|
||||||
|
{
|
||||||
|
int base, len;
|
||||||
|
htmlParserInputPtr in;
|
||||||
|
const xmlChar *buf;
|
||||||
|
int incomment = 0;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
in = ctxt->input;
|
||||||
|
if (in == NULL)
|
||||||
|
return (-1);
|
||||||
|
|
||||||
|
base = in->cur - in->base;
|
||||||
|
if (base < 0)
|
||||||
|
return (-1);
|
||||||
|
|
||||||
|
if (ctxt->checkIndex > base)
|
||||||
|
base = ctxt->checkIndex;
|
||||||
|
|
||||||
|
if (in->buf == NULL) {
|
||||||
|
buf = in->base;
|
||||||
|
len = in->length;
|
||||||
|
} else {
|
||||||
|
buf = in->buf->buffer->content;
|
||||||
|
len = in->buf->buffer->use;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; base < len; base++) {
|
||||||
|
if (!incomment && (base + 4 < len)) {
|
||||||
|
if ((buf[base] == '<') && (buf[base + 1] == '!') &&
|
||||||
|
(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
|
||||||
|
incomment = 1;
|
||||||
|
/* do not increment past <! - some people use <!--> */
|
||||||
|
base += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (incomment) {
|
||||||
|
if (base + 3 > len)
|
||||||
|
return (-1);
|
||||||
|
if ((buf[base] == '-') && (buf[base + 1] == '-') &&
|
||||||
|
(buf[base + 2] == '>')) {
|
||||||
|
incomment = 0;
|
||||||
|
base += 2;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (i = 0; i < stopLen; ++i) {
|
||||||
|
if (buf[base] == stop[i]) {
|
||||||
|
ctxt->checkIndex = 0;
|
||||||
|
return (base - (in->cur - in->base));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctxt->checkIndex = base;
|
||||||
|
return (-1);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* htmlParseTryOrFinish:
|
* htmlParseTryOrFinish:
|
||||||
* @ctxt: an HTML parser context
|
* @ctxt: an HTML parser context
|
||||||
@ -5254,7 +5328,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
break;
|
break;
|
||||||
} else if (cur == '&') {
|
} else if (cur == '&') {
|
||||||
if ((!terminate) &&
|
if ((!terminate) &&
|
||||||
(htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
|
(htmlParseLookupChars(ctxt,
|
||||||
|
BAD_CAST "; >/", 4) < 0))
|
||||||
goto done;
|
goto done;
|
||||||
#ifdef DEBUG_PUSH
|
#ifdef DEBUG_PUSH
|
||||||
xmlGenericError(xmlGenericErrorContext,
|
xmlGenericError(xmlGenericErrorContext,
|
||||||
@ -5270,7 +5345,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
* data detection.
|
* data detection.
|
||||||
*/
|
*/
|
||||||
if ((!terminate) &&
|
if ((!terminate) &&
|
||||||
(htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
|
(htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
|
||||||
goto done;
|
goto done;
|
||||||
ctxt->checkIndex = 0;
|
ctxt->checkIndex = 0;
|
||||||
#ifdef DEBUG_PUSH
|
#ifdef DEBUG_PUSH
|
||||||
|
Reference in New Issue
Block a user