1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-16 07:21:58 +03:00

572129 speed up parasing of large HTML text nodes

* HTMLparser.c: use a different lookup function htmlParseLookupChars()
  to avoid the quadratic behaviour
This commit is contained in:
Markus Kull
2009-08-24 19:00:23 +02:00
committed by Daniel Veillard
parent b468f7444c
commit 56a03035bf

View File

@ -4725,6 +4725,80 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
return(-1);
}
/**
* htmlParseLookupChars:
* @ctxt: an HTML parser context
* @stop: Array of chars, which stop the lookup.
* @stopLen: Length of stop-Array
*
* Try to find if any char of the stop-Array is available in the input
* stream.
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
*
* Returns the index to the current parsing point if a stopChar
* is available, -1 otherwise.
*/
static int
htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
int stopLen)
{
int base, len;
htmlParserInputPtr in;
const xmlChar *buf;
int incomment = 0;
int i;
in = ctxt->input;
if (in == NULL)
return (-1);
base = in->cur - in->base;
if (base < 0)
return (-1);
if (ctxt->checkIndex > base)
base = ctxt->checkIndex;
if (in->buf == NULL) {
buf = in->base;
len = in->length;
} else {
buf = in->buf->buffer->content;
len = in->buf->buffer->use;
}
for (; base < len; base++) {
if (!incomment && (base + 4 < len)) {
if ((buf[base] == '<') && (buf[base + 1] == '!') &&
(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
incomment = 1;
/* do not increment past <! - some people use <!--> */
base += 2;
}
}
if (incomment) {
if (base + 3 > len)
return (-1);
if ((buf[base] == '-') && (buf[base + 1] == '-') &&
(buf[base + 2] == '>')) {
incomment = 0;
base += 2;
}
continue;
}
for (i = 0; i < stopLen; ++i) {
if (buf[base] == stop[i]) {
ctxt->checkIndex = 0;
return (base - (in->cur - in->base));
}
}
}
ctxt->checkIndex = base;
return (-1);
}
/**
* htmlParseTryOrFinish:
* @ctxt: an HTML parser context
@ -5254,7 +5328,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
break;
} else if (cur == '&') {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, ';', 0, 0, 0, 1) < 0))
(htmlParseLookupChars(ctxt,
BAD_CAST "; >/", 4) < 0))
goto done;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@ -5270,7 +5345,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
* data detection.
*/
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '<', 0, 0, 0, 1) < 0))
(htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
goto done;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH