1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-26 00:37:43 +03:00

Fix HTML push parser lookahead

The parsing rules when looking for terminating chars or sequences in
the push parser differed from the actual parsing code. This could
result in the lookahead to overshoot and data being rescanned,
potentially leading to quadratic runtime.

Comments must never be handled during lookahead. Attribute values must
only be skipped for start tags and doctype declarations, not for end
tags, comments, PIs and script content.
This commit is contained in:
Nick Wellnhofer
2020-07-12 21:43:44 +02:00
parent e050062ca9
commit 8e219b154e

View File

@@ -5136,7 +5136,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
* @first: the first char to lookup * @first: the first char to lookup
* @next: the next char to lookup or zero * @next: the next char to lookup or zero
* @third: the next char to lookup or zero * @third: the next char to lookup or zero
* @comment: flag to force checking inside comments * @ignoreattrval: skip over attribute values
* *
* Try to find if a sequence (first, next, third) or just (first next) or * Try to find if a sequence (first, next, third) or just (first next) or
* (first) is available in the input stream. * (first) is available in the input stream.
@@ -5150,13 +5150,11 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
*/ */
static int static int
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
xmlChar next, xmlChar third, int iscomment, xmlChar next, xmlChar third, int ignoreattrval)
int ignoreattrval)
{ {
int base, len; int base, len;
htmlParserInputPtr in; htmlParserInputPtr in;
const xmlChar *buf; const xmlChar *buf;
int incomment = 0;
int invalue = 0; int invalue = 0;
char valdellim = 0x0; char valdellim = 0x0;
@@ -5171,8 +5169,7 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
if (ctxt->checkIndex > base) { if (ctxt->checkIndex > base) {
base = ctxt->checkIndex; base = ctxt->checkIndex;
/* Abuse hasPErefs member to restore current state. */ /* Abuse hasPErefs member to restore current state. */
incomment = ctxt->hasPErefs & 1 ? 1 : 0; invalue = ctxt->hasPErefs & 1 ? 1 : 0;
invalue = ctxt->hasPErefs & 2 ? 1 : 0;
} }
if (in->buf == NULL) { if (in->buf == NULL) {
@@ -5189,14 +5186,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
else if (next) else if (next)
len--; len--;
for (; base < len; base++) { for (; base < len; base++) {
if ((!incomment) && (base + 4 < len) && (!iscomment)) {
if ((buf[base] == '<') && (buf[base + 1] == '!') &&
(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
incomment = 1;
/* do not increment past <! - some people use <!--> */
base += 2;
}
}
if (ignoreattrval) { if (ignoreattrval) {
if (buf[base] == '"' || buf[base] == '\'') { if (buf[base] == '"' || buf[base] == '\'') {
if (invalue) { if (invalue) {
@@ -5213,16 +5202,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
continue; continue;
} }
} }
if (incomment) {
if (base + 3 > len)
break;
if ((buf[base] == '-') && (buf[base + 1] == '-') &&
(buf[base + 2] == '>')) {
incomment = 0;
base += 2;
}
continue;
}
if (buf[base] == first) { if (buf[base] == first) {
if (third != 0) { if (third != 0) {
if ((buf[base + 1] != next) || (buf[base + 2] != third)) if ((buf[base + 1] != next) || (buf[base + 2] != third))
@@ -5251,11 +5230,10 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
} }
ctxt->checkIndex = base; ctxt->checkIndex = base;
/* Abuse hasPErefs member to track current state. */ /* Abuse hasPErefs member to track current state. */
ctxt->hasPErefs = 0;
if (incomment)
ctxt->hasPErefs |= 1;
if (invalue) if (invalue)
ctxt->hasPErefs |= 2; ctxt->hasPErefs |= 1;
else
ctxt->hasPErefs &= ~1;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
if (next == 0) if (next == 0)
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5293,7 +5271,6 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
int base, len; int base, len;
htmlParserInputPtr in; htmlParserInputPtr in;
const xmlChar *buf; const xmlChar *buf;
int incomment = 0;
int i; int i;
in = ctxt->input; in = ctxt->input;
@@ -5304,11 +5281,8 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
if (base < 0) if (base < 0)
return (-1); return (-1);
if (ctxt->checkIndex > base) { if (ctxt->checkIndex > base)
base = ctxt->checkIndex; base = ctxt->checkIndex;
/* Abuse hasPErefs member to restore current state. */
incomment = ctxt->hasPErefs & 1 ? 1 : 0;
}
if (in->buf == NULL) { if (in->buf == NULL) {
buf = in->base; buf = in->base;
@@ -5319,24 +5293,6 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
} }
for (; base < len; base++) { for (; base < len; base++) {
if (!incomment && (base + 4 < len)) {
if ((buf[base] == '<') && (buf[base + 1] == '!') &&
(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
incomment = 1;
/* do not increment past <! - some people use <!--> */
base += 2;
}
}
if (incomment) {
if (base + 3 > len)
break;
if ((buf[base] == '-') && (buf[base + 1] == '-') &&
(buf[base + 2] == '>')) {
incomment = 0;
base += 2;
}
continue;
}
for (i = 0; i < stopLen; ++i) { for (i = 0; i < stopLen; ++i) {
if (buf[base] == stop[i]) { if (buf[base] == stop[i]) {
ctxt->checkIndex = 0; ctxt->checkIndex = 0;
@@ -5345,8 +5301,6 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
} }
} }
ctxt->checkIndex = base; ctxt->checkIndex = base;
/* Abuse hasPErefs member to track current state. */
ctxt->hasPErefs = incomment;
return (-1); return (-1);
} }
@@ -5489,7 +5443,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5536,7 +5490,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if ((cur == '<') && (next == '!') && if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) { (in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5546,7 +5500,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_MISC; ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5560,7 +5514,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5597,7 +5551,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if ((cur == '<') && (next == '!') && if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) { (in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5607,7 +5561,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_PROLOG; ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5645,7 +5599,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if ((cur == '<') && (next == '!') && if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) { (in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5655,7 +5609,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_EPILOG; ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5719,7 +5673,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
break; break;
} }
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done; goto done;
/* Capture start position */ /* Capture start position */
@@ -5866,7 +5820,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int idx; int idx;
xmlChar val; xmlChar val;
idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
if (idx < 0) if (idx < 0)
goto done; goto done;
val = in->cur[idx + 2]; val = in->cur[idx + 2];
@@ -5893,7 +5847,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
goto done; goto done;
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n", "Misplaced DOCTYPE declaration\n",
@@ -5903,7 +5857,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(in->cur[2] == '-') && (in->cur[3] == '-')) { (in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence( (htmlParseLookupSequence(
ctxt, '-', '-', '>', 1, 1) < 0)) ctxt, '-', '-', '>', 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5913,7 +5867,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done; goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
@@ -5984,7 +5938,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if (avail < 2) if (avail < 2)
goto done; goto done;
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done; goto done;
htmlParseEndTag(ctxt); htmlParseEndTag(ctxt);
if (ctxt->nameNr == 0) { if (ctxt->nameNr == 0) {