diff --git a/HTMLparser.c b/HTMLparser.c index 41ab4aa5..2877f4b7 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -5220,6 +5220,39 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, return (-1); } +/** + * htmlParseLookupCommentEnd: + * @ctxt: an HTML parser context + * + * Try to find a comment end tag in the input stream + * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags. + * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment) + * This function has a side effect of (possibly) incrementing ctxt->checkIndex + * to avoid rescanning sequences of bytes, it DOES change the state of the + * parser, do not use liberally. + * This wraps to htmlParseLookupSequence() + * + * Returns the index to the current parsing point if the full sequence is available, -1 otherwise. + */ +static int +htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) +{ + int mark = 0; + int cur = CUR_PTR - BASE_PTR; + + while (mark >= 0) { + mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0); + if ((mark < 0) || + (NXT(mark+2) == '>') || + ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) { + return mark; + } + ctxt->checkIndex = cur + mark + 1; + } + return mark; +} + + /** * htmlParseTryOrFinish: * @ctxt: an HTML parser context @@ -5405,8 +5438,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { cur = in->cur[0]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5466,8 +5498,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { next = in->cur[1]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5514,8 +5545,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { next = in->cur[1]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5769,9 +5799,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { htmlParseDocTypeDecl(ctxt); } else if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence( - ctxt, '-', '-', '>', 0) < 0)) + if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, diff --git a/result/HTML/758606.html.err b/result/HTML/758606.html.err index e3e61265..bcb253eb 100644 --- a/result/HTML/758606.html.err +++ b/result/HTML/758606.html.err @@ -1,6 +1,6 @@ ./test/HTML/758606.html:1: HTML parser error : Invalid char in comment 0xC