diff --git a/HTMLparser.c b/HTMLparser.c index 46744900..abc4e905 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2545,6 +2545,21 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); +static void +htmlSkipBogusComment(htmlParserCtxtPtr ctxt) { + int c; + + htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT, + "Incorrectly opened comment\n", NULL, NULL); + + do { + c = CUR; + if (c == 0) + break; + NEXT; + } while (c != '>'); +} + /** * htmlParseHTMLName: * @ctxt: an HTML parser context @@ -4380,26 +4395,28 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { htmlParseScript(ctxt); } - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - else if ((CUR == '<') && (NXT(1) == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "Misplaced DOCTYPE declaration\n", - BAD_CAST "DOCTYPE" , NULL); - htmlParseDocTypeDecl(ctxt); - } - - /* - * First case : a comment - */ - else if ((CUR == '<') && (NXT(1) == '!') && - (NXT(2) == '-') && (NXT(3) == '-')) { - htmlParseComment(ctxt); + else if ((CUR == '<') && (NXT(1) == '!')) { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "Misplaced DOCTYPE declaration\n", + BAD_CAST "DOCTYPE" , NULL); + htmlParseDocTypeDecl(ctxt); + } + /* + * First case : a comment + */ + else if ((NXT(2) == '-') && (NXT(3) == '-')) { + htmlParseComment(ctxt); + } + else { + htmlSkipBogusComment(ctxt); + } } /* @@ -4785,26 +4802,28 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) { htmlParseScript(ctxt); } - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - else if ((CUR == '<') && (NXT(1) == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "Misplaced DOCTYPE declaration\n", - BAD_CAST "DOCTYPE" , NULL); - htmlParseDocTypeDecl(ctxt); - } - - /* - * First case : a comment - */ - else if ((CUR == '<') && (NXT(1) == '!') && - (NXT(2) == '-') && (NXT(3) == '-')) { - htmlParseComment(ctxt); + else if ((CUR == '<') && (NXT(1) == '!')) { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "Misplaced DOCTYPE declaration\n", + BAD_CAST "DOCTYPE" , NULL); + htmlParseDocTypeDecl(ctxt); + } + /* + * First case : a comment + */ + else if ((NXT(2) == '-') && (NXT(3) == '-')) { + htmlParseComment(ctxt); + } + else { + htmlSkipBogusComment(ctxt); + } } /* @@ -5949,31 +5968,37 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { #endif break; } - } else if ((cur == '<') && (next == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { + } else if ((cur == '<') && (next == '!')) { /* * Sometimes DOCTYPE arrives in the middle of the document */ - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) - goto done; - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "Misplaced DOCTYPE declaration\n", - BAD_CAST "DOCTYPE" , NULL); - htmlParseDocTypeDecl(ctxt); - } else if ((cur == '<') && (next == '!') && - (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) - goto done; + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) + goto done; + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "Misplaced DOCTYPE declaration\n", + BAD_CAST "DOCTYPE" , NULL); + htmlParseDocTypeDecl(ctxt); + } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) { + if ((!terminate) && + (htmlParseLookupCommentEnd(ctxt) < 0)) + goto done; #ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Comment\n"); + xmlGenericError(xmlGenericErrorContext, + "HPP: Parsing Comment\n"); #endif - htmlParseComment(ctxt); - ctxt->instate = XML_PARSER_CONTENT; + htmlParseComment(ctxt); + ctxt->instate = XML_PARSER_CONTENT; + } else { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) + goto done; + htmlSkipBogusComment(ctxt); + } } else if ((cur == '<') && (next == '?')) { if ((!terminate) && (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h index 7b68e401..ee95be9c 100644 --- a/include/libxml/xmlerror.h +++ b/include/libxml/xmlerror.h @@ -260,6 +260,7 @@ typedef enum { XML_DTD_DUP_TOKEN, /* 541 */ XML_HTML_STRUCURE_ERROR = 800, XML_HTML_UNKNOWN_TAG, /* 801 */ + XML_HTML_INCORRECTLY_OPENED_COMMENT, /* 802 */ XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000, XML_RNGP_ATTR_CONFLICT, /* 1001 */ XML_RNGP_ATTRIBUTE_CHILDREN, /* 1002 */ diff --git a/result/HTML/issue380.html b/result/HTML/issue380.html new file mode 100644 index 00000000..1fcf4965 --- /dev/null +++ b/result/HTML/issue380.html @@ -0,0 +1,6 @@ + + + + ... + + diff --git a/result/HTML/issue380.html.err b/result/HTML/issue380.html.err new file mode 100644 index 00000000..efbb8bdf --- /dev/null +++ b/result/HTML/issue380.html.err @@ -0,0 +1,6 @@ +./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment + ... + ^ +./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment + ... + ^ diff --git a/result/HTML/issue380.html.sax b/result/HTML/issue380.html.sax new file mode 100644 index 00000000..5df2b506 --- /dev/null +++ b/result/HTML/issue380.html.sax @@ -0,0 +1,20 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(html) +SAX.characters( + , 3) +SAX.startElement(body) +SAX.characters( + , 5) +SAX.error: Incorrectly opened comment +SAX.characters(..., 3) +SAX.error: Incorrectly opened comment +SAX.characters( + , 3) +SAX.endElement(body) +SAX.characters( +, 1) +SAX.endElement(html) +SAX.characters( +, 1) +SAX.endDocument() diff --git a/test/HTML/issue380.html b/test/HTML/issue380.html new file mode 100644 index 00000000..46c07f26 --- /dev/null +++ b/test/HTML/issue380.html @@ -0,0 +1,5 @@ + + + ... + +