diff --git a/HTMLparser.c b/HTMLparser.c index 05170691..b9812985 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2802,47 +2802,39 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { size_t len = 0, startPosition = 0; + int err = 0; + int quote; xmlChar *ret = NULL; - if (CUR == '"') { - NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { - NEXT; - len++; - } - if (!IS_CHAR_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR+startPosition), len); - NEXT; - } - } else if (CUR == '\'') { - NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { - NEXT; - len++; - } - if (!IS_CHAR_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR+startPosition), len); - NEXT; - } - } else { + if ((CUR != '"') && (CUR != '\'')) { htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - " or ' expected\n", NULL, NULL); + "SystemLiteral \" or ' expected\n", NULL, NULL); + return(NULL); + } + quote = CUR; + NEXT; + + if (CUR_PTR < BASE_PTR) + return(ret); + startPosition = CUR_PTR - BASE_PTR; + + while ((CUR != 0) && (CUR != quote)) { + /* TODO: Handle UTF-8 */ + if (!IS_CHAR_CH(CUR)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in SystemLiteral 0x%X\n", CUR); + err = 1; + } + NEXT; + len++; + } + if (CUR != quote) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, + "Unfinished SystemLiteral\n", NULL, NULL); + } else { + NEXT; + if (err == 0) + ret = xmlStrndup((BASE_PTR+startPosition), len); } return(ret); @@ -2862,51 +2854,42 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { size_t len = 0, startPosition = 0; + int err = 0; + int quote; xmlChar *ret = NULL; + + if ((CUR != '"') && (CUR != '\'')) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, + "PubidLiteral \" or ' expected\n", NULL, NULL); + return(NULL); + } + quote = CUR; + NEXT; + /* * Name ::= (Letter | '_') (NameChar)* */ - if (CUR == '"') { - NEXT; + if (CUR_PTR < BASE_PTR) + return(ret); + startPosition = CUR_PTR - BASE_PTR; - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while (IS_PUBIDCHAR_CH(CUR)) { - len++; - NEXT; + while ((CUR != 0) && (CUR != quote)) { + if (!IS_PUBIDCHAR_CH(CUR)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in PubidLiteral 0x%X\n", CUR); + err = 1; } - - if (CUR != '"') { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR + startPosition), len); - NEXT; - } - } else if (CUR == '\'') { + len++; NEXT; + } - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ - len++; - NEXT; - } - - if (CUR != '\'') { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR + startPosition), len); - NEXT; - } + if (CUR != '"') { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, + "Unfinished PubidLiteral\n", NULL, NULL); } else { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "PubidLiteral \" or ' expected\n", NULL, NULL); + NEXT; + if (err == 0) + ret = xmlStrndup((BASE_PTR + startPosition), len); } return(ret); @@ -2972,7 +2955,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { } } } - if (IS_CHAR_CH(cur)) { + if (IS_CHAR(cur)) { COPY_BUF(l,buf,nbchar,cur); } else { htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, @@ -3242,7 +3225,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) { } SKIP_BLANKS; cur = CUR_CHAR(l); - while (IS_CHAR(cur) && (cur != '>')) { + while ((cur != 0) && (cur != '>')) { if (len + 5 >= size) { xmlChar *tmp; @@ -3261,7 +3244,13 @@ htmlParsePI(htmlParserCtxtPtr ctxt) { GROW; count = 0; } - COPY_BUF(l,buf,len,cur); + if (IS_CHAR(cur)) { + COPY_BUF(l,buf,len,cur); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in processing instruction " + "0x%X\n", cur); + } NEXTL(l); cur = CUR_CHAR(l); if (cur == 0) { @@ -3331,15 +3320,15 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { len = 0; buf[len] = 0; q = CUR_CHAR(ql); - if (!IS_CHAR(q)) + if (q == 0) goto unfinished; NEXTL(ql); r = CUR_CHAR(rl); - if (!IS_CHAR(r)) + if (r == 0) goto unfinished; NEXTL(rl); cur = CUR_CHAR(l); - while (IS_CHAR(cur) && + while ((cur != 0) && ((cur != '>') || (r != '-') || (q != '-'))) { if (len + 5 >= size) { @@ -3355,7 +3344,12 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { } buf = tmp; } - COPY_BUF(ql,buf,len,q); + if (IS_CHAR(q)) { + COPY_BUF(ql,buf,len,q); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in comment 0x%X\n", q); + } q = r; ql = rl; r = cur; @@ -3369,7 +3363,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { } } buf[len] = 0; - if (IS_CHAR(cur)) { + if (cur == '>') { NEXT; if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && (!ctxt->disableSAX)) @@ -3516,9 +3510,12 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { if (CUR != '>') { htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, "DOCTYPE improperly terminated\n", NULL, NULL); - /* We shouldn't try to resynchronize ... */ + /* Ignore bogus content */ + while ((CUR != 0) && (CUR != '>')) + NEXT; } - NEXT; + if (CUR == '>') + NEXT; /* * Create or update the document accordingly to the DOCTYPE @@ -3996,19 +3993,14 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) * We should definitely be at the ending "S? '>'" part */ SKIP_BLANKS; - if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { + if (CUR != '>') { htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, "End tag : expected '>'\n", NULL, NULL); - if (ctxt->recovery) { - /* - * We're not at the ending > !! - * Error, unless in recover mode where we search forwards - * until we find a > - */ - while (CUR != '\0' && CUR != '>') NEXT; - NEXT; - } - } else + /* Skip to next '>' */ + while ((CUR != 0) && (CUR != '>')) + NEXT; + } + if (CUR == '>') NEXT; /* @@ -4198,7 +4190,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { "htmlParseStartTag: invalid element name\n", NULL, NULL); /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + while ((CUR != 0) && (CUR != '>')) NEXT; if (currentNode != NULL) @@ -4413,7 +4405,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { */ currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; - while (IS_CHAR_CH(CUR)) { + while (CUR != 0) { oldptr = ctxt->input->cur; htmlParseContent(ctxt); if (oldptr==ctxt->input->cur) break; @@ -4430,7 +4422,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { node_info.node = ctxt->node; xmlParserAddNodeInfo(ctxt, &node_info); } - if (!IS_CHAR_CH(CUR)) { + if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } @@ -4451,7 +4443,7 @@ htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); htmlNodeInfoPop(ctxt); } - if (!IS_CHAR_CH(CUR)) { + if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } } @@ -4600,7 +4592,7 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) { "htmlParseStartTag: invalid element name\n", NULL, NULL); /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + while ((CUR == 0) && (CUR != '>')) NEXT; htmlParserFinishElementParsing(ctxt); diff --git a/fuzz/html.dict b/fuzz/html.dict index 9f58ed1e..801b7bb5 100644 --- a/fuzz/html.dict +++ b/fuzz/html.dict @@ -96,6 +96,10 @@ attr_style=" style=\"\"" comment="" +doctype="" +doctype_system="" +doctype_public="" + pi="" ref_lt="<" diff --git a/result/HTML/758606.html b/result/HTML/758606.html index 4f21f628..3974ca90 100644 --- a/result/HTML/758606.html +++ b/result/HTML/758606.html @@ -1,2 +1,2 @@ - + diff --git a/result/HTML/758606.html.err b/result/HTML/758606.html.err index 060433a8..e3e61265 100644 --- a/result/HTML/758606.html.err +++ b/result/HTML/758606.html.err @@ -1,16 +1,7 @@ -./test/HTML/758606.html:1: HTML parser error : Comment not terminated - diff --git a/result/HTML/wired.html.err b/result/HTML/wired.html.err index 70db11b0..116bbd2f 100644 --- a/result/HTML/wired.html.err +++ b/result/HTML/wired.html.err @@ -242,6 +242,9 @@ com&BANNER=Sprint" style="text-decoration:none">Sprint ^ ./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font + + ^ +./test/HTML/wired.html:414: HTML parser error : Opening and ending tag mismatch: td and font ^ ./test/HTML/wired.html:432: HTML parser error : htmlParseEntityRef: expecting ';' diff --git a/result/HTML/wired.html.sax b/result/HTML/wired.html.sax index d5b16297..bb787656 100644 --- a/result/HTML/wired.html.sax +++ b/result/HTML/wired.html.sax @@ -1962,7 +1962,6 @@ SAX.endElement(a) SAX.endElement(i) SAX.error: End tag : expected '>' SAX.endElement(font) -SAX.endElement(font) SAX.startElement(br) SAX.endElement(br) SAX.startElement(br) @@ -2023,6 +2022,8 @@ SAX.error: Opening and ending tag mismatch: td and font SAX.endElement(font) SAX.error: Opening and ending tag mismatch: td and font SAX.endElement(font) +SAX.error: Opening and ending tag mismatch: td and font +SAX.endElement(font) SAX.endElement(td) SAX.characters( , 1)