From e179f3ec0ef3238ca2e23693cdbc271c7480998f Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Wed, 11 Sep 2024 17:29:59 +0200 Subject: [PATCH] html: Stop reporting syntax errors It doesn't make much sense to keep the old syntax error handling which doesn't conform to HTML5. Handling HTML5 parser errors is rather involved and not essential for parsers. --- HTMLparser.c | 167 +++++++++++------------------------ python/tests/pushSAXhtml.py | 3 +- result/HTML/names.html.err | 3 - result/HTML/names.html.sax | 1 - result/HTML/utf8bug.html.err | 3 - result/HTML/utf8bug.html.sax | 1 - result/HTML/wired.html.err | 3 - result/HTML/wired.html.sax | 1 - 8 files changed, 53 insertions(+), 129 deletions(-) delete mode 100644 result/HTML/names.html.err delete mode 100644 result/HTML/utf8bug.html.err diff --git a/HTMLparser.c b/HTMLparser.c index bca44958..c111af5d 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2958,28 +2958,18 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) { if (CUR == '"') { SKIP(1); ret = htmlParseHTMLAttribute(ctxt, '"'); - if (CUR != '"') { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, - "AttValue: \" expected\n", NULL, NULL); - } else + if (CUR == '"') SKIP(1); } else if (CUR == '\'') { SKIP(1); ret = htmlParseHTMLAttribute(ctxt, '\''); - if (CUR != '\'') { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, - "AttValue: ' expected\n", NULL, NULL); - } else + if (CUR == '\'') SKIP(1); } else { /* * That's an HTMLism, the attribute value may not be quoted */ ret = htmlParseHTMLAttribute(ctxt, 0); - if (ret == NULL) { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, - "AttValue: no value found\n", NULL, NULL); - } } return(ret); } @@ -3561,11 +3551,8 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { *value = NULL; name = htmlParseHTMLName(ctxt, 1); - if (name == NULL) { - htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, - "error parsing attribute name\n", NULL, NULL); + if (name == NULL) return(NULL); - } /* * read the value @@ -3702,55 +3689,53 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { GROW; name = htmlParseHTMLName(ctxt, 0); - if (name == NULL) { - htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, - "htmlParseStartTag: invalid element name\n", - NULL, NULL); + if (name == NULL) return -1; - } if (xmlStrEqual(name, BAD_CAST"meta")) meta = 1; - /* - * Check for auto-closure of HTML elements. - */ - htmlAutoClose(ctxt, name); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + /* + * Check for auto-closure of HTML elements. + */ + htmlAutoClose(ctxt, name); - /* - * Check for implied HTML elements. - */ - htmlCheckImplied(ctxt, name); + /* + * Check for implied HTML elements. + */ + htmlCheckImplied(ctxt, name); - /* - * Avoid html at any level > 0, head at any level != 1 - * or any attempt to recurse body - */ - if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "htmlParseStartTag: misplaced tag\n", - name, NULL); - discardtag = 1; - ctxt->depth++; - } - if ((ctxt->nameNr != 1) && - (xmlStrEqual(name, BAD_CAST"head"))) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "htmlParseStartTag: misplaced tag\n", - name, NULL); - discardtag = 1; - ctxt->depth++; - } - if (xmlStrEqual(name, BAD_CAST"body")) { - int indx; - for (indx = 0;indx < ctxt->nameNr;indx++) { - if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "htmlParseStartTag: misplaced tag\n", - name, NULL); - discardtag = 1; - ctxt->depth++; - } - } + /* + * Avoid html at any level > 0, head at any level != 1 + * or any attempt to recurse body + */ + if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "htmlParseStartTag: misplaced tag\n", + name, NULL); + discardtag = 1; + ctxt->depth++; + } + if ((ctxt->nameNr != 1) && + (xmlStrEqual(name, BAD_CAST"head"))) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "htmlParseStartTag: misplaced tag\n", + name, NULL); + discardtag = 1; + ctxt->depth++; + } + if (xmlStrEqual(name, BAD_CAST"body")) { + int indx; + for (indx = 0;indx < ctxt->nameNr;indx++) { + if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "htmlParseStartTag: misplaced tag\n", + name, NULL); + discardtag = 1; + ctxt->depth++; + } + } + } } /* @@ -3778,8 +3763,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { */ for (i = 0; i < nbatts;i += 2) { if (xmlStrEqual(atts[i], attname)) { - htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, - "Attribute %s redefined\n", attname, NULL); if (attvalue != NULL) xmlFree(attvalue); goto failed; @@ -3894,8 +3877,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) int i, ret; if ((CUR != '<') || (NXT(1) != '/')) { - htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, - "htmlParseEndTag: 'endCheckState = info->dataMode; - } if (ctxt->record_info) htmlNodeInfoPush(ctxt, &node_info); @@ -4201,22 +4178,9 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) { return(0); } - if (CUR == '>') { - SKIP(1); - } else { - htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, - "Couldn't find end of Start Tag %s\n", name, NULL); - - /* - * end of parsing of this node. - */ - if (xmlStrEqual(name, ctxt->name)) { - htmlParserFinishElementParsing(ctxt); - nodePop(ctxt); - htmlnamePop(ctxt); - } - return(0); - } + if (CUR != '>') + return(0); + SKIP(1); /* * Check for an Empty Element from DTD definition @@ -4358,10 +4322,6 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { * Wipe out everything which is before the first '<' */ SKIP_BLANKS; - if (ctxt->input->cur >= ctxt->input->end) { - htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, - "Document is empty\n", NULL, NULL); - } if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) ctxt->sax->startDocument(ctxt->userData); @@ -5018,12 +4978,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { * Lookup the info for that element. */ info = htmlTagLookup(name); - if (info == NULL) { - htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, - "Tag %s invalid\n", name, NULL); - } else { + if (info != NULL) ctxt->endCheckState = info->dataMode; - } /* * Check for an Empty Element labeled the XML/SGML way @@ -5041,28 +4997,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { break; } - if (CUR == '>') { - SKIP(1); - } else { - htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, - "Couldn't find end of Start Tag %s\n", - name, NULL); - - /* - * end of parsing of this node. - */ - if (xmlStrEqual(name, ctxt->name)) { - htmlParserFinishElementParsing(ctxt); - nodePop(ctxt); - htmlnamePop(ctxt); - } - - if (ctxt->record_info) - htmlNodeInfoPush(ctxt, &node_info); - - ctxt->instate = XML_PARSER_CONTENT; - break; - } + if (CUR != '>') + break; + SKIP(1); /* * Check for an Empty Element from DTD definition diff --git a/python/tests/pushSAXhtml.py b/python/tests/pushSAXhtml.py index c32cd3e7..d9c3cc84 100755 --- a/python/tests/pushSAXhtml.py +++ b/python/tests/pushSAXhtml.py @@ -50,8 +50,7 @@ chunk = "ar" ctxt.htmlParseChunk(chunk, len(chunk), 1) ctxt=None -reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:error: Tag foo invalid -:characters: bar:endElement foo:endElement body:endElement html:endDocument:""" +reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:characters: bar:endElement foo:endElement body:endElement html:endDocument:""" if log != reference: print("Error got: %s" % log) print("Exprected: %s" % reference) diff --git a/result/HTML/names.html.err b/result/HTML/names.html.err deleted file mode 100644 index 4d91a5d2..00000000 --- a/result/HTML/names.html.err +++ /dev/null @@ -1,3 +0,0 @@ -./test/HTML/names.html:3: HTML parser error : Tag o:p invalid - - ^ diff --git a/result/HTML/names.html.sax b/result/HTML/names.html.sax index 12a107f8..7810c498 100644 --- a/result/HTML/names.html.sax +++ b/result/HTML/names.html.sax @@ -7,7 +7,6 @@ SAX.startElement(body) SAX.characters( , 3) SAX.startElement(o:p) -SAX.error: Tag o:p invalid SAX.endElement(o:p) SAX.characters( , 1) diff --git a/result/HTML/utf8bug.html.err b/result/HTML/utf8bug.html.err deleted file mode 100644 index 55f30ae8..00000000 --- a/result/HTML/utf8bug.html.err +++ /dev/null @@ -1,3 +0,0 @@ -./test/HTML/utf8bug.html:121: HTML parser error : Tag s1 invalid -ز همکاران است. روی آن کلیک کند.