From a3bfca59bf9a5b46dee8081d4c8a48740d6388f6 Mon Sep 17 00:00:00 2001
From: Daniel Veillard
Date: Thu, 12 Apr 2001 15:42:58 +0000
Subject: [PATCH] parsing real HTML is a nightmare. - HTMLparser.c
result/HTML/*: revamped the way the HTML parser handles end of tags or end
of input Daniel
---
ChangeLog | 5 +
HTMLparser.c | 92 +++++++--------
result/HTML/autoclose3.html.err | 3 -
result/HTML/autoclose3.html.sax | 1 -
result/HTML/doc3.htm | 27 +++--
result/HTML/doc3.htm.err | 76 +++++++-----
result/HTML/doc3.htm.sax | 78 +++++++------
result/HTML/entities.html.err | 4 +-
result/HTML/test3.html | 4 +-
result/HTML/test3.html.err | 6 +-
result/HTML/test3.html.sax | 20 ++--
result/HTML/wired.html | 83 +++++++-------
result/HTML/wired.html.err | 197 ++++++++++++++++++--------------
result/HTML/wired.html.sax | 117 ++++++++++---------
14 files changed, 393 insertions(+), 320 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 9a947ab9..9ad5c1ff 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+Thu Apr 12 17:41:09 CEST 2001 Daniel Veillard
+
+ * HTMLparser.c result/HTML/*: revamped the way the HTML
+ parser handles end of tags or end of input
+
Thu Apr 12 10:50:34 CEST 2001 Daniel Veillard
* tree.[ch] : added xmlDocCopyNode for gdome2 support
diff --git a/HTMLparser.c b/HTMLparser.c
index 39447e3a..4b3bac86 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -445,7 +445,7 @@ htmlElemDesc html40ElementTable[] = {
{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
{ "title", 0, 0, 0, 0, 0, 0, "document title " },
-{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
+{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
@@ -661,6 +661,7 @@ htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
* htmlAutoCloseOnClose:
* @ctxt: an HTML parser context
* @newtag: The new tag name
+ * @force: force the tag closure
*
* The HTmL DtD allows an ending tag to implicitely close other tags.
*/
@@ -688,11 +689,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
#endif
} else {
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "Opening and ending tag mismatch: %s and %s\n",
- newtag, ctxt->name);
- ctxt->wellFormed = 0;
+ return;
}
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, ctxt->name);
@@ -706,6 +703,39 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
}
}
+/**
+ * htmlAutoCloseOnEnd:
+ * @ctxt: an HTML parser context
+ *
+ * Close all remaining tags at the end of the stream
+ */
+static void
+htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
+ xmlChar *oldname;
+ int i;
+
+ if (ctxt->nameNr == 0)
+ return;
+#ifdef DEBUG
+ xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
+#endif
+
+ for (i = (ctxt->nameNr - 1);i >= 0;i--) {
+#ifdef DEBUG
+ xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
+#endif
+ if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
+ ctxt->sax->endElement(ctxt->userData, ctxt->name);
+ oldname = htmlnamePop(ctxt);
+ if (oldname != NULL) {
+#ifdef DEBUG
+ xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
+#endif
+ xmlFree(oldname);
+ }
+ }
+}
+
/**
* htmlAutoClose:
* @ctxt: an HTML parser context
@@ -737,9 +767,8 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
}
}
if (newtag == NULL) {
- htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
- htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
- htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
+ htmlAutoCloseOnEnd(ctxt);
+ return;
}
while ((newtag == NULL) && (ctxt->name != NULL) &&
((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
@@ -3266,10 +3295,8 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
* Fourth : end of the resource
*/
else if (CUR == 0) {
- int level = ctxt->nodeNr;
- htmlAutoClose(ctxt, NULL);
- if (level == ctxt->nodeNr)
- break;
+ htmlAutoCloseOnEnd(ctxt);
+ break;
}
/*
@@ -3439,29 +3466,6 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
if (ctxt->nameNr < depth) break;
}
- if (!IS_CHAR(CUR)) {
- /************
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "Premature end of data in tag %s\n", currentNode);
- ctxt->wellFormed = 0;
- *************/
-
- /*
- * end of parsing of this node.
- */
- nodePop(ctxt);
- oldname = htmlnamePop(ctxt);
-#ifdef DEBUG
- xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
-#endif
- if (oldname != NULL)
- xmlFree(oldname);
- if (currentNode != NULL)
- xmlFree(currentNode);
- return;
- }
-
/*
* Capture end position and add node
*/
@@ -3472,6 +3476,10 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info);
}
+ if (!IS_CHAR(CUR)) {
+ htmlAutoCloseOnEnd(ctxt);
+ }
+
if (currentNode != NULL)
xmlFree(currentNode);
}
@@ -3556,7 +3564,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
* autoclose
*/
if (CUR == 0)
- htmlAutoClose(ctxt, NULL);
+ htmlAutoCloseOnEnd(ctxt);
/*
@@ -3899,7 +3907,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
else
avail = in->buf->buffer->use - (in->cur - in->base);
if ((avail == 0) && (terminate)) {
- htmlAutoClose(ctxt, NULL);
+ htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
/*
* SAX: end of the document processing.
@@ -4077,9 +4085,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
goto done;
} else {
ctxt->errNo = XML_ERR_DOCUMENT_END;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "Extra content at the end of the document\n");
ctxt->wellFormed = 0;
ctxt->instate = XML_PARSER_EOF;
#ifdef DEBUG_PUSH
@@ -4491,7 +4496,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
}
done:
if ((avail == 0) && (terminate)) {
- htmlAutoClose(ctxt, NULL);
+ htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
/*
* SAX: end of the document processing.
@@ -4555,9 +4560,6 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
(ctxt->instate != XML_PARSER_EPILOG) &&
(ctxt->instate != XML_PARSER_MISC)) {
ctxt->errNo = XML_ERR_DOCUMENT_END;
- if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "Extra content at the end of the document\n");
ctxt->wellFormed = 0;
}
if (ctxt->instate != XML_PARSER_EOF) {
diff --git a/result/HTML/autoclose3.html.err b/result/HTML/autoclose3.html.err
index 09b9e332..e69de29b 100644
--- a/result/HTML/autoclose3.html.err
+++ b/result/HTML/autoclose3.html.err
@@ -1,3 +0,0 @@
-./test/HTML/autoclose3.html:4: error: Opening and ending tag mismatch: body and ul
-
-^
diff --git a/result/HTML/autoclose3.html.sax b/result/HTML/autoclose3.html.sax
index e8ed3303..25e06ead 100644
--- a/result/HTML/autoclose3.html.sax
+++ b/result/HTML/autoclose3.html.sax
@@ -13,7 +13,6 @@ SAX.startElement(li)
SAX.characters(item 2
, 7)
SAX.endElement(li)
-SAX.error: Opening and ending tag mismatch: body and ul
SAX.endElement(ul)
SAX.endElement(body)
SAX.endElement(html)
diff --git a/result/HTML/doc3.htm b/result/HTML/doc3.htm
index c5a4f669..0738497c 100644
--- a/result/HTML/doc3.htm
+++ b/result/HTML/doc3.htm
@@ -803,15 +803,15 @@ eval("page" + id + " = window.open(URL, '" + id + "', 'toolbars=0, scrollbars=0,
-
-
+
@@ -875,9 +877,7 @@ eval("page" + id + " = window.open(URL, '" + id + "', 'toolbars=0, scrollbars=0,
BP6.COM
Special
Code:BP6-hd
-
-
-
+
|
@@ -889,7 +889,14 @@ eval("page" + id + " = window.open(URL, '" + id + "', 'toolbars=0, scrollbars=0,
-
+
+
+
+
+
+
+
+