1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-01 09:41:47 +03:00

parsing real HTML is a nightmare.

- HTMLparser.c result/HTML/*: revamped the way the HTML
  parser handles end of tags or end of input
Daniel
This commit is contained in:
Daniel Veillard
2001-04-12 15:42:58 +00:00
parent 82daa81a8b
commit a3bfca59bf
14 changed files with 393 additions and 320 deletions

View File

@ -445,7 +445,7 @@ htmlElemDesc html40ElementTable[] = {
{ "th", 0, 1, 0, 0, 0, 0, "table header cell" },
{ "thead", 0, 1, 0, 0, 0, 0, "table header " },
{ "title", 0, 0, 0, 0, 0, 0, "document title " },
{ "tr", 0, 1, 0, 0, 0, 0, "table row " },
{ "tr", 0, 0, 0, 0, 0, 0, "table row " },
{ "tt", 0, 0, 0, 0, 0, 0, "teletype or monospaced text style" },
{ "u", 0, 0, 0, 0, 1, 1, "underlined text style" },
{ "ul", 0, 0, 0, 0, 0, 0, "unordered list " },
@ -661,6 +661,7 @@ htmlCheckAutoClose(const xmlChar *newtag, const xmlChar *oldtag) {
* htmlAutoCloseOnClose:
* @ctxt: an HTML parser context
* @newtag: The new tag name
* @force: force the tag closure
*
* The HTmL DtD allows an ending tag to implicitely close other tags.
*/
@ -688,11 +689,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnClose: %s closes %s\n", newtag, ctxt->name);
#endif
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Opening and ending tag mismatch: %s and %s\n",
newtag, ctxt->name);
ctxt->wellFormed = 0;
return;
}
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, ctxt->name);
@ -706,6 +703,39 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
}
}
/**
* htmlAutoCloseOnEnd:
* @ctxt: an HTML parser context
*
* Close all remaining tags at the end of the stream
*/
static void
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) {
xmlChar *oldname;
int i;
if (ctxt->nameNr == 0)
return;
#ifdef DEBUG
xmlGenericError(xmlGenericErrorContext,"Close of stack: %d elements\n", ctxt->nameNr);
#endif
for (i = (ctxt->nameNr - 1);i >= 0;i--) {
#ifdef DEBUG
xmlGenericError(xmlGenericErrorContext,"%d : %s\n", i, ctxt->nameTab[i]);
#endif
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, ctxt->name);
oldname = htmlnamePop(ctxt);
if (oldname != NULL) {
#ifdef DEBUG
xmlGenericError(xmlGenericErrorContext,"htmlAutoCloseOnEnd: popped %s\n", oldname);
#endif
xmlFree(oldname);
}
}
}
/**
* htmlAutoClose:
* @ctxt: an HTML parser context
@ -737,9 +767,8 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
}
}
if (newtag == NULL) {
htmlAutoCloseOnClose(ctxt, BAD_CAST"head");
htmlAutoCloseOnClose(ctxt, BAD_CAST"body");
htmlAutoCloseOnClose(ctxt, BAD_CAST"html");
htmlAutoCloseOnEnd(ctxt);
return;
}
while ((newtag == NULL) && (ctxt->name != NULL) &&
((xmlStrEqual(ctxt->name, BAD_CAST"head")) ||
@ -3266,10 +3295,8 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
* Fourth : end of the resource
*/
else if (CUR == 0) {
int level = ctxt->nodeNr;
htmlAutoClose(ctxt, NULL);
if (level == ctxt->nodeNr)
break;
htmlAutoCloseOnEnd(ctxt);
break;
}
/*
@ -3439,29 +3466,6 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
if (ctxt->nameNr < depth) break;
}
if (!IS_CHAR(CUR)) {
/************
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Premature end of data in tag %s\n", currentNode);
ctxt->wellFormed = 0;
*************/
/*
* end of parsing of this node.
*/
nodePop(ctxt);
oldname = htmlnamePop(ctxt);
#ifdef DEBUG
xmlGenericError(xmlGenericErrorContext,"Premature end of tag %s : popping out %s\n", name, oldname);
#endif
if (oldname != NULL)
xmlFree(oldname);
if (currentNode != NULL)
xmlFree(currentNode);
return;
}
/*
* Capture end position and add node
*/
@ -3472,6 +3476,10 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info);
}
if (!IS_CHAR(CUR)) {
htmlAutoCloseOnEnd(ctxt);
}
if (currentNode != NULL)
xmlFree(currentNode);
}
@ -3556,7 +3564,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
* autoclose
*/
if (CUR == 0)
htmlAutoClose(ctxt, NULL);
htmlAutoCloseOnEnd(ctxt);
/*
@ -3899,7 +3907,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
else
avail = in->buf->buffer->use - (in->cur - in->base);
if ((avail == 0) && (terminate)) {
htmlAutoClose(ctxt, NULL);
htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
/*
* SAX: end of the document processing.
@ -4077,9 +4085,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
goto done;
} else {
ctxt->errNo = XML_ERR_DOCUMENT_END;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Extra content at the end of the document\n");
ctxt->wellFormed = 0;
ctxt->instate = XML_PARSER_EOF;
#ifdef DEBUG_PUSH
@ -4491,7 +4496,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
}
done:
if ((avail == 0) && (terminate)) {
htmlAutoClose(ctxt, NULL);
htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
/*
* SAX: end of the document processing.
@ -4555,9 +4560,6 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
(ctxt->instate != XML_PARSER_EPILOG) &&
(ctxt->instate != XML_PARSER_MISC)) {
ctxt->errNo = XML_ERR_DOCUMENT_END;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Extra content at the end of the document\n");
ctxt->wellFormed = 0;
}
if (ctxt->instate != XML_PARSER_EOF) {