From b8f25c9118ecfe07a40a0d534f3e1598e43959a3 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sat, 19 Aug 2000 19:52:36 +0000 Subject: [PATCH] work done on auto-opening of

tags and cleanup of SAX output, Daniel. --- ChangeLog | 5 ++ HTMLparser.c | 155 ++++++++++++++++++++++++++++++++--- HTMLtree.h | 2 + SAX.c | 4 +- include/libxml/HTMLtree.h | 2 + result/HTML/Down.html | 2 + result/HTML/Down.html.sax | 19 ++++- result/HTML/doc2.htm | 2 +- result/HTML/doc2.htm.err | 4 +- result/HTML/doc2.htm.sax | 15 +++- result/HTML/doc3.htm | 2 +- result/HTML/doc3.htm.err | 4 +- result/HTML/doc3.htm.sax | 15 +++- result/HTML/fp40.htm.sax | 17 +++- result/HTML/liclose.html.sax | 15 +++- result/HTML/reg1.html.sax | 17 +++- result/HTML/reg2.html.sax | 17 +++- result/HTML/reg3.html.sax | 19 ++++- result/HTML/reg4.html.sax | 22 ++++- result/HTML/test2.html.sax | 15 +++- result/HTML/test3.html.sax | 31 ++++++- result/HTML/wired.html.sax | 14 ++++ tree.c | 15 +++- 23 files changed, 373 insertions(+), 40 deletions(-) diff --git a/ChangeLog b/ChangeLog index ff671b5e..f3e161ba 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +Sat Aug 19 21:02:08 CEST 2000 Daniel Veillard + + * HTMLparser.c SAX.c tree.c HTMLtree.h result/HTML/*: work + done on auto-opening of

tags and cleanup of SAX output + Sat Aug 19 18:45:40 CEST 2000 Daniel Veillard * libxml.4 xmllint.1 Makefile.am libxml.spec.in: added man pages diff --git a/HTMLparser.c b/HTMLparser.c index 9f7da5ce..75edb108 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -552,6 +552,20 @@ char *htmlStartClose[] = { NULL }; +/* + * The list of HTML elements which are supposed not to have + * CDATA content and where a p element will be implied + * + * TODO: extend that list by reading the HTML SGML DtD on + * implied paragraph + */ +static char *htmlNoContentElements[] = { + "html", + "head", + "body", + NULL +}; + static char** htmlStartCloseIndex[100]; static int htmlStartCloseIndexinitialized = 0; @@ -845,6 +859,49 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { } } +/** + * htmlCheckParagraph + * @ctxt: an HTML parser context + * + * Check whether a p element need to be implied before inserting + * characters in the current element. + * + * Returns 1 if a paragraph has been inserted, 0 if not and -1 + * in case of error. + */ + +int +htmlCheckParagraph(htmlParserCtxtPtr ctxt) { + const xmlChar *tag; + int i; + + if (ctxt == NULL) + return(-1); + tag = ctxt->name; + if (tag == NULL) { + htmlAutoClose(ctxt, BAD_CAST"p"); + htmlCheckImplied(ctxt, BAD_CAST"p"); + htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p")); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) + ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); + return(1); + } + for (i = 0; htmlNoContentElements[i] != NULL; i++) { + if (!xmlStrcmp(tag, BAD_CAST htmlNoContentElements[i])) { +#ifdef DEBUG + fprintf(stderr,"Implied element paragraph\n"); +#endif + htmlAutoClose(ctxt, BAD_CAST"p"); + htmlCheckImplied(ctxt, BAD_CAST"p"); + htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p")); + if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) + ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); + return(1); + } + } + return(0); +} + /************************************************************************ * * * The list of HTML predefined entities * @@ -1253,7 +1310,8 @@ UTF8ToHtml(unsigned char* out, int *outlen, sizeof(html40EntitiesTable[0]));i++) { if (html40EntitiesTable[i].value == c) { #ifdef DEBUG - fprintf(stderr,"Found entity %s\n", name); + fprintf(stderr,"Found entity %s\n", + html40EntitiesTable[i].name); #endif goto found_ent; } @@ -1496,20 +1554,21 @@ htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) { /* * Just handle the content as a set of chars. */ + htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, entity->content, len); } /** - * htmlNewDoc: + * htmlNewDocNoDtD: * @URI: URI for the dtd, or NULL * @ExternalID: the external ID of the DTD, or NULL * - * Returns a new document + * Returns a new document, do not intialize the DTD if not provided */ htmlDocPtr -htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { +htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { xmlDocPtr cur; /* @@ -1525,12 +1584,8 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { cur->type = XML_HTML_DOCUMENT_NODE; cur->version = NULL; cur->intSubset = NULL; - if ((ExternalID == NULL) && - (URI == NULL)) - xmlCreateIntSubset(cur, BAD_CAST "HTML", - BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", - BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); - else + if ((ExternalID != NULL) || + (URI != NULL)) xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI); cur->doc = cur; cur->name = NULL; @@ -1548,6 +1603,23 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { return(cur); } +/** + * htmlNewDoc: + * @URI: URI for the dtd, or NULL + * @ExternalID: the external ID of the DTD, or NULL + * + * Returns a new document + */ +htmlDocPtr +htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { + if ((URI == NULL) && (ExternalID == NULL)) + return(htmlNewDocNoDtD( + BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", + BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd")); + + return(htmlNewDocNoDtD(URI, ExternalID)); +} + /************************************************************************ * * @@ -2062,6 +2134,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) { ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); } else { + htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) ctxt->sax->characters(ctxt->userData, buf, nbchar); } @@ -2080,6 +2153,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) { if (ctxt->sax->ignorableWhitespace != NULL) ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); } else { + htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) ctxt->sax->characters(ctxt->userData, buf, nbchar); } @@ -2861,16 +2935,19 @@ htmlParseReference(htmlParserCtxtPtr ctxt) { } out[i] = 0; + htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, out, i); } else { ent = htmlParseEntityRef(ctxt, &name); if (name == NULL) { + htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); return; } if ((ent == NULL) || (ent->value <= 0)) { + htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); @@ -2895,6 +2972,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) { } out[i] = 0; + htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, out, i); } @@ -2941,6 +3019,21 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { return; } + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((CUR == '<') && (NXT(1) == '!') && + (UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Misplaced DOCTYPE declaration\n"); + ctxt->wellFormed = 0; + htmlParseDocTypeDecl(ctxt); + } + /* * First case : a comment */ @@ -3185,6 +3278,8 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { int htmlParseDocument(htmlParserCtxtPtr ctxt) { + xmlDtdPtr dtd; + htmlDefaultSAXHandlerInit(); ctxt->html = 1; @@ -3258,6 +3353,15 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { */ if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) ctxt->sax->endDocument(ctxt->userData); + + if (ctxt->myDoc != NULL) { + dtd = xmlGetIntSubset(ctxt->myDoc); + if (dtd == NULL) + ctxt->myDoc->intSubset = + xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML", + BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", + BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); + } if (! ctxt->wellFormed) return(-1); return(0); } @@ -3848,6 +3952,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { xmlChar chr[2] = { 0 , 0 } ; chr[0] = (xmlChar) ctxt->token; + htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, chr, 1); ctxt->token = 0; @@ -3862,6 +3967,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->sax->ignorableWhitespace( ctxt->userData, &cur, 1); } else { + htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) ctxt->sax->characters( ctxt->userData, &cur, 1); @@ -3878,7 +3984,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { cur = in->cur[0]; next = in->cur[1]; cons = ctxt->nbChars; - if ((cur == '<') && (next == '!') && + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((cur == '<') && (next == '!') && + (UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) + goto done; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Misplaced DOCTYPE declaration\n"); + ctxt->wellFormed = 0; + htmlParseDocTypeDecl(ctxt); + } else if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { if ((!terminate) && (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) @@ -4040,6 +4162,17 @@ done: ctxt->sax->endDocument(ctxt->userData); } } + if ((ctxt->myDoc != NULL) && + ((terminate) || (ctxt->instate == XML_PARSER_EOF) || + (ctxt->instate == XML_PARSER_EPILOG))) { + xmlDtdPtr dtd; + dtd = xmlGetIntSubset(ctxt->myDoc); + if (dtd == NULL) + ctxt->myDoc->intSubset = + xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML", + BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", + BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); + } #ifdef DEBUG_PUSH fprintf(stderr, "HPP: done %d\n", ret); #endif diff --git a/HTMLtree.h b/HTMLtree.h index feff3a47..17043b78 100644 --- a/HTMLtree.h +++ b/HTMLtree.h @@ -25,6 +25,8 @@ extern "C" { htmlDocPtr htmlNewDoc (const xmlChar *URI, const xmlChar *ExternalID); +htmlDocPtr htmlNewDocNoDtD (const xmlChar *URI, + const xmlChar *ExternalID); const xmlChar * htmlGetMetaEncoding (htmlDocPtr doc); int htmlSetMetaEncoding (htmlDocPtr doc, const xmlChar *encoding); diff --git a/SAX.c b/SAX.c index ecc5331b..fb5e741c 100644 --- a/SAX.c +++ b/SAX.c @@ -169,6 +169,8 @@ internalSubset(void *ctx, const xmlChar *name, return; dtd = xmlGetIntSubset(ctxt->myDoc); if (dtd != NULL) { + if (ctxt->html) + return; xmlUnlinkNode((xmlNodePtr) dtd); xmlFreeDtd(dtd); ctxt->myDoc->intSubset = NULL; @@ -605,7 +607,7 @@ startDocument(void *ctx) if (ctxt->html) { if (ctxt->myDoc == NULL) #ifdef LIBXML_HTML_ENABLED - ctxt->myDoc = htmlNewDoc(NULL, NULL); + ctxt->myDoc = htmlNewDocNoDtD(NULL, NULL); #else fprintf(stderr, "libxml2 built without HTML support\n"); #endif diff --git a/include/libxml/HTMLtree.h b/include/libxml/HTMLtree.h index feff3a47..17043b78 100644 --- a/include/libxml/HTMLtree.h +++ b/include/libxml/HTMLtree.h @@ -25,6 +25,8 @@ extern "C" { htmlDocPtr htmlNewDoc (const xmlChar *URI, const xmlChar *ExternalID); +htmlDocPtr htmlNewDocNoDtD (const xmlChar *URI, + const xmlChar *ExternalID); const xmlChar * htmlGetMetaEncoding (htmlDocPtr doc); int htmlSetMetaEncoding (htmlDocPtr doc, const xmlChar *encoding); diff --git a/result/HTML/Down.html b/result/HTML/Down.html index 2eb3e8bd..7a004e56 100644 --- a/result/HTML/Down.html +++ b/result/HTML/Down.html @@ -3,8 +3,10 @@ This service is temporary down

Sorry, this service is temporary down

+

We are doing our best to get it back on-line, +

The W3C system administrators

diff --git a/result/HTML/Down.html.sax b/result/HTML/Down.html.sax index ce3052eb..6b23930a 100644 --- a/result/HTML/Down.html.sax +++ b/result/HTML/Down.html.sax @@ -1,36 +1,53 @@ SAX.setDocumentLocator() SAX.startDocument() SAX.startElement(html) +SAX.startElement(body) +SAX.startElement(p) SAX.characters( , 1) +SAX.endElement(p) SAX.startElement(head) +SAX.endElement(head) +SAX.startElement(p) SAX.characters( , 3) +SAX.endElement(p) SAX.startElement(title) SAX.characters(This service is temporary down, 30) SAX.endElement(title) +SAX.startElement(p) SAX.characters( , 1) -SAX.endElement(head) +SAX.error: Unexpected end tag : head SAX.characters( , 2) +SAX.endElement(p) SAX.startElement(body, bgcolor='#FFFFFF') +SAX.startElement(p) SAX.characters( , 1) +SAX.endElement(p) SAX.startElement(h1, align='center') SAX.characters(Sorry, this service is tempora, 37) SAX.endElement(h1) +SAX.startElement(p) SAX.characters( We are doing our best to get , 48) +SAX.endElement(p) SAX.startElement(p) SAX.characters(The W3C system administrators, 29) SAX.endElement(p) +SAX.startElement(p) SAX.characters( , 1) +SAX.endElement(p) SAX.endElement(body) +SAX.startElement(p) SAX.characters( , 1) +SAX.endElement(p) +SAX.endElement(body) SAX.endElement(html) SAX.ignorableWhitespace( , 1) diff --git a/result/HTML/doc2.htm b/result/HTML/doc2.htm index 31db4968..07fcc3e2 100644 --- a/result/HTML/doc2.htm +++ b/result/HTML/doc2.htm @@ -8,7 +8,7 @@ function NS_NewOpen(url,nam,atr){return(new NS_NullWindow());} window.open=NS_NewOpen; -!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> + diff --git a/result/HTML/doc2.htm.err b/result/HTML/doc2.htm.err index 9e37bf47..bf46ffad 100644 --- a/result/HTML/doc2.htm.err +++ b/result/HTML/doc2.htm.err @@ -1,3 +1,3 @@ -./test/HTML/doc2.htm:10: error: htmlParseStartTag: invalid element name +./test/HTML/doc2.htm:10: error: Misplaced DOCTYPE declaration -!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"> +