diff --git a/ChangeLog b/ChangeLog index ff457c99..11a0ec6c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Sun Oct 15 01:34:37 CEST 2000 Daniel Veillard + + * HTMLparser.c HTMLtree.[ch] SAX.c testHTML.c tree.c: fixed HTML + support for SCRIPT and STYLE with help from Bjorn Reese + * test/HTML/* result/HTML/*: added simple testcase and updated + the existing ones. + Fri Oct 13 18:24:31 CEST 2000 Daniel Veillard * xpath.c xpointer.c: XPointer reorder of ranges start/end and diff --git a/HTMLparser.c b/HTMLparser.c index 40a15a46..617b903f 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -559,7 +559,6 @@ static char *htmlNoContentElements[] = { NULL }; - static char** htmlStartCloseIndex[100]; static int htmlStartCloseIndexinitialized = 0; @@ -1863,7 +1862,7 @@ htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { /* * allocate a translation buffer. */ - buffer_size = HTML_PARSER_BIG_BUFFER_SIZE; + buffer_size = HTML_PARSER_BUFFER_SIZE; buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); if (buffer == NULL) { perror("htmlParseHTMLAttribute: malloc failed"); @@ -2209,6 +2208,71 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { return(ret); } +/** + * htmlParseScript: + * @ctxt: an HTML parser context + * + * parse the content of an HTML SCRIPT or STYLE element + * http://www.w3.org/TR/html4/sgml/dtd.html#Script + * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet + * http://www.w3.org/TR/html4/types.html#type-script + * http://www.w3.org/TR/html4/types.html#h-6.15 + * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 + * + * Script data ( %Script; in the DTD) can be the content of the SCRIPT + * element and the value of intrinsic event attributes. User agents must + * not evaluate script data as HTML markup but instead must pass it on as + * data to a script engine. + * NOTES: + * - The content is passed like CDATA + * - the attributes for style and scripting "onXXX" are also described + * as CDATA but SGML allows entities references in attributes so their + * processing is identical as other attributes + */ +void +htmlParseScript(htmlParserCtxtPtr ctxt) { + xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1]; + int nbchar = 0; + xmlChar cur; + + SHRINK; + cur = CUR; + while (IS_CHAR(cur)) { + if ((cur == '<') && (NXT(1) == '/')) { + /* + * One should break here, the specification is clear: + * Authors should therefore escape "= 'A') && (NXT(2) <= 'Z')) || + ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) + break; /* while */ + } + buf[nbchar++] = cur; + if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { + if (ctxt->sax->cdataBlock!= NULL) { + /* + * Insert as CDATA, which is the same as HTML_PRESERVE_NODE + */ + ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); + } + nbchar = 0; + } + NEXT; + cur = CUR; + } + if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { + if (ctxt->sax->cdataBlock!= NULL) { + /* + * Insert as CDATA, which is the same as HTML_PRESERVE_NODE + */ + ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); + } + } +} + + /** * htmlParseCharData: * @ctxt: an HTML parser context @@ -3112,68 +3176,75 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { return; } - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - if ((CUR == '<') && (NXT(1) == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Misplaced DOCTYPE declaration\n"); - ctxt->wellFormed = 0; - htmlParseDocTypeDecl(ctxt); - } - - /* - * First case : a comment - */ - if ((CUR == '<') && (NXT(1) == '!') && - (NXT(2) == '-') && (NXT(3) == '-')) { - htmlParseComment(ctxt); - } - - /* - * Second case : a sub-element. - */ - else if (CUR == '<') { - htmlParseElement(ctxt); - } - - /* - * Third case : a reference. If if has not been resolved, - * parsing returns it's Name, create the node - */ - else if (CUR == '&') { - htmlParseReference(ctxt); - } - - /* - * Fourth : end of the resource - */ - else if (CUR == 0) { - htmlAutoClose(ctxt, NULL); - } - - /* - * Last case, text. Note that References are handled directly. - */ - else { - htmlParseCharData(ctxt, 0); - } - - if (cons == ctxt->nbChars) { - if (ctxt->node != NULL) { + if ((xmlStrEqual(currentNode, BAD_CAST"script")) || + (xmlStrEqual(currentNode, BAD_CAST"style"))) { + /* + * Handle SCRIPT/STYLE separately + */ + htmlParseScript(ctxt); + } else { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((CUR == '<') && (NXT(1) == '!') && + (UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, - "detected an error in element content\n"); + "Misplaced DOCTYPE declaration\n"); ctxt->wellFormed = 0; + htmlParseDocTypeDecl(ctxt); } - break; - } + /* + * First case : a comment + */ + if ((CUR == '<') && (NXT(1) == '!') && + (NXT(2) == '-') && (NXT(3) == '-')) { + htmlParseComment(ctxt); + } + + /* + * Second case : a sub-element. + */ + else if (CUR == '<') { + htmlParseElement(ctxt); + } + + /* + * Third case : a reference. If if has not been resolved, + * parsing returns it's Name, create the node + */ + else if (CUR == '&') { + htmlParseReference(ctxt); + } + + /* + * Fourth : end of the resource + */ + else if (CUR == 0) { + htmlAutoClose(ctxt, NULL); + } + + /* + * Last case, text. Note that References are handled directly. + */ + else { + htmlParseCharData(ctxt, 0); + } + + if (cons == ctxt->nbChars) { + if (ctxt->node != NULL) { + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "detected an error in element content\n"); + ctxt->wellFormed = 0; + } + break; + } + } GROW; } if (currentNode != NULL) xmlFree(currentNode); @@ -3739,6 +3810,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { fprintf(stderr, "HPP: try EPILOG\n");break; case XML_PARSER_PI: fprintf(stderr, "HPP: try PI\n");break; + case XML_PARSER_SYSTEM_LITERAL: + fprintf(stderr, "HPP: try SYSTEM_LITERAL\n");break; } #endif @@ -4105,75 +4178,94 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { cur = in->cur[0]; next = in->cur[1]; cons = ctxt->nbChars; - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - if ((cur == '<') && (next == '!') && - (UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) - goto done; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Misplaced DOCTYPE declaration\n"); - ctxt->wellFormed = 0; - htmlParseDocTypeDecl(ctxt); - } else if ((cur == '<') && (next == '!') && - (in->cur[2] == '-') && (in->cur[3] == '-')) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) - goto done; -#ifdef DEBUG_PUSH - fprintf(stderr, "HPP: Parsing Comment\n"); -#endif - htmlParseComment(ctxt); - ctxt->instate = XML_PARSER_CONTENT; - } else if ((cur == '<') && (next == '!') && (avail < 4)) { - goto done; - } else if ((cur == '<') && (next == '/')) { - ctxt->instate = XML_PARSER_END_TAG; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - fprintf(stderr, "HPP: entering END_TAG\n"); -#endif - break; - } else if (cur == '<') { - ctxt->instate = XML_PARSER_START_TAG; - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - fprintf(stderr, "HPP: entering START_TAG\n"); -#endif - break; - } else if (cur == '&') { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0)) - goto done; -#ifdef DEBUG_PUSH - fprintf(stderr, "HPP: Parsing Reference\n"); -#endif - /* TODO: check generation of subtrees if noent !!! */ - htmlParseReference(ctxt); - } else { - /* TODO Avoid the extra copy, handle directly !!!!!! */ + if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || + (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { /* - * Goal of the following test is : - * - minimize calls to the SAX 'character' callback - * when they are mergeable + * Handle SCRIPT/STYLE separately */ - if ((ctxt->inputNr == 1) && - (avail < HTML_PARSER_BIG_BUFFER_SIZE)) { - if ((!terminate) && - (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0)) - goto done; - } - ctxt->checkIndex = 0; + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '<', '/', 0) < 0)) + goto done; + htmlParseScript(ctxt); + if ((cur == '<') && (next == '/')) { + ctxt->instate = XML_PARSER_END_TAG; + ctxt->checkIndex = 0; #ifdef DEBUG_PUSH - fprintf(stderr, "HPP: Parsing char data\n"); + fprintf(stderr, "HPP: entering END_TAG\n"); #endif - htmlParseCharData(ctxt, 0); + break; + } + } else { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((cur == '<') && (next == '!') && + (UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) + goto done; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Misplaced DOCTYPE declaration\n"); + ctxt->wellFormed = 0; + htmlParseDocTypeDecl(ctxt); + } else if ((cur == '<') && (next == '!') && + (in->cur[2] == '-') && (in->cur[3] == '-')) { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) + goto done; +#ifdef DEBUG_PUSH + fprintf(stderr, "HPP: Parsing Comment\n"); +#endif + htmlParseComment(ctxt); + ctxt->instate = XML_PARSER_CONTENT; + } else if ((cur == '<') && (next == '!') && (avail < 4)) { + goto done; + } else if ((cur == '<') && (next == '/')) { + ctxt->instate = XML_PARSER_END_TAG; + ctxt->checkIndex = 0; +#ifdef DEBUG_PUSH + fprintf(stderr, "HPP: entering END_TAG\n"); +#endif + break; + } else if (cur == '<') { + ctxt->instate = XML_PARSER_START_TAG; + ctxt->checkIndex = 0; +#ifdef DEBUG_PUSH + fprintf(stderr, "HPP: entering START_TAG\n"); +#endif + break; + } else if (cur == '&') { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0)) + goto done; +#ifdef DEBUG_PUSH + fprintf(stderr, "HPP: Parsing Reference\n"); +#endif + /* TODO: check generation of subtrees if noent !!! */ + htmlParseReference(ctxt); + } else { + /* TODO Avoid the extra copy, handle directly !!!!!! */ + /* + * Goal of the following test is : + * - minimize calls to the SAX 'character' callback + * when they are mergeable + */ + if ((ctxt->inputNr == 1) && + (avail < HTML_PARSER_BIG_BUFFER_SIZE)) { + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0)) + goto done; + } + ctxt->checkIndex = 0; +#ifdef DEBUG_PUSH + fprintf(stderr, "HPP: Parsing char data\n"); +#endif + htmlParseCharData(ctxt, 0); + } } if (cons == ctxt->nbChars) { if (ctxt->node != NULL) { diff --git a/HTMLtree.c b/HTMLtree.c index 9ce68971..c8e8a646 100644 --- a/HTMLtree.c +++ b/HTMLtree.c @@ -818,6 +818,16 @@ htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, const xmlOutputBufferWriteString(buf, ";"); return; } + if (cur->type == HTML_PRESERVE_NODE) { + if (cur->content != NULL) { +#ifndef XML_USE_BUFFER_CONTENT + xmlOutputBufferWriteString(buf, (const char *)cur->content); +#else + xmlOutputBufferWriteString(buf, xmlBufferContent(cur->content)); +#endif + } + return; + } /* * Get specific HTmL info for taht node. diff --git a/HTMLtree.h b/HTMLtree.h index 17043b78..2b5331c6 100644 --- a/HTMLtree.h +++ b/HTMLtree.h @@ -22,6 +22,7 @@ extern "C" { #define HTML_TEXT_NODE XML_TEXT_NODE #define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE #define HTML_COMMENT_NODE XML_COMMENT_NODE +#define HTML_PRESERVE_NODE XML_CDATA_SECTION_NODE htmlDocPtr htmlNewDoc (const xmlChar *URI, const xmlChar *ExternalID); diff --git a/SAX.c b/SAX.c index c53a10f8..25a0191e 100644 --- a/SAX.c +++ b/SAX.c @@ -1600,7 +1600,7 @@ xmlSAXHandler htmlDefaultSAXHandler = { xmlParserError, xmlParserError, getParameterEntity, - NULL, + cdataBlock, NULL, }; @@ -1632,7 +1632,7 @@ htmlDefaultSAXHandlerInit(void) htmlDefaultSAXHandler.endElement = endElement; htmlDefaultSAXHandler.reference = NULL; htmlDefaultSAXHandler.characters = characters; - htmlDefaultSAXHandler.cdataBlock = NULL; + xmlDefaultSAXHandler.cdataBlock = cdataBlock; htmlDefaultSAXHandler.ignorableWhitespace = ignorableWhitespace; htmlDefaultSAXHandler.processingInstruction = NULL; htmlDefaultSAXHandler.comment = comment; diff --git a/include/libxml/HTMLtree.h b/include/libxml/HTMLtree.h index 17043b78..2b5331c6 100644 --- a/include/libxml/HTMLtree.h +++ b/include/libxml/HTMLtree.h @@ -22,6 +22,7 @@ extern "C" { #define HTML_TEXT_NODE XML_TEXT_NODE #define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE #define HTML_COMMENT_NODE XML_COMMENT_NODE +#define HTML_PRESERVE_NODE XML_CDATA_SECTION_NODE htmlDocPtr htmlNewDoc (const xmlChar *URI, const xmlChar *ExternalID); diff --git a/result/HTML/doc2.htm.err b/result/HTML/doc2.htm.err index bf46ffad..d098b471 100644 --- a/result/HTML/doc2.htm.err +++ b/result/HTML/doc2.htm.err @@ -1,3 +1,3 @@ -./test/HTML/doc2.htm:10: error: Misplaced DOCTYPE declaration +./test/HTML/doc2.htm:5: error: Misplaced DOCTYPE declaration + // --> + diff --git a/result/HTML/doc3.htm.err b/result/HTML/doc3.htm.err index 51266e19..5a57449d 100644 --- a/result/HTML/doc3.htm.err +++ b/result/HTML/doc3.htm.err @@ -1,93 +1,105 @@ -./test/HTML/doc3.htm:10: error: Misplaced DOCTYPE declaration +./test/HTML/doc3.htm:5: error: Misplaced DOCTYPE declaration






Code:BP6-hd

Code:BP6-hd

 

^ -./test/HTML/doc3.htm:839: error: Unexpected end tag : center +./test/HTML/doc3.htm:834: error: Unexpected end tag : center width="100%"> 

^ -./test/HTML/doc3.htm:839: error: Unexpected end tag : tr +./test/HTML/doc3.htm:834: error: Unexpected end tag : tr width="100%"> 

^ -./test/HTML/doc3.htm:839: error: Unexpected end tag : tbody +./test/HTML/doc3.htm:834: error: Unexpected end tag : tbody width="100%"> 

^ -./test/HTML/doc3.htm:839: error: Unexpected end tag : table +./test/HTML/doc3.htm:834: error: Unexpected end tag : table width="100%"> 

^ -./test/HTML/doc3.htm:840: error: Unexpected end tag : td +./test/HTML/doc3.htm:835: error: Unexpected end tag : td
  ^ diff --git a/result/HTML/doc3.htm.sax b/result/HTML/doc3.htm.sax index ff8f186a..0cd7df16 100644 --- a/result/HTML/doc3.htm.sax +++ b/result/HTML/doc3.htm.sax @@ -11,7 +11,7 @@ SAX.endElement(title) SAX.ignorableWhitespace( , 2) SAX.startElement(script, language='javascript') -SAX.characters( +SAX.cdata( NS_ActualOpen=wind, 199) SAX.endElement(script) SAX.ignorableWhitespace( @@ -31,38 +31,15 @@ SAX.endElement(meta) SAX.ignorableWhitespace( , 2) SAX.startElement(style, type='text/css') -SAX.characters(A.nav { +SAX.cdata(A.nav { COLOR: #003399; TEXT, 115) SAX.endElement(style) SAX.ignorableWhitespace( , 4) SAX.startElement(script, language='JavaScript') -SAX.characters( -, 1) -SAX.comment( Idea by: Nic Wolfe (Nic@TimelapseProductions.com) ) -SAX.characters( -, 1) -SAX.comment( Web URL: http://fineline.xs.mw ) -SAX.characters( - -, 2) -SAX.comment( This script and many more are available free online at ) -SAX.characters( -, 1) -SAX.comment( The JavaScript Source!! http://javascript.internet.com ) -SAX.characters( - -, 2) -SAX.comment( Begin -function popUp(URL) { -day = new Date(); -id = day.getTime(); -eval("page" + id + " = window.open(URL, '" + id + "', 'toolbars=0, scrollbars=0, location=0, statusbars=0, menubars=0, resizable=0, width=145, height=250');"); -} -// End ) -SAX.characters( -, 1) +SAX.cdata( +<!-- Idea by: Nic Wolfe (, 476) SAX.endElement(script) SAX.ignorableWhitespace( @@ -2717,38 +2694,20 @@ SAX.comment( BEGIN GoTo.com Search Box ) SAX.characters( , 14) SAX.startElement(script, language='javascript', type='text/javascript') -SAX.characters( - , 9) -SAX.comment( - if ((parseInt(navigator.appVersion) >= 3) - && (navigator.appName != "Netscape")) { - document.write(""); - } else if ((parseInt(navigator.appVersion) > 3) - && (navigator.appName == "Netscape")) { - document.write(" + + + diff --git a/result/HTML/script.html.err b/result/HTML/script.html.err new file mode 100644 index 00000000..e69de29b diff --git a/result/HTML/script.html.sax b/result/HTML/script.html.sax new file mode 100644 index 00000000..3a470619 --- /dev/null +++ b/result/HTML/script.html.sax @@ -0,0 +1,32 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(html) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(head) +SAX.startElement(title) +SAX.characters(Script tests, 12) +SAX.endElement(title) +SAX.endElement(head) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(body) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(script, language='javascript') +SAX.cdata( + if (window.open<max) ;, 28) +SAX.endElement(script) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(input, onclick='if(window.open<max);') +SAX.endElement(input) +SAX.ignorableWhitespace( +, 1) +SAX.endElement(body) +SAX.ignorableWhitespace( +, 1) +SAX.endElement(html) +SAX.ignorableWhitespace( +, 1) +SAX.endDocument() diff --git a/test/HTML/script.html b/test/HTML/script.html new file mode 100644 index 00000000..197b002a --- /dev/null +++ b/test/HTML/script.html @@ -0,0 +1,9 @@ + +Script tests + + + + + diff --git a/testHTML.c b/testHTML.c index 1aff9670..541620e3 100644 --- a/testHTML.c +++ b/testHTML.c @@ -420,6 +420,27 @@ charactersDebug(void *ctx, const xmlChar *ch, int len) fprintf(stdout, "SAX.characters(%s, %d)\n", output, len); } +/** + * cdataDebug: + * @ctxt: An XML parser context + * @ch: a xmlChar string + * @len: the number of xmlChar + * + * receiving some cdata chars from the parser. + * Question: how much at a time ??? + */ +void +cdataDebug(void *ctx, const xmlChar *ch, int len) +{ + unsigned char output[40]; + int inlen = len, outlen = 30; + + htmlEncodeEntities(output, &outlen, ch, &inlen, 0); + output[outlen] = 0; + + fprintf(stdout, "SAX.cdata(%s, %d)\n", output, len); +} + /** * referenceDebug: * @ctxt: An XML parser context @@ -572,6 +593,8 @@ xmlSAXHandler debugSAXHandlerStruct = { errorDebug, fatalErrorDebug, getParameterEntityDebug, + cdataDebug, + NULL }; xmlSAXHandlerPtr debugSAXHandler = &debugSAXHandlerStruct; diff --git a/tree.c b/tree.c index ee3584b6..ae841c8c 100644 --- a/tree.c +++ b/tree.c @@ -495,12 +495,8 @@ xmlDocPtr xmlNewDoc(const xmlChar *version) { xmlDocPtr cur; - if (version == NULL) { -#ifdef DEBUG_TREE - fprintf(stderr, "xmlNewDoc : version == NULL\n"); -#endif - return(NULL); - } + if (version == NULL) + version = (const xmlChar *) "1.0"; /* * Allocate a new document and fill the fields.