diff --git a/HTMLparser.c b/HTMLparser.c index 2b569ca9..a3fbb7f2 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -209,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt) return (ret); } +/** + * htmlNodeInfoPush: + * @ctxt: an HTML parser context + * @value: the node info + * + * Pushes a new element name on top of the node info stack + * + * Returns 0 in case of error, the index in the stack otherwise + */ +static int +htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) +{ + if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { + if (ctxt->nodeInfoMax == 0) + ctxt->nodeInfoMax = 5; + ctxt->nodeInfoMax *= 2; + ctxt->nodeInfoTab = (htmlParserNodeInfo *) + xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, + ctxt->nodeInfoMax * + sizeof(ctxt->nodeInfoTab[0])); + if (ctxt->nodeInfoTab == NULL) { + htmlErrMemory(ctxt, NULL); + return (0); + } + } + ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; + ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; + return (ctxt->nodeInfoNr++); +} + +/** + * htmlNodeInfoPop: + * @ctxt: an HTML parser context + * + * Pops the top element name from the node info stack + * + * Returns 0 in case of error, the pointer to NodeInfo otherwise + */ +static htmlParserNodeInfo * +htmlNodeInfoPop(htmlParserCtxtPtr ctxt) +{ + if (ctxt->nodeInfoNr <= 0) + return (NULL); + ctxt->nodeInfoNr--; + if (ctxt->nodeInfoNr < 0) + return (NULL); + if (ctxt->nodeInfoNr > 0) + ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; + else + ctxt->nodeInfo = NULL; + return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; +} + /* * Macros for accessing the content. Those should be used only by the parser, * and not exported. @@ -3927,6 +3980,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) { * @ctxt: an HTML parser context * * Parse a content: comment, sub-element, reference or text. + * Kept for compatibility with old code */ static void @@ -4074,24 +4128,12 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { if (currentNode != NULL) xmlFree(currentNode); } -/** - * htmlParseContent: - * @ctxt: an HTML parser context - * - * Parse a content: comment, sub-element, reference or text. - */ - -void -__htmlParseContent(void *ctxt) { - if (ctxt != NULL) - htmlParseContent((htmlParserCtxtPtr) ctxt); -} - /** * htmlParseElement: * @ctxt: an HTML parser context * * parse an HTML element, this is highly recursive + * this is kept for compatibility with previous code versions * * [39] element ::= EmptyElemTag | STag content ETag * @@ -4219,6 +4261,303 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { xmlFree(currentNode); } +static void +htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { + /* + * Capture end position and add node + */ + if ( ctxt->node != NULL && ctxt->record_info ) { + ctxt->nodeInfo->end_pos = ctxt->input->consumed + + (CUR_PTR - ctxt->input->base); + ctxt->nodeInfo->end_line = ctxt->input->line; + ctxt->nodeInfo->node = ctxt->node; + xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); + htmlNodeInfoPop(ctxt); + } + if (!IS_CHAR_CH(CUR)) { + htmlAutoCloseOnEnd(ctxt); + } +} + +/** + * htmlParseElementInternal: + * @ctxt: an HTML parser context + * + * parse an HTML element, new version, non recursive + * + * [39] element ::= EmptyElemTag | STag content ETag + * + * [41] Attribute ::= Name Eq AttValue + */ + +static void +htmlParseElementInternal(htmlParserCtxtPtr ctxt) { + const xmlChar *name; + const htmlElemDesc * info; + htmlParserNodeInfo node_info; + int failed; + int depth; + const xmlChar *oldptr; + + if ((ctxt == NULL) || (ctxt->input == NULL)) { + htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, + "htmlParseElementInternal: context error\n", NULL, NULL); + return; + } + + if (ctxt->instate == XML_PARSER_EOF) + return; + + /* Capture start position */ + if (ctxt->record_info) { + node_info.begin_pos = ctxt->input->consumed + + (CUR_PTR - ctxt->input->base); + node_info.begin_line = ctxt->input->line; + } + + failed = htmlParseStartTag(ctxt); + name = ctxt->name; + if ((failed == -1) || (name == NULL)) { + if (CUR == '>') + NEXT; + return; + } + + /* + * Lookup the info for that element. + */ + info = htmlTagLookup(name); + if (info == NULL) { + htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, + "Tag %s invalid\n", name, NULL); + } + + /* + * Check for an Empty Element labeled the XML/SGML way + */ + if ((CUR == '/') && (NXT(1) == '>')) { + SKIP(2); + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + htmlnamePop(ctxt); + return; + } + + if (CUR == '>') { + NEXT; + } else { + htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, + "Couldn't find end of Start Tag %s\n", name, NULL); + + /* + * end of parsing of this node. + */ + if (xmlStrEqual(name, ctxt->name)) { + nodePop(ctxt); + htmlnamePop(ctxt); + } + + if (ctxt->record_info) + htmlNodeInfoPush(ctxt, &node_info); + htmlParserFinishElementParsing(ctxt); + return; + } + + /* + * Check for an Empty Element from DTD definition + */ + if ((info != NULL) && (info->empty)) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + htmlnamePop(ctxt); + return; + } + + if (ctxt->record_info) + htmlNodeInfoPush(ctxt, &node_info); +} + +/** + * htmlParseContentInternal: + * @ctxt: an HTML parser context + * + * Parse a content: comment, sub-element, reference or text. + * New version for non recursive htmlParseElementInternal + */ + +static void +htmlParseContentInternal(htmlParserCtxtPtr ctxt) { + xmlChar *currentNode; + int depth; + const xmlChar *name; + + currentNode = xmlStrdup(ctxt->name); + depth = ctxt->nameNr; + while (1) { + long cons = ctxt->nbChars; + + GROW; + + if (ctxt->instate == XML_PARSER_EOF) + break; + + /* + * Our tag or one of it's parent or children is ending. + */ + if ((CUR == '<') && (NXT(1) == '/')) { + if (htmlParseEndTag(ctxt) && + ((currentNode != NULL) || (ctxt->nameNr == 0))) { + if (currentNode != NULL) + xmlFree(currentNode); + + currentNode = xmlStrdup(ctxt->name); + depth = ctxt->nameNr; + } + continue; /* while */ + } + + else if ((CUR == '<') && + ((IS_ASCII_LETTER(NXT(1))) || + (NXT(1) == '_') || (NXT(1) == ':'))) { + name = htmlParseHTMLName_nonInvasive(ctxt); + if (name == NULL) { + htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, + "htmlParseStartTag: invalid element name\n", + NULL, NULL); + /* Dump the bogus tag like browsers do */ + while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + NEXT; + + htmlParserFinishElementParsing(ctxt); + if (currentNode != NULL) + xmlFree(currentNode); + + currentNode = xmlStrdup(ctxt->name); + depth = ctxt->nameNr; + continue; + } + + if (ctxt->name != NULL) { + if (htmlCheckAutoClose(name, ctxt->name) == 1) { + htmlAutoClose(ctxt, name); + continue; + } + } + } + + /* + * Has this node been popped out during parsing of + * the next element + */ + if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && + (!xmlStrEqual(currentNode, ctxt->name))) + { + htmlParserFinishElementParsing(ctxt); + if (currentNode != NULL) xmlFree(currentNode); + + currentNode = xmlStrdup(ctxt->name); + depth = ctxt->nameNr; + continue; + } + + if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || + (xmlStrEqual(currentNode, BAD_CAST"style")))) { + /* + * Handle SCRIPT/STYLE separately + */ + htmlParseScript(ctxt); + } else { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((CUR == '<') && (NXT(1) == '!') && + (UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "Misplaced DOCTYPE declaration\n", + BAD_CAST "DOCTYPE" , NULL); + htmlParseDocTypeDecl(ctxt); + } + + /* + * First case : a comment + */ + if ((CUR == '<') && (NXT(1) == '!') && + (NXT(2) == '-') && (NXT(3) == '-')) { + htmlParseComment(ctxt); + } + + /* + * Second case : a Processing Instruction. + */ + else if ((CUR == '<') && (NXT(1) == '?')) { + htmlParsePI(ctxt); + } + + /* + * Third case : a sub-element. + */ + else if (CUR == '<') { + htmlParseElementInternal(ctxt); + if (currentNode != NULL) xmlFree(currentNode); + + currentNode = xmlStrdup(ctxt->name); + depth = ctxt->nameNr; + } + + /* + * Fourth case : a reference. If if has not been resolved, + * parsing returns it's Name, create the node + */ + else if (CUR == '&') { + htmlParseReference(ctxt); + } + + /* + * Fifth case : end of the resource + */ + else if (CUR == 0) { + htmlAutoCloseOnEnd(ctxt); + break; + } + + /* + * Last case, text. Note that References are handled directly. + */ + else { + htmlParseCharData(ctxt); + } + + if (cons == ctxt->nbChars) { + if (ctxt->node != NULL) { + htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, + "detected an error in element content\n", + NULL, NULL); + } + break; + } + } + GROW; + } + if (currentNode != NULL) xmlFree(currentNode); +} + +/** + * htmlParseContent: + * @ctxt: an HTML parser context + * + * Parse a content: comment, sub-element, reference or text. + * This is the entry point when called from parser.c + */ + +void +__htmlParseContent(void *ctxt) { + if (ctxt != NULL) + htmlParseContentInternal((htmlParserCtxtPtr) ctxt); +} + /** * htmlParseDocument: * @ctxt: an HTML parser context @@ -4323,7 +4662,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { /* * Time to start parsing the tree itself */ - htmlParseContent(ctxt); + htmlParseContentInternal(ctxt); /* * autoclose @@ -4440,6 +4779,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt) ctxt->nameMax = 10; ctxt->name = NULL; + ctxt->nodeInfoTab = NULL; + ctxt->nodeInfoNr = 0; + ctxt->nodeInfoMax = 0; + if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; else { ctxt->sax = sax; diff --git a/include/libxml/parser.h b/include/libxml/parser.h index 148ee038..dd79c426 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -302,6 +302,12 @@ struct _xmlParserCtxt { xmlParserMode parseMode; /* the parser mode */ unsigned long nbentities; /* number of entities references */ unsigned long sizeentities; /* size of parsed entities */ + + /* for use by HTML non-recursive parser */ + xmlParserNodeInfo *nodeInfo; /* Current NodeInfo */ + int nodeInfoNr; /* Depth of the parsing stack */ + int nodeInfoMax; /* Max depth of the parsing stack */ + xmlParserNodeInfo *nodeInfoTab; /* array of nodeInfos */ }; /** diff --git a/parserInternals.c b/parserInternals.c index ff204350..2404ddfc 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1782,6 +1782,7 @@ xmlFreeParserCtxt(xmlParserCtxtPtr ctxt) if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab); if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab); if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab); + if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab); if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab); if (ctxt->version != NULL) xmlFree((char *) ctxt->version); if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);