From 86d6b9b051c0c21e7ac0038b02d0eed22439e187 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sat, 7 Sep 2024 04:18:06 +0200 Subject: [PATCH] html: Deduplicate some code --- HTMLparser.c | 509 ++++++++++++++------------------------------------- 1 file changed, 134 insertions(+), 375 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index 04ab503e..436dc700 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -42,6 +42,9 @@ static int htmlOmittedDefaultValue = 1; static void htmlParseComment(htmlParserCtxtPtr ctxt); +static int +htmlParseElementInternal(htmlParserCtxtPtr ctxt); + /************************************************************************ * * * Some factorized error routines * @@ -4264,111 +4267,154 @@ htmlParseReference(htmlParserCtxtPtr ctxt) { * @ctxt: an HTML parser context * * Parse a content: comment, sub-element, reference or text. - * Kept for compatibility with old code + * New version for non recursive htmlParseElementInternal */ static void htmlParseContent(htmlParserCtxtPtr ctxt) { - while (!PARSER_STOPPED(ctxt)) { + while (PARSER_STOPPED(ctxt) == 0) { int mode; GROW; - - /* - * Handle character data states first - */ mode = ctxt->endCheckState; - if ((mode != 0) && (CUR != 0)) { - if ((CUR == '&') && (mode == DATA_RCDATA)) { - htmlParseReference(ctxt); - } - else { - htmlParseCharData(ctxt, /* terminate */ 1); - } - goto done; - } - /* - * Our tag or one of it's parent or children is ending. - */ - if ((CUR == '<') && (NXT(1) == '/')) { - htmlParseEndTag(ctxt); - continue; /* while */ - } - - if ((CUR == '<') && (NXT(1) == '!')) { - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - if ((UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "Misplaced DOCTYPE declaration\n", - BAD_CAST "DOCTYPE" , NULL); - htmlParseDocTypeDecl(ctxt); + if ((mode == 0) && (CUR == '<')) { + if (NXT(1) == '/') { + htmlParseEndTag(ctxt); + } else if (NXT(1) == '!') { + /* + * Sometimes DOCTYPE arrives in the middle of the document + */ + if ((UPP(2) == 'D') && (UPP(3) == 'O') && + (UPP(4) == 'C') && (UPP(5) == 'T') && + (UPP(6) == 'Y') && (UPP(7) == 'P') && + (UPP(8) == 'E')) { + htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, + "Misplaced DOCTYPE declaration\n", + BAD_CAST "DOCTYPE" , NULL); + htmlParseDocTypeDecl(ctxt); + } else if ((NXT(2) == '-') && (NXT(3) == '-')) { + htmlParseComment(ctxt); + } else { + htmlSkipBogusComment(ctxt); + } + } else if (NXT(1) == '?') { + htmlParsePI(ctxt); + } else if (IS_ASCII_LETTER(NXT(1))) { + htmlParseElementInternal(ctxt); + } else { + if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1); + SKIP(1); } - /* - * First case : a comment - */ - else if ((NXT(2) == '-') && (NXT(3) == '-')) { - htmlParseComment(ctxt); - } - else { - htmlSkipBogusComment(ctxt); - } - } - - /* - * Second case : a Processing Instruction. - */ - else if ((CUR == '<') && (NXT(1) == '?')) { - htmlParsePI(ctxt); - } - - /* - * Third case : a sub-element. - */ - else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) { - htmlParseElement(ctxt); - } - else if (CUR == '<') { - if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && - (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1); - SKIP(1); - } - - /* - * Fourth case : a reference. If if has not been resolved, - * parsing returns it's Name, create the node - */ - else if (CUR == '&') { + } else if ((CUR == '&') && ((mode == 0) || (mode == DATA_RCDATA))) { htmlParseReference(ctxt); - } - - /* - * Fifth case : end of the resource - */ - else if (CUR == 0) { + } else if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); break; - } - - /* - * Last case, text. Note that References are handled directly. - */ - else { + } else { htmlParseCharData(ctxt, /* terminate */ 1); } -done: SHRINK; GROW; } } +/** + * htmlParseElementInternal: + * @ctxt: an HTML parser context + * + * parse an HTML element, new version, non recursive + * + * [39] element ::= EmptyElemTag | STag content ETag + * + * [41] Attribute ::= Name Eq AttValue + */ + +static int +htmlParseElementInternal(htmlParserCtxtPtr ctxt) { + const xmlChar *name; + const htmlElemDesc * info; + htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 }; + int failed; + + if ((ctxt == NULL) || (ctxt->input == NULL)) + return(0); + + /* Capture start position */ + if (ctxt->record_info) { + node_info.begin_pos = ctxt->input->consumed + + (CUR_PTR - ctxt->input->base); + node_info.begin_line = ctxt->input->line; + } + + failed = htmlParseStartTag(ctxt); + name = ctxt->name; + if ((failed == -1) || (name == NULL)) { + if (CUR == '>') + SKIP(1); + return(0); + } + + /* + * Lookup the info for that element. + */ + info = htmlTagLookup(name); + if (info == NULL) { + htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, + "Tag %s invalid\n", name, NULL); + } else { + ctxt->endCheckState = info->dataMode; + } + + if (ctxt->record_info) + htmlNodeInfoPush(ctxt, &node_info); + + /* + * Check for an Empty Element labeled the XML/SGML way + */ + if ((CUR == '/') && (NXT(1) == '>')) { + SKIP(2); + htmlParserFinishElementParsing(ctxt); + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + htmlnamePop(ctxt); + return(0); + } + + if (CUR == '>') { + SKIP(1); + } else { + htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, + "Couldn't find end of Start Tag %s\n", name, NULL); + + /* + * end of parsing of this node. + */ + if (xmlStrEqual(name, ctxt->name)) { + htmlParserFinishElementParsing(ctxt); + nodePop(ctxt); + htmlnamePop(ctxt); + } + return(0); + } + + /* + * Check for an Empty Element from DTD definition + */ + if ((info != NULL) && (info->empty)) { + htmlParserFinishElementParsing(ctxt); + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + htmlnamePop(ctxt); + return(0); + } + + return(1); +} + /** * htmlParseElement: * @ctxt: an HTML parser context @@ -4385,91 +4431,18 @@ done: void htmlParseElement(htmlParserCtxtPtr ctxt) { - const xmlChar *name; - xmlChar *currentNode = NULL; - const htmlElemDesc * info; - htmlParserNodeInfo node_info; - int failed; - int depth; const xmlChar *oldptr; + int depth; if ((ctxt == NULL) || (ctxt->input == NULL)) return; - /* Capture start position */ - if (ctxt->record_info) { - node_info.begin_pos = ctxt->input->consumed + - (CUR_PTR - ctxt->input->base); - node_info.begin_line = ctxt->input->line; - } - - failed = htmlParseStartTag(ctxt); - name = ctxt->name; - if ((failed == -1) || (name == NULL)) { - if (CUR == '>') - SKIP(1); + if (htmlParseElementInternal(ctxt) == 0) return; - } - - /* - * Lookup the info for that element. - */ - info = htmlTagLookup(name); - if (info == NULL) { - htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, - "Tag %s invalid\n", name, NULL); - } else { - ctxt->endCheckState = info->dataMode; - } - - if (ctxt->record_info) - htmlNodeInfoPush(ctxt, &node_info); - - /* - * Check for an Empty Element labeled the XML/SGML way - */ - if ((CUR == '/') && (NXT(1) == '>')) { - SKIP(2); - htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); - htmlnamePop(ctxt); - return; - } - - if (CUR == '>') { - SKIP(1); - } else { - htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, - "Couldn't find end of Start Tag %s\n", name, NULL); - - /* - * end of parsing of this node. - */ - if (xmlStrEqual(name, ctxt->name)) { - htmlParserFinishElementParsing(ctxt); - nodePop(ctxt); - htmlnamePop(ctxt); - } - - return; - } - - /* - * Check for an Empty Element from DTD definition - */ - if ((info != NULL) && (info->empty)) { - htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); - htmlnamePop(ctxt); - return; - } /* * Parse the content of the element: */ - currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; while (CUR != 0) { oldptr = ctxt->input->cur; @@ -4478,223 +4451,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { if (ctxt->nameNr < depth) break; } - /* - * Capture end position and add node - */ - if ( currentNode != NULL && ctxt->record_info ) { - node_info.end_pos = ctxt->input->consumed + - (CUR_PTR - ctxt->input->base); - node_info.end_line = ctxt->input->line; - node_info.node = ctxt->node; - xmlParserAddNodeInfo(ctxt, &node_info); - } if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } - - if (currentNode != NULL) - xmlFree(currentNode); -} - -/** - * htmlParseElementInternal: - * @ctxt: an HTML parser context - * - * parse an HTML element, new version, non recursive - * - * [39] element ::= EmptyElemTag | STag content ETag - * - * [41] Attribute ::= Name Eq AttValue - */ - -static void -htmlParseElementInternal(htmlParserCtxtPtr ctxt) { - const xmlChar *name; - const htmlElemDesc * info; - htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 }; - int failed; - - if ((ctxt == NULL) || (ctxt->input == NULL)) - return; - - /* Capture start position */ - if (ctxt->record_info) { - node_info.begin_pos = ctxt->input->consumed + - (CUR_PTR - ctxt->input->base); - node_info.begin_line = ctxt->input->line; - } - - failed = htmlParseStartTag(ctxt); - name = ctxt->name; - if ((failed == -1) || (name == NULL)) { - if (CUR == '>') - SKIP(1); - return; - } - - /* - * Lookup the info for that element. - */ - info = htmlTagLookup(name); - if (info == NULL) { - htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, - "Tag %s invalid\n", name, NULL); - } else { - ctxt->endCheckState = info->dataMode; - } - - if (ctxt->record_info) - htmlNodeInfoPush(ctxt, &node_info); - - /* - * Check for an Empty Element labeled the XML/SGML way - */ - if ((CUR == '/') && (NXT(1) == '>')) { - SKIP(2); - htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); - htmlnamePop(ctxt); - return; - } - - if (CUR == '>') { - SKIP(1); - } else { - htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, - "Couldn't find end of Start Tag %s\n", name, NULL); - - /* - * end of parsing of this node. - */ - if (xmlStrEqual(name, ctxt->name)) { - htmlParserFinishElementParsing(ctxt); - nodePop(ctxt); - htmlnamePop(ctxt); - } - return; - } - - /* - * Check for an Empty Element from DTD definition - */ - if ((info != NULL) && (info->empty)) { - htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); - htmlnamePop(ctxt); - return; - } -} - -/** - * htmlParseContentInternal: - * @ctxt: an HTML parser context - * - * Parse a content: comment, sub-element, reference or text. - * New version for non recursive htmlParseElementInternal - */ - -static void -htmlParseContentInternal(htmlParserCtxtPtr ctxt) { - while (PARSER_STOPPED(ctxt) == 0) { - int mode; - - GROW; - - /* - * Handle character data states first - */ - mode = ctxt->endCheckState; - if ((mode != 0) && (CUR != 0)) { - if ((CUR == '&') && (mode == DATA_RCDATA)) { - htmlParseReference(ctxt); - } - else { - htmlParseCharData(ctxt, /* terminate */ 1); - } - goto done; - } - - /* - * Our tag or one of it's parent or children is ending. - */ - if ((CUR == '<') && (NXT(1) == '/')) { - htmlParseEndTag(ctxt); - continue; /* while */ - } - - if ((CUR == '<') && (NXT(1) == '!')) { - /* - * Sometimes DOCTYPE arrives in the middle of the document - */ - if ((UPP(2) == 'D') && (UPP(3) == 'O') && - (UPP(4) == 'C') && (UPP(5) == 'T') && - (UPP(6) == 'Y') && (UPP(7) == 'P') && - (UPP(8) == 'E')) { - htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, - "Misplaced DOCTYPE declaration\n", - BAD_CAST "DOCTYPE" , NULL); - htmlParseDocTypeDecl(ctxt); - } - /* - * First case : a comment - */ - else if ((NXT(2) == '-') && (NXT(3) == '-')) { - htmlParseComment(ctxt); - } - else { - htmlSkipBogusComment(ctxt); - } - } - - /* - * Second case : a Processing Instruction. - */ - else if ((CUR == '<') && (NXT(1) == '?')) { - htmlParsePI(ctxt); - } - - /* - * Third case : a sub-element. - */ - else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) { - htmlParseElementInternal(ctxt); - } - else if (CUR == '<') { - if ((ctxt->sax != NULL) && (!ctxt->disableSAX) && - (ctxt->sax->characters != NULL)) - ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1); - SKIP(1); - } - - /* - * Fourth case : a reference. If if has not been resolved, - * parsing returns it's Name, create the node - */ - else if (CUR == '&') { - htmlParseReference(ctxt); - } - - /* - * Fifth case : end of the resource - */ - else if (CUR == 0) { - htmlAutoCloseOnEnd(ctxt); - break; - } - - /* - * Last case, text. Note that References are handled directly. - */ - else { - htmlParseCharData(ctxt, /* terminate */ 1); - } - -done: - SHRINK; - GROW; - } } xmlNodePtr @@ -4717,7 +4476,7 @@ htmlCtxtParseContentInternal(htmlParserCtxtPtr ctxt, xmlParserInputPtr input) { htmlnamePush(ctxt, rootName); nodePush(ctxt, root); - htmlParseContentInternal(ctxt); + htmlParseContent(ctxt); /* TODO: Use xmlCtxtIsCatastrophicError */ if (ctxt->errNo != XML_ERR_NO_MEMORY) { @@ -4828,7 +4587,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { /* * Time to start parsing the tree itself */ - htmlParseContentInternal(ctxt); + htmlParseContent(ctxt); /* * autoclose