mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-19 05:43:19 +03:00
Switch the HTML parser to be non-recursive
* HTMLparser.c: new htmlParseElementInternal non recursive, with htmlParseContentInternal and new function to handle node info and element end. * include/libxml/parser.h: add new stack for element info in parser context * parserInternals.c: fee element info stack
This commit is contained in:
committed by
Daniel Veillard
parent
ef9c636ac1
commit
615904f582
371
HTMLparser.c
371
HTMLparser.c
@ -209,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
|
||||
return (ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlNodeInfoPush:
|
||||
* @ctxt: an HTML parser context
|
||||
* @value: the node info
|
||||
*
|
||||
* Pushes a new element name on top of the node info stack
|
||||
*
|
||||
* Returns 0 in case of error, the index in the stack otherwise
|
||||
*/
|
||||
static int
|
||||
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
|
||||
{
|
||||
if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
|
||||
if (ctxt->nodeInfoMax == 0)
|
||||
ctxt->nodeInfoMax = 5;
|
||||
ctxt->nodeInfoMax *= 2;
|
||||
ctxt->nodeInfoTab = (htmlParserNodeInfo *)
|
||||
xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
|
||||
ctxt->nodeInfoMax *
|
||||
sizeof(ctxt->nodeInfoTab[0]));
|
||||
if (ctxt->nodeInfoTab == NULL) {
|
||||
htmlErrMemory(ctxt, NULL);
|
||||
return (0);
|
||||
}
|
||||
}
|
||||
ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
|
||||
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
|
||||
return (ctxt->nodeInfoNr++);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlNodeInfoPop:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Pops the top element name from the node info stack
|
||||
*
|
||||
* Returns 0 in case of error, the pointer to NodeInfo otherwise
|
||||
*/
|
||||
static htmlParserNodeInfo *
|
||||
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
|
||||
{
|
||||
if (ctxt->nodeInfoNr <= 0)
|
||||
return (NULL);
|
||||
ctxt->nodeInfoNr--;
|
||||
if (ctxt->nodeInfoNr < 0)
|
||||
return (NULL);
|
||||
if (ctxt->nodeInfoNr > 0)
|
||||
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
|
||||
else
|
||||
ctxt->nodeInfo = NULL;
|
||||
return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
|
||||
}
|
||||
|
||||
/*
|
||||
* Macros for accessing the content. Those should be used only by the parser,
|
||||
* and not exported.
|
||||
@ -3927,6 +3980,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Parse a content: comment, sub-element, reference or text.
|
||||
* Kept for compatibility with old code
|
||||
*/
|
||||
|
||||
static void
|
||||
@ -4074,24 +4128,12 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
||||
if (currentNode != NULL) xmlFree(currentNode);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseContent:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Parse a content: comment, sub-element, reference or text.
|
||||
*/
|
||||
|
||||
void
|
||||
__htmlParseContent(void *ctxt) {
|
||||
if (ctxt != NULL)
|
||||
htmlParseContent((htmlParserCtxtPtr) ctxt);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseElement:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* parse an HTML element, this is highly recursive
|
||||
* this is kept for compatibility with previous code versions
|
||||
*
|
||||
* [39] element ::= EmptyElemTag | STag content ETag
|
||||
*
|
||||
@ -4219,6 +4261,303 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
||||
xmlFree(currentNode);
|
||||
}
|
||||
|
||||
static void
|
||||
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
|
||||
/*
|
||||
* Capture end position and add node
|
||||
*/
|
||||
if ( ctxt->node != NULL && ctxt->record_info ) {
|
||||
ctxt->nodeInfo->end_pos = ctxt->input->consumed +
|
||||
(CUR_PTR - ctxt->input->base);
|
||||
ctxt->nodeInfo->end_line = ctxt->input->line;
|
||||
ctxt->nodeInfo->node = ctxt->node;
|
||||
xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
|
||||
htmlNodeInfoPop(ctxt);
|
||||
}
|
||||
if (!IS_CHAR_CH(CUR)) {
|
||||
htmlAutoCloseOnEnd(ctxt);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseElementInternal:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* parse an HTML element, new version, non recursive
|
||||
*
|
||||
* [39] element ::= EmptyElemTag | STag content ETag
|
||||
*
|
||||
* [41] Attribute ::= Name Eq AttValue
|
||||
*/
|
||||
|
||||
static void
|
||||
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
|
||||
const xmlChar *name;
|
||||
const htmlElemDesc * info;
|
||||
htmlParserNodeInfo node_info;
|
||||
int failed;
|
||||
int depth;
|
||||
const xmlChar *oldptr;
|
||||
|
||||
if ((ctxt == NULL) || (ctxt->input == NULL)) {
|
||||
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
||||
"htmlParseElementInternal: context error\n", NULL, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctxt->instate == XML_PARSER_EOF)
|
||||
return;
|
||||
|
||||
/* Capture start position */
|
||||
if (ctxt->record_info) {
|
||||
node_info.begin_pos = ctxt->input->consumed +
|
||||
(CUR_PTR - ctxt->input->base);
|
||||
node_info.begin_line = ctxt->input->line;
|
||||
}
|
||||
|
||||
failed = htmlParseStartTag(ctxt);
|
||||
name = ctxt->name;
|
||||
if ((failed == -1) || (name == NULL)) {
|
||||
if (CUR == '>')
|
||||
NEXT;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lookup the info for that element.
|
||||
*/
|
||||
info = htmlTagLookup(name);
|
||||
if (info == NULL) {
|
||||
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
|
||||
"Tag %s invalid\n", name, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for an Empty Element labeled the XML/SGML way
|
||||
*/
|
||||
if ((CUR == '/') && (NXT(1) == '>')) {
|
||||
SKIP(2);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||
ctxt->sax->endElement(ctxt->userData, name);
|
||||
htmlnamePop(ctxt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (CUR == '>') {
|
||||
NEXT;
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
|
||||
"Couldn't find end of Start Tag %s\n", name, NULL);
|
||||
|
||||
/*
|
||||
* end of parsing of this node.
|
||||
*/
|
||||
if (xmlStrEqual(name, ctxt->name)) {
|
||||
nodePop(ctxt);
|
||||
htmlnamePop(ctxt);
|
||||
}
|
||||
|
||||
if (ctxt->record_info)
|
||||
htmlNodeInfoPush(ctxt, &node_info);
|
||||
htmlParserFinishElementParsing(ctxt);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check for an Empty Element from DTD definition
|
||||
*/
|
||||
if ((info != NULL) && (info->empty)) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||
ctxt->sax->endElement(ctxt->userData, name);
|
||||
htmlnamePop(ctxt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (ctxt->record_info)
|
||||
htmlNodeInfoPush(ctxt, &node_info);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseContentInternal:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Parse a content: comment, sub-element, reference or text.
|
||||
* New version for non recursive htmlParseElementInternal
|
||||
*/
|
||||
|
||||
static void
|
||||
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
|
||||
xmlChar *currentNode;
|
||||
int depth;
|
||||
const xmlChar *name;
|
||||
|
||||
currentNode = xmlStrdup(ctxt->name);
|
||||
depth = ctxt->nameNr;
|
||||
while (1) {
|
||||
long cons = ctxt->nbChars;
|
||||
|
||||
GROW;
|
||||
|
||||
if (ctxt->instate == XML_PARSER_EOF)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Our tag or one of it's parent or children is ending.
|
||||
*/
|
||||
if ((CUR == '<') && (NXT(1) == '/')) {
|
||||
if (htmlParseEndTag(ctxt) &&
|
||||
((currentNode != NULL) || (ctxt->nameNr == 0))) {
|
||||
if (currentNode != NULL)
|
||||
xmlFree(currentNode);
|
||||
|
||||
currentNode = xmlStrdup(ctxt->name);
|
||||
depth = ctxt->nameNr;
|
||||
}
|
||||
continue; /* while */
|
||||
}
|
||||
|
||||
else if ((CUR == '<') &&
|
||||
((IS_ASCII_LETTER(NXT(1))) ||
|
||||
(NXT(1) == '_') || (NXT(1) == ':'))) {
|
||||
name = htmlParseHTMLName_nonInvasive(ctxt);
|
||||
if (name == NULL) {
|
||||
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
||||
"htmlParseStartTag: invalid element name\n",
|
||||
NULL, NULL);
|
||||
/* Dump the bogus tag like browsers do */
|
||||
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
|
||||
NEXT;
|
||||
|
||||
htmlParserFinishElementParsing(ctxt);
|
||||
if (currentNode != NULL)
|
||||
xmlFree(currentNode);
|
||||
|
||||
currentNode = xmlStrdup(ctxt->name);
|
||||
depth = ctxt->nameNr;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ctxt->name != NULL) {
|
||||
if (htmlCheckAutoClose(name, ctxt->name) == 1) {
|
||||
htmlAutoClose(ctxt, name);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Has this node been popped out during parsing of
|
||||
* the next element
|
||||
*/
|
||||
if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
|
||||
(!xmlStrEqual(currentNode, ctxt->name)))
|
||||
{
|
||||
htmlParserFinishElementParsing(ctxt);
|
||||
if (currentNode != NULL) xmlFree(currentNode);
|
||||
|
||||
currentNode = xmlStrdup(ctxt->name);
|
||||
depth = ctxt->nameNr;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
|
||||
(xmlStrEqual(currentNode, BAD_CAST"style")))) {
|
||||
/*
|
||||
* Handle SCRIPT/STYLE separately
|
||||
*/
|
||||
htmlParseScript(ctxt);
|
||||
} else {
|
||||
/*
|
||||
* Sometimes DOCTYPE arrives in the middle of the document
|
||||
*/
|
||||
if ((CUR == '<') && (NXT(1) == '!') &&
|
||||
(UPP(2) == 'D') && (UPP(3) == 'O') &&
|
||||
(UPP(4) == 'C') && (UPP(5) == 'T') &&
|
||||
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
||||
(UPP(8) == 'E')) {
|
||||
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
|
||||
"Misplaced DOCTYPE declaration\n",
|
||||
BAD_CAST "DOCTYPE" , NULL);
|
||||
htmlParseDocTypeDecl(ctxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* First case : a comment
|
||||
*/
|
||||
if ((CUR == '<') && (NXT(1) == '!') &&
|
||||
(NXT(2) == '-') && (NXT(3) == '-')) {
|
||||
htmlParseComment(ctxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Second case : a Processing Instruction.
|
||||
*/
|
||||
else if ((CUR == '<') && (NXT(1) == '?')) {
|
||||
htmlParsePI(ctxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Third case : a sub-element.
|
||||
*/
|
||||
else if (CUR == '<') {
|
||||
htmlParseElementInternal(ctxt);
|
||||
if (currentNode != NULL) xmlFree(currentNode);
|
||||
|
||||
currentNode = xmlStrdup(ctxt->name);
|
||||
depth = ctxt->nameNr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fourth case : a reference. If if has not been resolved,
|
||||
* parsing returns it's Name, create the node
|
||||
*/
|
||||
else if (CUR == '&') {
|
||||
htmlParseReference(ctxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* Fifth case : end of the resource
|
||||
*/
|
||||
else if (CUR == 0) {
|
||||
htmlAutoCloseOnEnd(ctxt);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Last case, text. Note that References are handled directly.
|
||||
*/
|
||||
else {
|
||||
htmlParseCharData(ctxt);
|
||||
}
|
||||
|
||||
if (cons == ctxt->nbChars) {
|
||||
if (ctxt->node != NULL) {
|
||||
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
||||
"detected an error in element content\n",
|
||||
NULL, NULL);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
GROW;
|
||||
}
|
||||
if (currentNode != NULL) xmlFree(currentNode);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseContent:
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Parse a content: comment, sub-element, reference or text.
|
||||
* This is the entry point when called from parser.c
|
||||
*/
|
||||
|
||||
void
|
||||
__htmlParseContent(void *ctxt) {
|
||||
if (ctxt != NULL)
|
||||
htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlParseDocument:
|
||||
* @ctxt: an HTML parser context
|
||||
@ -4323,7 +4662,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
||||
/*
|
||||
* Time to start parsing the tree itself
|
||||
*/
|
||||
htmlParseContent(ctxt);
|
||||
htmlParseContentInternal(ctxt);
|
||||
|
||||
/*
|
||||
* autoclose
|
||||
@ -4440,6 +4779,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
|
||||
ctxt->nameMax = 10;
|
||||
ctxt->name = NULL;
|
||||
|
||||
ctxt->nodeInfoTab = NULL;
|
||||
ctxt->nodeInfoNr = 0;
|
||||
ctxt->nodeInfoMax = 0;
|
||||
|
||||
if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
|
||||
else {
|
||||
ctxt->sax = sax;
|
||||
|
Reference in New Issue
Block a user