mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-28 00:21:53 +03:00
Switch the HTML parser to be non-recursive
* HTMLparser.c: new htmlParseElementInternal non recursive, with htmlParseContentInternal and new function to handle node info and element end. * include/libxml/parser.h: add new stack for element info in parser context * parserInternals.c: fee element info stack
This commit is contained in:
committed by
Daniel Veillard
parent
ef9c636ac1
commit
615904f582
371
HTMLparser.c
371
HTMLparser.c
@ -209,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
|
|||||||
return (ret);
|
return (ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlNodeInfoPush:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
* @value: the node info
|
||||||
|
*
|
||||||
|
* Pushes a new element name on top of the node info stack
|
||||||
|
*
|
||||||
|
* Returns 0 in case of error, the index in the stack otherwise
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
|
||||||
|
{
|
||||||
|
if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
|
||||||
|
if (ctxt->nodeInfoMax == 0)
|
||||||
|
ctxt->nodeInfoMax = 5;
|
||||||
|
ctxt->nodeInfoMax *= 2;
|
||||||
|
ctxt->nodeInfoTab = (htmlParserNodeInfo *)
|
||||||
|
xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
|
||||||
|
ctxt->nodeInfoMax *
|
||||||
|
sizeof(ctxt->nodeInfoTab[0]));
|
||||||
|
if (ctxt->nodeInfoTab == NULL) {
|
||||||
|
htmlErrMemory(ctxt, NULL);
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
|
||||||
|
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
|
||||||
|
return (ctxt->nodeInfoNr++);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlNodeInfoPop:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
*
|
||||||
|
* Pops the top element name from the node info stack
|
||||||
|
*
|
||||||
|
* Returns 0 in case of error, the pointer to NodeInfo otherwise
|
||||||
|
*/
|
||||||
|
static htmlParserNodeInfo *
|
||||||
|
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
|
||||||
|
{
|
||||||
|
if (ctxt->nodeInfoNr <= 0)
|
||||||
|
return (NULL);
|
||||||
|
ctxt->nodeInfoNr--;
|
||||||
|
if (ctxt->nodeInfoNr < 0)
|
||||||
|
return (NULL);
|
||||||
|
if (ctxt->nodeInfoNr > 0)
|
||||||
|
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
|
||||||
|
else
|
||||||
|
ctxt->nodeInfo = NULL;
|
||||||
|
return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Macros for accessing the content. Those should be used only by the parser,
|
* Macros for accessing the content. Those should be used only by the parser,
|
||||||
* and not exported.
|
* and not exported.
|
||||||
@ -3927,6 +3980,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
|||||||
* @ctxt: an HTML parser context
|
* @ctxt: an HTML parser context
|
||||||
*
|
*
|
||||||
* Parse a content: comment, sub-element, reference or text.
|
* Parse a content: comment, sub-element, reference or text.
|
||||||
|
* Kept for compatibility with old code
|
||||||
*/
|
*/
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -4074,24 +4128,12 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
|||||||
if (currentNode != NULL) xmlFree(currentNode);
|
if (currentNode != NULL) xmlFree(currentNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* htmlParseContent:
|
|
||||||
* @ctxt: an HTML parser context
|
|
||||||
*
|
|
||||||
* Parse a content: comment, sub-element, reference or text.
|
|
||||||
*/
|
|
||||||
|
|
||||||
void
|
|
||||||
__htmlParseContent(void *ctxt) {
|
|
||||||
if (ctxt != NULL)
|
|
||||||
htmlParseContent((htmlParserCtxtPtr) ctxt);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* htmlParseElement:
|
* htmlParseElement:
|
||||||
* @ctxt: an HTML parser context
|
* @ctxt: an HTML parser context
|
||||||
*
|
*
|
||||||
* parse an HTML element, this is highly recursive
|
* parse an HTML element, this is highly recursive
|
||||||
|
* this is kept for compatibility with previous code versions
|
||||||
*
|
*
|
||||||
* [39] element ::= EmptyElemTag | STag content ETag
|
* [39] element ::= EmptyElemTag | STag content ETag
|
||||||
*
|
*
|
||||||
@ -4219,6 +4261,303 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
xmlFree(currentNode);
|
xmlFree(currentNode);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
|
||||||
|
/*
|
||||||
|
* Capture end position and add node
|
||||||
|
*/
|
||||||
|
if ( ctxt->node != NULL && ctxt->record_info ) {
|
||||||
|
ctxt->nodeInfo->end_pos = ctxt->input->consumed +
|
||||||
|
(CUR_PTR - ctxt->input->base);
|
||||||
|
ctxt->nodeInfo->end_line = ctxt->input->line;
|
||||||
|
ctxt->nodeInfo->node = ctxt->node;
|
||||||
|
xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
|
||||||
|
htmlNodeInfoPop(ctxt);
|
||||||
|
}
|
||||||
|
if (!IS_CHAR_CH(CUR)) {
|
||||||
|
htmlAutoCloseOnEnd(ctxt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlParseElementInternal:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
*
|
||||||
|
* parse an HTML element, new version, non recursive
|
||||||
|
*
|
||||||
|
* [39] element ::= EmptyElemTag | STag content ETag
|
||||||
|
*
|
||||||
|
* [41] Attribute ::= Name Eq AttValue
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void
|
||||||
|
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
|
||||||
|
const xmlChar *name;
|
||||||
|
const htmlElemDesc * info;
|
||||||
|
htmlParserNodeInfo node_info;
|
||||||
|
int failed;
|
||||||
|
int depth;
|
||||||
|
const xmlChar *oldptr;
|
||||||
|
|
||||||
|
if ((ctxt == NULL) || (ctxt->input == NULL)) {
|
||||||
|
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
||||||
|
"htmlParseElementInternal: context error\n", NULL, NULL);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctxt->instate == XML_PARSER_EOF)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/* Capture start position */
|
||||||
|
if (ctxt->record_info) {
|
||||||
|
node_info.begin_pos = ctxt->input->consumed +
|
||||||
|
(CUR_PTR - ctxt->input->base);
|
||||||
|
node_info.begin_line = ctxt->input->line;
|
||||||
|
}
|
||||||
|
|
||||||
|
failed = htmlParseStartTag(ctxt);
|
||||||
|
name = ctxt->name;
|
||||||
|
if ((failed == -1) || (name == NULL)) {
|
||||||
|
if (CUR == '>')
|
||||||
|
NEXT;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Lookup the info for that element.
|
||||||
|
*/
|
||||||
|
info = htmlTagLookup(name);
|
||||||
|
if (info == NULL) {
|
||||||
|
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
|
||||||
|
"Tag %s invalid\n", name, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check for an Empty Element labeled the XML/SGML way
|
||||||
|
*/
|
||||||
|
if ((CUR == '/') && (NXT(1) == '>')) {
|
||||||
|
SKIP(2);
|
||||||
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (CUR == '>') {
|
||||||
|
NEXT;
|
||||||
|
} else {
|
||||||
|
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
|
||||||
|
"Couldn't find end of Start Tag %s\n", name, NULL);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* end of parsing of this node.
|
||||||
|
*/
|
||||||
|
if (xmlStrEqual(name, ctxt->name)) {
|
||||||
|
nodePop(ctxt);
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctxt->record_info)
|
||||||
|
htmlNodeInfoPush(ctxt, &node_info);
|
||||||
|
htmlParserFinishElementParsing(ctxt);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check for an Empty Element from DTD definition
|
||||||
|
*/
|
||||||
|
if ((info != NULL) && (info->empty)) {
|
||||||
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctxt->record_info)
|
||||||
|
htmlNodeInfoPush(ctxt, &node_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlParseContentInternal:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
*
|
||||||
|
* Parse a content: comment, sub-element, reference or text.
|
||||||
|
* New version for non recursive htmlParseElementInternal
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void
|
||||||
|
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
|
||||||
|
xmlChar *currentNode;
|
||||||
|
int depth;
|
||||||
|
const xmlChar *name;
|
||||||
|
|
||||||
|
currentNode = xmlStrdup(ctxt->name);
|
||||||
|
depth = ctxt->nameNr;
|
||||||
|
while (1) {
|
||||||
|
long cons = ctxt->nbChars;
|
||||||
|
|
||||||
|
GROW;
|
||||||
|
|
||||||
|
if (ctxt->instate == XML_PARSER_EOF)
|
||||||
|
break;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Our tag or one of it's parent or children is ending.
|
||||||
|
*/
|
||||||
|
if ((CUR == '<') && (NXT(1) == '/')) {
|
||||||
|
if (htmlParseEndTag(ctxt) &&
|
||||||
|
((currentNode != NULL) || (ctxt->nameNr == 0))) {
|
||||||
|
if (currentNode != NULL)
|
||||||
|
xmlFree(currentNode);
|
||||||
|
|
||||||
|
currentNode = xmlStrdup(ctxt->name);
|
||||||
|
depth = ctxt->nameNr;
|
||||||
|
}
|
||||||
|
continue; /* while */
|
||||||
|
}
|
||||||
|
|
||||||
|
else if ((CUR == '<') &&
|
||||||
|
((IS_ASCII_LETTER(NXT(1))) ||
|
||||||
|
(NXT(1) == '_') || (NXT(1) == ':'))) {
|
||||||
|
name = htmlParseHTMLName_nonInvasive(ctxt);
|
||||||
|
if (name == NULL) {
|
||||||
|
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
||||||
|
"htmlParseStartTag: invalid element name\n",
|
||||||
|
NULL, NULL);
|
||||||
|
/* Dump the bogus tag like browsers do */
|
||||||
|
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
|
||||||
|
NEXT;
|
||||||
|
|
||||||
|
htmlParserFinishElementParsing(ctxt);
|
||||||
|
if (currentNode != NULL)
|
||||||
|
xmlFree(currentNode);
|
||||||
|
|
||||||
|
currentNode = xmlStrdup(ctxt->name);
|
||||||
|
depth = ctxt->nameNr;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ctxt->name != NULL) {
|
||||||
|
if (htmlCheckAutoClose(name, ctxt->name) == 1) {
|
||||||
|
htmlAutoClose(ctxt, name);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Has this node been popped out during parsing of
|
||||||
|
* the next element
|
||||||
|
*/
|
||||||
|
if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
|
||||||
|
(!xmlStrEqual(currentNode, ctxt->name)))
|
||||||
|
{
|
||||||
|
htmlParserFinishElementParsing(ctxt);
|
||||||
|
if (currentNode != NULL) xmlFree(currentNode);
|
||||||
|
|
||||||
|
currentNode = xmlStrdup(ctxt->name);
|
||||||
|
depth = ctxt->nameNr;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
|
||||||
|
(xmlStrEqual(currentNode, BAD_CAST"style")))) {
|
||||||
|
/*
|
||||||
|
* Handle SCRIPT/STYLE separately
|
||||||
|
*/
|
||||||
|
htmlParseScript(ctxt);
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Sometimes DOCTYPE arrives in the middle of the document
|
||||||
|
*/
|
||||||
|
if ((CUR == '<') && (NXT(1) == '!') &&
|
||||||
|
(UPP(2) == 'D') && (UPP(3) == 'O') &&
|
||||||
|
(UPP(4) == 'C') && (UPP(5) == 'T') &&
|
||||||
|
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
||||||
|
(UPP(8) == 'E')) {
|
||||||
|
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
|
||||||
|
"Misplaced DOCTYPE declaration\n",
|
||||||
|
BAD_CAST "DOCTYPE" , NULL);
|
||||||
|
htmlParseDocTypeDecl(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* First case : a comment
|
||||||
|
*/
|
||||||
|
if ((CUR == '<') && (NXT(1) == '!') &&
|
||||||
|
(NXT(2) == '-') && (NXT(3) == '-')) {
|
||||||
|
htmlParseComment(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Second case : a Processing Instruction.
|
||||||
|
*/
|
||||||
|
else if ((CUR == '<') && (NXT(1) == '?')) {
|
||||||
|
htmlParsePI(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Third case : a sub-element.
|
||||||
|
*/
|
||||||
|
else if (CUR == '<') {
|
||||||
|
htmlParseElementInternal(ctxt);
|
||||||
|
if (currentNode != NULL) xmlFree(currentNode);
|
||||||
|
|
||||||
|
currentNode = xmlStrdup(ctxt->name);
|
||||||
|
depth = ctxt->nameNr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fourth case : a reference. If if has not been resolved,
|
||||||
|
* parsing returns it's Name, create the node
|
||||||
|
*/
|
||||||
|
else if (CUR == '&') {
|
||||||
|
htmlParseReference(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Fifth case : end of the resource
|
||||||
|
*/
|
||||||
|
else if (CUR == 0) {
|
||||||
|
htmlAutoCloseOnEnd(ctxt);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Last case, text. Note that References are handled directly.
|
||||||
|
*/
|
||||||
|
else {
|
||||||
|
htmlParseCharData(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cons == ctxt->nbChars) {
|
||||||
|
if (ctxt->node != NULL) {
|
||||||
|
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
|
||||||
|
"detected an error in element content\n",
|
||||||
|
NULL, NULL);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GROW;
|
||||||
|
}
|
||||||
|
if (currentNode != NULL) xmlFree(currentNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlParseContent:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
*
|
||||||
|
* Parse a content: comment, sub-element, reference or text.
|
||||||
|
* This is the entry point when called from parser.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
void
|
||||||
|
__htmlParseContent(void *ctxt) {
|
||||||
|
if (ctxt != NULL)
|
||||||
|
htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* htmlParseDocument:
|
* htmlParseDocument:
|
||||||
* @ctxt: an HTML parser context
|
* @ctxt: an HTML parser context
|
||||||
@ -4323,7 +4662,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
|||||||
/*
|
/*
|
||||||
* Time to start parsing the tree itself
|
* Time to start parsing the tree itself
|
||||||
*/
|
*/
|
||||||
htmlParseContent(ctxt);
|
htmlParseContentInternal(ctxt);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* autoclose
|
* autoclose
|
||||||
@ -4440,6 +4779,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
|
|||||||
ctxt->nameMax = 10;
|
ctxt->nameMax = 10;
|
||||||
ctxt->name = NULL;
|
ctxt->name = NULL;
|
||||||
|
|
||||||
|
ctxt->nodeInfoTab = NULL;
|
||||||
|
ctxt->nodeInfoNr = 0;
|
||||||
|
ctxt->nodeInfoMax = 0;
|
||||||
|
|
||||||
if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
|
if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
|
||||||
else {
|
else {
|
||||||
ctxt->sax = sax;
|
ctxt->sax = sax;
|
||||||
|
@ -302,6 +302,12 @@ struct _xmlParserCtxt {
|
|||||||
xmlParserMode parseMode; /* the parser mode */
|
xmlParserMode parseMode; /* the parser mode */
|
||||||
unsigned long nbentities; /* number of entities references */
|
unsigned long nbentities; /* number of entities references */
|
||||||
unsigned long sizeentities; /* size of parsed entities */
|
unsigned long sizeentities; /* size of parsed entities */
|
||||||
|
|
||||||
|
/* for use by HTML non-recursive parser */
|
||||||
|
xmlParserNodeInfo *nodeInfo; /* Current NodeInfo */
|
||||||
|
int nodeInfoNr; /* Depth of the parsing stack */
|
||||||
|
int nodeInfoMax; /* Max depth of the parsing stack */
|
||||||
|
xmlParserNodeInfo *nodeInfoTab; /* array of nodeInfos */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1782,6 +1782,7 @@ xmlFreeParserCtxt(xmlParserCtxtPtr ctxt)
|
|||||||
if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
|
if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
|
||||||
if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
|
if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
|
||||||
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
|
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
|
||||||
|
if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab);
|
||||||
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
|
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
|
||||||
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
|
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
|
||||||
if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
|
if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
|
||||||
|
Reference in New Issue
Block a user