1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-19 05:43:19 +03:00

Switch the HTML parser to be non-recursive

* HTMLparser.c: new htmlParseElementInternal non recursive, with
  htmlParseContentInternal and new function to handle node info
  and element end.
* include/libxml/parser.h: add new stack for element info in parser
  context
* parserInternals.c: fee element info stack
This commit is contained in:
Eugene Pimenov
2010-03-15 15:16:02 +01:00
committed by Daniel Veillard
parent ef9c636ac1
commit 615904f582
3 changed files with 364 additions and 14 deletions

View File

@ -209,6 +209,59 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
return (ret);
}
/**
* htmlNodeInfoPush:
* @ctxt: an HTML parser context
* @value: the node info
*
* Pushes a new element name on top of the node info stack
*
* Returns 0 in case of error, the index in the stack otherwise
*/
static int
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
{
if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
if (ctxt->nodeInfoMax == 0)
ctxt->nodeInfoMax = 5;
ctxt->nodeInfoMax *= 2;
ctxt->nodeInfoTab = (htmlParserNodeInfo *)
xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
ctxt->nodeInfoMax *
sizeof(ctxt->nodeInfoTab[0]));
if (ctxt->nodeInfoTab == NULL) {
htmlErrMemory(ctxt, NULL);
return (0);
}
}
ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
return (ctxt->nodeInfoNr++);
}
/**
* htmlNodeInfoPop:
* @ctxt: an HTML parser context
*
* Pops the top element name from the node info stack
*
* Returns 0 in case of error, the pointer to NodeInfo otherwise
*/
static htmlParserNodeInfo *
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
{
if (ctxt->nodeInfoNr <= 0)
return (NULL);
ctxt->nodeInfoNr--;
if (ctxt->nodeInfoNr < 0)
return (NULL);
if (ctxt->nodeInfoNr > 0)
ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
else
ctxt->nodeInfo = NULL;
return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
}
/*
* Macros for accessing the content. Those should be used only by the parser,
* and not exported.
@ -3927,6 +3980,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
* @ctxt: an HTML parser context
*
* Parse a content: comment, sub-element, reference or text.
* Kept for compatibility with old code
*/
static void
@ -4074,24 +4128,12 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
if (currentNode != NULL) xmlFree(currentNode);
}
/**
* htmlParseContent:
* @ctxt: an HTML parser context
*
* Parse a content: comment, sub-element, reference or text.
*/
void
__htmlParseContent(void *ctxt) {
if (ctxt != NULL)
htmlParseContent((htmlParserCtxtPtr) ctxt);
}
/**
* htmlParseElement:
* @ctxt: an HTML parser context
*
* parse an HTML element, this is highly recursive
* this is kept for compatibility with previous code versions
*
* [39] element ::= EmptyElemTag | STag content ETag
*
@ -4219,6 +4261,303 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
xmlFree(currentNode);
}
static void
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
/*
* Capture end position and add node
*/
if ( ctxt->node != NULL && ctxt->record_info ) {
ctxt->nodeInfo->end_pos = ctxt->input->consumed +
(CUR_PTR - ctxt->input->base);
ctxt->nodeInfo->end_line = ctxt->input->line;
ctxt->nodeInfo->node = ctxt->node;
xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
htmlNodeInfoPop(ctxt);
}
if (!IS_CHAR_CH(CUR)) {
htmlAutoCloseOnEnd(ctxt);
}
}
/**
* htmlParseElementInternal:
* @ctxt: an HTML parser context
*
* parse an HTML element, new version, non recursive
*
* [39] element ::= EmptyElemTag | STag content ETag
*
* [41] Attribute ::= Name Eq AttValue
*/
static void
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
const xmlChar *name;
const htmlElemDesc * info;
htmlParserNodeInfo node_info;
int failed;
int depth;
const xmlChar *oldptr;
if ((ctxt == NULL) || (ctxt->input == NULL)) {
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
"htmlParseElementInternal: context error\n", NULL, NULL);
return;
}
if (ctxt->instate == XML_PARSER_EOF)
return;
/* Capture start position */
if (ctxt->record_info) {
node_info.begin_pos = ctxt->input->consumed +
(CUR_PTR - ctxt->input->base);
node_info.begin_line = ctxt->input->line;
}
failed = htmlParseStartTag(ctxt);
name = ctxt->name;
if ((failed == -1) || (name == NULL)) {
if (CUR == '>')
NEXT;
return;
}
/*
* Lookup the info for that element.
*/
info = htmlTagLookup(name);
if (info == NULL) {
htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
"Tag %s invalid\n", name, NULL);
}
/*
* Check for an Empty Element labeled the XML/SGML way
*/
if ((CUR == '/') && (NXT(1) == '>')) {
SKIP(2);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
return;
}
if (CUR == '>') {
NEXT;
} else {
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"Couldn't find end of Start Tag %s\n", name, NULL);
/*
* end of parsing of this node.
*/
if (xmlStrEqual(name, ctxt->name)) {
nodePop(ctxt);
htmlnamePop(ctxt);
}
if (ctxt->record_info)
htmlNodeInfoPush(ctxt, &node_info);
htmlParserFinishElementParsing(ctxt);
return;
}
/*
* Check for an Empty Element from DTD definition
*/
if ((info != NULL) && (info->empty)) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
return;
}
if (ctxt->record_info)
htmlNodeInfoPush(ctxt, &node_info);
}
/**
* htmlParseContentInternal:
* @ctxt: an HTML parser context
*
* Parse a content: comment, sub-element, reference or text.
* New version for non recursive htmlParseElementInternal
*/
static void
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
xmlChar *currentNode;
int depth;
const xmlChar *name;
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
while (1) {
long cons = ctxt->nbChars;
GROW;
if (ctxt->instate == XML_PARSER_EOF)
break;
/*
* Our tag or one of it's parent or children is ending.
*/
if ((CUR == '<') && (NXT(1) == '/')) {
if (htmlParseEndTag(ctxt) &&
((currentNode != NULL) || (ctxt->nameNr == 0))) {
if (currentNode != NULL)
xmlFree(currentNode);
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
}
continue; /* while */
}
else if ((CUR == '<') &&
((IS_ASCII_LETTER(NXT(1))) ||
(NXT(1) == '_') || (NXT(1) == ':'))) {
name = htmlParseHTMLName_nonInvasive(ctxt);
if (name == NULL) {
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
/* Dump the bogus tag like browsers do */
while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
NEXT;
htmlParserFinishElementParsing(ctxt);
if (currentNode != NULL)
xmlFree(currentNode);
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
continue;
}
if (ctxt->name != NULL) {
if (htmlCheckAutoClose(name, ctxt->name) == 1) {
htmlAutoClose(ctxt, name);
continue;
}
}
}
/*
* Has this node been popped out during parsing of
* the next element
*/
if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
(!xmlStrEqual(currentNode, ctxt->name)))
{
htmlParserFinishElementParsing(ctxt);
if (currentNode != NULL) xmlFree(currentNode);
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
continue;
}
if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
(xmlStrEqual(currentNode, BAD_CAST"style")))) {
/*
* Handle SCRIPT/STYLE separately
*/
htmlParseScript(ctxt);
} else {
/*
* Sometimes DOCTYPE arrives in the middle of the document
*/
if ((CUR == '<') && (NXT(1) == '!') &&
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n",
BAD_CAST "DOCTYPE" , NULL);
htmlParseDocTypeDecl(ctxt);
}
/*
* First case : a comment
*/
if ((CUR == '<') && (NXT(1) == '!') &&
(NXT(2) == '-') && (NXT(3) == '-')) {
htmlParseComment(ctxt);
}
/*
* Second case : a Processing Instruction.
*/
else if ((CUR == '<') && (NXT(1) == '?')) {
htmlParsePI(ctxt);
}
/*
* Third case : a sub-element.
*/
else if (CUR == '<') {
htmlParseElementInternal(ctxt);
if (currentNode != NULL) xmlFree(currentNode);
currentNode = xmlStrdup(ctxt->name);
depth = ctxt->nameNr;
}
/*
* Fourth case : a reference. If if has not been resolved,
* parsing returns it's Name, create the node
*/
else if (CUR == '&') {
htmlParseReference(ctxt);
}
/*
* Fifth case : end of the resource
*/
else if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
break;
}
/*
* Last case, text. Note that References are handled directly.
*/
else {
htmlParseCharData(ctxt);
}
if (cons == ctxt->nbChars) {
if (ctxt->node != NULL) {
htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
"detected an error in element content\n",
NULL, NULL);
}
break;
}
}
GROW;
}
if (currentNode != NULL) xmlFree(currentNode);
}
/**
* htmlParseContent:
* @ctxt: an HTML parser context
*
* Parse a content: comment, sub-element, reference or text.
* This is the entry point when called from parser.c
*/
void
__htmlParseContent(void *ctxt) {
if (ctxt != NULL)
htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
}
/**
* htmlParseDocument:
* @ctxt: an HTML parser context
@ -4323,7 +4662,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
/*
* Time to start parsing the tree itself
*/
htmlParseContent(ctxt);
htmlParseContentInternal(ctxt);
/*
* autoclose
@ -4440,6 +4779,10 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
ctxt->nameMax = 10;
ctxt->name = NULL;
ctxt->nodeInfoTab = NULL;
ctxt->nodeInfoNr = 0;
ctxt->nodeInfoMax = 0;
if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
else {
ctxt->sax = sax;