mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
Fixed problems with HTML parsing, Daniel.
This commit is contained in:
@ -1,3 +1,8 @@
|
|||||||
|
Fri Oct 8 16:35:37 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
|
||||||
|
|
||||||
|
* HTMLparser.c parser.h : Fixed problems with HTML parsing
|
||||||
|
reported by Kristian Hogsberg Kristensen <hogsberg@daimi.au.dk>
|
||||||
|
|
||||||
Fri Oct 8 11:37:11 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
|
Fri Oct 8 11:37:11 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
|
||||||
|
|
||||||
* tree.c : Raph patch for initialization of CORBA fields
|
* tree.c : Raph patch for initialization of CORBA fields
|
||||||
|
88
HTMLparser.c
88
HTMLparser.c
@ -87,6 +87,7 @@ type html##name##Pop(htmlParserCtxtPtr ctxt) { \
|
|||||||
} \
|
} \
|
||||||
|
|
||||||
PUSH_AND_POP(xmlNodePtr, node)
|
PUSH_AND_POP(xmlNodePtr, node)
|
||||||
|
PUSH_AND_POP(xmlChar*, name)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Macros for accessing the content. Those should be used only by the parser,
|
* Macros for accessing the content. Those should be used only by the parser,
|
||||||
@ -300,7 +301,7 @@ char *htmlStartClose[] = {
|
|||||||
"TITLE", "P", NULL,
|
"TITLE", "P", NULL,
|
||||||
"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
|
"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
|
||||||
"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
|
"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
|
||||||
"PRE", "LISTING", "XMP", "HEAD", NULL,
|
"PRE", "LISTING", "XMP", "HEAD", "LI", NULL,
|
||||||
"HR", "P", "HEAD", NULL,
|
"HR", "P", "HEAD", NULL,
|
||||||
"H1", "P", "HEAD", NULL,
|
"H1", "P", "HEAD", NULL,
|
||||||
"H2", "P", "HEAD", NULL,
|
"H2", "P", "HEAD", NULL,
|
||||||
@ -443,14 +444,18 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
|
|||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
||||||
|
xmlChar *oldname;
|
||||||
|
|
||||||
while ((ctxt->node != NULL) &&
|
while ((ctxt->name != NULL) &&
|
||||||
(htmlCheckAutoClose(new, ctxt->node->name))) {
|
(htmlCheckAutoClose(new, ctxt->name))) {
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name);
|
printf("htmlAutoClose: %s closes %s\n", new, ctxt->name);
|
||||||
#endif
|
#endif
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
|
ctxt->sax->endElement(ctxt->userData, ctxt->name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -464,16 +469,20 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
|||||||
void
|
void
|
||||||
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
||||||
htmlElemDescPtr info;
|
htmlElemDescPtr info;
|
||||||
|
xmlChar *oldname;
|
||||||
|
|
||||||
while ((ctxt->node != NULL) &&
|
while ((ctxt->name != NULL) &&
|
||||||
(xmlStrcmp(new, ctxt->node->name))) {
|
(xmlStrcmp(new, ctxt->name))) {
|
||||||
info = htmlTagLookup(ctxt->node->name);
|
info = htmlTagLookup(ctxt->name);
|
||||||
if ((info == NULL) || (info->endTag == 1)) {
|
if ((info == NULL) || (info->endTag == 1)) {
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name);
|
printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
|
||||||
#endif
|
#endif
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
ctxt->sax->endElement(ctxt->userData, ctxt->node->name);
|
ctxt->sax->endElement(ctxt->userData, ctxt->name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
} else
|
} else
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -2000,6 +2009,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
|||||||
/*
|
/*
|
||||||
* SAX: Start of Element !
|
* SAX: Start of Element !
|
||||||
*/
|
*/
|
||||||
|
htmlnamePush(ctxt, xmlStrdup(name));
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
||||||
ctxt->sax->startElement(ctxt->userData, name, atts);
|
ctxt->sax->startElement(ctxt->userData, name, atts);
|
||||||
|
|
||||||
@ -2027,6 +2037,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
|||||||
void
|
void
|
||||||
htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
|
htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
|
||||||
xmlChar *name;
|
xmlChar *name;
|
||||||
|
xmlChar *oldname;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if ((CUR != '<') || (NXT(1) != '/')) {
|
if ((CUR != '<') || (NXT(1) != '/')) {
|
||||||
@ -2054,9 +2065,9 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
|
|||||||
* Check that we are not closing an already closed tag,
|
* Check that we are not closing an already closed tag,
|
||||||
* <p><b>...</p></b> is a really common error !
|
* <p><b>...</p></b> is a really common error !
|
||||||
*/
|
*/
|
||||||
for (i = ctxt->nodeNr - 1;i >= 0;i--) {
|
for (i = ctxt->nameNr - 1;i >= 0;i--) {
|
||||||
if ((ctxt->nodeTab[i] != NULL) &&
|
if ((ctxt->nameTab[i] != NULL) &&
|
||||||
(!xmlStrcmp(tagname, ctxt->nodeTab[i]->name)))
|
(!xmlStrcmp(tagname, ctxt->nameTab[i])))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (i < 0) {
|
if (i < 0) {
|
||||||
@ -2080,12 +2091,12 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
|
|||||||
* of the stack.
|
* of the stack.
|
||||||
*/
|
*/
|
||||||
if (xmlStrcmp(name, tagname)) {
|
if (xmlStrcmp(name, tagname)) {
|
||||||
if ((ctxt->node != NULL) &&
|
if ((ctxt->name != NULL) &&
|
||||||
(xmlStrcmp(ctxt->node->name, name))) {
|
(xmlStrcmp(ctxt->name, name))) {
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||||
ctxt->sax->error(ctxt->userData,
|
ctxt->sax->error(ctxt->userData,
|
||||||
"Opening and ending tag mismatch: %s and %s\n",
|
"Opening and ending tag mismatch: %s and %s\n",
|
||||||
name, ctxt->node->name);
|
name, ctxt->name);
|
||||||
ctxt->wellFormed = 0;
|
ctxt->wellFormed = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2095,6 +2106,9 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
|
|||||||
*/
|
*/
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
|
|
||||||
if (name != NULL)
|
if (name != NULL)
|
||||||
xmlFree(name);
|
xmlFree(name);
|
||||||
@ -2157,9 +2171,9 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
|||||||
|
|
||||||
void
|
void
|
||||||
htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) {
|
htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) {
|
||||||
htmlNodePtr currentNode;
|
xmlChar *currentNode;
|
||||||
|
|
||||||
currentNode = ctxt->node;
|
currentNode = ctxt->name;
|
||||||
while ((CUR != '<') || (NXT(1) != '/')) {
|
while ((CUR != '<') || (NXT(1) != '/')) {
|
||||||
const xmlChar *test = CUR_PTR;
|
const xmlChar *test = CUR_PTR;
|
||||||
|
|
||||||
@ -2167,7 +2181,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) {
|
|||||||
* Has this node been popped out during parsing of
|
* Has this node been popped out during parsing of
|
||||||
* the next element
|
* the next element
|
||||||
*/
|
*/
|
||||||
if (currentNode != ctxt->node) return;
|
if (currentNode != ctxt->name) return;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* First case : a comment
|
* First case : a comment
|
||||||
@ -2230,7 +2244,8 @@ void
|
|||||||
htmlParseElement(htmlParserCtxtPtr ctxt) {
|
htmlParseElement(htmlParserCtxtPtr ctxt) {
|
||||||
const xmlChar *openTag = CUR_PTR;
|
const xmlChar *openTag = CUR_PTR;
|
||||||
xmlChar *name;
|
xmlChar *name;
|
||||||
htmlNodePtr currentNode;
|
xmlChar *oldname;
|
||||||
|
xmlChar *currentNode;
|
||||||
htmlElemDescPtr info;
|
htmlElemDescPtr info;
|
||||||
htmlParserNodeInfo node_info;
|
htmlParserNodeInfo node_info;
|
||||||
|
|
||||||
@ -2245,7 +2260,6 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
if (name == NULL) {
|
if (name == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
currentNode = ctxt->node;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Lookup the info for that element.
|
* Lookup the info for that element.
|
||||||
@ -2271,6 +2285,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
SKIP(2);
|
SKIP(2);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
xmlFree(name);
|
xmlFree(name);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -2288,6 +2305,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
*/
|
*/
|
||||||
nodePop(ctxt);
|
nodePop(ctxt);
|
||||||
xmlFree(name);
|
xmlFree(name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Capture end position and add node
|
* Capture end position and add node
|
||||||
@ -2296,7 +2316,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
node_info.end_pos = ctxt->input->consumed +
|
node_info.end_pos = ctxt->input->consumed +
|
||||||
(CUR_PTR - ctxt->input->base);
|
(CUR_PTR - ctxt->input->base);
|
||||||
node_info.end_line = ctxt->input->line;
|
node_info.end_line = ctxt->input->line;
|
||||||
node_info.node = currentNode;
|
node_info.node = ctxt->node;
|
||||||
xmlParserAddNodeInfo(ctxt, &node_info);
|
xmlParserAddNodeInfo(ctxt, &node_info);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
@ -2309,20 +2329,23 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
xmlFree(name);
|
xmlFree(name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Parse the content of the element:
|
* Parse the content of the element:
|
||||||
*/
|
*/
|
||||||
currentNode = ctxt->node;
|
currentNode = ctxt->name;
|
||||||
htmlParseContent(ctxt, name);
|
htmlParseContent(ctxt, name);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* check whether the element get popped due to auto closure
|
* check whether the element get popped due to auto closure
|
||||||
* on start tag
|
* on start tag
|
||||||
*/
|
*/
|
||||||
if (currentNode != ctxt->node) {
|
if (currentNode != ctxt->name) {
|
||||||
xmlFree(name);
|
xmlFree(name);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -2338,6 +2361,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
*/
|
*/
|
||||||
nodePop(ctxt);
|
nodePop(ctxt);
|
||||||
xmlFree(name);
|
xmlFree(name);
|
||||||
|
oldname = ctxt->name;
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2350,7 +2376,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
|||||||
node_info.end_pos = ctxt->input->consumed +
|
node_info.end_pos = ctxt->input->consumed +
|
||||||
(CUR_PTR - ctxt->input->base);
|
(CUR_PTR - ctxt->input->base);
|
||||||
node_info.end_line = ctxt->input->line;
|
node_info.end_line = ctxt->input->line;
|
||||||
node_info.node = currentNode;
|
node_info.node = ctxt->node;
|
||||||
xmlParserAddNodeInfo(ctxt, &node_info);
|
xmlParserAddNodeInfo(ctxt, &node_info);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2469,6 +2495,12 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
|
|||||||
ctxt->nodeMax = 10;
|
ctxt->nodeMax = 10;
|
||||||
ctxt->node = NULL;
|
ctxt->node = NULL;
|
||||||
|
|
||||||
|
/* Allocate the Name stack */
|
||||||
|
ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
|
||||||
|
ctxt->nameNr = 0;
|
||||||
|
ctxt->nameMax = 10;
|
||||||
|
ctxt->name = NULL;
|
||||||
|
|
||||||
if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
|
if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
|
||||||
else {
|
else {
|
||||||
ctxt->sax = sax;
|
ctxt->sax = sax;
|
||||||
@ -2495,6 +2527,7 @@ void
|
|||||||
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
|
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
|
||||||
{
|
{
|
||||||
htmlParserInputPtr input;
|
htmlParserInputPtr input;
|
||||||
|
xmlChar *oldname;
|
||||||
|
|
||||||
if (ctxt == NULL) return;
|
if (ctxt == NULL) return;
|
||||||
|
|
||||||
@ -2503,6 +2536,11 @@ htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
|
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
|
||||||
|
while ((oldname = ctxt->name) != NULL) {
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
xmlFree(oldname);
|
||||||
|
}
|
||||||
|
if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
|
||||||
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
|
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
|
||||||
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
|
if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
|
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
|
||||||
|
@ -141,6 +141,13 @@ typedef struct _xmlParserCtxt {
|
|||||||
int token; /* next char look-ahead */
|
int token; /* next char look-ahead */
|
||||||
|
|
||||||
char *directory; /* the data directory */
|
char *directory; /* the data directory */
|
||||||
|
|
||||||
|
/* Node name stack only used for HTML parsing */
|
||||||
|
xmlChar *name; /* Current parsed Node */
|
||||||
|
int nameNr; /* Depth of the parsing stack */
|
||||||
|
int nameMax; /* Max depth of the parsing stack */
|
||||||
|
xmlChar * *nameTab; /* array of nodes */
|
||||||
|
|
||||||
} _xmlParserCtxt;
|
} _xmlParserCtxt;
|
||||||
typedef _xmlParserCtxt xmlParserCtxt;
|
typedef _xmlParserCtxt xmlParserCtxt;
|
||||||
typedef xmlParserCtxt *xmlParserCtxtPtr;
|
typedef xmlParserCtxt *xmlParserCtxtPtr;
|
||||||
|
7
parser.h
7
parser.h
@ -141,6 +141,13 @@ typedef struct _xmlParserCtxt {
|
|||||||
int token; /* next char look-ahead */
|
int token; /* next char look-ahead */
|
||||||
|
|
||||||
char *directory; /* the data directory */
|
char *directory; /* the data directory */
|
||||||
|
|
||||||
|
/* Node name stack only used for HTML parsing */
|
||||||
|
xmlChar *name; /* Current parsed Node */
|
||||||
|
int nameNr; /* Depth of the parsing stack */
|
||||||
|
int nameMax; /* Max depth of the parsing stack */
|
||||||
|
xmlChar * *nameTab; /* array of nodes */
|
||||||
|
|
||||||
} _xmlParserCtxt;
|
} _xmlParserCtxt;
|
||||||
typedef _xmlParserCtxt xmlParserCtxt;
|
typedef _xmlParserCtxt xmlParserCtxt;
|
||||||
typedef xmlParserCtxt *xmlParserCtxtPtr;
|
typedef xmlParserCtxt *xmlParserCtxtPtr;
|
||||||
|
Reference in New Issue
Block a user