1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

Fixed problems with HTML parsing, Daniel.

This commit is contained in:
Daniel Veillard
1999-10-08 14:37:09 +00:00
parent 00fdf370d3
commit 2673d3c856
4 changed files with 82 additions and 25 deletions

View File

@ -1,3 +1,8 @@
Fri Oct 8 16:35:37 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
* HTMLparser.c parser.h : Fixed problems with HTML parsing
reported by Kristian Hogsberg Kristensen <hogsberg@daimi.au.dk>
Fri Oct 8 11:37:11 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org> Fri Oct 8 11:37:11 CEST 1999 Daniel Veillard <Daniel.Veillard@w3.org>
* tree.c : Raph patch for initialization of CORBA fields * tree.c : Raph patch for initialization of CORBA fields

View File

@ -87,6 +87,7 @@ type html##name##Pop(htmlParserCtxtPtr ctxt) { \
} \ } \
PUSH_AND_POP(xmlNodePtr, node) PUSH_AND_POP(xmlNodePtr, node)
PUSH_AND_POP(xmlChar*, name)
/* /*
* Macros for accessing the content. Those should be used only by the parser, * Macros for accessing the content. Those should be used only by the parser,
@ -300,7 +301,7 @@ char *htmlStartClose[] = {
"TITLE", "P", NULL, "TITLE", "P", NULL,
"BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL, "BODY", "HEAD", "STYLE", "LINK", "TITLE", "P", NULL,
"LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS", "LI", "P", "H1", "H2", "H3", "H4", "H5", "H6", "DL", "ADDRESS",
"PRE", "LISTING", "XMP", "HEAD", NULL, "PRE", "LISTING", "XMP", "HEAD", "LI", NULL,
"HR", "P", "HEAD", NULL, "HR", "P", "HEAD", NULL,
"H1", "P", "HEAD", NULL, "H1", "P", "HEAD", NULL,
"H2", "P", "HEAD", NULL, "H2", "P", "HEAD", NULL,
@ -443,14 +444,18 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
*/ */
void void
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
xmlChar *oldname;
while ((ctxt->node != NULL) && while ((ctxt->name != NULL) &&
(htmlCheckAutoClose(new, ctxt->node->name))) { (htmlCheckAutoClose(new, ctxt->name))) {
#ifdef DEBUG #ifdef DEBUG
printf("htmlAutoClose: %s closes %s\n", new, ctxt->node->name); printf("htmlAutoClose: %s closes %s\n", new, ctxt->name);
#endif #endif
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, ctxt->node->name); ctxt->sax->endElement(ctxt->userData, ctxt->name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
} }
} }
@ -464,16 +469,20 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
void void
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) { htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
htmlElemDescPtr info; htmlElemDescPtr info;
xmlChar *oldname;
while ((ctxt->node != NULL) && while ((ctxt->name != NULL) &&
(xmlStrcmp(new, ctxt->node->name))) { (xmlStrcmp(new, ctxt->name))) {
info = htmlTagLookup(ctxt->node->name); info = htmlTagLookup(ctxt->name);
if ((info == NULL) || (info->endTag == 1)) { if ((info == NULL) || (info->endTag == 1)) {
#ifdef DEBUG #ifdef DEBUG
printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->node->name); printf("htmlAutoCloseOnClose: %s closes %s\n", new, ctxt->name);
#endif #endif
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, ctxt->node->name); ctxt->sax->endElement(ctxt->userData, ctxt->name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
} else } else
break; break;
} }
@ -2000,6 +2009,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
/* /*
* SAX: Start of Element ! * SAX: Start of Element !
*/ */
htmlnamePush(ctxt, xmlStrdup(name));
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
ctxt->sax->startElement(ctxt->userData, name, atts); ctxt->sax->startElement(ctxt->userData, name, atts);
@ -2027,6 +2037,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
void void
htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) { htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
xmlChar *name; xmlChar *name;
xmlChar *oldname;
int i; int i;
if ((CUR != '<') || (NXT(1) != '/')) { if ((CUR != '<') || (NXT(1) != '/')) {
@ -2054,9 +2065,9 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
* Check that we are not closing an already closed tag, * Check that we are not closing an already closed tag,
* <p><b>...</p></b> is a really common error ! * <p><b>...</p></b> is a really common error !
*/ */
for (i = ctxt->nodeNr - 1;i >= 0;i--) { for (i = ctxt->nameNr - 1;i >= 0;i--) {
if ((ctxt->nodeTab[i] != NULL) && if ((ctxt->nameTab[i] != NULL) &&
(!xmlStrcmp(tagname, ctxt->nodeTab[i]->name))) (!xmlStrcmp(tagname, ctxt->nameTab[i])))
break; break;
} }
if (i < 0) { if (i < 0) {
@ -2080,12 +2091,12 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
* of the stack. * of the stack.
*/ */
if (xmlStrcmp(name, tagname)) { if (xmlStrcmp(name, tagname)) {
if ((ctxt->node != NULL) && if ((ctxt->name != NULL) &&
(xmlStrcmp(ctxt->node->name, name))) { (xmlStrcmp(ctxt->name, name))) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
"Opening and ending tag mismatch: %s and %s\n", "Opening and ending tag mismatch: %s and %s\n",
name, ctxt->node->name); name, ctxt->name);
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
} }
} }
@ -2095,6 +2106,9 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt, const xmlChar *tagname) {
*/ */
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name); ctxt->sax->endElement(ctxt->userData, name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
if (name != NULL) if (name != NULL)
xmlFree(name); xmlFree(name);
@ -2157,9 +2171,9 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
void void
htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) { htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) {
htmlNodePtr currentNode; xmlChar *currentNode;
currentNode = ctxt->node; currentNode = ctxt->name;
while ((CUR != '<') || (NXT(1) != '/')) { while ((CUR != '<') || (NXT(1) != '/')) {
const xmlChar *test = CUR_PTR; const xmlChar *test = CUR_PTR;
@ -2167,7 +2181,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt, const xmlChar *name) {
* Has this node been popped out during parsing of * Has this node been popped out during parsing of
* the next element * the next element
*/ */
if (currentNode != ctxt->node) return; if (currentNode != ctxt->name) return;
/* /*
* First case : a comment * First case : a comment
@ -2230,7 +2244,8 @@ void
htmlParseElement(htmlParserCtxtPtr ctxt) { htmlParseElement(htmlParserCtxtPtr ctxt) {
const xmlChar *openTag = CUR_PTR; const xmlChar *openTag = CUR_PTR;
xmlChar *name; xmlChar *name;
htmlNodePtr currentNode; xmlChar *oldname;
xmlChar *currentNode;
htmlElemDescPtr info; htmlElemDescPtr info;
htmlParserNodeInfo node_info; htmlParserNodeInfo node_info;
@ -2245,7 +2260,6 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
if (name == NULL) { if (name == NULL) {
return; return;
} }
currentNode = ctxt->node;
/* /*
* Lookup the info for that element. * Lookup the info for that element.
@ -2271,6 +2285,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
SKIP(2); SKIP(2);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name); ctxt->sax->endElement(ctxt->userData, name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
xmlFree(name); xmlFree(name);
return; return;
} }
@ -2288,6 +2305,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
*/ */
nodePop(ctxt); nodePop(ctxt);
xmlFree(name); xmlFree(name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
/* /*
* Capture end position and add node * Capture end position and add node
@ -2296,7 +2316,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
node_info.end_pos = ctxt->input->consumed + node_info.end_pos = ctxt->input->consumed +
(CUR_PTR - ctxt->input->base); (CUR_PTR - ctxt->input->base);
node_info.end_line = ctxt->input->line; node_info.end_line = ctxt->input->line;
node_info.node = currentNode; node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info); xmlParserAddNodeInfo(ctxt, &node_info);
} }
return; return;
@ -2309,20 +2329,23 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name); ctxt->sax->endElement(ctxt->userData, name);
xmlFree(name); xmlFree(name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
return; return;
} }
/* /*
* Parse the content of the element: * Parse the content of the element:
*/ */
currentNode = ctxt->node; currentNode = ctxt->name;
htmlParseContent(ctxt, name); htmlParseContent(ctxt, name);
/* /*
* check whether the element get popped due to auto closure * check whether the element get popped due to auto closure
* on start tag * on start tag
*/ */
if (currentNode != ctxt->node) { if (currentNode != ctxt->name) {
xmlFree(name); xmlFree(name);
return; return;
} }
@ -2338,6 +2361,9 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
*/ */
nodePop(ctxt); nodePop(ctxt);
xmlFree(name); xmlFree(name);
oldname = ctxt->name;
htmlnamePop(ctxt);
xmlFree(oldname);
return; return;
} }
@ -2350,7 +2376,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
node_info.end_pos = ctxt->input->consumed + node_info.end_pos = ctxt->input->consumed +
(CUR_PTR - ctxt->input->base); (CUR_PTR - ctxt->input->base);
node_info.end_line = ctxt->input->line; node_info.end_line = ctxt->input->line;
node_info.node = currentNode; node_info.node = ctxt->node;
xmlParserAddNodeInfo(ctxt, &node_info); xmlParserAddNodeInfo(ctxt, &node_info);
} }
} }
@ -2469,6 +2495,12 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
ctxt->nodeMax = 10; ctxt->nodeMax = 10;
ctxt->node = NULL; ctxt->node = NULL;
/* Allocate the Name stack */
ctxt->nameTab = (xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
ctxt->nameNr = 0;
ctxt->nameMax = 10;
ctxt->name = NULL;
if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler; if (sax == NULL) ctxt->sax = &htmlDefaultSAXHandler;
else { else {
ctxt->sax = sax; ctxt->sax = sax;
@ -2495,6 +2527,7 @@ void
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
{ {
htmlParserInputPtr input; htmlParserInputPtr input;
xmlChar *oldname;
if (ctxt == NULL) return; if (ctxt == NULL) return;
@ -2503,6 +2536,11 @@ htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
} }
if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab); if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
while ((oldname = ctxt->name) != NULL) {
htmlnamePop(ctxt);
xmlFree(oldname);
}
if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab); if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
if (ctxt->version != NULL) xmlFree((char *) ctxt->version); if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler)) if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))

View File

@ -141,6 +141,13 @@ typedef struct _xmlParserCtxt {
int token; /* next char look-ahead */ int token; /* next char look-ahead */
char *directory; /* the data directory */ char *directory; /* the data directory */
/* Node name stack only used for HTML parsing */
xmlChar *name; /* Current parsed Node */
int nameNr; /* Depth of the parsing stack */
int nameMax; /* Max depth of the parsing stack */
xmlChar * *nameTab; /* array of nodes */
} _xmlParserCtxt; } _xmlParserCtxt;
typedef _xmlParserCtxt xmlParserCtxt; typedef _xmlParserCtxt xmlParserCtxt;
typedef xmlParserCtxt *xmlParserCtxtPtr; typedef xmlParserCtxt *xmlParserCtxtPtr;

View File

@ -141,6 +141,13 @@ typedef struct _xmlParserCtxt {
int token; /* next char look-ahead */ int token; /* next char look-ahead */
char *directory; /* the data directory */ char *directory; /* the data directory */
/* Node name stack only used for HTML parsing */
xmlChar *name; /* Current parsed Node */
int nameNr; /* Depth of the parsing stack */
int nameMax; /* Max depth of the parsing stack */
xmlChar * *nameTab; /* array of nodes */
} _xmlParserCtxt; } _xmlParserCtxt;
typedef _xmlParserCtxt xmlParserCtxt; typedef _xmlParserCtxt xmlParserCtxt;
typedef xmlParserCtxt *xmlParserCtxtPtr; typedef xmlParserCtxt *xmlParserCtxtPtr;