mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-28 00:21:53 +03:00
more cleanup of the HTML parser to force it to not bypass SAX, Daniel.
Ready for 2.1.1 it seems
This commit is contained in:
@ -1,3 +1,8 @@
|
|||||||
|
Fri Jun 30 20:29:08 MEST 2000
|
||||||
|
|
||||||
|
* HTMLparser.c HTMLtree.c SAX.c valid.c tree.h : more cleanup
|
||||||
|
of the HTML parser to force it to not bypass SAX
|
||||||
|
|
||||||
Fri Jun 30 11:19:59 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
|
Fri Jun 30 11:19:59 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
|
||||||
|
|
||||||
* win32config.h.in: updated
|
* win32config.h.in: updated
|
||||||
|
35
HTMLparser.c
35
HTMLparser.c
@ -618,7 +618,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
|||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
||||||
if (!strcmp(new, "html"))
|
if (!xmlStrcmp(new, BAD_CAST"html"))
|
||||||
return;
|
return;
|
||||||
if (ctxt->nameNr <= 0) {
|
if (ctxt->nameNr <= 0) {
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
@ -628,12 +628,15 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *new) {
|
|||||||
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
||||||
ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
|
ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
|
||||||
}
|
}
|
||||||
if ((!strcmp(new, "body")) || (!strcmp(new, "head")))
|
if ((!xmlStrcmp(new, BAD_CAST"body")) || (!xmlStrcmp(new, BAD_CAST"head")))
|
||||||
return;
|
return;
|
||||||
if (ctxt->nameNr <= 1) {
|
if (ctxt->nameNr <= 1) {
|
||||||
if ((!strcmp(new, "script")) || (!strcmp(new, "style")) ||
|
if ((!xmlStrcmp(new, BAD_CAST"script")) ||
|
||||||
(!strcmp(new, "meta")) || (!strcmp(new, "link")) ||
|
(!xmlStrcmp(new, BAD_CAST"style")) ||
|
||||||
(!strcmp(new, "title")) || (!strcmp(new, "base"))) {
|
(!xmlStrcmp(new, BAD_CAST"meta")) ||
|
||||||
|
(!xmlStrcmp(new, BAD_CAST"link")) ||
|
||||||
|
(!xmlStrcmp(new, BAD_CAST"title")) ||
|
||||||
|
(!xmlStrcmp(new, BAD_CAST"base"))) {
|
||||||
/*
|
/*
|
||||||
* dropped OBJECT ... i you put it first BODY will be
|
* dropped OBJECT ... i you put it first BODY will be
|
||||||
* assumed !
|
* assumed !
|
||||||
@ -2152,17 +2155,15 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
|
|||||||
ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
|
ctxt->sax->error(ctxt->userData, "DOCTYPE unproperly terminated\n");
|
||||||
ctxt->wellFormed = 0;
|
ctxt->wellFormed = 0;
|
||||||
/* We shouldn't try to resynchronize ... */
|
/* We shouldn't try to resynchronize ... */
|
||||||
} else {
|
|
||||||
}
|
}
|
||||||
NEXT;
|
NEXT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Create the document accordingly to the DOCTYPE
|
* Create or update the document accordingly to the DOCTYPE
|
||||||
*/
|
*/
|
||||||
if (ctxt->myDoc != NULL)
|
if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
|
||||||
xmlFreeDoc(ctxt->myDoc);
|
(!ctxt->disableSAX))
|
||||||
|
ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
|
||||||
ctxt->myDoc = htmlNewDoc(URI, ExternalID);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Cleanup, since we don't use all those identifiers
|
* Cleanup, since we don't use all those identifiers
|
||||||
@ -2845,13 +2846,6 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
|||||||
}
|
}
|
||||||
SKIP_BLANKS;
|
SKIP_BLANKS;
|
||||||
|
|
||||||
/*
|
|
||||||
* Create the document if not done already.
|
|
||||||
*/
|
|
||||||
if (ctxt->myDoc == NULL) {
|
|
||||||
ctxt->myDoc = htmlNewDoc(NULL, NULL);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Time to start parsing the tree itself
|
* Time to start parsing the tree itself
|
||||||
*/
|
*/
|
||||||
@ -3171,6 +3165,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
|
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
|
||||||
ctxt->sax->setDocumentLocator(ctxt->userData,
|
ctxt->sax->setDocumentLocator(ctxt->userData,
|
||||||
&xmlDefaultSAXLocator);
|
&xmlDefaultSAXLocator);
|
||||||
|
if ((ctxt->sax) && (ctxt->sax->startDocument) &&
|
||||||
|
(!ctxt->disableSAX))
|
||||||
|
ctxt->sax->startDocument(ctxt->userData);
|
||||||
|
|
||||||
cur = in->cur[0];
|
cur = in->cur[0];
|
||||||
next = in->cur[1];
|
next = in->cur[1];
|
||||||
if ((cur == '<') && (next == '!') &&
|
if ((cur == '<') && (next == '!') &&
|
||||||
@ -3190,7 +3188,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
fprintf(stderr, "HPP: entering PROLOG\n");
|
fprintf(stderr, "HPP: entering PROLOG\n");
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
ctxt->myDoc = htmlNewDoc(NULL, NULL);
|
|
||||||
ctxt->instate = XML_PARSER_MISC;
|
ctxt->instate = XML_PARSER_MISC;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PUSH
|
#ifdef DEBUG_PUSH
|
||||||
|
@ -158,6 +158,8 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
|
|||||||
/*
|
/*
|
||||||
* Special cases.
|
* Special cases.
|
||||||
*/
|
*/
|
||||||
|
if (cur->type == XML_DTD_NODE)
|
||||||
|
return;
|
||||||
if (cur->type == XML_HTML_DOCUMENT_NODE) {
|
if (cur->type == XML_HTML_DOCUMENT_NODE) {
|
||||||
htmlDocContentDump(buf, (xmlDocPtr) cur);
|
htmlDocContentDump(buf, (xmlDocPtr) cur);
|
||||||
return;
|
return;
|
||||||
|
18
SAX.c
18
SAX.c
@ -25,6 +25,7 @@
|
|||||||
#include <libxml/xmlIO.h>
|
#include <libxml/xmlIO.h>
|
||||||
#include <libxml/SAX.h>
|
#include <libxml/SAX.h>
|
||||||
#include <libxml/uri.h>
|
#include <libxml/uri.h>
|
||||||
|
#include <libxml/HTMLtree.h>
|
||||||
|
|
||||||
/* #define DEBUG_SAX */
|
/* #define DEBUG_SAX */
|
||||||
/* #define DEBUG_SAX_TREE */
|
/* #define DEBUG_SAX_TREE */
|
||||||
@ -157,11 +158,22 @@ internalSubset(void *ctx, const xmlChar *name,
|
|||||||
const xmlChar *ExternalID, const xmlChar *SystemID)
|
const xmlChar *ExternalID, const xmlChar *SystemID)
|
||||||
{
|
{
|
||||||
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx;
|
xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx;
|
||||||
|
xmlDtdPtr dtd;
|
||||||
#ifdef DEBUG_SAX
|
#ifdef DEBUG_SAX
|
||||||
fprintf(stderr, "SAX.internalSubset(%s, %s, %s)\n",
|
fprintf(stderr, "SAX.internalSubset(%s, %s, %s)\n",
|
||||||
name, ExternalID, SystemID);
|
name, ExternalID, SystemID);
|
||||||
#endif
|
#endif
|
||||||
xmlCreateIntSubset(ctxt->myDoc, name, ExternalID, SystemID);
|
|
||||||
|
if (ctxt->myDoc == NULL)
|
||||||
|
return;
|
||||||
|
dtd = xmlGetIntSubset(ctxt->myDoc);
|
||||||
|
if (dtd != NULL) {
|
||||||
|
xmlUnlinkNode((xmlNodePtr) dtd);
|
||||||
|
xmlFreeDtd(dtd);
|
||||||
|
ctxt->myDoc->intSubset = NULL;
|
||||||
|
}
|
||||||
|
ctxt->myDoc->intSubset =
|
||||||
|
xmlCreateIntSubset(ctxt->myDoc, name, ExternalID, SystemID);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1485,7 +1497,7 @@ xmlDefaultSAXHandlerInit(void)
|
|||||||
* Default handler for HTML, builds the DOM tree
|
* Default handler for HTML, builds the DOM tree
|
||||||
*/
|
*/
|
||||||
xmlSAXHandler htmlDefaultSAXHandler = {
|
xmlSAXHandler htmlDefaultSAXHandler = {
|
||||||
NULL,
|
internalSubset,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
NULL,
|
NULL,
|
||||||
@ -1522,7 +1534,7 @@ xmlSAXHandler htmlDefaultSAXHandler = {
|
|||||||
void
|
void
|
||||||
htmlDefaultSAXHandlerInit(void)
|
htmlDefaultSAXHandlerInit(void)
|
||||||
{
|
{
|
||||||
htmlDefaultSAXHandler.internalSubset = NULL;
|
htmlDefaultSAXHandler.internalSubset = internalSubset;
|
||||||
htmlDefaultSAXHandler.externalSubset = NULL;
|
htmlDefaultSAXHandler.externalSubset = NULL;
|
||||||
htmlDefaultSAXHandler.isStandalone = NULL;
|
htmlDefaultSAXHandler.isStandalone = NULL;
|
||||||
htmlDefaultSAXHandler.hasInternalSubset = NULL;
|
htmlDefaultSAXHandler.hasInternalSubset = NULL;
|
||||||
|
@ -48,7 +48,7 @@ mail</a>:</p>
|
|||||||
Use <strong>xmlDocGetRootElement(doc)</strong> to get the root element of
|
Use <strong>xmlDocGetRootElement(doc)</strong> to get the root element of
|
||||||
a document. Alternatively if you are sure to not reference Dtds nor have
|
a document. Alternatively if you are sure to not reference Dtds nor have
|
||||||
PIs or comments before or after the root element s/->root/->children/g
|
PIs or comments before or after the root element s/->root/->children/g
|
||||||
will probably do it. </li>
|
will probably do it.</li>
|
||||||
<li>The white space issue, this one is more complex, unless special case of
|
<li>The white space issue, this one is more complex, unless special case of
|
||||||
validating parsing, the line breaks and spaces usually used for indenting
|
validating parsing, the line breaks and spaces usually used for indenting
|
||||||
and formatting the document content becomes significant. So they are
|
and formatting the document content becomes significant. So they are
|
||||||
@ -90,7 +90,7 @@ They offers the following:</p>
|
|||||||
<strong>#include<libxml/...></strong> in both cases.</li>
|
<strong>#include<libxml/...></strong> in both cases.</li>
|
||||||
<li>similar identifiers defined via macros for the child and root fields:
|
<li>similar identifiers defined via macros for the child and root fields:
|
||||||
respectively <strong>xmlChildrenNode</strong> and
|
respectively <strong>xmlChildrenNode</strong> and
|
||||||
<strong>xmlRootNode</strong> </li>
|
<strong>xmlRootNode</strong></li>
|
||||||
<li>a new macro <strong>LIBXML_TEST_VERSION</strong> which should be
|
<li>a new macro <strong>LIBXML_TEST_VERSION</strong> which should be
|
||||||
inserted once in the client code</li>
|
inserted once in the client code</li>
|
||||||
</ol>
|
</ol>
|
||||||
@ -118,7 +118,7 @@ following:</p>
|
|||||||
<strong>LIBXML_TEST_VERSION</strong> is a fine place).</li>
|
<strong>LIBXML_TEST_VERSION</strong> is a fine place).</li>
|
||||||
</ol>
|
</ol>
|
||||||
|
|
||||||
<p>Following those 3 steps should work. It worked for some of my own code.</p>
|
<p>Following those steps should work. It worked for some of my own code.</p>
|
||||||
|
|
||||||
<p>Let me put some emphasis on the fact that there is far more changes from
|
<p>Let me put some emphasis on the fact that there is far more changes from
|
||||||
libxml 1.x to 2.x than the ones you may have to patch for. The overall code
|
libxml 1.x to 2.x than the ones you may have to patch for. The overall code
|
||||||
@ -128,6 +128,6 @@ upgrade, it may cost a lot on the long term ...</p>
|
|||||||
|
|
||||||
<p><a href="mailto:Daniel.Veillard@w3.org">Daniel Veillard</a></p>
|
<p><a href="mailto:Daniel.Veillard@w3.org">Daniel Veillard</a></p>
|
||||||
|
|
||||||
<p>$Id: upgrade.html,v 1.5 2000/05/06 08:11:18 veillard Exp $</p>
|
<p>$Id: upgrade.html,v 1.6 2000/06/29 00:43:26 veillard Exp $</p>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@ -414,6 +414,7 @@ xmlDtdPtr xmlNewDtd (xmlDocPtr doc,
|
|||||||
const xmlChar *name,
|
const xmlChar *name,
|
||||||
const xmlChar *ExternalID,
|
const xmlChar *ExternalID,
|
||||||
const xmlChar *SystemID);
|
const xmlChar *SystemID);
|
||||||
|
xmlDtdPtr xmlGetIntSubset (xmlDocPtr doc);
|
||||||
void xmlFreeDtd (xmlDtdPtr cur);
|
void xmlFreeDtd (xmlDtdPtr cur);
|
||||||
xmlNsPtr xmlNewGlobalNs (xmlDocPtr doc,
|
xmlNsPtr xmlNewGlobalNs (xmlDocPtr doc,
|
||||||
const xmlChar *href,
|
const xmlChar *href,
|
||||||
|
1
tree.h
1
tree.h
@ -414,6 +414,7 @@ xmlDtdPtr xmlNewDtd (xmlDocPtr doc,
|
|||||||
const xmlChar *name,
|
const xmlChar *name,
|
||||||
const xmlChar *ExternalID,
|
const xmlChar *ExternalID,
|
||||||
const xmlChar *SystemID);
|
const xmlChar *SystemID);
|
||||||
|
xmlDtdPtr xmlGetIntSubset (xmlDocPtr doc);
|
||||||
void xmlFreeDtd (xmlDtdPtr cur);
|
void xmlFreeDtd (xmlDtdPtr cur);
|
||||||
xmlNsPtr xmlNewGlobalNs (xmlDocPtr doc,
|
xmlNsPtr xmlNewGlobalNs (xmlDocPtr doc,
|
||||||
const xmlChar *href,
|
const xmlChar *href,
|
||||||
|
3
valid.c
3
valid.c
@ -2031,6 +2031,9 @@ xmlIsRef(xmlDocPtr doc, xmlNodePtr elem, xmlAttrPtr attr) {
|
|||||||
((attr->name[1] == 'D') || (attr->name[1] == 'd')) &&
|
((attr->name[1] == 'D') || (attr->name[1] == 'd')) &&
|
||||||
(attr->name[2] == 0)) return(1);
|
(attr->name[2] == 0)) return(1);
|
||||||
*******************/
|
*******************/
|
||||||
|
} else if (doc->type == XML_HTML_DOCUMENT_NODE) {
|
||||||
|
/* TODO @@@ */
|
||||||
|
return(0);
|
||||||
} else {
|
} else {
|
||||||
xmlAttributePtr attrDecl;
|
xmlAttributePtr attrDecl;
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user