mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-28 00:21:53 +03:00
work done on auto-opening of <p> tags and cleanup of SAX output, Daniel.
This commit is contained in:
155
HTMLparser.c
155
HTMLparser.c
@ -552,6 +552,20 @@ char *htmlStartClose[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
/*
|
||||
* The list of HTML elements which are supposed not to have
|
||||
* CDATA content and where a p element will be implied
|
||||
*
|
||||
* TODO: extend that list by reading the HTML SGML DtD on
|
||||
* implied paragraph
|
||||
*/
|
||||
static char *htmlNoContentElements[] = {
|
||||
"html",
|
||||
"head",
|
||||
"body",
|
||||
NULL
|
||||
};
|
||||
|
||||
|
||||
static char** htmlStartCloseIndex[100];
|
||||
static int htmlStartCloseIndexinitialized = 0;
|
||||
@ -845,6 +859,49 @@ htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlCheckParagraph
|
||||
* @ctxt: an HTML parser context
|
||||
*
|
||||
* Check whether a p element need to be implied before inserting
|
||||
* characters in the current element.
|
||||
*
|
||||
* Returns 1 if a paragraph has been inserted, 0 if not and -1
|
||||
* in case of error.
|
||||
*/
|
||||
|
||||
int
|
||||
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
|
||||
const xmlChar *tag;
|
||||
int i;
|
||||
|
||||
if (ctxt == NULL)
|
||||
return(-1);
|
||||
tag = ctxt->name;
|
||||
if (tag == NULL) {
|
||||
htmlAutoClose(ctxt, BAD_CAST"p");
|
||||
htmlCheckImplied(ctxt, BAD_CAST"p");
|
||||
htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
||||
ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
|
||||
return(1);
|
||||
}
|
||||
for (i = 0; htmlNoContentElements[i] != NULL; i++) {
|
||||
if (!xmlStrcmp(tag, BAD_CAST htmlNoContentElements[i])) {
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"Implied element paragraph\n");
|
||||
#endif
|
||||
htmlAutoClose(ctxt, BAD_CAST"p");
|
||||
htmlCheckImplied(ctxt, BAD_CAST"p");
|
||||
htmlnamePush(ctxt, xmlStrdup(BAD_CAST"p"));
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
|
||||
ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
|
||||
return(1);
|
||||
}
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
* The list of HTML predefined entities *
|
||||
@ -1253,7 +1310,8 @@ UTF8ToHtml(unsigned char* out, int *outlen,
|
||||
sizeof(html40EntitiesTable[0]));i++) {
|
||||
if (html40EntitiesTable[i].value == c) {
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"Found entity %s\n", name);
|
||||
fprintf(stderr,"Found entity %s\n",
|
||||
html40EntitiesTable[i].name);
|
||||
#endif
|
||||
goto found_ent;
|
||||
}
|
||||
@ -1496,20 +1554,21 @@ htmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
|
||||
/*
|
||||
* Just handle the content as a set of chars.
|
||||
*/
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, entity->content, len);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlNewDoc:
|
||||
* htmlNewDocNoDtD:
|
||||
* @URI: URI for the dtd, or NULL
|
||||
* @ExternalID: the external ID of the DTD, or NULL
|
||||
*
|
||||
* Returns a new document
|
||||
* Returns a new document, do not intialize the DTD if not provided
|
||||
*/
|
||||
htmlDocPtr
|
||||
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
|
||||
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
|
||||
xmlDocPtr cur;
|
||||
|
||||
/*
|
||||
@ -1525,12 +1584,8 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
|
||||
cur->type = XML_HTML_DOCUMENT_NODE;
|
||||
cur->version = NULL;
|
||||
cur->intSubset = NULL;
|
||||
if ((ExternalID == NULL) &&
|
||||
(URI == NULL))
|
||||
xmlCreateIntSubset(cur, BAD_CAST "HTML",
|
||||
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
|
||||
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
|
||||
else
|
||||
if ((ExternalID != NULL) ||
|
||||
(URI != NULL))
|
||||
xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI);
|
||||
cur->doc = cur;
|
||||
cur->name = NULL;
|
||||
@ -1548,6 +1603,23 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
|
||||
return(cur);
|
||||
}
|
||||
|
||||
/**
|
||||
* htmlNewDoc:
|
||||
* @URI: URI for the dtd, or NULL
|
||||
* @ExternalID: the external ID of the DTD, or NULL
|
||||
*
|
||||
* Returns a new document
|
||||
*/
|
||||
htmlDocPtr
|
||||
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
|
||||
if ((URI == NULL) && (ExternalID == NULL))
|
||||
return(htmlNewDocNoDtD(
|
||||
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
|
||||
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"));
|
||||
|
||||
return(htmlNewDocNoDtD(URI, ExternalID));
|
||||
}
|
||||
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
@ -2062,6 +2134,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
|
||||
ctxt->sax->ignorableWhitespace(ctxt->userData,
|
||||
buf, nbchar);
|
||||
} else {
|
||||
htmlCheckParagraph(ctxt);
|
||||
if (ctxt->sax->characters != NULL)
|
||||
ctxt->sax->characters(ctxt->userData, buf, nbchar);
|
||||
}
|
||||
@ -2080,6 +2153,7 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) {
|
||||
if (ctxt->sax->ignorableWhitespace != NULL)
|
||||
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
|
||||
} else {
|
||||
htmlCheckParagraph(ctxt);
|
||||
if (ctxt->sax->characters != NULL)
|
||||
ctxt->sax->characters(ctxt->userData, buf, nbchar);
|
||||
}
|
||||
@ -2861,16 +2935,19 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
out[i] = 0;
|
||||
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, out, i);
|
||||
} else {
|
||||
ent = htmlParseEntityRef(ctxt, &name);
|
||||
if (name == NULL) {
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
|
||||
return;
|
||||
}
|
||||
if ((ent == NULL) || (ent->value <= 0)) {
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
|
||||
ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
|
||||
ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
|
||||
@ -2895,6 +2972,7 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
|
||||
}
|
||||
out[i] = 0;
|
||||
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, out, i);
|
||||
}
|
||||
@ -2941,6 +3019,21 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Sometimes DOCTYPE arrives in the middle of the document
|
||||
*/
|
||||
if ((CUR == '<') && (NXT(1) == '!') &&
|
||||
(UPP(2) == 'D') && (UPP(3) == 'O') &&
|
||||
(UPP(4) == 'C') && (UPP(5) == 'T') &&
|
||||
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
||||
(UPP(8) == 'E')) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Misplaced DOCTYPE declaration\n");
|
||||
ctxt->wellFormed = 0;
|
||||
htmlParseDocTypeDecl(ctxt);
|
||||
}
|
||||
|
||||
/*
|
||||
* First case : a comment
|
||||
*/
|
||||
@ -3185,6 +3278,8 @@ htmlParseElement(htmlParserCtxtPtr ctxt) {
|
||||
|
||||
int
|
||||
htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
||||
xmlDtdPtr dtd;
|
||||
|
||||
htmlDefaultSAXHandlerInit();
|
||||
ctxt->html = 1;
|
||||
|
||||
@ -3258,6 +3353,15 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
|
||||
*/
|
||||
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
|
||||
ctxt->sax->endDocument(ctxt->userData);
|
||||
|
||||
if (ctxt->myDoc != NULL) {
|
||||
dtd = xmlGetIntSubset(ctxt->myDoc);
|
||||
if (dtd == NULL)
|
||||
ctxt->myDoc->intSubset =
|
||||
xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
|
||||
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
|
||||
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
|
||||
}
|
||||
if (! ctxt->wellFormed) return(-1);
|
||||
return(0);
|
||||
}
|
||||
@ -3848,6 +3952,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
xmlChar chr[2] = { 0 , 0 } ;
|
||||
|
||||
chr[0] = (xmlChar) ctxt->token;
|
||||
htmlCheckParagraph(ctxt);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
|
||||
ctxt->sax->characters(ctxt->userData, chr, 1);
|
||||
ctxt->token = 0;
|
||||
@ -3862,6 +3967,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
ctxt->sax->ignorableWhitespace(
|
||||
ctxt->userData, &cur, 1);
|
||||
} else {
|
||||
htmlCheckParagraph(ctxt);
|
||||
if (ctxt->sax->characters != NULL)
|
||||
ctxt->sax->characters(
|
||||
ctxt->userData, &cur, 1);
|
||||
@ -3878,7 +3984,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
||||
cur = in->cur[0];
|
||||
next = in->cur[1];
|
||||
cons = ctxt->nbChars;
|
||||
if ((cur == '<') && (next == '!') &&
|
||||
/*
|
||||
* Sometimes DOCTYPE arrives in the middle of the document
|
||||
*/
|
||||
if ((cur == '<') && (next == '!') &&
|
||||
(UPP(2) == 'D') && (UPP(3) == 'O') &&
|
||||
(UPP(4) == 'C') && (UPP(5) == 'T') &&
|
||||
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
|
||||
(UPP(8) == 'E')) {
|
||||
if ((!terminate) &&
|
||||
(htmlParseLookupSequence(ctxt, '>', 0, 0) < 0))
|
||||
goto done;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Misplaced DOCTYPE declaration\n");
|
||||
ctxt->wellFormed = 0;
|
||||
htmlParseDocTypeDecl(ctxt);
|
||||
} else if ((cur == '<') && (next == '!') &&
|
||||
(in->cur[2] == '-') && (in->cur[3] == '-')) {
|
||||
if ((!terminate) &&
|
||||
(htmlParseLookupSequence(ctxt, '-', '-', '>') < 0))
|
||||
@ -4040,6 +4162,17 @@ done:
|
||||
ctxt->sax->endDocument(ctxt->userData);
|
||||
}
|
||||
}
|
||||
if ((ctxt->myDoc != NULL) &&
|
||||
((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
|
||||
(ctxt->instate == XML_PARSER_EPILOG))) {
|
||||
xmlDtdPtr dtd;
|
||||
dtd = xmlGetIntSubset(ctxt->myDoc);
|
||||
if (dtd == NULL)
|
||||
ctxt->myDoc->intSubset =
|
||||
xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "HTML",
|
||||
BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
|
||||
BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
|
||||
}
|
||||
#ifdef DEBUG_PUSH
|
||||
fprintf(stderr, "HPP: done %d\n", ret);
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user