1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

html: Start to fix EOF and U+0000 handling

This commit is contained in:
Nick Wellnhofer
2024-09-08 22:20:20 +02:00
parent e062a4a9b3
commit 4eeac30944
5 changed files with 93 additions and 274 deletions

View File

@ -388,10 +388,8 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
if (c < 0x80) { if (c < 0x80) {
if (c == 0) { if (c == 0) {
if (ctxt->input->cur < ctxt->input->end) { if (ctxt->input->cur < ctxt->input->end) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Char 0x%X out of allowed range\n", 0);
*len = 1; *len = 1;
return(' '); return(0xFFFD);
} else { } else {
*len = 0; *len = 0;
return(0); return(0);
@ -3166,19 +3164,16 @@ static int
htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) { htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
int nbchar = 0; int nbchar = 0;
int stop = 0;
int complete = 0; int complete = 0;
int res = 0; int res = 0;
int cur, l, mode; int cur, l, mode;
mode = ctxt->endCheckState; mode = ctxt->endCheckState;
if ((mode == 0) || (mode == DATA_RCDATA))
stop = '&';
cur = CUR_CHAR(l); while ((!PARSER_STOPPED(ctxt)) &&
while ((cur != stop) && (ctxt->input->cur < ctxt->input->end)) {
(cur != 0) && cur = CUR_CHAR(l);
(!PARSER_STOPPED(ctxt))) {
/* /*
* Check for end of text data * Check for end of text data
*/ */
@ -3262,6 +3257,9 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
res = 1; res = 1;
break; break;
} }
} else if ((cur == '&') &&
((mode == 0) || (mode == DATA_RCDATA))) {
break;
} }
COPY_BUF(buf,nbchar,cur); COPY_BUF(buf,nbchar,cur);
@ -3273,7 +3271,6 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
nbchar = 0; nbchar = 0;
SHRINK; SHRINK;
} }
cur = CUR_CHAR(l);
} }
if (nbchar != 0) { if (nbchar != 0) {
buf[nbchar] = 0; buf[nbchar] = 0;
@ -4130,7 +4127,8 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
static void static void
htmlParseContent(htmlParserCtxtPtr ctxt) { htmlParseContent(htmlParserCtxtPtr ctxt) {
while (PARSER_STOPPED(ctxt) == 0) { while ((PARSER_STOPPED(ctxt) == 0) &&
(ctxt->input->cur < ctxt->input->end)) {
int mode; int mode;
GROW; GROW;
@ -4147,9 +4145,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
(UPP(4) == 'C') && (UPP(5) == 'T') && (UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n",
BAD_CAST "DOCTYPE" , NULL);
htmlParseDocTypeDecl(ctxt); htmlParseDocTypeDecl(ctxt);
} else if ((NXT(2) == '-') && (NXT(3) == '-')) { } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
SKIP(4); SKIP(4);
@ -4172,9 +4167,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
} }
} else if ((CUR == '&') && ((mode == 0) || (mode == DATA_RCDATA))) { } else if ((CUR == '&') && ((mode == 0) || (mode == DATA_RCDATA))) {
htmlParseReference(ctxt); htmlParseReference(ctxt);
} else if (CUR == 0) {
htmlAutoCloseOnEnd(ctxt);
break;
} else { } else {
htmlParseCharData(ctxt, /* terminate */ 1); htmlParseCharData(ctxt, /* terminate */ 1);
} }
@ -4182,6 +4174,9 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
SHRINK; SHRINK;
GROW; GROW;
} }
if (ctxt->input->cur >= ctxt->input->end)
htmlAutoCloseOnEnd(ctxt);
} }
/** /**
@ -4405,7 +4400,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
* Wipe out everything which is before the first '<' * Wipe out everything which is before the first '<'
*/ */
SKIP_BLANKS; SKIP_BLANKS;
if (CUR == 0) { if (ctxt->input->cur >= ctxt->input->end) {
htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
"Document is empty\n", NULL, NULL); "Document is empty\n", NULL, NULL);
} }
@ -4966,7 +4961,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int ret = 0; int ret = 0;
htmlParserInputPtr in; htmlParserInputPtr in;
ptrdiff_t avail = 0; ptrdiff_t avail = 0;
xmlChar cur, next; int cur;
htmlParserNodeInfo node_info; htmlParserNodeInfo node_info;
@ -4988,17 +4983,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
} }
if (avail < 1) if (avail < 1)
goto done; goto done;
/*
* This is done to make progress and avoid an infinite loop
* if a parsing attempt was aborted by hitting a NUL byte. After
* changing htmlCurrentChar, this probably isn't necessary anymore.
* We should consider removing this check.
*/
cur = in->cur[0]; cur = in->cur[0];
if (cur == 0) {
SKIP(1);
continue;
}
switch (ctxt->instate) { switch (ctxt->instate) {
case XML_PARSER_EOF: case XML_PARSER_EOF:
@ -5017,14 +5002,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8); xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
} }
/*
* Very first chars read from the document flow.
*/
cur = in->cur[0];
if (IS_BLANK_CH(cur)) {
SKIP_BLANKS;
avail = in->end - in->cur;
}
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) { if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
ctxt->sax->setDocumentLocator(ctxt->userData, ctxt->sax->setDocumentLocator(ctxt->userData,
(xmlSAXLocator *) &xmlDefaultSAXLocator); (xmlSAXLocator *) &xmlDefaultSAXLocator);
@ -5033,161 +5010,22 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(!ctxt->disableSAX)) (!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData); ctxt->sax->startDocument(ctxt->userData);
cur = in->cur[0]; /* Allow callback to modify state */
next = in->cur[1]; if (ctxt->instate == XML_PARSER_START)
if ((cur == '<') && (next == '!') && ctxt->instate = XML_PARSER_MISC;
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 9, ">", 1, 0) < 0))
goto done;
htmlParseDocTypeDecl(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
} else {
ctxt->instate = XML_PARSER_MISC;
}
break;
case XML_PARSER_MISC:
SKIP_BLANKS;
avail = in->end - in->cur;
/*
* no chars in buffer
*/
if (avail < 1)
goto done;
/*
* not enough chars in buffer
*/
if (avail < 2) {
if (!terminate)
goto done;
else
next = ' ';
} else {
next = in->cur[1];
}
cur = in->cur[0];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_MISC;
} else if ((cur == '<') && (next == '!') &&
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 9, ">", 1, 0) < 0))
goto done;
htmlParseDocTypeDecl(ctxt);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 9)) {
goto done;
} else {
ctxt->instate = XML_PARSER_CONTENT;
}
break;
case XML_PARSER_PROLOG:
SKIP_BLANKS;
avail = in->end - in->cur;
if (avail < 2)
goto done;
cur = in->cur[0];
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_PROLOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
} else {
ctxt->instate = XML_PARSER_CONTENT;
}
break;
case XML_PARSER_EPILOG:
avail = in->end - in->cur;
if (avail < 1)
goto done;
cur = in->cur[0];
if (IS_BLANK_CH(cur)) {
htmlParseCharData(ctxt, terminate);
goto done;
}
if (avail < 2)
goto done;
next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_EPILOG;
} else if ((cur == '<') && (next == '!') &&
(avail < 4)) {
goto done;
} else {
ctxt->errNo = XML_ERR_DOCUMENT_END;
ctxt->wellFormed = 0;
ctxt->instate = XML_PARSER_EOF;
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
ctxt->sax->endDocument(ctxt->userData);
goto done;
}
break; break;
case XML_PARSER_START_TAG: { case XML_PARSER_START_TAG: {
const xmlChar *name; const xmlChar *name;
int failed; int failed, next;
const htmlElemDesc * info; const htmlElemDesc * info;
/*
* no chars in buffer
*/
if (avail < 1)
goto done;
/* /*
* not enough chars in buffer * not enough chars in buffer
*/ */
if (avail < 2) { if (avail < 2)
if (!terminate) goto done;
goto done;
else
next = ' ';
} else {
next = in->cur[1];
}
cur = in->cur[0]; cur = in->cur[0];
next = in->cur[1];
if (cur != '<') { if (cur != '<') {
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
break; break;
@ -5287,44 +5125,21 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
break; break;
} }
case XML_PARSER_CONTENT: { case XML_PARSER_MISC:
xmlChar chr[2] = { 0, 0 }; case XML_PARSER_PROLOG:
case XML_PARSER_CONTENT:
case XML_PARSER_EPILOG: {
int mode; int mode;
/* if ((ctxt->instate == XML_PARSER_MISC) ||
* Handle preparsed entities and charRef (ctxt->instate == XML_PARSER_PROLOG)) {
*/ SKIP_BLANKS;
if ((avail == 1) && (terminate)) { avail = in->end - in->cur;
cur = in->cur[0]; }
if ((cur != '<') && (cur != '&')) {
if (ctxt->sax != NULL) { if (avail < 1)
chr[0] = cur;
if (IS_BLANK_CH(cur)) {
if (ctxt->keepBlanks) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
ctxt->userData, chr, 1);
} else {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(
ctxt->userData, chr, 1);
}
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(
ctxt->userData, chr, 1);
}
}
ctxt->checkIndex = 0;
in->cur++;
break;
}
}
if (avail < 2)
goto done; goto done;
cur = in->cur[0]; cur = in->cur[0];
next = in->cur[1];
mode = ctxt->endCheckState; mode = ctxt->endCheckState;
if (mode != 0) { if (mode != 0) {
@ -5353,62 +5168,74 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
} }
break; break;
} else if ((cur == '<') && (next == '!')) { } else if (cur == '<') {
if (avail < 4) int next;
goto done;
/* if (avail < 2) {
* Sometimes DOCTYPE arrives in the middle of the document if (!terminate)
*/
if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 9, ">", 1, 0) < 0))
goto done; goto done;
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, next = ' ';
"Misplaced DOCTYPE declaration\n",
BAD_CAST "DOCTYPE" , NULL);
htmlParseDocTypeDecl(ctxt);
} else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
(htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
ctxt->instate = XML_PARSER_CONTENT;
} else { } else {
next = in->cur[1];
}
if (next == '!') {
if ((!terminate) && (avail < 4))
goto done;
if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
(htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
SKIP(4);
htmlParseComment(ctxt, /* bogus */ 0);
break;
}
if ((!terminate) && (avail < 9))
goto done;
if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 9, ">", 1,
0) < 0))
goto done;
htmlParseDocTypeDecl(ctxt);
if (ctxt->instate == XML_PARSER_MISC)
ctxt->instate = XML_PARSER_PROLOG;
} else {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
SKIP(2);
htmlParseComment(ctxt, /* bogus */ 1);
}
} else if (next == '?') {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0)) (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done; goto done;
SKIP(2); SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1); htmlParseComment(ctxt, /* bogus */ 1);
} else if (next == '/') {
ctxt->instate = XML_PARSER_END_TAG;
ctxt->checkIndex = 0;
break;
} else if (IS_ASCII_LETTER(next)) {
if ((!terminate) && (next == 0))
goto done;
ctxt->instate = XML_PARSER_START_TAG;
ctxt->checkIndex = 0;
break;
} else {
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
(ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData,
BAD_CAST "<", 1);
SKIP(1);
} }
} else if ((cur == '<') && (next == '?')) {
if ((!terminate) &&
(htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
goto done;
SKIP(1);
htmlParseComment(ctxt, /* bogus */ 1);
ctxt->instate = XML_PARSER_CONTENT;
} else if ((cur == '<') && (next == '/')) {
ctxt->instate = XML_PARSER_END_TAG;
ctxt->checkIndex = 0;
break;
} else if ((cur == '<') && IS_ASCII_LETTER(next)) {
if ((!terminate) && (next == 0))
goto done;
ctxt->instate = XML_PARSER_START_TAG;
ctxt->checkIndex = 0;
break;
} else if (cur == '<') {
htmlCheckParagraph(ctxt);
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
(ctxt->sax->characters != NULL))
ctxt->sax->characters(ctxt->userData,
BAD_CAST "<", 1);
SKIP(1);
} else { } else {
/* /*
* check that the text sequence is complete * check that the text sequence is complete

View File

@ -1,3 +0,0 @@
./test/HTML/doc2.htm:10: HTML parser error : Misplaced DOCTYPE declaration
<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tra
^

View File

@ -17,7 +17,6 @@ SAX.endElement(script)
SAX.characters( SAX.characters(
, 1) , 1)
SAX.comment( END Naviscope Javascript ) SAX.comment( END Naviscope Javascript )
SAX.error: Misplaced DOCTYPE declaration
SAX.internalSubset(HTML, -//W3C//DTD HTML 4.0 Transitional//EN, ) SAX.internalSubset(HTML, -//W3C//DTD HTML 4.0 Transitional//EN, )
SAX.comment( saved from url=(0027)http://www.agents-tech.com/ ) SAX.comment( saved from url=(0027)http://www.agents-tech.com/ )
SAX.characters( SAX.characters(

View File

@ -1,6 +1,3 @@
./test/HTML/doc3.htm:3: HTML parser error : Misplaced DOCTYPE declaration
<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN
^
./test/HTML/doc3.htm:81: HTML parser error : Unexpected end tag : p ./test/HTML/doc3.htm:81: HTML parser error : Unexpected end tag : p
</P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P> </P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P>
^ ^

View File

@ -17,7 +17,6 @@ SAX.endElement(script)
SAX.characters( SAX.characters(
, 1) , 1)
SAX.comment( END Naviscope Javascript ) SAX.comment( END Naviscope Javascript )
SAX.error: Misplaced DOCTYPE declaration
SAX.internalSubset(HTML, -//W3C//DTD HTML 3.2//EN, ) SAX.internalSubset(HTML, -//W3C//DTD HTML 3.2//EN, )
SAX.comment(last modified on Tuesday, February 22, 2000 11:47 PM ) SAX.comment(last modified on Tuesday, February 22, 2000 11:47 PM )
SAX.characters( SAX.characters(