From 14f752c2b725131a24c1296109a0406fc870279c Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sat, 9 Aug 2003 11:44:50 +0000 Subject: [PATCH] fixed a nasty bug #119387, bad heuristic from the progressive HTML parser * HTMLparser.c: fixed a nasty bug #119387, bad heuristic from the progressive HTML parser front-end on large character data island leading to an erroneous end of data detection by the parser. Some cleanup too to get closer from the XML progressive parser. Daniel --- ChangeLog | 8 ++++++++ HTMLparser.c | 39 +++++++++++++++++++++++++-------------- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index 61822b86..6dc7954c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Sat Aug 9 13:41:21 CEST 2003 Daniel Veillard + + * HTMLparser.c: fixed a nasty bug #119387, bad heuristic from + the progressive HTML parser front-end on large character data + island leading to an erroneous end of data detection by the + parser. Some cleanup too to get closer from the XML progressive + parser. + Sat Aug 9 00:42:47 HKT 2003 William Brack * win32/configure.js: Added in support for the ISO8859X diff --git a/HTMLparser.c b/HTMLparser.c index 0aa00411..2168bbdf 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4950,19 +4950,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { /* TODO: check generation of subtrees if noent !!! */ htmlParseReference(ctxt); } else { - /* TODO Avoid the extra copy, handle directly !!!!!! */ - /* - * Goal of the following test is: - * - minimize calls to the SAX 'character' callback - * when they are mergeable + /* + * check that the text sequence is complete + * before handing out the data to the parser + * to avoid problems with erroneous end of + * data detection. */ - if ((ctxt->inputNr == 1) && - (avail < HTML_PARSER_BIG_BUFFER_SIZE)) { - if ((!terminate) && - (htmlParseLookupSequence( - ctxt, '<', 0, 0, 0) < 0)) - goto done; - } + if ((!terminate) && + (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) + goto done; ctxt->checkIndex = 0; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5160,12 +5156,27 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif +#if 0 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) htmlParseTryOrFinish(ctxt, terminate); +#endif } else if (ctxt->instate != XML_PARSER_EOF) { - xmlParserInputBufferPush(ctxt->input->buf, 0, ""); - htmlParseTryOrFinish(ctxt, terminate); + if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { + xmlParserInputBufferPtr in = ctxt->input->buf; + if ((in->encoder != NULL) && (in->buffer != NULL) && + (in->raw != NULL)) { + int nbchars; + + nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); + if (nbchars < 0) { + xmlGenericError(xmlGenericErrorContext, + "htmlParseChunk: encoder error\n"); + return(XML_ERR_INVALID_ENCODING); + } + } + } } + htmlParseTryOrFinish(ctxt, terminate); if (terminate) { if ((ctxt->instate != XML_PARSER_EOF) && (ctxt->instate != XML_PARSER_EPILOG) &&