diff --git a/HTMLparser.c b/HTMLparser.c index 93b6661b..d33913b3 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -5334,30 +5334,17 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, int base, len; htmlParserInputPtr in; const xmlChar *buf; - int invalue = 0; - char valdellim = 0x0; + int quote; in = ctxt->input; if (in == NULL) return (-1); - base = in->cur - in->base; - if (base < 0) - return (-1); + base = ctxt->checkIndex; + quote = ctxt->endCheckState; - if (ctxt->checkIndex > base) { - base = ctxt->checkIndex; - /* Abuse hasPErefs member to restore current state. */ - invalue = ctxt->hasPErefs & 1 ? 1 : 0; - } - - if (in->buf == NULL) { - buf = in->base; - len = in->length; - } else { - buf = xmlBufContent(in->buf->buffer); - len = xmlBufUse(in->buf->buffer); - } + buf = in->cur; + len = in->end - in->cur; /* take into account the sequence length */ if (third) @@ -5366,18 +5353,13 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, len--; for (; base < len; base++) { if (ignoreattrval) { + if (quote) { + if (buf[base] == quote) + quote = 0; + continue; + } if (buf[base] == '"' || buf[base] == '\'') { - if (invalue) { - if (buf[base] == valdellim) { - invalue = 0; - continue; - } - } else { - valdellim = buf[base]; - invalue = 1; - continue; - } - } else if (invalue) { + quote = buf[base]; continue; } } @@ -5390,29 +5372,12 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, continue; } ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - if (next == 0) - xmlGenericError(xmlGenericErrorContext, - "HPP: lookup '%c' found at %d\n", - first, base); - else if (third == 0) - xmlGenericError(xmlGenericErrorContext, - "HPP: lookup '%c%c' found at %d\n", - first, next, base); - else - xmlGenericError(xmlGenericErrorContext, - "HPP: lookup '%c%c%c' found at %d\n", - first, next, third, base); -#endif - return (base - (in->cur - in->base)); + ctxt->endCheckState = 0; + return (base); } } ctxt->checkIndex = base; - /* Abuse hasPErefs member to track current state. */ - if (invalue) - ctxt->hasPErefs |= 1; - else - ctxt->hasPErefs &= ~1; + ctxt->endCheckState = quote; #ifdef DEBUG_PUSH if (next == 0) xmlGenericError(xmlGenericErrorContext, @@ -5446,7 +5411,6 @@ static int htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) { int mark = 0; - int cur = CUR_PTR - BASE_PTR; while (mark >= 0) { mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0); @@ -5455,7 +5419,7 @@ htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt) ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) { return mark; } - ctxt->checkIndex = cur + mark + 1; + ctxt->checkIndex = mark + 1; } return mark; } @@ -6806,6 +6770,7 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) ctxt->vctxt.warning = xmlParserValidityWarning; ctxt->record_info = 0; ctxt->checkIndex = 0; + ctxt->endCheckState = 0; ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; diff --git a/SAX2.c b/SAX2.c index 44c6b8c3..f6321c68 100644 --- a/SAX2.c +++ b/SAX2.c @@ -387,6 +387,7 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, xmlCharEncoding enc; int oldcharset; const xmlChar *oldencoding; + int oldprogressive; /* * Ask the Entity resolver to load the damn thing @@ -409,7 +410,9 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, oldinputTab = ctxt->inputTab; oldcharset = ctxt->charset; oldencoding = ctxt->encoding; + oldprogressive = ctxt->progressive; ctxt->encoding = NULL; + ctxt->progressive = 0; ctxt->inputTab = (xmlParserInputPtr *) xmlMalloc(5 * sizeof(xmlParserInputPtr)); @@ -422,6 +425,7 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->inputTab = oldinputTab; ctxt->charset = oldcharset; ctxt->encoding = oldencoding; + ctxt->progressive = oldprogressive; return; } ctxt->inputNr = 0; @@ -472,6 +476,7 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, (!xmlDictOwns(ctxt->dict, ctxt->encoding)))) xmlFree((xmlChar *) ctxt->encoding); ctxt->encoding = oldencoding; + ctxt->progressive = oldprogressive; /* ctxt->wellFormed = oldwellFormed; */ } } diff --git a/include/libxml/parser.h b/include/libxml/parser.h index ca5c93c9..3c86d228 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -311,6 +311,8 @@ struct _xmlParserCtxt { int input_id; /* we need to label inputs */ unsigned long sizeentcopy; /* volume of entity copy */ + + int endCheckState; /* quote state for push parser */ }; /** diff --git a/parser.c b/parser.c index 3aeb8ad9..903ccf08 100644 --- a/parser.c +++ b/parser.c @@ -11074,142 +11074,231 @@ xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { ************************************************************************/ /** - * xmlParseLookupSequence: + * xmlParseLookupChar: * @ctxt: an XML parser context - * @first: the first char to lookup - * @next: the next char to lookup or zero - * @third: the next char to lookup or zero + * @c: character * - * Try to find if a sequence (first, next, third) or just (first next) or - * (first) is available in the input stream. - * This function has a side effect of (possibly) incrementing ctxt->checkIndex - * to avoid rescanning sequences of bytes, it DOES change the state of the - * parser, do not use liberally. - * - * Returns the index to the current parsing point if the full sequence - * is available, -1 otherwise. + * Check whether the input buffer contains a character. */ static int -xmlParseLookupSequence(xmlParserCtxtPtr ctxt, xmlChar first, - xmlChar next, xmlChar third) { - int base, len; - xmlParserInputPtr in; - const xmlChar *buf; +xmlParseLookupChar(xmlParserCtxtPtr ctxt, int c) { + const xmlChar *cur; - in = ctxt->input; - if (in == NULL) return(-1); - base = in->cur - in->base; - if (base < 0) return(-1); - if (ctxt->checkIndex > base) - base = ctxt->checkIndex; - if (in->buf == NULL) { - buf = in->base; - len = in->length; + if (ctxt->checkIndex == 0) { + cur = ctxt->input->cur + 1; } else { - buf = xmlBufContent(in->buf->buffer); - len = xmlBufUse(in->buf->buffer); + cur = ctxt->input->cur + ctxt->checkIndex; } - /* take into account the sequence length */ - if (third) len -= 2; - else if (next) len --; - for (;base < len;base++) { - if (buf[base] == first) { - if (third != 0) { - if ((buf[base + 1] != next) || - (buf[base + 2] != third)) continue; - } else if (next != 0) { - if (buf[base + 1] != next) continue; - } - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - if (next == 0) - xmlGenericError(xmlGenericErrorContext, - "PP: lookup '%c' found at %d\n", - first, base); - else if (third == 0) - xmlGenericError(xmlGenericErrorContext, - "PP: lookup '%c%c' found at %d\n", - first, next, base); - else - xmlGenericError(xmlGenericErrorContext, - "PP: lookup '%c%c%c' found at %d\n", - first, next, third, base); -#endif - return(base - (in->cur - in->base)); - } + + if (memchr(cur, c, ctxt->input->end - cur) == NULL) { + ctxt->checkIndex = ctxt->input->end - ctxt->input->cur; + return(0); + } else { + ctxt->checkIndex = 0; + return(1); } - ctxt->checkIndex = base; -#ifdef DEBUG_PUSH - if (next == 0) - xmlGenericError(xmlGenericErrorContext, - "PP: lookup '%c' failed\n", first); - else if (third == 0) - xmlGenericError(xmlGenericErrorContext, - "PP: lookup '%c%c' failed\n", first, next); - else - xmlGenericError(xmlGenericErrorContext, - "PP: lookup '%c%c%c' failed\n", first, next, third); -#endif - return(-1); } /** - * xmlParseGetLasts: + * xmlParseLookupString: * @ctxt: an XML parser context - * @lastlt: pointer to store the last '<' from the input - * @lastgt: pointer to store the last '>' from the input + * @startDelta: delta to apply at the start + * @str: string + * @strLen: length of string * - * Lookup the last < and > in the current chunk + * Check whether the input buffer contains a string. */ -static void -xmlParseGetLasts(xmlParserCtxtPtr ctxt, const xmlChar **lastlt, - const xmlChar **lastgt) { - const xmlChar *tmp; +static const xmlChar * +xmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta, + const char *str, size_t strLen) { + const xmlChar *cur, *term; - if ((ctxt == NULL) || (lastlt == NULL) || (lastgt == NULL)) { - xmlGenericError(xmlGenericErrorContext, - "Internal error: xmlParseGetLasts\n"); - return; - } - if ((ctxt->progressive != 0) && (ctxt->inputNr == 1)) { - tmp = ctxt->input->end; - tmp--; - while ((tmp >= ctxt->input->base) && (*tmp != '<')) tmp--; - if (tmp < ctxt->input->base) { - *lastlt = NULL; - *lastgt = NULL; - } else { - *lastlt = tmp; - tmp++; - while ((tmp < ctxt->input->end) && (*tmp != '>')) { - if (*tmp == '\'') { - tmp++; - while ((tmp < ctxt->input->end) && (*tmp != '\'')) tmp++; - if (tmp < ctxt->input->end) tmp++; - } else if (*tmp == '"') { - tmp++; - while ((tmp < ctxt->input->end) && (*tmp != '"')) tmp++; - if (tmp < ctxt->input->end) tmp++; - } else - tmp++; - } - if (tmp < ctxt->input->end) - *lastgt = tmp; - else { - tmp = *lastlt; - tmp--; - while ((tmp >= ctxt->input->base) && (*tmp != '>')) tmp--; - if (tmp >= ctxt->input->base) - *lastgt = tmp; - else - *lastgt = NULL; - } - } + if (ctxt->checkIndex == 0) { + cur = ctxt->input->cur + startDelta; } else { - *lastlt = NULL; - *lastgt = NULL; + cur = ctxt->input->cur + ctxt->checkIndex; } + + term = BAD_CAST strstr((const char *) cur, str); + if (term == NULL) { + const xmlChar *end = ctxt->input->end; + + /* Rescan (strLen - 1) characters. */ + if ((size_t) (end - cur) < strLen) + end = cur; + else + end -= strLen - 1; + ctxt->checkIndex = end - ctxt->input->cur; + } else { + ctxt->checkIndex = 0; + } + + return(term); } + +/** + * xmlParseLookupCharData: + * @ctxt: an XML parser context + * + * Check whether the input buffer contains terminated char data. + */ +static int +xmlParseLookupCharData(xmlParserCtxtPtr ctxt) { + const xmlChar *cur = ctxt->input->cur + ctxt->checkIndex; + const xmlChar *end = ctxt->input->end; + + while (cur < end) { + if ((*cur == '<') || (*cur == '&')) { + ctxt->checkIndex = 0; + return(1); + } + cur++; + } + + ctxt->checkIndex = cur - ctxt->input->cur; + return(0); +} + +/** + * xmlParseLookupGt: + * @ctxt: an XML parser context + * + * Check whether there's enough data in the input buffer to finish parsing + * a start tag. This has to take quotes into account. + */ +static int +xmlParseLookupGt(xmlParserCtxtPtr ctxt) { + const xmlChar *cur; + const xmlChar *end = ctxt->input->end; + int state = ctxt->endCheckState; + + if (ctxt->checkIndex == 0) + cur = ctxt->input->cur + 1; + else + cur = ctxt->input->cur + ctxt->checkIndex; + + while (cur < end) { + if (state) { + if (*cur == state) + state = 0; + } else if (*cur == '\'' || *cur == '"') { + state = *cur; + } else if (*cur == '>') { + ctxt->checkIndex = 0; + ctxt->endCheckState = 0; + return(1); + } + cur++; + } + + ctxt->checkIndex = cur - ctxt->input->cur; + ctxt->endCheckState = state; + return(0); +} + +/** + * xmlParseLookupInternalSubset: + * @ctxt: an XML parser context + * + * Check whether there's enough data in the input buffer to finish parsing + * the internal subset. + */ +static int +xmlParseLookupInternalSubset(xmlParserCtxtPtr ctxt) { + /* + * Sorry, but progressive parsing of the internal subset is not + * supported. We first check that the full content of the internal + * subset is available and parsing is launched only at that point. + * Internal subset ends with "']' S? '>'" in an unescaped section and + * not in a ']]>' sequence which are conditional sections. + */ + const xmlChar *cur, *start; + const xmlChar *end = ctxt->input->end; + int state = ctxt->endCheckState; + + if (ctxt->checkIndex == 0) { + cur = ctxt->input->cur + 1; + } else { + cur = ctxt->input->cur + ctxt->checkIndex; + } + start = cur; + + while (cur < end) { + if (state == '-') { + if ((*cur == '-') && + (cur[1] == '-') && + (cur[2] == '>')) { + state = 0; + cur += 3; + start = cur; + continue; + } + } + else if (state == ']') { + if (*cur == '>') { + ctxt->checkIndex = 0; + ctxt->endCheckState = 0; + return(1); + } + if (IS_BLANK_CH(*cur)) { + state = ' '; + } else if (*cur != ']') { + state = 0; + start = cur; + continue; + } + } + else if (state == ' ') { + if (*cur == '>') { + ctxt->checkIndex = 0; + ctxt->endCheckState = 0; + return(1); + } + if (!IS_BLANK_CH(*cur)) { + state = 0; + start = cur; + continue; + } + } + else if (state != 0) { + if (*cur == state) { + state = 0; + start = cur + 1; + } + } + else if (*cur == '<') { + if ((cur[1] == '!') && + (cur[2] == '-') && + (cur[3] == '-')) { + state = '-'; + cur += 4; + /* Don't treat as comment */ + start = cur; + continue; + } + } + else if ((*cur == '"') || (*cur == '\'') || (*cur == ']')) { + state = *cur; + } + + cur++; + } + + /* + * Rescan the three last characters to detect "" + * split across chunks. + */ + if ((state == 0) || (state == '-')) { + if (cur - start < 3) + cur = start; + else + cur -= 3; + } + ctxt->checkIndex = cur - ctxt->input->cur; + ctxt->endCheckState = state; + return(0); +} + /** * xmlCheckCdataPush: * @cur: pointer to the block of characters @@ -11292,7 +11381,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { int ret = 0; int avail, tlen; xmlChar cur, next; - const xmlChar *lastlt, *lastgt; if (ctxt->input == NULL) return(0); @@ -11353,9 +11441,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if ((ctxt->input != NULL) && (ctxt->input->cur - ctxt->input->base > 4096)) { xmlParserInputShrink(ctxt->input); - ctxt->checkIndex = 0; } - xmlParseGetLasts(ctxt, &lastlt, &lastgt); while (ctxt->instate != XML_PARSER_EOF) { if ((ctxt->errNo != XML_ERR_OK) && (ctxt->disableSAX == 1)) @@ -11442,10 +11528,10 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { } if ((cur == '<') && (next == '?')) { /* PI or XML decl */ - if (avail < 5) return(ret); + if (avail < 5) goto done; if ((!terminate) && - (xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) - return(ret); + (!xmlParseLookupString(ctxt, 2, "?>", 2))) + goto done; if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); @@ -11526,15 +11612,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { ctxt->sax->endDocument(ctxt->userData); goto done; } - if (!terminate) { - if (ctxt->progressive) { - /* > can be found unescaped in attribute values */ - if ((lastgt == NULL) || (ctxt->input->cur >= lastgt)) - goto done; - } else if (xmlParseLookupSequence(ctxt, '>', 0, 0) < 0) { - goto done; - } - } + if ((!terminate) && (!xmlParseLookupGt(ctxt))) + goto done; if (ctxt->spaceNr == 0) spacePush(ctxt, -1); else if (*ctxt->space == -2) @@ -11599,7 +11678,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { } else { ctxt->instate = XML_PARSER_CONTENT; } - ctxt->progressive = 1; break; } if (RAW == '>') { @@ -11614,7 +11692,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { nameNsPush(ctxt, name, prefix, URI, line, ctxt->nsNr - nsNr); ctxt->instate = XML_PARSER_CONTENT; - ctxt->progressive = 1; break; } case XML_PARSER_CONTENT: { @@ -11628,33 +11705,21 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { break; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) { - ctxt->progressive = XML_PARSER_PI; + (!xmlParseLookupString(ctxt, 2, "?>", 2))) goto done; - } xmlParsePI(ctxt); ctxt->instate = XML_PARSER_CONTENT; - ctxt->progressive = 1; } else if ((cur == '<') && (next != '!')) { ctxt->instate = XML_PARSER_START_TAG; break; } else if ((cur == '<') && (next == '!') && (ctxt->input->cur[2] == '-') && (ctxt->input->cur[3] == '-')) { - int term; - - if (avail < 4) - goto done; - ctxt->input->cur += 4; - term = xmlParseLookupSequence(ctxt, '-', '-', '>'); - ctxt->input->cur -= 4; - if ((!terminate) && (term < 0)) { - ctxt->progressive = XML_PARSER_COMMENT; + if ((!terminate) && + (!xmlParseLookupString(ctxt, 4, "-->", 3))) goto done; - } xmlParseComment(ctxt); ctxt->instate = XML_PARSER_CONTENT; - ctxt->progressive = 1; } else if ((cur == '<') && (ctxt->input->cur[1] == '!') && (ctxt->input->cur[2] == '[') && (ctxt->input->cur[3] == 'C') && @@ -11674,8 +11739,7 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { "detected an error in element content\n"); SKIP(1); } else if (cur == '&') { - if ((!terminate) && - (xmlParseLookupSequence(ctxt, ';', 0, 0) < 0)) + if ((!terminate) && (!xmlParseLookupChar(ctxt, ';'))) goto done; xmlParseReference(ctxt); } else { @@ -11693,18 +11757,10 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { */ if ((ctxt->inputNr == 1) && (avail < XML_PARSER_BIG_BUFFER_SIZE)) { - if (!terminate) { - if (ctxt->progressive) { - if ((lastlt == NULL) || - (ctxt->input->cur > lastlt)) - goto done; - } else if (xmlParseLookupSequence(ctxt, - '<', 0, 0) < 0) { - goto done; - } - } + if ((!terminate) && (!xmlParseLookupCharData(ctxt))) + goto done; } - ctxt->checkIndex = 0; + ctxt->checkIndex = 0; xmlParseCharData(ctxt, 0); } break; @@ -11712,15 +11768,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { case XML_PARSER_END_TAG: if (avail < 2) goto done; - if (!terminate) { - if (ctxt->progressive) { - /* > can be found unescaped in attribute values */ - if ((lastgt == NULL) || (ctxt->input->cur >= lastgt)) - goto done; - } else if (xmlParseLookupSequence(ctxt, '>', 0, 0) < 0) { - goto done; - } - } + if ((!terminate) && (!xmlParseLookupChar(ctxt, '>'))) + goto done; if (ctxt->sax2) { xmlParseEndTag2(ctxt, &ctxt->pushTab[ctxt->nameNr - 1]); nameNsPop(ctxt); @@ -11742,35 +11791,35 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { * The Push mode need to have the SAX callback for * cdataBlock merge back contiguous callbacks. */ - int base; + const xmlChar *term; - base = xmlParseLookupSequence(ctxt, ']', ']', '>'); - if (base < 0) { - if (avail >= XML_PARSER_BIG_BUFFER_SIZE + 2) { - int tmp; + term = xmlParseLookupString(ctxt, 0, "]]>", 3); + if (term == NULL) { + int tmp; - tmp = xmlCheckCdataPush(ctxt->input->cur, - XML_PARSER_BIG_BUFFER_SIZE, 0); - if (tmp < 0) { - tmp = -tmp; - ctxt->input->cur += tmp; - goto encoding_error; - } - if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { - if (ctxt->sax->cdataBlock != NULL) - ctxt->sax->cdataBlock(ctxt->userData, - ctxt->input->cur, tmp); - else if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, - ctxt->input->cur, tmp); - } - if (ctxt->instate == XML_PARSER_EOF) - goto done; - SKIPL(tmp); - ctxt->checkIndex = 0; - } - goto done; + if (avail < XML_PARSER_BIG_BUFFER_SIZE + 2) + goto done; + ctxt->checkIndex = 0; + tmp = xmlCheckCdataPush(ctxt->input->cur, + XML_PARSER_BIG_BUFFER_SIZE, 0); + if (tmp < 0) { + tmp = -tmp; + ctxt->input->cur += tmp; + goto encoding_error; + } + if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { + if (ctxt->sax->cdataBlock != NULL) + ctxt->sax->cdataBlock(ctxt->userData, + ctxt->input->cur, tmp); + else if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, + ctxt->input->cur, tmp); + } + if (ctxt->instate == XML_PARSER_EOF) + goto done; + SKIPL(tmp); } else { + int base = term - CUR_PTR; int tmp; tmp = xmlCheckCdataPush(ctxt->input->cur, base, 1); @@ -11804,7 +11853,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; SKIPL(base + 3); - ctxt->checkIndex = 0; ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -11827,10 +11875,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { next = ctxt->input->cur[1]; if ((cur == '<') && (next == '?')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) { - ctxt->progressive = XML_PARSER_PI; + (!xmlParseLookupString(ctxt, 2, "?>", 2))) goto done; - } #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing PI\n"); @@ -11839,16 +11885,12 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_MISC; - ctxt->progressive = 1; - ctxt->checkIndex = 0; } else if ((cur == '<') && (next == '!') && (ctxt->input->cur[2] == '-') && (ctxt->input->cur[3] == '-')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) { - ctxt->progressive = XML_PARSER_COMMENT; + (!xmlParseLookupString(ctxt, 4, "-->", 3))) goto done; - } #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing Comment\n"); @@ -11857,8 +11899,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_MISC; - ctxt->progressive = 1; - ctxt->checkIndex = 0; } else if ((cur == '<') && (next == '!') && (ctxt->input->cur[2] == 'D') && (ctxt->input->cur[3] == 'O') && @@ -11867,18 +11907,13 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { (ctxt->input->cur[6] == 'Y') && (ctxt->input->cur[7] == 'P') && (ctxt->input->cur[8] == 'E')) { - if ((!terminate) && - (xmlParseLookupSequence(ctxt, '>', 0, 0) < 0)) { - ctxt->progressive = XML_PARSER_DTD; - goto done; - } + if ((!terminate) && (!xmlParseLookupGt(ctxt))) + goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing internal subset\n"); #endif ctxt->inSubset = 1; - ctxt->progressive = 0; - ctxt->checkIndex = 0; xmlParseDocTypeDecl(ctxt); if (ctxt->instate == XML_PARSER_EOF) goto done; @@ -11911,8 +11946,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { goto done; } else { ctxt->instate = XML_PARSER_START_TAG; - ctxt->progressive = XML_PARSER_START_TAG; - xmlParseGetLasts(ctxt, &lastlt, &lastgt); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: entering START_TAG\n"); @@ -11932,10 +11965,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { next = ctxt->input->cur[1]; if ((cur == '<') && (next == '?')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) { - ctxt->progressive = XML_PARSER_PI; + (!xmlParseLookupString(ctxt, 2, "?>", 2))) goto done; - } #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing PI\n"); @@ -11944,14 +11975,11 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_PROLOG; - ctxt->progressive = 1; } else if ((cur == '<') && (next == '!') && (ctxt->input->cur[2] == '-') && (ctxt->input->cur[3] == '-')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) { - ctxt->progressive = XML_PARSER_COMMENT; + (!xmlParseLookupString(ctxt, 4, "-->", 3))) goto done; - } #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing Comment\n"); @@ -11960,15 +11988,11 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_PROLOG; - ctxt->progressive = 1; } else if ((cur == '<') && (next == '!') && (avail < 4)) { goto done; } else { ctxt->instate = XML_PARSER_START_TAG; - if (ctxt->progressive == 0) - ctxt->progressive = XML_PARSER_START_TAG; - xmlParseGetLasts(ctxt, &lastlt, &lastgt); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: entering START_TAG\n"); @@ -11988,10 +12012,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { next = ctxt->input->cur[1]; if ((cur == '<') && (next == '?')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '?', '>', 0) < 0)) { - ctxt->progressive = XML_PARSER_PI; + (!xmlParseLookupString(ctxt, 2, "?>", 2))) goto done; - } #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing PI\n"); @@ -12000,14 +12022,11 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_EPILOG; - ctxt->progressive = 1; } else if ((cur == '<') && (next == '!') && (ctxt->input->cur[2] == '-') && (ctxt->input->cur[3] == '-')) { if ((!terminate) && - (xmlParseLookupSequence(ctxt, '-', '-', '>') < 0)) { - ctxt->progressive = XML_PARSER_COMMENT; + (!xmlParseLookupString(ctxt, 4, "-->", 3))) goto done; - } #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: Parsing Comment\n"); @@ -12016,7 +12035,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_EPILOG; - ctxt->progressive = 1; } else if ((cur == '<') && (next == '!') && (avail < 4)) { goto done; @@ -12033,117 +12051,8 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { } break; case XML_PARSER_DTD: { - /* - * Sorry but progressive parsing of the internal subset - * is not expected to be supported. We first check that - * the full content of the internal subset is available and - * the parsing is launched only at that point. - * Internal subset ends up with "']' S? '>'" in an unescaped - * section and not in a ']]>' sequence which are conditional - * sections (whoever argued to keep that crap in XML deserve - * a place in hell !). - */ - int base, i; - xmlChar *buf; - xmlChar quote = 0; - size_t use; - - base = ctxt->input->cur - ctxt->input->base; - if (base < 0) return(0); - if (ctxt->checkIndex > base) - base = ctxt->checkIndex; - buf = xmlBufContent(ctxt->input->buf->buffer); - use = xmlBufUse(ctxt->input->buf->buffer); - for (;(unsigned int) base < use; base++) { - if (quote != 0) { - if (buf[base] == quote) - quote = 0; - continue; - } - if ((quote == 0) && (buf[base] == '<')) { - int found = 0; - /* special handling of comments */ - if (((unsigned int) base + 4 < use) && - (buf[base + 1] == '!') && - (buf[base + 2] == '-') && - (buf[base + 3] == '-')) { - for (;(unsigned int) base + 3 < use; base++) { - if ((buf[base] == '-') && - (buf[base + 1] == '-') && - (buf[base + 2] == '>')) { - found = 1; - base += 2; - break; - } - } - if (!found) { -#if 0 - fprintf(stderr, "unfinished comment\n"); -#endif - break; /* for */ - } - continue; - } - } - if (buf[base] == '"') { - quote = '"'; - continue; - } - if (buf[base] == '\'') { - quote = '\''; - continue; - } - if (buf[base] == ']') { -#if 0 - fprintf(stderr, "%c%c%c%c: ", buf[base], - buf[base + 1], buf[base + 2], buf[base + 3]); -#endif - if ((unsigned int) base +1 >= use) - break; - if (buf[base + 1] == ']') { - /* conditional crap, skip both ']' ! */ - base++; - continue; - } - for (i = 1; (unsigned int) base + i < use; i++) { - if (buf[base + i] == '>') { -#if 0 - fprintf(stderr, "found\n"); -#endif - goto found_end_int_subset; - } - if (!IS_BLANK_CH(buf[base + i])) { -#if 0 - fprintf(stderr, "not found\n"); -#endif - goto not_end_of_int_subset; - } - } -#if 0 - fprintf(stderr, "end of stream\n"); -#endif - break; - - } -not_end_of_int_subset: - continue; /* for */ - } - /* - * We didn't found the end of the Internal subset - */ - if (quote == 0) - ctxt->checkIndex = base; - else - ctxt->checkIndex = 0; -#ifdef DEBUG_PUSH - if (next == 0) - xmlGenericError(xmlGenericErrorContext, - "PP: lookup of int subset end filed\n"); -#endif - goto done; - -found_end_int_subset: - ctxt->checkIndex = 0; + if ((!terminate) && (!xmlParseLookupInternalSubset(ctxt))) + goto done; xmlParseInternalSubset(ctxt); if (ctxt->instate == XML_PARSER_EOF) goto done; @@ -12157,7 +12066,6 @@ found_end_int_subset: if (ctxt->instate == XML_PARSER_EOF) goto done; ctxt->instate = XML_PARSER_PROLOG; - ctxt->checkIndex = 0; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "PP: entering PROLOG\n"); @@ -12257,55 +12165,6 @@ encoding_error: return(0); } -/** - * xmlParseCheckTransition: - * @ctxt: an XML parser context - * @chunk: a char array - * @size: the size in byte of the chunk - * - * Check depending on the current parser state if the chunk given must be - * processed immediately or one need more data to advance on parsing. - * - * Returns -1 in case of error, 0 if the push is not needed and 1 if needed - */ -static int -xmlParseCheckTransition(xmlParserCtxtPtr ctxt, const char *chunk, int size) { - if ((ctxt == NULL) || (chunk == NULL) || (size < 0)) - return(-1); - if (ctxt->instate == XML_PARSER_START_TAG) { - if (memchr(chunk, '>', size) != NULL) - return(1); - return(0); - } - if (ctxt->progressive == XML_PARSER_COMMENT) { - if (memchr(chunk, '>', size) != NULL) - return(1); - return(0); - } - if (ctxt->instate == XML_PARSER_CDATA_SECTION) { - if (memchr(chunk, '>', size) != NULL) - return(1); - return(0); - } - if (ctxt->progressive == XML_PARSER_PI) { - if (memchr(chunk, '>', size) != NULL) - return(1); - return(0); - } - if (ctxt->instate == XML_PARSER_END_TAG) { - if (memchr(chunk, '>', size) != NULL) - return(1); - return(0); - } - if ((ctxt->progressive == XML_PARSER_DTD) || - (ctxt->instate == XML_PARSER_DTD)) { - if (memchr(chunk, '>', size) != NULL) - return(1); - return(0); - } - return(1); -} - /** * xmlParseChunk: * @ctxt: an XML parser context @@ -12322,8 +12181,6 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate) { int end_in_lf = 0; int remain = 0; - size_t old_avail = 0; - size_t avail = 0; if (ctxt == NULL) return(XML_ERR_INTERNAL_ERROR); @@ -12331,6 +12188,10 @@ xmlParseChunk(xmlParserCtxtPtr ctxt, const char *chunk, int size, return(ctxt->errNo); if (ctxt->instate == XML_PARSER_EOF) return(-1); + if (ctxt->input == NULL) + return(-1); + + ctxt->progressive = 1; if (ctxt->instate == XML_PARSER_START) xmlDetectSAX2(ctxt); if ((size > 0) && (chunk != NULL) && (!terminate) && @@ -12347,7 +12208,6 @@ xmldecl_done: size_t cur = ctxt->input->cur - ctxt->input->base; int res; - old_avail = xmlBufUse(ctxt->input->buf->buffer); /* * Specific handling if we autodetected an encoding, we should not * push more than the first line ... which depend on the encoding @@ -12415,23 +12275,11 @@ xmldecl_done: } } } + if (remain != 0) { xmlParseTryOrFinish(ctxt, 0); } else { - if ((ctxt->input != NULL) && (ctxt->input->buf != NULL)) - avail = xmlBufUse(ctxt->input->buf->buffer); - /* - * Depending on the current state it may not be such - * a good idea to try parsing if there is nothing in the chunk - * which would be worth doing a parser state transition and we - * need to wait for more data - */ - if ((terminate) || (avail > XML_MAX_TEXT_LENGTH) || - (old_avail == 0) || (avail == 0) || - (xmlParseCheckTransition(ctxt, - (const char *)&ctxt->input->base[old_avail], - avail - old_avail))) - xmlParseTryOrFinish(ctxt, terminate); + xmlParseTryOrFinish(ctxt, terminate); } if (ctxt->instate == XML_PARSER_EOF) return(ctxt->errNo); @@ -14895,6 +14743,7 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt) #endif ctxt->record_info = 0; ctxt->checkIndex = 0; + ctxt->endCheckState = 0; ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; diff --git a/result/errors/754946.xml.str b/result/errors/754946.xml.str index 49395b61..7aaf045b 100644 --- a/result/errors/754946.xml.str +++ b/result/errors/754946.xml.str @@ -1,4 +1,15 @@ -./test/errors/754946.xml:1: parser error : Extra content at the end of the document - + ^ +Entity: line 1: parser error : xmlParseEntityDecl: no name +%zz; + ^ +Entity: line 1: + + ^ +Entity: line 1: parser error : ParsePI: PI xDOCTYPEm space expected +%zz; + ^ +Entity: line 1: + + ^ +./test/errors/759573-2.xml:6: parser error : internal error: xmlParseInternalSubset: error detected in Markup declaration + +%xx;ÿggKENSMYNT#MENTDŴzz;'> + ^ +Entity: line 2: + + ^ +./test/errors/759573-2.xml:6: parser error : DOCTYPE improperly terminated +%xx;ÿggKENSMYNT#MENTDŴzz;'> + ^ ./test/errors/759573-2.xml : failed to parse diff --git a/result/errors/759573.xml.str b/result/errors/759573.xml.str index 1b6addb7..432f66b2 100644 --- a/result/errors/759573.xml.str +++ b/result/errors/759573.xml.str @@ -1,4 +1,30 @@ -./test/errors/759573.xml:1: parser error : Extra content at the end of the document -%xx; + ^ +Entity: line 1: +%%xx; + ^ +Entity: line 1: +%%xx; + ^ +./test/errors/759573.xml:1: parser error : DOCTYPE improperly terminated +T t (A)>%xx; + ^ ./test/errors/759573.xml : failed to parse