From e50f3b5d54496e3afb411a9c3012c697b75eb34e Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Wed, 20 Mar 2002 19:24:21 +0000 Subject: [PATCH] I wanted to see the real speed at the SAX interface after a little too * testSAX.c: I wanted to see the real speed at the SAX interface after a little too many Ximianer started complaining about the parser speed. added a --quiet option: paphio:~/XML -> ls -l db100000.xml -rw-rw-r-- 1 veillard www 20182040 Mar 20 10:30 db100000.xml paphio:~/XML -> time ./testSAX --quiet db100000.xml 3200006 callbacks generated real 0m1.270s Which means 16MBytes/s and 3Mcallback/s Daniel --- ChangeLog | 13 ++++ HTMLparser.c | 13 ++-- error.c | 106 +++++++++------------------- include/libxml/tree.h | 3 +- parser.c | 114 +++++++++++++++++------------- parserInternals.c | 118 +++++++++++++++++++++++--------- testSAX.c | 97 +++++++++++++++++++++++++- tree.c | 32 ++++++++- xmlIO.c | 156 +++++++++++++++++++++++++++++++++++++++--- 9 files changed, 479 insertions(+), 173 deletions(-) diff --git a/ChangeLog b/ChangeLog index 93dac5f7..5f6be58d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,16 @@ +Wed Mar 20 20:20:57 CET 2002 Daniel Veillard + + * testSAX.c: I wanted to see the real speed at the SAX interface + after a little too many Ximianer started complaining about the + parser speed. + added a --quiet option: + paphio:~/XML -> ls -l db100000.xml + -rw-rw-r-- 1 veillard www 20182040 Mar 20 10:30 db100000.xml + paphio:~/XML -> time ./testSAX --quiet db100000.xml + 3200006 callbacks generated + real 0m1.270s + Which means 16MBytes/s and 3Mcallback/s + Tue Mar 19 19:33:57 CET 2002 Daniel Veillard * xpath.c: valgrind spotted another error that time when running diff --git a/HTMLparser.c b/HTMLparser.c index 4c819d1b..f5da2983 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -147,11 +147,12 @@ PUSH_AND_POP(static, xmlChar*, name) /* Inported from XML */ -/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ -#define CUR ((int) (*ctxt->input->cur)) +#define CUR ((ctxt->input->cur < ctxt->input->end) ? (*ctxt->input->cur) : 0) #define NEXT xmlNextChar(ctxt),ctxt->nbChars++ +#define AVAIL (ctxt->input->end - ctxt->input->cur) -#define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) +#define RAW (ctxt->token ? -1 : \ + (ctxt->input->cur < ctxt->input->end) ? (*ctxt->input->cur) : 0) #define NXT(val) ctxt->input->cur[(val)] #define CUR_PTR ctxt->input->cur @@ -3047,8 +3048,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { else { /* Dump the bogus attribute string up to the next blank or * the end of the tag. */ - while ((IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && (CUR != '>') - && ((CUR != '/') || (NXT(1) != '>'))) + while ((AVAIL > 0) && (IS_CHAR(CUR)) && !(IS_BLANK(CUR)) && + (CUR != '>') && ((CUR != '/') || (NXT(1) != '>'))) NEXT; } @@ -4670,6 +4671,8 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, xmlParserInputBufferPush(ctxt->input->buf, size, chunk); ctxt->input->base = ctxt->input->buf->buffer->content + base; ctxt->input->cur = ctxt->input->base + cur; + ctxt->input->end = ctxt->input->buf->buffer->content + + ctxt->input->buf->buffer->use; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif diff --git a/error.c b/error.c index 14d91f31..6a4a69f7 100644 --- a/error.c +++ b/error.c @@ -146,103 +146,63 @@ xmlParserPrintFileInfo(xmlParserInputPtr input) { */ void -xmlParserPrintFileContext(xmlParserInputPtr input) { - const xmlChar *cur, *base; +xmlParserPrintFileContext(xmlParserInputPtr input) +{ + const xmlChar *cur, *base, *end; int n; - xmlChar content[81]; + xmlChar content[81]; xmlChar *ctnt; - if (input == NULL) return; + if (input == NULL) + return; cur = input->cur; base = input->base; + end = input->end; /* skip backwards over any end-of-lines */ - while ((cur > base) && ((*cur == '\n') || (*cur == '\r'))) { - cur--; + while ((cur > base) && (cur < end) + && ((*cur == '\n') || (*cur == '\r'))) { + cur--; } n = 0; /* search backwards for beginning-of-line maximum 80 characters */ - while ((n++ < 80) && (cur > base) && (*cur != '\n') && (*cur != '\r')) + while ((n++ < 80) && (cur > base) && (cur < end) && (*cur != '\n') + && (*cur != '\r')) cur--; - if ((*cur == '\n') || (*cur == '\r')) cur++; - /* search forward for end-of-line maximum 80 characters */ + if ((cur > base) && (cur < end) && ((*cur == '\n') || (*cur == '\r'))) + cur++; + /* search forward for end-of-line maximum 80 characters */ n = 0; ctnt = content; - while ((*cur != 0) && (*cur != '\n') && (*cur != '\r') && (n < 79)) { - *ctnt++ = *cur++; - n++; + while ((cur < end) && (*cur != 0) && (*cur != '\n') && (*cur != '\r') + && (n < 79)) { + *ctnt++ = *cur++; + n++; } *ctnt = 0; - xmlGenericError(xmlGenericErrorContext,"%s\n", content); + xmlGenericError(xmlGenericErrorContext, "%s\n", content); /* create blank line with problem pointer */ cur = input->cur; - while ((cur > base) && ((*cur == '\n') || (*cur == '\r'))) { - cur--; - } + while ((cur > base) && (cur < end) + && ((*cur == '\n') || (*cur == '\r'))) { + cur--; + } n = 0; ctnt = content; - while ((n++ < 79) && (cur > base) && (*cur != '\n') && (*cur != '\r')) { - *ctnt++ = ' '; - cur--; + while ((n++ < 79) && (cur > base) && (cur < end) && (*cur != '\n') + && (*cur != '\r')) { + *ctnt++ = ' '; + cur--; } if (ctnt > content) { - *(--ctnt) = '^'; - *(++ctnt) = 0; + *(--ctnt) = '^'; + *(++ctnt) = 0; } else { - *ctnt = '^'; - *(++ctnt) = 0; + *ctnt = '^'; + *(++ctnt) = 0; } - xmlGenericError(xmlGenericErrorContext,"%s\n", content); + xmlGenericError(xmlGenericErrorContext, "%s\n", content); } -#if 0 -/** - * xmlGetVarStr: - * @msg: the message format - * @args: a va_list argument list - * - * SGS contribution - * Get an arbitrary-sized string for an error argument - * The caller must free() the returned string - */ -static char * -xmlGetVarStr(const char * msg, va_list args) { - int size; - int length; - int chars, left; - char *str, *larger; - va_list ap; - - str = (char *) xmlMalloc(150); - if (str == NULL) - return(NULL); - - size = 150; - length = 0; - - while (1) { - left = size - length; - /* Try to print in the allocated space. */ - va_start(msg, ap); - chars = vsnprintf(str + length, left, msg, ap); - va_end(ap); - /* If that worked, we're done. */ - if ((chars > -1) && (chars < left )) - break; - /* Else try again with more space. */ - if (chars > -1) /* glibc 2.1 */ - size += chars + 1; /* precisely what is needed */ - else /* glibc 2.0 */ - size += 100; - if ((larger = (char *) xmlRealloc(str, size)) == NULL) { - xmlFree(str); - return(NULL); - } - str = larger; - } - return(str); -} -#endif - /** * xmlParserError: * @ctx: an XML parser context diff --git a/include/libxml/tree.h b/include/libxml/tree.h index 940426dd..448a5992 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -402,7 +402,8 @@ struct _xmlRef { typedef enum { XML_BUFFER_ALLOC_DOUBLEIT, - XML_BUFFER_ALLOC_EXACT + XML_BUFFER_ALLOC_EXACT, + XML_BUFFER_ALLOC_UNMUTABLE } xmlBufferAllocationScheme; /** diff --git a/parser.c b/parser.c index a14cdbda..13e9d509 100644 --- a/parser.c +++ b/parser.c @@ -268,29 +268,32 @@ static int spacePop(xmlParserCtxtPtr ctxt) { * GROW, SHRINK handling of input buffers */ -#define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) -#define CUR (ctxt->token ? ctxt->token : (*ctxt->input->cur)) +#define AVAIL (ctxt->input->end - ctxt->input->cur) +#define RAW (ctxt->token ? -1 : \ + (ctxt->input->cur < ctxt->input->end) ? (*ctxt->input->cur) : 0) +#define CUR (ctxt->token ? ctxt->token : \ + (ctxt->input->cur < ctxt->input->end) ? (*ctxt->input->cur) : 0) #define NXT(val) ctxt->input->cur[(val)] #define CUR_PTR ctxt->input->cur #define SKIP(val) do { \ ctxt->nbChars += (val),ctxt->input->cur += (val); \ if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ - if ((*ctxt->input->cur == 0) && \ + if ((ctxt->input->cur >= ctxt->input->end) && \ (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) \ xmlPopInput(ctxt); \ } while (0) #define SHRINK if (ctxt->input->cur - ctxt->input->base > INPUT_CHUNK) {\ xmlParserInputShrink(ctxt->input); \ - if ((*ctxt->input->cur == 0) && \ + if ((ctxt->input->cur >= ctxt->input->end) && \ (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) \ xmlPopInput(ctxt); \ } #define GROW if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) { \ xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \ - if ((*ctxt->input->cur == 0) && \ + if ((ctxt->input->cur >= ctxt->input->end) && \ (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) \ xmlPopInput(ctxt); \ } @@ -302,7 +305,7 @@ static int spacePop(xmlParserCtxtPtr ctxt) { #define NEXT1 { \ ctxt->input->cur++; \ ctxt->nbChars++; \ - if (*ctxt->input->cur == 0) \ + if (ctxt->input->cur >= ctxt->input->end) \ xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \ } @@ -409,7 +412,7 @@ xmlPopInput(xmlParserCtxtPtr ctxt) { xmlGenericError(xmlGenericErrorContext, "Popping input %d\n", ctxt->inputNr); xmlFreeInputStream(inputPop(ctxt)); - if ((*ctxt->input->cur == 0) && + if ((ctxt->input->cur >= ctxt->input->end) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) return(xmlPopInput(ctxt)); return(CUR); @@ -467,7 +470,8 @@ xmlParseCharRef(xmlParserCtxtPtr ctxt) { /* * Using RAW/CUR/NEXT is okay since we are working on ASCII range here */ - if ((RAW == '&') && (NXT(1) == '#') && + GROW; + if ((AVAIL >= 5) && (RAW == '&') && (NXT(1) == '#') && (NXT(2) == 'x')) { SKIP(3); GROW; @@ -500,7 +504,7 @@ xmlParseCharRef(xmlParserCtxtPtr ctxt) { ctxt->nbChars ++; ctxt->input->cur++; } - } else if ((RAW == '&') && (NXT(1) == '#')) { + } else if ((AVAIL >= 4) && (RAW == '&') && (NXT(1) == '#')) { SKIP(2); GROW; while (RAW != ';') { /* loop blocked by count */ @@ -836,16 +840,19 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { * plug some encoding conversion routines. */ GROW - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); + if (AVAIL > 4) { + start[0] = RAW; + start[1] = NXT(1); + start[2] = NXT(2); + start[3] = NXT(3); + enc = xmlDetectCharEncoding(start, 4); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } } if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) && + (AVAIL >= 6) && (RAW == '<') && (NXT(1) == '?') && (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { @@ -1526,7 +1533,7 @@ static int areBlanks(xmlParserCtxtPtr ctxt, const xmlChar *str, int len) { /* * Otherwise, heuristic :-\ */ - if (RAW != '<') return(0); + if ((AVAIL < 2) || (RAW != '<')) return(0); if ((ctxt->node->children == NULL) && (RAW == '<') && (NXT(1) == '/')) return(0); @@ -2555,7 +2562,7 @@ void xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int cdata); void xmlParseCharData(xmlParserCtxtPtr ctxt, int cdata) { - const xmlChar *in; + const xmlChar *in, *end; int nbchar = 0; int line = ctxt->input->line; int col = ctxt->input->col; @@ -2568,21 +2575,24 @@ xmlParseCharData(xmlParserCtxtPtr ctxt, int cdata) { */ if ((ctxt->token == 0) && (!cdata)) { in = ctxt->input->cur; + end = ctxt->input->end; do { get_more: - while (((*in >= 0x20) && (*in != '<') && (*in != ']') && - (*in != '&') && (*in <= 0x7F)) || (*in == 0x09)) + while ((in < end) && + (((*in >= 0x20) && (*in != '<') && (*in != ']') && + (*in != '&') && (*in <= 0x7F)) || (*in == 0x09))) in++; - if (*in == 0xA) { + if (in >= end) { + end = ctxt->input->end; + } else if (*in == 0xA) { ctxt->input->line++; in++; - while (*in == 0xA) { + while ((in < end) && (*in == 0xA)) { ctxt->input->line++; in++; } goto get_more; - } - if (*in == ']') { + } else if (*in == ']') { if ((in[1] == ']') && (in[2] == '>')) { ctxt->errNo = XML_ERR_MISPLACED_CDATA_END; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) @@ -2621,26 +2631,29 @@ get_more: } } ctxt->input->cur = in; - if (*in == 0xD) { - in++; - if (*in == 0xA) { - ctxt->input->cur = in; + if (in < ctxt->input->end) { + if (*in == 0xD) { in++; - ctxt->input->line++; - continue; /* while */ + if (*in == 0xA) { + ctxt->input->cur = in; + in++; + ctxt->input->line++; + continue; /* while */ + } + in--; + } + if (*in == '<') { + return; + } + if (*in == '&') { + return; } - in--; - } - if (*in == '<') { - return; - } - if (*in == '&') { - return; } SHRINK; GROW; in = ctxt->input->cur; - } while ((*in >= 0x20) && (*in <= 0x7F)); + end = ctxt->input->end; + } while ((in < end) && (*in >= 0x20) && (*in <= 0x7F)); nbchar = 0; } ctxt->input->line = line; @@ -6746,7 +6759,7 @@ xmlParseCDSect(xmlParserCtxtPtr ctxt) { void xmlParseContent(xmlParserCtxtPtr ctxt) { GROW; - while (((RAW != 0) || (ctxt->token != 0)) && + while ((AVAIL > 0) && ((RAW != 0) || (ctxt->token != 0)) && ((RAW != '<') || (NXT(1) != '/'))) { const xmlChar *test = CUR_PTR; int cons = ctxt->input->consumed; @@ -6813,7 +6826,7 @@ xmlParseContent(xmlParserCtxtPtr ctxt) { /* * Pop-up of finished entities. */ - while ((RAW == 0) && (ctxt->inputNr > 1)) + while (((AVAIL == 0) || (RAW == 0)) && (ctxt->inputNr > 1)) xmlPopInput(ctxt); SHRINK; @@ -7527,13 +7540,16 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { void xmlParseMisc(xmlParserCtxtPtr ctxt) { - while (((RAW == '<') && (NXT(1) == '?')) || - ((RAW == '<') && (NXT(1) == '!') && - (NXT(2) == '-') && (NXT(3) == '-')) || - IS_BLANK(CUR)) { - if ((RAW == '<') && (NXT(1) == '?')) { + if (AVAIL < 4) { + GROW; + } + while (((AVAIL >= 2) && ((RAW == '<') && (NXT(1) == '?'))) || + ((AVAIL >= 4) && ((RAW == '<') && (NXT(1) == '!') && + (NXT(2) == '-') && (NXT(3) == '-'))) || + ((AVAIL > 0) && (IS_BLANK(CUR)))) { + if ((AVAIL >= 2) && (RAW == '<') && (NXT(1) == '?')) { xmlParsePI(ctxt); - } else if (IS_BLANK(CUR)) { + } else if ((AVAIL > 0) && (IS_BLANK(CUR))) { NEXT; } else xmlParseComment(ctxt); @@ -7683,7 +7699,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { */ xmlParseMisc(ctxt); - if (RAW != 0) { + if ((AVAIL > 0) && (RAW != 0)) { ctxt->errNo = XML_ERR_DOCUMENT_END; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, @@ -9525,14 +9541,14 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, xmlParseContent(ctxt); - if ((RAW == '<') && (NXT(1) == '/')) { + if ((AVAIL > 2) && (RAW == '<') && (NXT(1) == '/')) { ctxt->errNo = XML_ERR_NOT_WELL_BALANCED; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "chunk is not well balanced\n"); ctxt->wellFormed = 0; ctxt->disableSAX = 1; - } else if (RAW != 0) { + } else if ((AVAIL > 0) && (RAW != 0)) { ctxt->errNo = XML_ERR_EXTRA_CONTENT; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, diff --git a/parserInternals.c b/parserInternals.c index 68ac5382..ca84b19a 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -993,6 +993,7 @@ xmlParserInputGrow(xmlParserInputPtr in, int len) { if (in->base == NULL) return(-1); if (in->cur == NULL) return(-1); if (in->buf->buffer == NULL) return(-1); + if (in->buf->buffer->alloc == XML_BUFFER_ALLOC_UNMUTABLE) return(-1); CHECK_BUFFER(in); @@ -1048,6 +1049,7 @@ xmlParserInputShrink(xmlParserInputPtr in) { if (in->base == NULL) return; if (in->cur == NULL) return; if (in->buf->buffer == NULL) return; + if (in->buf->buffer->alloc == XML_BUFFER_ALLOC_UNMUTABLE) return; CHECK_BUFFER(in); @@ -1101,9 +1103,13 @@ xmlParserInputShrink(xmlParserInputPtr in) { void xmlNextChar(xmlParserCtxtPtr ctxt) { + int avail; + if (ctxt->instate == XML_PARSER_EOF) return; + avail = ctxt->input->end - ctxt->input->cur; + /* * 2.11 End-of-Line Handling * the literal two-character sequence "#xD#xA" or a standalone @@ -1112,7 +1118,8 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { */ if (ctxt->token != 0) ctxt->token = 0; else if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { - if ((*ctxt->input->cur == 0) && + if (((ctxt->input->cur >= ctxt->input->end) || + (*ctxt->input->cur == 0)) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) && (ctxt->instate != XML_PARSER_COMMENT)) { /* @@ -1126,6 +1133,14 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { if (*(ctxt->input->cur) == '\n') { ctxt->input->line++; ctxt->input->col = 1; } else ctxt->input->col++; + + if (avail < 4) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + avail = ctxt->input->end - ctxt->input->cur; + } + if (avail < 1) + return; + if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* * We are supposed to handle UTF8, check it's valid @@ -1143,21 +1158,15 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { c = *cur; if (c & 0x80) { - if (cur[1] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if ((cur[1] & 0xc0) != 0x80) + if ((avail < 2) || (cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { unsigned int val; - if (cur[2] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if ((cur[2] & 0xc0) != 0x80) + if ((avail < 3) || ((cur[2] & 0xc0) != 0x80)) goto encoding_error; if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if (((c & 0xf8) != 0xf0) || + if ((avail < 4) || ((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ @@ -1199,18 +1208,18 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { ctxt->input->cur++; } ctxt->nbChars++; - if (*ctxt->input->cur == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } } else { ctxt->input->cur++; ctxt->nbChars++; - if (*ctxt->input->cur == 0) + if (avail < 1) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + avail = ctxt->input->end - ctxt->input->cur; + } } - if ((*ctxt->input->cur == '%') && (!ctxt->html)) + if ((avail > 1) && (*ctxt->input->cur == '%') && (!ctxt->html)) xmlParserHandlePEReference(ctxt); - if ((*ctxt->input->cur == 0) && + if ((avail < 1) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) xmlPopInput(ctxt); return; @@ -1223,17 +1232,34 @@ encoding_error: * encoding !) */ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { + xmlChar bytes[4]; + if (avail > 3) + bytes[3] = ctxt->input->cur[3]; + else + bytes[3] = 0; + if (avail > 2) + bytes[2] = ctxt->input->cur[2]; + else + bytes[2] = 0; + if (avail > 1) + bytes[1] = ctxt->input->cur[1]; + else + bytes[1] = 0; + if (avail > 0) + bytes[0] = ctxt->input->cur[0]; + else + bytes[0] = 0; ctxt->sax->error(ctxt->userData, "Input is not proper UTF-8, indicate encoding !\n"); ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", - ctxt->input->cur[0], ctxt->input->cur[1], - ctxt->input->cur[2], ctxt->input->cur[3]); + bytes[0], bytes[1],bytes[2],bytes[3]); } ctxt->wellFormed = 0; ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->charset = XML_CHAR_ENCODING_8859_1; - ctxt->input->cur++; + if (avail > 1) + ctxt->input->cur++; return; } @@ -1257,6 +1283,8 @@ encoding_error: int xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { + int avail; + if (ctxt->instate == XML_PARSER_EOF) return(0); @@ -1264,9 +1292,18 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 0; return(ctxt->token); } - if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) { - *len = 1; - return((int) *ctxt->input->cur); + avail = ctxt->input->end - ctxt->input->cur; + if (avail < 4) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + avail = ctxt->input->end - ctxt->input->cur; + } + if (avail < 1) + return(0); + + if ((avail > 1) && + (*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) { + *len = 1; + return((int) *ctxt->input->cur); } if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { /* @@ -1286,20 +1323,15 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { c = *cur; if (c & 0x80) { - if (cur[1] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if ((cur[1] & 0xc0) != 0x80) + if ((avail < 2) || (cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { - - if (cur[2] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if ((cur[2] & 0xc0) != 0x80) + if ((avail < 3) || ((cur[2] & 0xc0) != 0x80)) goto encoding_error; if ((c & 0xf0) == 0xf0) { if (cur[3] == 0) xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if (((c & 0xf8) != 0xf0) || + if ((avail < 4) || ((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; /* 4-byte code */ @@ -1335,7 +1367,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { /* 1-byte code */ *len = 1; if (*ctxt->input->cur == 0xD) { - if (ctxt->input->cur[1] == 0xA) { + if ((avail > 1) && (ctxt->input->cur[1] == 0xA)) { ctxt->nbChars++; ctxt->input->cur++; } @@ -1351,7 +1383,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { */ *len = 1; if (*ctxt->input->cur == 0xD) { - if (ctxt->input->cur[1] == 0xA) { + if ((avail > 1) && (ctxt->input->cur[1] == 0xA)) { ctxt->nbChars++; ctxt->input->cur++; } @@ -1367,16 +1399,36 @@ encoding_error: * encoding !) */ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { + xmlChar bytes[4]; + if (avail > 3) + bytes[3] = ctxt->input->cur[3]; + else + bytes[3] = 0; + if (avail > 2) + bytes[2] = ctxt->input->cur[2]; + else + bytes[2] = 0; + if (avail > 1) + bytes[1] = ctxt->input->cur[1]; + else + bytes[1] = 0; + if (avail > 0) + bytes[0] = ctxt->input->cur[0]; + else + bytes[0] = 0; ctxt->sax->error(ctxt->userData, "Input is not proper UTF-8, indicate encoding !\n"); ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", - ctxt->input->cur[0], ctxt->input->cur[1], - ctxt->input->cur[2], ctxt->input->cur[3]); + bytes[0], bytes[1],bytes[2],bytes[3]); } ctxt->wellFormed = 0; ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->charset = XML_CHAR_ENCODING_8859_1; + if (avail < 1) { + *len = 0; + return(0); + } *len = 1; return((int) *ctxt->input->cur); } diff --git a/testSAX.c b/testSAX.c index 559177c7..cf284678 100644 --- a/testSAX.c +++ b/testSAX.c @@ -45,6 +45,8 @@ static int recovery = 0; static int push = 0; static int speed = 0; static int noent = 0; +static int quiet = 0; +static int callbacks = 0; xmlSAXHandler emptySAXHandlerStruct = { NULL, /* internalSubset */ @@ -97,6 +99,9 @@ extern xmlSAXHandlerPtr debugSAXHandler; static int isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return(0); fprintf(stdout, "SAX.isStandalone()\n"); return(0); } @@ -112,6 +117,9 @@ isStandaloneDebug(void *ctx ATTRIBUTE_UNUSED) static int hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return(0); fprintf(stdout, "SAX.hasInternalSubset()\n"); return(0); } @@ -127,6 +135,9 @@ hasInternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED) static int hasExternalSubsetDebug(void *ctx ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return(0); fprintf(stdout, "SAX.hasExternalSubset()\n"); return(0); } @@ -141,6 +152,9 @@ static void internalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar *ExternalID, const xmlChar *SystemID) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.internalSubset(%s,", name); if (ExternalID == NULL) fprintf(stdout, " ,"); @@ -162,6 +176,9 @@ static void externalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar *ExternalID, const xmlChar *SystemID) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.externalSubset(%s,", name); if (ExternalID == NULL) fprintf(stdout, " ,"); @@ -190,6 +207,9 @@ externalSubsetDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, static xmlParserInputPtr resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *publicId, const xmlChar *systemId) { + callbacks++; + if (quiet) + return(NULL); /* xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; */ @@ -222,6 +242,9 @@ resolveEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *publicId, const xm static xmlEntityPtr getEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) { + callbacks++; + if (quiet) + return(NULL); fprintf(stdout, "SAX.getEntity(%s)\n", name); return(NULL); } @@ -238,6 +261,9 @@ getEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) static xmlEntityPtr getParameterEntityDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) { + callbacks++; + if (quiet) + return(NULL); fprintf(stdout, "SAX.getParameterEntity(%s)\n", name); return(NULL); } @@ -258,6 +284,9 @@ static void entityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type, const xmlChar *publicId, const xmlChar *systemId, xmlChar *content) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.entityDecl(%s, %d, %s, %s, %s)\n", name, type, publicId, systemId, content); } @@ -275,6 +304,9 @@ attributeDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *elem, const xmlCha int type, int def, const xmlChar *defaultValue, xmlEnumerationPtr tree ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return; if (defaultValue == NULL) fprintf(stdout, "SAX.attributeDecl(%s, %s, %d, %d, NULL, ...)\n", elem, name, type, def); @@ -296,6 +328,9 @@ static void elementDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, int type, xmlElementContentPtr content ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.elementDecl(%s, %d, ...)\n", name, type); } @@ -313,6 +348,9 @@ static void notationDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.notationDecl(%s, %s, %s)\n", (char *) name, (char *) publicId, (char *) systemId); } @@ -332,6 +370,9 @@ unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar *publicId, const xmlChar *systemId, const xmlChar *notationName) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.unparsedEntityDecl(%s, %s, %s, %s)\n", (char *) name, (char *) publicId, (char *) systemId, (char *) notationName); @@ -348,6 +389,9 @@ unparsedEntityDeclDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, static void setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED, xmlSAXLocatorPtr loc ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.setDocumentLocator()\n"); } @@ -360,6 +404,9 @@ setDocumentLocatorDebug(void *ctx ATTRIBUTE_UNUSED, xmlSAXLocatorPtr loc ATTRIBU static void startDocumentDebug(void *ctx ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.startDocument()\n"); } @@ -372,6 +419,9 @@ startDocumentDebug(void *ctx ATTRIBUTE_UNUSED) static void endDocumentDebug(void *ctx ATTRIBUTE_UNUSED) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.endDocument()\n"); } @@ -387,6 +437,9 @@ startElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar { int i; + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.startElement(%s", (char *) name); if (atts != NULL) { for (i = 0;(atts[i] != NULL);i++) { @@ -408,6 +461,9 @@ startElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name, const xmlChar static void endElementDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) { + callbacks++; + if (quiet) + return; fprintf(stdout, "SAX.endElement(%s)\n", (char *) name); } @@ -426,6 +482,9 @@ charactersDebug(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len) char output[40]; int i; + callbacks++; + if (quiet) + return; for (i = 0;(ialloc == XML_BUFFER_ALLOC_UNMUTABLE) + return; if (buf == NULL) { #ifdef DEBUG_BUFFER xmlGenericError(xmlGenericErrorContext, @@ -5186,7 +5188,8 @@ xmlBufferFree(xmlBufferPtr buf) { #endif return; } - if (buf->content != NULL) { + if ((buf->content != NULL) && + (buf->alloc != XML_BUFFER_ALLOC_UNMUTABLE)) { xmlFree(buf->content); } xmlFree(buf); @@ -5200,6 +5203,9 @@ xmlBufferFree(xmlBufferPtr buf) { */ void xmlBufferEmpty(xmlBufferPtr buf) { + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return; + if (buf->content == NULL) return; buf->use = 0; memset(buf->content, 0, buf->size); @@ -5219,6 +5225,12 @@ xmlBufferShrink(xmlBufferPtr buf, unsigned int len) { if (len == 0) return(0); if (len > buf->use) return(-1); + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) { + buf->content = &buf->content[len]; + buf->use -= len; + return(len); + } + buf->use -= len; memmove(buf->content, &buf->content[len], buf->use * sizeof(xmlChar)); @@ -5240,6 +5252,9 @@ xmlBufferGrow(xmlBufferPtr buf, unsigned int len) { int size; xmlChar *newbuf; + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return(-1); + if (len + buf->use < buf->size) return(0); size = buf->use + len + 100; @@ -5334,6 +5349,9 @@ xmlBufferResize(xmlBufferPtr buf, unsigned int size) unsigned int newSize; xmlChar* rebuf = NULL; + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return 0; + /*take care of empty case*/ newSize = (buf->size ? buf->size*2 : size); @@ -5383,6 +5401,9 @@ void xmlBufferAdd(xmlBufferPtr buf, const xmlChar *str, int len) { unsigned int needSize; + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return; + if (str == NULL) { #ifdef DEBUG_BUFFER xmlGenericError(xmlGenericErrorContext, @@ -5431,6 +5452,9 @@ void xmlBufferAddHead(xmlBufferPtr buf, const xmlChar *str, int len) { unsigned int needSize; + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return; + if (str == NULL) { #ifdef DEBUG_BUFFER xmlGenericError(xmlGenericErrorContext, @@ -5491,6 +5515,9 @@ void xmlBufferCCat(xmlBufferPtr buf, const char *str) { const char *cur; + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return; + if (str == NULL) { #ifdef DEBUG_BUFFER xmlGenericError(xmlGenericErrorContext, @@ -5563,6 +5590,9 @@ xmlBufferWriteChar(xmlBufferPtr buf, const char *string) { */ void xmlBufferWriteQuotedString(xmlBufferPtr buf, const xmlChar *string) { + if (buf->alloc == XML_BUFFER_ALLOC_UNMUTABLE) + return; + if (xmlStrchr(string, '"')) { if (xmlStrchr(string, '\'')) { #ifdef DEBUG_BUFFER diff --git a/xmlIO.c b/xmlIO.c index e3c16fb1..2811058c 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -33,6 +33,14 @@ #include #endif +#ifdef HAVE_SYS_MMAN_H +#include +/* seems needed for Solaris */ +#ifndef MAP_FAILED +#define MAP_FAILED ((void *) -1) +#endif +#endif + /* Figure a portable way to know if a file is a directory. */ #ifndef HAVE_STAT # ifdef HAVE__STAT @@ -1947,6 +1955,84 @@ xmlParserInputBufferCreateMem(const char *mem, int size, xmlCharEncoding enc) { return(ret); } +#ifdef HAVE_SYS_MMAN_H +typedef struct _xmlMMapContext xmlMMapContext; +typedef xmlMMapContext *xmlMMapContextPtr; +struct _xmlMMapContext { + int fd; + const char *mem; + size_t size; +}; + +/** + * xmlParserInputBufferCloseMMapFile: + * @ctxt: the mmaped context + * + * Free up the resources associated to the mmaped file + */ +static void +xmlParserInputBufferCloseMMapFile(xmlMMapContextPtr ctxt) { + if (ctxt == NULL) + return; + if (ctxt->mem != (void *) MAP_FAILED) + munmap((char *) ctxt->mem, ctxt->size); + if (ctxt->fd >= 0) + close(ctxt->fd); + xmlFree(ctxt); +} + +/** + * xmlParserInputBufferCreateMMapFile: + * @fd: the descriptor associated to the mmaped file. + * @base: the mmaped start + * @size: the length of the memory block + * @enc: the charset encoding if known + * + * Create a buffered parser input for the progressive parsing for the input + * from a memory area. + * + * Returns the new parser input or NULL + */ +static xmlParserInputBufferPtr +xmlParserInputBufferCreateMMapFile(int fd, const char *mem, size_t size, + xmlCharEncoding enc) { + xmlParserInputBufferPtr ret; + xmlMMapContextPtr ctxt; + + if (fd < 0) return(NULL); + if (size <= 0) return(NULL); + if (mem == NULL) return(NULL); + + ctxt = (xmlMMapContextPtr) xmlMalloc(sizeof(xmlMMapContext)); + if (ctxt == NULL) + return(NULL); + ctxt->fd = fd; + ctxt->mem = mem; + ctxt->size = size; + + + ret = xmlAllocParserInputBuffer(enc); + if (ret != NULL) { + ret->context = (void *) ctxt; + ret->readcallback = (xmlInputReadCallback) xmlNop; + ret->closecallback = (xmlInputCloseCallback) + xmlParserInputBufferCloseMMapFile; + if (ret->buffer->content != NULL) { + xmlFree(ret->buffer->content); + } + ret->buffer->alloc = XML_BUFFER_ALLOC_UNMUTABLE; + ret->buffer->content = (xmlChar *) mem; + ret->buffer->size = size; + ret->buffer->use = size; + } else { + xmlFree(ctxt); + return(NULL); + } + + return(ret); +} +#endif + /** * xmlOutputBufferCreateFd: * @fd: a file descriptor number @@ -2433,8 +2519,7 @@ xmlParserGetDirectory(const char *filename) { * * ****************************************************************/ -#ifdef LIBXML_CATALOG_ENABLED -static int xmlSysIDExists(const char *URL) { +static const char * xmlSysIDExists(const char *URL, size_t *size) { #ifdef HAVE_STAT int ret; struct stat info; @@ -2454,12 +2539,16 @@ static int xmlSysIDExists(const char *URL) { } else path = URL; ret = stat(path, &info); - if (ret == 0) - return(1); + if (ret == 0) { + if (size) + *size = info.st_size; + return(path); + } #endif - return(0); + if (size) + *size = -1; + return(NULL); } -#endif /** * xmlDefaultExternalEntityLoader: @@ -2480,7 +2569,57 @@ xmlDefaultExternalEntityLoader(const char *URL, const char *ID, #ifdef LIBXML_CATALOG_ENABLED xmlCatalogAllow pref; #endif + const char *exist; + size_t length; + exist = xmlSysIDExists(URL, &length); +#ifdef HAVE_SYS_MMAN_H + /* + * Shortcut, if asked for a file, the file is present, mmap it ! + */ + if ((exist != NULL) && (length > 0)) { + int fd = -1; + const char *base = NULL; + xmlParserInputBufferPtr buf = NULL; + + if ((fd = open(exist, O_RDONLY)) >= 0) { + /* + * Magic test: don't drop back native compressed content support + */ + char tmpbuf[2]; + if (read(fd, tmpbuf, 2) != 2) + goto failed; + if ((tmpbuf[0] == 0x1F) && (tmpbuf[1] == 0x8B)) + goto failed; + + base = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0); + if (base != (void *) MAP_FAILED) { + buf = xmlParserInputBufferCreateMMapFile(fd, base, length, + XML_CHAR_ENCODING_NONE); + if (buf != NULL) { + ret = xmlNewInputStream(ctxt); + if (ret != NULL) { + ret->filename = (const char *) xmlCharStrdup(exist); + ret->directory = (const char *) + xmlParserGetDirectory(exist); + ret->buf = buf; + ret->base = ret->buf->buffer->content; + ret->cur = ret->buf->buffer->content; + ret->end = &ret->base[ret->buf->buffer->use]; + return(ret); + } + } + } + } +failed: + if (buf != NULL) + xmlFreeParserInputBuffer(buf); + if (base != (void *) MAP_FAILED) + munmap((char *) base, length); + if (fd >= 0) + close(fd); + } +#endif #ifdef DEBUG_EXTERNAL_ENTITIES xmlGenericError(xmlGenericErrorContext, "xmlDefaultExternalEntityLoader(%s, xxx)\n", URL); @@ -2492,7 +2631,7 @@ xmlDefaultExternalEntityLoader(const char *URL, const char *ID, */ pref = xmlCatalogGetDefaults(); - if ((pref != XML_CATA_ALLOW_NONE) && (!xmlSysIDExists(URL))) { + if ((pref != XML_CATA_ALLOW_NONE) && (exist == NULL)) { /* * Do a local lookup */ @@ -2518,7 +2657,8 @@ xmlDefaultExternalEntityLoader(const char *URL, const char *ID, /* * TODO: do an URI lookup on the reference */ - if ((resource != NULL) && (!xmlSysIDExists((const char *)resource))) { + exist = xmlSysIDExists(URL, &length); + if ((resource != NULL) && (exist == NULL)) { xmlChar *tmp = NULL; if ((ctxt->catalogs != NULL) &&