From 77a90a7f8e3fc4a03d379e79b00509635ca2f0d8 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sat, 22 Mar 2003 00:04:05 +0000 Subject: [PATCH] patch from johan@evenhuis.nl for #107937 fixing some line counting * HTMLparser.c parser.c parserInternals.c: patch from johan@evenhuis.nl for #107937 fixing some line counting problems, and some other cleanups. * result/HTML/: this result in some line number changes Daniel --- ChangeLog | 7 ++ HTMLparser.c | 12 +- parser.c | 15 ++- parserInternals.c | 230 +++++++++++++++++++------------------ result/HTML/doc2.htm.err | 2 +- result/HTML/doc3.htm.err | 46 ++++---- result/HTML/wired.html.err | 28 ++--- 7 files changed, 186 insertions(+), 154 deletions(-) diff --git a/ChangeLog b/ChangeLog index cfe9e9ca..1609071c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Sat Mar 23 01:00:24 CET 2003 Daniel Veillard + + * HTMLparser.c parser.c parserInternals.c: patch from + johan@evenhuis.nl for #107937 fixing some line counting + problems, and some other cleanups. + * result/HTML/: this result in some line number changes + Fri Mar 21 22:19:14 CET 2003 Daniel Veillard * configure.in Makefile.am: fixed Red Hat bug #86118 use libxml2.spec diff --git a/HTMLparser.c b/HTMLparser.c index 38a442c2..24186a24 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -134,7 +134,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt) * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR * it should be used only to compare on ASCII based substring. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined - * strings within the parser. + * strings without newlines within the parser. * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * @@ -142,12 +142,13 @@ htmlnamePop(htmlParserCtxtPtr ctxt) * UTF-8 if we are using this mode. It returns an int. * NEXT Skip to the next character, this does the proper decoding * in UTF-8 mode. It also pop-up unfinished entities on the fly. + * NEXTL(l) Skip the current unicode character of l xmlChars long. * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */ #define UPPER (toupper(*ctxt->input->cur)) -#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val) +#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) #define NXT(val) ctxt->input->cur[(val)] @@ -167,7 +168,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt) /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ #define CUR ((int) (*ctxt->input->cur)) -#define NEXT xmlNextChar(ctxt),ctxt->nbChars++ +#define NEXT xmlNextChar(ctxt) #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) #define NXT(val) ctxt->input->cur[(val)] @@ -2220,6 +2221,8 @@ htmlParseName(htmlParserCtxtPtr ctxt) { count = in - ctxt->input->cur; ret = xmlStrndup(ctxt->input->cur, count); ctxt->input->cur = in; + ctxt->nbChars += count; + ctxt->input->col += count; return(ret); } } @@ -5203,6 +5206,8 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, } memset(ctxt, 0, sizeof(htmlParserCtxt)); htmlInitParserCtxt(ctxt); + if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) + ctxt->charset=XML_CHAR_ENCODING_UTF8; if (sax != NULL) { if (ctxt->sax != &htmlDefaultSAXHandler) xmlFree(ctxt->sax); @@ -5225,6 +5230,7 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, inputStream = htmlNewInputStream(ctxt); if (inputStream == NULL) { xmlFreeParserCtxt(ctxt); + xmlFree(buf); return(NULL); } diff --git a/parser.c b/parser.c index 83db22bf..f29d87d8 100644 --- a/parser.c +++ b/parser.c @@ -339,13 +339,14 @@ static int spacePop(xmlParserCtxtPtr ctxt) { * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only * to compare on ASCII based substring. * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined - * strings within the parser. - * + * strings without newlines within the parser. + * NEXT1(l) Skip 1 xmlChar, and must also be used only to skip 1 non-newline ASCII + * defined char within the parser. * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * * NEXT Skip to the next character, this does the proper decoding * in UTF-8 mode. It also pop-up unfinished entities on the fly. - * NEXTL(l) Skip l xmlChar in the input buffer + * NEXTL(l) Skip the current unicode character of l xmlChars long. * CUR_CHAR(l) returns the current unicode character (int), set l * to the number of xmlChars used for the encoding [0-5]. * CUR_SCHAR same but operate on a string instead of the context @@ -360,7 +361,7 @@ static int spacePop(xmlParserCtxtPtr ctxt) { #define CUR_PTR ctxt->input->cur #define SKIP(val) do { \ - ctxt->nbChars += (val),ctxt->input->cur += (val); \ + ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val); \ if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ if ((*ctxt->input->cur == 0) && \ (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) \ @@ -392,6 +393,7 @@ static void xmlGROW (xmlParserCtxtPtr ctxt) { #define NEXT xmlNextChar(ctxt) #define NEXT1 { \ + ctxt->input->col++; \ ctxt->input->cur++; \ ctxt->nbChars++; \ if (*ctxt->input->cur == 0) \ @@ -578,6 +580,7 @@ xmlParseCharRef(xmlParserCtxtPtr ctxt) { } if (RAW == ';') { /* on purpose to avoid reentrancy problems with NEXT and SKIP */ + ctxt->input->col++; ctxt->nbChars ++; ctxt->input->cur++; } @@ -606,6 +609,7 @@ xmlParseCharRef(xmlParserCtxtPtr ctxt) { } if (RAW == ';') { /* on purpose to avoid reentrancy problems with NEXT and SKIP */ + ctxt->input->col++; ctxt->nbChars ++; ctxt->input->cur++; } @@ -1897,6 +1901,8 @@ xmlParseName(xmlParserCtxtPtr ctxt) { count = in - ctxt->input->cur; ret = xmlStrndup(ctxt->input->cur, count); ctxt->input->cur = in; + ctxt->nbChars += count; + ctxt->input->col += count; return(ret); } } @@ -9149,6 +9155,7 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data, inputStream = xmlNewInputStream(ctxt); if (inputStream == NULL) { xmlFreeParserCtxt(ctxt); + xmlFree(buf); return(NULL); } diff --git a/parserInternals.c b/parserInternals.c index 65551441..3900a0e7 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1095,120 +1095,131 @@ xmlParserInputShrink(xmlParserInputPtr in) { */ void -xmlNextChar(xmlParserCtxtPtr ctxt) { +xmlNextChar(xmlParserCtxtPtr ctxt) +{ if (ctxt->instate == XML_PARSER_EOF) - return; + return; - /* - * 2.11 End-of-Line Handling - * the literal two-character sequence "#xD#xA" or a standalone - * literal #xD, an XML processor must pass to the application - * the single character #xA. - */ if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { - if ((*ctxt->input->cur == 0) && - (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) && - (ctxt->instate != XML_PARSER_COMMENT)) { - /* - * If we are at the end of the current entity and - * the context allows it, we pop consumed entities - * automatically. - * the auto closing should be blocked in other cases - */ - xmlPopInput(ctxt); - } else { - if (*(ctxt->input->cur) == '\n') { - ctxt->input->line++; ctxt->input->col = 1; - } else ctxt->input->col++; - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - const unsigned char *cur = ctxt->input->cur; - unsigned char c; + if ((*ctxt->input->cur == 0) && + (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) && + (ctxt->instate != XML_PARSER_COMMENT)) { + /* + * If we are at the end of the current entity and + * the context allows it, we pop consumed entities + * automatically. + * the auto closing should be blocked in other cases + */ + xmlPopInput(ctxt); + } else { + const unsigned char *cur; + unsigned char c; - c = *cur; - if (c & 0x80) { - if (cur[1] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if ((cur[1] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { - unsigned int val; + /* + * 2.11 End-of-Line Handling + * the literal two-character sequence "#xD#xA" or a standalone + * literal #xD, an XML processor must pass to the application + * the single character #xA. + */ + if (*(ctxt->input->cur) == '\n') { + ctxt->input->line++; + ctxt->input->col = 1; + } else + ctxt->input->col++; - if (cur[2] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if ((cur[2] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - if (((c & 0xf8) != 0xf0) || - ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - ctxt->input->cur += 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - } else { - /* 3-byte code */ - ctxt->input->cur += 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - } - if (((val > 0xd7ff) && (val < 0xe000)) || - ((val > 0xfffd) && (val < 0x10000)) || - (val >= 0x110000)) { - if ((ctxt->sax != NULL) && - (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "Char 0x%X out of allowed range\n", val); - ctxt->errNo = XML_ERR_INVALID_ENCODING; - ctxt->wellFormed = 0; - if (ctxt->recovery == 0) ctxt->disableSAX = 1; - } - } else - /* 2-byte code */ - ctxt->input->cur += 2; - } else - /* 1-byte code */ - ctxt->input->cur++; - } else { - /* - * Assume it's a fixed length encoding (1) with - * a compatible encoding for the ASCII set, since - * XML constructs only use < 128 chars - */ - ctxt->input->cur++; - } - ctxt->nbChars++; - if (*ctxt->input->cur == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - } + /* + * We are supposed to handle UTF8, check it's valid + * From rfc2044: encoding of the Unicode values on UTF-8: + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 0000 0000-0000 007F 0xxxxxxx + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + * + * Check for the 0x110000 limit too + */ + cur = ctxt->input->cur; + + c = *cur; + if (c & 0x80) { + if (cur[1] == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + if ((cur[1] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xe0) == 0xe0) { + unsigned int val; + + if (cur[2] == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + if ((cur[2] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xf0) == 0xf0) { + if (cur[3] == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + if (((c & 0xf8) != 0xf0) || + ((cur[3] & 0xc0) != 0x80)) + goto encoding_error; + /* 4-byte code */ + ctxt->input->cur += 4; + val = (cur[0] & 0x7) << 18; + val |= (cur[1] & 0x3f) << 12; + val |= (cur[2] & 0x3f) << 6; + val |= cur[3] & 0x3f; + } else { + /* 3-byte code */ + ctxt->input->cur += 3; + val = (cur[0] & 0xf) << 12; + val |= (cur[1] & 0x3f) << 6; + val |= cur[2] & 0x3f; + } + if (((val > 0xd7ff) && (val < 0xe000)) || + ((val > 0xfffd) && (val < 0x10000)) || + (val >= 0x110000)) { + if ((ctxt->sax != NULL) && + (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Char 0x%X out of allowed range\n", + val); + ctxt->errNo = XML_ERR_INVALID_ENCODING; + ctxt->wellFormed = 0; + if (ctxt->recovery == 0) + ctxt->disableSAX = 1; + } + } else + /* 2-byte code */ + ctxt->input->cur += 2; + } else + /* 1-byte code */ + ctxt->input->cur++; + + ctxt->nbChars++; + if (*ctxt->input->cur == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + } } else { - ctxt->input->cur++; - ctxt->nbChars++; - if (*ctxt->input->cur == 0) - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + /* + * Assume it's a fixed length encoding (1) with + * a compatible encoding for the ASCII set, since + * XML constructs only use < 128 chars + */ + + if (*(ctxt->input->cur) == '\n') { + ctxt->input->line++; + ctxt->input->col = 1; + } else + ctxt->input->col++; + ctxt->input->cur++; + ctxt->nbChars++; + if (*ctxt->input->cur == 0) + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); } if ((*ctxt->input->cur == '%') && (!ctxt->html)) - xmlParserHandlePEReference(ctxt); + xmlParserHandlePEReference(ctxt); if ((*ctxt->input->cur == 0) && (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) - xmlPopInput(ctxt); + xmlPopInput(ctxt); return; -encoding_error: + encoding_error: /* * If we detect an UTF8 error that probably mean that the * input encoding didn't get properly advertised in the @@ -1217,16 +1228,17 @@ encoding_error: * encoding !) */ if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { - ctxt->sax->error(ctxt->userData, - "Input is not proper UTF-8, indicate encoding !\n"); - ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", - ctxt->input->cur[0], ctxt->input->cur[1], - ctxt->input->cur[2], ctxt->input->cur[3]); + ctxt->sax->error(ctxt->userData, + "Input is not proper UTF-8, indicate encoding !\n"); + ctxt->sax->error(ctxt->userData, + "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + ctxt->input->cur[0], ctxt->input->cur[1], + ctxt->input->cur[2], ctxt->input->cur[3]); } ctxt->wellFormed = 0; ctxt->errNo = XML_ERR_INVALID_ENCODING; - ctxt->charset = XML_CHAR_ENCODING_8859_1; + ctxt->charset = XML_CHAR_ENCODING_8859_1; ctxt->input->cur++; return; } diff --git a/result/HTML/doc2.htm.err b/result/HTML/doc2.htm.err index d098b471..bf46ffad 100644 --- a/result/HTML/doc2.htm.err +++ b/result/HTML/doc2.htm.err @@ -1,3 +1,3 @@ -./test/HTML/doc2.htm:5: error: Misplaced DOCTYPE declaration +./test/HTML/doc2.htm:10: error: Misplaced DOCTYPE declaration





Code:BP6-hd

Code:BP6-hd

 

^ -./test/HTML/doc3.htm:835: error: Unexpected end tag : td +./test/HTML/doc3.htm:840: error: Unexpected end tag : td
  ^ diff --git a/result/HTML/wired.html.err b/result/HTML/wired.html.err index 82415e0e..bb1c4315 100644 --- a/result/HTML/wired.html.err +++ b/result/HTML/wired.html.err @@ -205,45 +205,45 @@ Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.

VignetteSpr ^ -./test/HTML/wired.html:406: error: Opening and ending tag mismatch: a and font +./test/HTML/wired.html:408: error: Opening and ending tag mismatch: a and font com&BANNER=Sprint" style="text-decoration:none">Sprint' +./test/HTML/wired.html:408: error: End tag : expected '>' =Sprint" style="text-decoration:none">Sprint ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font +./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font ^ -./test/HTML/wired.html:430: error: htmlParseEntityRef: expecting ';' +./test/HTML/wired.html:432: error: htmlParseEntityRef: expecting ';' href="http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&lpv=1">Lycos