From 77a90a7f8e3fc4a03d379e79b00509635ca2f0d8 Mon Sep 17 00:00:00 2001
From: Daniel Veillard
Date: Sat, 22 Mar 2003 00:04:05 +0000
Subject: [PATCH] patch from johan@evenhuis.nl for #107937 fixing some line
counting
* HTMLparser.c parser.c parserInternals.c: patch from
johan@evenhuis.nl for #107937 fixing some line counting
problems, and some other cleanups.
* result/HTML/: this result in some line number changes
Daniel
---
ChangeLog | 7 ++
HTMLparser.c | 12 +-
parser.c | 15 ++-
parserInternals.c | 230 +++++++++++++++++++------------------
result/HTML/doc2.htm.err | 2 +-
result/HTML/doc3.htm.err | 46 ++++----
result/HTML/wired.html.err | 28 ++---
7 files changed, 186 insertions(+), 154 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index cfe9e9ca..1609071c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+Sat Mar 23 01:00:24 CET 2003 Daniel Veillard
+
+ * HTMLparser.c parser.c parserInternals.c: patch from
+ johan@evenhuis.nl for #107937 fixing some line counting
+ problems, and some other cleanups.
+ * result/HTML/: this result in some line number changes
+
Fri Mar 21 22:19:14 CET 2003 Daniel Veillard
* configure.in Makefile.am: fixed Red Hat bug #86118 use libxml2.spec
diff --git a/HTMLparser.c b/HTMLparser.c
index 38a442c2..24186a24 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -134,7 +134,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
* it should be used only to compare on ASCII based substring.
* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
- * strings within the parser.
+ * strings without newlines within the parser.
*
* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
*
@@ -142,12 +142,13 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
* UTF-8 if we are using this mode. It returns an int.
* NEXT Skip to the next character, this does the proper decoding
* in UTF-8 mode. It also pop-up unfinished entities on the fly.
+ * NEXTL(l) Skip the current unicode character of l xmlChars long.
* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
*/
#define UPPER (toupper(*ctxt->input->cur))
-#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val)
+#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
#define NXT(val) ctxt->input->cur[(val)]
@@ -167,7 +168,7 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
#define CUR ((int) (*ctxt->input->cur))
-#define NEXT xmlNextChar(ctxt),ctxt->nbChars++
+#define NEXT xmlNextChar(ctxt)
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
#define NXT(val) ctxt->input->cur[(val)]
@@ -2220,6 +2221,8 @@ htmlParseName(htmlParserCtxtPtr ctxt) {
count = in - ctxt->input->cur;
ret = xmlStrndup(ctxt->input->cur, count);
ctxt->input->cur = in;
+ ctxt->nbChars += count;
+ ctxt->input->col += count;
return(ret);
}
}
@@ -5203,6 +5206,8 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
}
memset(ctxt, 0, sizeof(htmlParserCtxt));
htmlInitParserCtxt(ctxt);
+ if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
+ ctxt->charset=XML_CHAR_ENCODING_UTF8;
if (sax != NULL) {
if (ctxt->sax != &htmlDefaultSAXHandler)
xmlFree(ctxt->sax);
@@ -5225,6 +5230,7 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
inputStream = htmlNewInputStream(ctxt);
if (inputStream == NULL) {
xmlFreeParserCtxt(ctxt);
+ xmlFree(buf);
return(NULL);
}
diff --git a/parser.c b/parser.c
index 83db22bf..f29d87d8 100644
--- a/parser.c
+++ b/parser.c
@@ -339,13 +339,14 @@ static int spacePop(xmlParserCtxtPtr ctxt) {
* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
* to compare on ASCII based substring.
* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
- * strings within the parser.
- *
+ * strings without newlines within the parser.
+ * NEXT1(l) Skip 1 xmlChar, and must also be used only to skip 1 non-newline ASCII
+ * defined char within the parser.
* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
*
* NEXT Skip to the next character, this does the proper decoding
* in UTF-8 mode. It also pop-up unfinished entities on the fly.
- * NEXTL(l) Skip l xmlChar in the input buffer
+ * NEXTL(l) Skip the current unicode character of l xmlChars long.
* CUR_CHAR(l) returns the current unicode character (int), set l
* to the number of xmlChars used for the encoding [0-5].
* CUR_SCHAR same but operate on a string instead of the context
@@ -360,7 +361,7 @@ static int spacePop(xmlParserCtxtPtr ctxt) {
#define CUR_PTR ctxt->input->cur
#define SKIP(val) do { \
- ctxt->nbChars += (val),ctxt->input->cur += (val); \
+ ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val); \
if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
if ((*ctxt->input->cur == 0) && \
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) \
@@ -392,6 +393,7 @@ static void xmlGROW (xmlParserCtxtPtr ctxt) {
#define NEXT xmlNextChar(ctxt)
#define NEXT1 { \
+ ctxt->input->col++; \
ctxt->input->cur++; \
ctxt->nbChars++; \
if (*ctxt->input->cur == 0) \
@@ -578,6 +580,7 @@ xmlParseCharRef(xmlParserCtxtPtr ctxt) {
}
if (RAW == ';') {
/* on purpose to avoid reentrancy problems with NEXT and SKIP */
+ ctxt->input->col++;
ctxt->nbChars ++;
ctxt->input->cur++;
}
@@ -606,6 +609,7 @@ xmlParseCharRef(xmlParserCtxtPtr ctxt) {
}
if (RAW == ';') {
/* on purpose to avoid reentrancy problems with NEXT and SKIP */
+ ctxt->input->col++;
ctxt->nbChars ++;
ctxt->input->cur++;
}
@@ -1897,6 +1901,8 @@ xmlParseName(xmlParserCtxtPtr ctxt) {
count = in - ctxt->input->cur;
ret = xmlStrndup(ctxt->input->cur, count);
ctxt->input->cur = in;
+ ctxt->nbChars += count;
+ ctxt->input->col += count;
return(ret);
}
}
@@ -9149,6 +9155,7 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
inputStream = xmlNewInputStream(ctxt);
if (inputStream == NULL) {
xmlFreeParserCtxt(ctxt);
+ xmlFree(buf);
return(NULL);
}
diff --git a/parserInternals.c b/parserInternals.c
index 65551441..3900a0e7 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1095,120 +1095,131 @@ xmlParserInputShrink(xmlParserInputPtr in) {
*/
void
-xmlNextChar(xmlParserCtxtPtr ctxt) {
+xmlNextChar(xmlParserCtxtPtr ctxt)
+{
if (ctxt->instate == XML_PARSER_EOF)
- return;
+ return;
- /*
- * 2.11 End-of-Line Handling
- * the literal two-character sequence "#xD#xA" or a standalone
- * literal #xD, an XML processor must pass to the application
- * the single character #xA.
- */
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
- if ((*ctxt->input->cur == 0) &&
- (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
- (ctxt->instate != XML_PARSER_COMMENT)) {
- /*
- * If we are at the end of the current entity and
- * the context allows it, we pop consumed entities
- * automatically.
- * the auto closing should be blocked in other cases
- */
- xmlPopInput(ctxt);
- } else {
- if (*(ctxt->input->cur) == '\n') {
- ctxt->input->line++; ctxt->input->col = 1;
- } else ctxt->input->col++;
- if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
- /*
- * We are supposed to handle UTF8, check it's valid
- * From rfc2044: encoding of the Unicode values on UTF-8:
- *
- * UCS-4 range (hex.) UTF-8 octet sequence (binary)
- * 0000 0000-0000 007F 0xxxxxxx
- * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
- * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
- *
- * Check for the 0x110000 limit too
- */
- const unsigned char *cur = ctxt->input->cur;
- unsigned char c;
+ if ((*ctxt->input->cur == 0) &&
+ (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
+ (ctxt->instate != XML_PARSER_COMMENT)) {
+ /*
+ * If we are at the end of the current entity and
+ * the context allows it, we pop consumed entities
+ * automatically.
+ * the auto closing should be blocked in other cases
+ */
+ xmlPopInput(ctxt);
+ } else {
+ const unsigned char *cur;
+ unsigned char c;
- c = *cur;
- if (c & 0x80) {
- if (cur[1] == 0)
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- if ((cur[1] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xe0) == 0xe0) {
- unsigned int val;
+ /*
+ * 2.11 End-of-Line Handling
+ * the literal two-character sequence "#xD#xA" or a standalone
+ * literal #xD, an XML processor must pass to the application
+ * the single character #xA.
+ */
+ if (*(ctxt->input->cur) == '\n') {
+ ctxt->input->line++;
+ ctxt->input->col = 1;
+ } else
+ ctxt->input->col++;
- if (cur[2] == 0)
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- if ((cur[2] & 0xc0) != 0x80)
- goto encoding_error;
- if ((c & 0xf0) == 0xf0) {
- if (cur[3] == 0)
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- if (((c & 0xf8) != 0xf0) ||
- ((cur[3] & 0xc0) != 0x80))
- goto encoding_error;
- /* 4-byte code */
- ctxt->input->cur += 4;
- val = (cur[0] & 0x7) << 18;
- val |= (cur[1] & 0x3f) << 12;
- val |= (cur[2] & 0x3f) << 6;
- val |= cur[3] & 0x3f;
- } else {
- /* 3-byte code */
- ctxt->input->cur += 3;
- val = (cur[0] & 0xf) << 12;
- val |= (cur[1] & 0x3f) << 6;
- val |= cur[2] & 0x3f;
- }
- if (((val > 0xd7ff) && (val < 0xe000)) ||
- ((val > 0xfffd) && (val < 0x10000)) ||
- (val >= 0x110000)) {
- if ((ctxt->sax != NULL) &&
- (ctxt->sax->error != NULL))
- ctxt->sax->error(ctxt->userData,
- "Char 0x%X out of allowed range\n", val);
- ctxt->errNo = XML_ERR_INVALID_ENCODING;
- ctxt->wellFormed = 0;
- if (ctxt->recovery == 0) ctxt->disableSAX = 1;
- }
- } else
- /* 2-byte code */
- ctxt->input->cur += 2;
- } else
- /* 1-byte code */
- ctxt->input->cur++;
- } else {
- /*
- * Assume it's a fixed length encoding (1) with
- * a compatible encoding for the ASCII set, since
- * XML constructs only use < 128 chars
- */
- ctxt->input->cur++;
- }
- ctxt->nbChars++;
- if (*ctxt->input->cur == 0)
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
- }
+ /*
+ * We are supposed to handle UTF8, check it's valid
+ * From rfc2044: encoding of the Unicode values on UTF-8:
+ *
+ * UCS-4 range (hex.) UTF-8 octet sequence (binary)
+ * 0000 0000-0000 007F 0xxxxxxx
+ * 0000 0080-0000 07FF 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
+ *
+ * Check for the 0x110000 limit too
+ */
+ cur = ctxt->input->cur;
+
+ c = *cur;
+ if (c & 0x80) {
+ if (cur[1] == 0)
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ if ((cur[1] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xe0) == 0xe0) {
+ unsigned int val;
+
+ if (cur[2] == 0)
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ if ((cur[2] & 0xc0) != 0x80)
+ goto encoding_error;
+ if ((c & 0xf0) == 0xf0) {
+ if (cur[3] == 0)
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ if (((c & 0xf8) != 0xf0) ||
+ ((cur[3] & 0xc0) != 0x80))
+ goto encoding_error;
+ /* 4-byte code */
+ ctxt->input->cur += 4;
+ val = (cur[0] & 0x7) << 18;
+ val |= (cur[1] & 0x3f) << 12;
+ val |= (cur[2] & 0x3f) << 6;
+ val |= cur[3] & 0x3f;
+ } else {
+ /* 3-byte code */
+ ctxt->input->cur += 3;
+ val = (cur[0] & 0xf) << 12;
+ val |= (cur[1] & 0x3f) << 6;
+ val |= cur[2] & 0x3f;
+ }
+ if (((val > 0xd7ff) && (val < 0xe000)) ||
+ ((val > 0xfffd) && (val < 0x10000)) ||
+ (val >= 0x110000)) {
+ if ((ctxt->sax != NULL) &&
+ (ctxt->sax->error != NULL))
+ ctxt->sax->error(ctxt->userData,
+ "Char 0x%X out of allowed range\n",
+ val);
+ ctxt->errNo = XML_ERR_INVALID_ENCODING;
+ ctxt->wellFormed = 0;
+ if (ctxt->recovery == 0)
+ ctxt->disableSAX = 1;
+ }
+ } else
+ /* 2-byte code */
+ ctxt->input->cur += 2;
+ } else
+ /* 1-byte code */
+ ctxt->input->cur++;
+
+ ctxt->nbChars++;
+ if (*ctxt->input->cur == 0)
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ }
} else {
- ctxt->input->cur++;
- ctxt->nbChars++;
- if (*ctxt->input->cur == 0)
- xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+ /*
+ * Assume it's a fixed length encoding (1) with
+ * a compatible encoding for the ASCII set, since
+ * XML constructs only use < 128 chars
+ */
+
+ if (*(ctxt->input->cur) == '\n') {
+ ctxt->input->line++;
+ ctxt->input->col = 1;
+ } else
+ ctxt->input->col++;
+ ctxt->input->cur++;
+ ctxt->nbChars++;
+ if (*ctxt->input->cur == 0)
+ xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
}
if ((*ctxt->input->cur == '%') && (!ctxt->html))
- xmlParserHandlePEReference(ctxt);
+ xmlParserHandlePEReference(ctxt);
if ((*ctxt->input->cur == 0) &&
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0))
- xmlPopInput(ctxt);
+ xmlPopInput(ctxt);
return;
-encoding_error:
+ encoding_error:
/*
* If we detect an UTF8 error that probably mean that the
* input encoding didn't get properly advertised in the
@@ -1217,16 +1228,17 @@ encoding_error:
* encoding !)
*/
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
- ctxt->sax->error(ctxt->userData,
- "Input is not proper UTF-8, indicate encoding !\n");
- ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
- ctxt->input->cur[0], ctxt->input->cur[1],
- ctxt->input->cur[2], ctxt->input->cur[3]);
+ ctxt->sax->error(ctxt->userData,
+ "Input is not proper UTF-8, indicate encoding !\n");
+ ctxt->sax->error(ctxt->userData,
+ "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ ctxt->input->cur[0], ctxt->input->cur[1],
+ ctxt->input->cur[2], ctxt->input->cur[3]);
}
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_INVALID_ENCODING;
- ctxt->charset = XML_CHAR_ENCODING_8859_1;
+ ctxt->charset = XML_CHAR_ENCODING_8859_1;
ctxt->input->cur++;
return;
}
diff --git a/result/HTML/doc2.htm.err b/result/HTML/doc2.htm.err
index d098b471..bf46ffad 100644
--- a/result/HTML/doc2.htm.err
+++ b/result/HTML/doc2.htm.err
@@ -1,3 +1,3 @@
-./test/HTML/doc2.htm:5: error: Misplaced DOCTYPE declaration
+./test/HTML/doc2.htm:10: error: Misplaced DOCTYPE declaration

^
-./test/HTML/doc3.htm:815: error: Unexpected end tag : noscript
+./test/HTML/doc3.htm:820: error: Unexpected end tag : noscript
^
-./test/HTML/doc3.htm:821: error: Opening and ending tag mismatch: form and center
+./test/HTML/doc3.htm:826: error: Opening and ending tag mismatch: form and center
Code:BP6-hd
^
-./test/HTML/doc3.htm:828: error: Opening and ending tag mismatch: center and td
+./test/HTML/doc3.htm:833: error: Opening and ending tag mismatch: center and td
Special
Code:BP6-hd
^
-./test/HTML/doc3.htm:834: error: Unexpected end tag : p
+./test/HTML/doc3.htm:839: error: Unexpected end tag : p
width="100%">
^
-./test/HTML/doc3.htm:835: error: Unexpected end tag : td
+./test/HTML/doc3.htm:840: error: Unexpected end tag : td
|
|
^
diff --git a/result/HTML/wired.html.err b/result/HTML/wired.html.err
index 82415e0e..bb1c4315 100644
--- a/result/HTML/wired.html.err
+++ b/result/HTML/wired.html.err
@@ -205,45 +205,45 @@ Readers on Apple's G4 ... AOL's passwords ... MS vs. Linux.
VignetteSpr
^
-./test/HTML/wired.html:406: error: Opening and ending tag mismatch: a and font
+./test/HTML/wired.html:408: error: Opening and ending tag mismatch: a and font
com&BANNER=Sprint" style="text-decoration:none">Sprint'
+./test/HTML/wired.html:408: error: End tag : expected '>'
=Sprint" style="text-decoration:none">Sprint
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:412: error: Opening and ending tag mismatch: td and font
+./test/HTML/wired.html:414: error: Opening and ending tag mismatch: td and font
^
-./test/HTML/wired.html:430: error: htmlParseEntityRef: expecting ';'
+./test/HTML/wired.html:432: error: htmlParseEntityRef: expecting ';'
href="http://www.lycos.com/news/flash/hitlerbunker.html?v=wn1015&lpv=1">Lycos
^