mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
patch from johan@evenhuis.nl for #107937 fixing some line counting
* HTMLparser.c parser.c parserInternals.c: patch from johan@evenhuis.nl for #107937 fixing some line counting problems, and some other cleanups. * result/HTML/: this result in some line number changes Daniel
This commit is contained in:
@ -1095,120 +1095,131 @@ xmlParserInputShrink(xmlParserInputPtr in) {
|
||||
*/
|
||||
|
||||
void
|
||||
xmlNextChar(xmlParserCtxtPtr ctxt) {
|
||||
xmlNextChar(xmlParserCtxtPtr ctxt)
|
||||
{
|
||||
if (ctxt->instate == XML_PARSER_EOF)
|
||||
return;
|
||||
return;
|
||||
|
||||
/*
|
||||
* 2.11 End-of-Line Handling
|
||||
* the literal two-character sequence "#xD#xA" or a standalone
|
||||
* literal #xD, an XML processor must pass to the application
|
||||
* the single character #xA.
|
||||
*/
|
||||
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
|
||||
if ((*ctxt->input->cur == 0) &&
|
||||
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
|
||||
(ctxt->instate != XML_PARSER_COMMENT)) {
|
||||
/*
|
||||
* If we are at the end of the current entity and
|
||||
* the context allows it, we pop consumed entities
|
||||
* automatically.
|
||||
* the auto closing should be blocked in other cases
|
||||
*/
|
||||
xmlPopInput(ctxt);
|
||||
} else {
|
||||
if (*(ctxt->input->cur) == '\n') {
|
||||
ctxt->input->line++; ctxt->input->col = 1;
|
||||
} else ctxt->input->col++;
|
||||
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
|
||||
/*
|
||||
* We are supposed to handle UTF8, check it's valid
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
*
|
||||
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
||||
* 0000 0000-0000 007F 0xxxxxxx
|
||||
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
*
|
||||
* Check for the 0x110000 limit too
|
||||
*/
|
||||
const unsigned char *cur = ctxt->input->cur;
|
||||
unsigned char c;
|
||||
if ((*ctxt->input->cur == 0) &&
|
||||
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
|
||||
(ctxt->instate != XML_PARSER_COMMENT)) {
|
||||
/*
|
||||
* If we are at the end of the current entity and
|
||||
* the context allows it, we pop consumed entities
|
||||
* automatically.
|
||||
* the auto closing should be blocked in other cases
|
||||
*/
|
||||
xmlPopInput(ctxt);
|
||||
} else {
|
||||
const unsigned char *cur;
|
||||
unsigned char c;
|
||||
|
||||
c = *cur;
|
||||
if (c & 0x80) {
|
||||
if (cur[1] == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
if ((cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xe0) == 0xe0) {
|
||||
unsigned int val;
|
||||
/*
|
||||
* 2.11 End-of-Line Handling
|
||||
* the literal two-character sequence "#xD#xA" or a standalone
|
||||
* literal #xD, an XML processor must pass to the application
|
||||
* the single character #xA.
|
||||
*/
|
||||
if (*(ctxt->input->cur) == '\n') {
|
||||
ctxt->input->line++;
|
||||
ctxt->input->col = 1;
|
||||
} else
|
||||
ctxt->input->col++;
|
||||
|
||||
if (cur[2] == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
if ((cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xf0) == 0xf0) {
|
||||
if (cur[3] == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
if (((c & 0xf8) != 0xf0) ||
|
||||
((cur[3] & 0xc0) != 0x80))
|
||||
goto encoding_error;
|
||||
/* 4-byte code */
|
||||
ctxt->input->cur += 4;
|
||||
val = (cur[0] & 0x7) << 18;
|
||||
val |= (cur[1] & 0x3f) << 12;
|
||||
val |= (cur[2] & 0x3f) << 6;
|
||||
val |= cur[3] & 0x3f;
|
||||
} else {
|
||||
/* 3-byte code */
|
||||
ctxt->input->cur += 3;
|
||||
val = (cur[0] & 0xf) << 12;
|
||||
val |= (cur[1] & 0x3f) << 6;
|
||||
val |= cur[2] & 0x3f;
|
||||
}
|
||||
if (((val > 0xd7ff) && (val < 0xe000)) ||
|
||||
((val > 0xfffd) && (val < 0x10000)) ||
|
||||
(val >= 0x110000)) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Char 0x%X out of allowed range\n", val);
|
||||
ctxt->errNo = XML_ERR_INVALID_ENCODING;
|
||||
ctxt->wellFormed = 0;
|
||||
if (ctxt->recovery == 0) ctxt->disableSAX = 1;
|
||||
}
|
||||
} else
|
||||
/* 2-byte code */
|
||||
ctxt->input->cur += 2;
|
||||
} else
|
||||
/* 1-byte code */
|
||||
ctxt->input->cur++;
|
||||
} else {
|
||||
/*
|
||||
* Assume it's a fixed length encoding (1) with
|
||||
* a compatible encoding for the ASCII set, since
|
||||
* XML constructs only use < 128 chars
|
||||
*/
|
||||
ctxt->input->cur++;
|
||||
}
|
||||
ctxt->nbChars++;
|
||||
if (*ctxt->input->cur == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
}
|
||||
/*
|
||||
* We are supposed to handle UTF8, check it's valid
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
*
|
||||
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
|
||||
* 0000 0000-0000 007F 0xxxxxxx
|
||||
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
|
||||
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
||||
*
|
||||
* Check for the 0x110000 limit too
|
||||
*/
|
||||
cur = ctxt->input->cur;
|
||||
|
||||
c = *cur;
|
||||
if (c & 0x80) {
|
||||
if (cur[1] == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
if ((cur[1] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xe0) == 0xe0) {
|
||||
unsigned int val;
|
||||
|
||||
if (cur[2] == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
if ((cur[2] & 0xc0) != 0x80)
|
||||
goto encoding_error;
|
||||
if ((c & 0xf0) == 0xf0) {
|
||||
if (cur[3] == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
if (((c & 0xf8) != 0xf0) ||
|
||||
((cur[3] & 0xc0) != 0x80))
|
||||
goto encoding_error;
|
||||
/* 4-byte code */
|
||||
ctxt->input->cur += 4;
|
||||
val = (cur[0] & 0x7) << 18;
|
||||
val |= (cur[1] & 0x3f) << 12;
|
||||
val |= (cur[2] & 0x3f) << 6;
|
||||
val |= cur[3] & 0x3f;
|
||||
} else {
|
||||
/* 3-byte code */
|
||||
ctxt->input->cur += 3;
|
||||
val = (cur[0] & 0xf) << 12;
|
||||
val |= (cur[1] & 0x3f) << 6;
|
||||
val |= cur[2] & 0x3f;
|
||||
}
|
||||
if (((val > 0xd7ff) && (val < 0xe000)) ||
|
||||
((val > 0xfffd) && (val < 0x10000)) ||
|
||||
(val >= 0x110000)) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Char 0x%X out of allowed range\n",
|
||||
val);
|
||||
ctxt->errNo = XML_ERR_INVALID_ENCODING;
|
||||
ctxt->wellFormed = 0;
|
||||
if (ctxt->recovery == 0)
|
||||
ctxt->disableSAX = 1;
|
||||
}
|
||||
} else
|
||||
/* 2-byte code */
|
||||
ctxt->input->cur += 2;
|
||||
} else
|
||||
/* 1-byte code */
|
||||
ctxt->input->cur++;
|
||||
|
||||
ctxt->nbChars++;
|
||||
if (*ctxt->input->cur == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
}
|
||||
} else {
|
||||
ctxt->input->cur++;
|
||||
ctxt->nbChars++;
|
||||
if (*ctxt->input->cur == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
/*
|
||||
* Assume it's a fixed length encoding (1) with
|
||||
* a compatible encoding for the ASCII set, since
|
||||
* XML constructs only use < 128 chars
|
||||
*/
|
||||
|
||||
if (*(ctxt->input->cur) == '\n') {
|
||||
ctxt->input->line++;
|
||||
ctxt->input->col = 1;
|
||||
} else
|
||||
ctxt->input->col++;
|
||||
ctxt->input->cur++;
|
||||
ctxt->nbChars++;
|
||||
if (*ctxt->input->cur == 0)
|
||||
xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
|
||||
}
|
||||
if ((*ctxt->input->cur == '%') && (!ctxt->html))
|
||||
xmlParserHandlePEReference(ctxt);
|
||||
xmlParserHandlePEReference(ctxt);
|
||||
if ((*ctxt->input->cur == 0) &&
|
||||
(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0))
|
||||
xmlPopInput(ctxt);
|
||||
xmlPopInput(ctxt);
|
||||
return;
|
||||
encoding_error:
|
||||
encoding_error:
|
||||
/*
|
||||
* If we detect an UTF8 error that probably mean that the
|
||||
* input encoding didn't get properly advertised in the
|
||||
@ -1217,16 +1228,17 @@ encoding_error:
|
||||
* encoding !)
|
||||
*/
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Input is not proper UTF-8, indicate encoding !\n");
|
||||
ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
ctxt->input->cur[0], ctxt->input->cur[1],
|
||||
ctxt->input->cur[2], ctxt->input->cur[3]);
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Input is not proper UTF-8, indicate encoding !\n");
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
|
||||
ctxt->input->cur[0], ctxt->input->cur[1],
|
||||
ctxt->input->cur[2], ctxt->input->cur[3]);
|
||||
}
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->errNo = XML_ERR_INVALID_ENCODING;
|
||||
|
||||
ctxt->charset = XML_CHAR_ENCODING_8859_1;
|
||||
ctxt->charset = XML_CHAR_ENCODING_8859_1;
|
||||
ctxt->input->cur++;
|
||||
return;
|
||||
}
|
||||
|
Reference in New Issue
Block a user