diff --git a/HTMLparser.c b/HTMLparser.c index d31e2ec9..ec88eed0 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { static int htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { + const unsigned char *cur; + unsigned char c; + unsigned int val; + if (ctxt->instate == XML_PARSER_EOF) return(0); @@ -421,107 +425,23 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 0; return(ctxt->token); } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - const unsigned char *cur = ctxt->input->cur; - unsigned char c; - unsigned int val; - - c = *cur; - if (c & 0x80) { - if ((c & 0x40) == 0) - goto encoding_error; - if (cur[1] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[1] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { - - if (cur[2] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[2] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if (((c & 0xf8) != 0xf0) || - ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - *len = 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - if (val < 0x10000) - goto encoding_error; - } else { - /* 3-byte code */ - *len = 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - if (val < 0x800) - goto encoding_error; - } - } else { - /* 2-byte code */ - *len = 2; - val = (cur[0] & 0x1f) << 6; - val |= cur[1] & 0x3f; - if (val < 0x80) - goto encoding_error; - } - if (!IS_CHAR(val)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", val); - } - return(val); - } else { - if ((*ctxt->input->cur == 0) && - (ctxt->input->cur < ctxt->input->end)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", 0); - *len = 1; - return(' '); - } - /* 1-byte code */ - *len = 1; - return((int) *ctxt->input->cur); - } - } - /* - * Assume it's a fixed length encoding (1) with - * a compatible encoding for the ASCII set, since - * XML constructs only use < 128 chars - */ - *len = 1; - if ((int) *ctxt->input->cur < 0x80) - return((int) *ctxt->input->cur); - - /* - * Humm this is bad, do an automatic flow conversion - */ - { + if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { xmlChar * guess; xmlCharEncodingHandlerPtr handler; + /* + * Assume it's a fixed length encoding (1) with + * a compatible encoding for the ASCII set, since + * HTML constructs only use < 128 chars + */ + if ((int) *ctxt->input->cur < 0x80) { + *len = 1; + return((int) *ctxt->input->cur); + } + + /* + * Humm this is bad, do an automatic flow conversion + */ guess = htmlFindEncoding(ctxt); if (guess == NULL) { xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); @@ -540,7 +460,86 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { ctxt->charset = XML_CHAR_ENCODING_UTF8; } - return(xmlCurrentChar(ctxt, len)); + /* + * We are supposed to handle UTF8, check it's valid + * From rfc2044: encoding of the Unicode values on UTF-8: + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 0000 0000-0000 007F 0xxxxxxx + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + * + * Check for the 0x110000 limit too + */ + cur = ctxt->input->cur; + c = *cur; + if (c & 0x80) { + if ((c & 0x40) == 0) + goto encoding_error; + if (cur[1] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if ((cur[1] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xe0) == 0xe0) { + + if (cur[2] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if ((cur[2] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xf0) == 0xf0) { + if (cur[3] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if (((c & 0xf8) != 0xf0) || + ((cur[3] & 0xc0) != 0x80)) + goto encoding_error; + /* 4-byte code */ + *len = 4; + val = (cur[0] & 0x7) << 18; + val |= (cur[1] & 0x3f) << 12; + val |= (cur[2] & 0x3f) << 6; + val |= cur[3] & 0x3f; + if (val < 0x10000) + goto encoding_error; + } else { + /* 3-byte code */ + *len = 3; + val = (cur[0] & 0xf) << 12; + val |= (cur[1] & 0x3f) << 6; + val |= cur[2] & 0x3f; + if (val < 0x800) + goto encoding_error; + } + } else { + /* 2-byte code */ + *len = 2; + val = (cur[0] & 0x1f) << 6; + val |= cur[1] & 0x3f; + if (val < 0x80) + goto encoding_error; + } + if (!IS_CHAR(val)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", val); + } + return(val); + } else { + if ((*ctxt->input->cur == 0) && + (ctxt->input->cur < ctxt->input->end)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", 0); + *len = 1; + return(' '); + } + /* 1-byte code */ + *len = 1; + return((int) *ctxt->input->cur); + } encoding_error: /*