mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-26 00:37:43 +03:00
Rework control flow in htmlCurrentChar
Don't call xmlCurrentChar after switching encodings. Rearrange code blocks and fall through to normal UTF-8 handling.
This commit is contained in:
81
HTMLparser.c
81
HTMLparser.c
@@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
|
||||
|
||||
static int
|
||||
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
const unsigned char *cur;
|
||||
unsigned char c;
|
||||
unsigned int val;
|
||||
|
||||
if (ctxt->instate == XML_PARSER_EOF)
|
||||
return(0);
|
||||
|
||||
@@ -421,7 +425,41 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
*len = 0;
|
||||
return(ctxt->token);
|
||||
}
|
||||
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
|
||||
if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
|
||||
xmlChar * guess;
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
|
||||
/*
|
||||
* Assume it's a fixed length encoding (1) with
|
||||
* a compatible encoding for the ASCII set, since
|
||||
* HTML constructs only use < 128 chars
|
||||
*/
|
||||
if ((int) *ctxt->input->cur < 0x80) {
|
||||
*len = 1;
|
||||
return((int) *ctxt->input->cur);
|
||||
}
|
||||
|
||||
/*
|
||||
* Humm this is bad, do an automatic flow conversion
|
||||
*/
|
||||
guess = htmlFindEncoding(ctxt);
|
||||
if (guess == NULL) {
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
||||
} else {
|
||||
if (ctxt->input->encoding != NULL)
|
||||
xmlFree((xmlChar *) ctxt->input->encoding);
|
||||
ctxt->input->encoding = guess;
|
||||
handler = xmlFindCharEncodingHandler((const char *) guess);
|
||||
if (handler != NULL) {
|
||||
xmlSwitchToEncoding(ctxt, handler);
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
|
||||
"Unsupported encoding %s", guess, NULL);
|
||||
}
|
||||
}
|
||||
ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
||||
}
|
||||
|
||||
/*
|
||||
* We are supposed to handle UTF8, check it's valid
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
@@ -433,10 +471,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
*
|
||||
* Check for the 0x110000 limit too
|
||||
*/
|
||||
const unsigned char *cur = ctxt->input->cur;
|
||||
unsigned char c;
|
||||
unsigned int val;
|
||||
|
||||
cur = ctxt->input->cur;
|
||||
c = *cur;
|
||||
if (c & 0x80) {
|
||||
if ((c & 0x40) == 0)
|
||||
@@ -505,42 +540,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
*len = 1;
|
||||
return((int) *ctxt->input->cur);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Assume it's a fixed length encoding (1) with
|
||||
* a compatible encoding for the ASCII set, since
|
||||
* XML constructs only use < 128 chars
|
||||
*/
|
||||
*len = 1;
|
||||
if ((int) *ctxt->input->cur < 0x80)
|
||||
return((int) *ctxt->input->cur);
|
||||
|
||||
/*
|
||||
* Humm this is bad, do an automatic flow conversion
|
||||
*/
|
||||
{
|
||||
xmlChar * guess;
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
|
||||
guess = htmlFindEncoding(ctxt);
|
||||
if (guess == NULL) {
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
||||
} else {
|
||||
if (ctxt->input->encoding != NULL)
|
||||
xmlFree((xmlChar *) ctxt->input->encoding);
|
||||
ctxt->input->encoding = guess;
|
||||
handler = xmlFindCharEncodingHandler((const char *) guess);
|
||||
if (handler != NULL) {
|
||||
xmlSwitchToEncoding(ctxt, handler);
|
||||
} else {
|
||||
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
|
||||
"Unsupported encoding %s", guess, NULL);
|
||||
}
|
||||
}
|
||||
ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
||||
}
|
||||
|
||||
return(xmlCurrentChar(ctxt, len));
|
||||
|
||||
encoding_error:
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user