1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

parser: Rework EBCDIC code page detection

To detect EBCDIC code pages, we used to switch the encoding twice and
had to be very careful not to decode data after the XML declaration
before the second switch. This relied on a hard-coded expected size of
the XML declaration and was complicated and unreliable.

Now we convert the first 200 bytes to EBCDIC-US and parse the encoding
declaration manually.
This commit is contained in:
Nick Wellnhofer
2023-03-21 19:07:12 +01:00
parent 3eb9f5ca4e
commit 98840d40da
5 changed files with 93 additions and 314 deletions

View File

@@ -2037,7 +2037,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
* as the return value is 0, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
static int
int
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen, int flush) {
int ret;
@@ -2123,189 +2123,12 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* Front-end for the encoding handler input function, but handle only
* the very first line, i.e. limit itself to 45 chars.
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
* DEPERECATED: Don't use.
*/
int
xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
int ret;
int written;
int toconv;
if (handler == NULL) return(-1);
if (out == NULL) return(-1);
if (in == NULL) return(-1);
/* calculate space available */
written = out->size - out->use - 1; /* count '\0' */
toconv = in->use;
/*
* echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
* 45 chars should be sufficient to reach the end of the encoding
* declaration without going too far inside the document content.
* on UTF-16 this means 90bytes, on UCS4 this means 180
* The actual value depending on guessed encoding is passed as @len
* if provided
*/
if (toconv > 180)
toconv = 180;
if (toconv * 2 >= written) {
xmlBufferGrow(out, toconv * 2);
written = out->size - out->use - 1;
}
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
in->content, &toconv, 0);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
if (ret == -1) ret = -3;
#ifdef DEBUG_ENCODING
switch (ret) {
case 0:
xmlGenericError(xmlGenericErrorContext,
"converted %d bytes to %d bytes of input\n",
toconv, written);
break;
case -1:
xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
toconv, written, in->use);
break;
case -2:
xmlGenericError(xmlGenericErrorContext,
"input conversion failed due to input error\n");
break;
case -3:
xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
toconv, written, in->use);
break;
default:
xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
}
#endif /* DEBUG_ENCODING */
/*
* Ignore when input buffer is not on a boundary
*/
if (ret == -3) ret = 0;
if (ret == -1) ret = 0;
return(written ? written : ret);
}
/**
* xmlCharEncFirstLineInput:
* @input: a parser input buffer
* @len: number of bytes to convert for the first line, or -1
*
* Front-end for the encoding handler input function, but handle only
* the very first line. Point is that this is based on autodetection
* of the encoding and once that first line is converted we may find
* out that a different decoder is needed to process the input.
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
*/
int
xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
{
int ret;
size_t written;
size_t toconv;
int c_in;
int c_out;
xmlBufPtr in;
xmlBufPtr out;
if ((input == NULL) || (input->encoder == NULL) ||
(input->buffer == NULL) || (input->raw == NULL))
return (-1);
out = input->buffer;
in = input->raw;
toconv = xmlBufUse(in);
if (toconv == 0)
return (0);
written = xmlBufAvail(out);
/*
* echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
* 45 chars should be sufficient to reach the end of the encoding
* declaration without going too far inside the document content.
* on UTF-16 this means 90bytes, on UCS4 this means 180
* The actual value depending on guessed encoding is passed as @len
* if provided
*/
if (len >= 0) {
if (toconv > (unsigned int) len)
toconv = len;
} else {
if (toconv > 180)
toconv = 180;
}
if (toconv * 2 >= written) {
xmlBufGrow(out, toconv * 2);
written = xmlBufAvail(out);
}
if (written > 360)
written = 360;
c_in = toconv;
c_out = written;
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
xmlBufContent(in), &c_in, 0);
xmlBufShrink(in, c_in);
xmlBufAddLen(out, c_out);
if (ret == -1)
ret = -3;
switch (ret) {
case 0:
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext,
"converted %d bytes to %d bytes of input\n",
c_in, c_out);
#endif
break;
case -1:
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext,
"converted %d bytes to %d bytes of input, %d left\n",
c_in, c_out, (int)xmlBufUse(in));
#endif
break;
case -3:
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext,
"converted %d bytes to %d bytes of input, %d left\n",
c_in, c_out, (int)xmlBufUse(in));
#endif
break;
case -2: {
char buf[50];
const xmlChar *content = xmlBufContent(in);
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
content[0], content[1],
content[2], content[3]);
buf[49] = 0;
xmlEncodingErr(XML_I18N_CONV_FAILED,
"input conversion failed due to input error, bytes %s\n",
buf);
}
}
/*
* Ignore when input buffer is not on a boundary
*/
if (ret == -3) ret = 0;
if (ret == -1) ret = 0;
return(c_out ? c_out : ret);
return(xmlCharEncInFunc(handler, out, in));
}
/**

View File

@@ -203,6 +203,7 @@ XMLPUBFUN int
xmlCharEncInFunc (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
xmlBufferPtr in);
XML_DEPRECATED
XMLPUBFUN int
xmlCharEncFirstLine (xmlCharEncodingHandler *handler,
xmlBufferPtr out,

View File

@@ -8,7 +8,8 @@ XML_HIDDEN void
xmlInitEncodingInternal(void);
XML_HIDDEN int
xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len);
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
int *outlen, const unsigned char *in, int *inlen, int flush);
XML_HIDDEN int
xmlCharEncInput(xmlParserInputBufferPtr input, int flush);
XML_HIDDEN int

View File

@@ -10367,6 +10367,7 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
else if ((encoding != NULL) &&
((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) ||
(!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) {
/* TODO: Check for encoding mismatch. */
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = encoding;
@@ -10692,15 +10693,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
return(-1);
}
/*
* Check for the XMLDecl in the Prolog.
* do not GROW here to avoid the detected encoder to decode more
* than just the first line, unless the amount of data is really
* too small to hold "<?xml version="1.0" encoding="foo"
*/
if ((ctxt->input->end - ctxt->input->cur) < 35) {
GROW;
}
GROW;
if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) {
/*
@@ -11347,12 +11340,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
/*
* If we are operating on converted input, try to flush
* remaining chars to avoid them stalling in the non-converted
* buffer. But do not do this in document start where
* encoding="..." may not have been read and we work on a
* guessed encoding.
* buffer.
*/
if ((ctxt->instate != XML_PARSER_START) &&
(ctxt->input->buf->raw != NULL) &&
if ((ctxt->input->buf->raw != NULL) &&
(xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) {
size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer,
ctxt->input);
@@ -11395,6 +11385,13 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
/*
* We need more bytes to detect EBCDIC code pages.
* See xmlDetectEBCDIC.
*/
if ((enc == XML_CHAR_ENCODING_EBCDIC) &&
(!terminate) && (avail < 200))
goto done;
xmlSwitchEncoding(ctxt, enc);
break;
}
@@ -12186,15 +12183,8 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
xmlParserCtxtPtr ctxt;
xmlParserInputPtr inputStream;
xmlParserInputBufferPtr buf;
xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;
/*
* plug some encoding conversion routines
*/
if ((chunk != NULL) && (size >= 4))
enc = xmlDetectCharEncoding((const xmlChar *) chunk, size);
buf = xmlAllocParserInputBuffer(enc);
buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
if (buf == NULL) return(NULL);
ctxt = xmlNewSAXParserCtxt(sax, user_data);
@@ -12253,10 +12243,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
#endif
}
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
return(ctxt);
}
#endif /* LIBXML_PUSH_ENABLED */

View File

@@ -990,9 +990,62 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
* *
************************************************************************/
static int
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler, int len);
static xmlCharEncodingHandlerPtr
xmlDetectEBCDIC(xmlParserInputPtr input) {
xmlChar out[200];
xmlCharEncodingHandlerPtr handler;
int inlen, outlen, res, i;
/*
* To detect the EBCDIC code page, we convert the first 200 bytes
* to EBCDIC-US and try to find the encoding declaration.
*/
handler = xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC);
if (handler == NULL)
return(NULL);
outlen = sizeof(out);
inlen = input->end - input->cur;
res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0);
if (res < 0)
return(handler);
for (i = 0; i < outlen; i++) {
if (out[i] == '>')
break;
if ((out[i] == 'e') &&
(xmlStrncmp(out + i, BAD_CAST "encoding", 8) == 0)) {
int start, cur, quote;
i += 8;
while (IS_BLANK_CH(out[i]))
i += 1;
if (out[i++] != '=')
break;
while (IS_BLANK_CH(out[i]))
i += 1;
quote = out[i++];
if ((quote != '\'') && (quote != '"'))
break;
start = i;
cur = out[i];
while (((cur >= 'a') && (cur <= 'z')) ||
((cur >= 'A') && (cur <= 'Z')) ||
((cur >= '0') && (cur <= '9')) ||
(cur == '.') || (cur == '_') ||
(cur == '-'))
cur = out[++i];
if (cur != quote)
break;
out[i] = 0;
xmlCharEncCloseFunc(handler);
handler = xmlFindCharEncodingHandler((char *) out + start);
break;
}
}
return(handler);
}
/**
* xmlSwitchEncoding:
* @ctxt: the parser context
@@ -1007,7 +1060,6 @@ int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
xmlCharEncodingHandlerPtr handler;
int len = -1;
int ret;
if (ctxt == NULL) return(-1);
@@ -1036,51 +1088,13 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
ctxt->input->cur += 3;
}
return(0);
case XML_CHAR_ENCODING_UTF16LE:
case XML_CHAR_ENCODING_UTF16BE:
/*The raw input characters are encoded
*in UTF-16. As we expect this function
*to be called after xmlCharEncInFunc, we expect
*ctxt->input->cur to contain UTF-8 encoded characters.
*So the raw UTF16 Byte Order Mark
*has also been converted into
*an UTF-8 BOM. Let's skip that BOM.
*/
if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) &&
(ctxt->input->cur[0] == 0xEF) &&
(ctxt->input->cur[1] == 0xBB) &&
(ctxt->input->cur[2] == 0xBF)) {
ctxt->input->cur += 3;
}
len = 90;
break;
case XML_CHAR_ENCODING_UCS2:
len = 90;
break;
case XML_CHAR_ENCODING_UCS4BE:
case XML_CHAR_ENCODING_UCS4LE:
case XML_CHAR_ENCODING_UCS4_2143:
case XML_CHAR_ENCODING_UCS4_3412:
len = 180;
break;
case XML_CHAR_ENCODING_EBCDIC:
case XML_CHAR_ENCODING_8859_1:
case XML_CHAR_ENCODING_8859_2:
case XML_CHAR_ENCODING_8859_3:
case XML_CHAR_ENCODING_8859_4:
case XML_CHAR_ENCODING_8859_5:
case XML_CHAR_ENCODING_8859_6:
case XML_CHAR_ENCODING_8859_7:
case XML_CHAR_ENCODING_8859_8:
case XML_CHAR_ENCODING_8859_9:
case XML_CHAR_ENCODING_ASCII:
case XML_CHAR_ENCODING_2022_JP:
case XML_CHAR_ENCODING_SHIFT_JIS:
case XML_CHAR_ENCODING_EUC_JP:
len = 45;
break;
case XML_CHAR_ENCODING_EBCDIC:
handler = xmlDetectEBCDIC(ctxt->input);
break;
default:
handler = xmlGetCharEncodingHandler(enc);
break;
}
handler = xmlGetCharEncodingHandler(enc);
if (handler == NULL) {
/*
* Default handlers.
@@ -1112,7 +1126,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
return(-1);
}
}
ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
/*
* on encoding conversion errors, stop the parser
@@ -1124,20 +1138,19 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
}
/**
* xmlSwitchInputEncodingInt:
* xmlSwitchInputEncoding:
* @ctxt: the parser context
* @input: the input stream
* @handler: the encoding handler
* @len: the number of bytes to convert for the first line or -1
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
static int
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler, int len)
int
xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler)
{
int nbchars;
xmlParserInputBufferPtr in;
@@ -1159,30 +1172,17 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
return (-1);
}
ctxt->charset = XML_CHAR_ENCODING_UTF8;
if (in->encoder != NULL) {
/*
* Check in case the auto encoding detection triggered
* in already.
* TODO: Detect encoding mismatch. We should start by comparing
* in->encoder->name and handler->name, but there are a few
* compatible encodings like UTF-16 and UCS-2 or UTF-32 and UCS-4.
*/
if (in->encoder == handler)
return (0);
/*
* Note: this is a bit dangerous, but that's what it
* takes to use nearly compatible signature for different
* encodings.
*
* FIXME: Encoders might buffer partial byte sequences, so
* this probably can't work. We should return an error and
* make sure that callers never try to switch the encoding
* twice.
*/
xmlCharEncCloseFunc(in->encoder);
in->encoder = handler;
xmlCharEncCloseFunc(handler);
return (0);
}
ctxt->charset = XML_CHAR_ENCODING_UTF8;
in->encoder = handler;
/*
@@ -1230,20 +1230,7 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
in->rawconsumed = processed;
use = xmlBufUse(in->raw);
if (ctxt->html) {
/*
* convert as much as possible of the buffer
*/
nbchars = xmlCharEncInput(in, 1);
} else {
/*
* convert just enough to get
* '<?xml version="1.0" encoding="xxx"?>'
* parsed with the autodetected encoding
* into the parser reading buffer.
*/
nbchars = xmlCharEncFirstLineInput(in, len);
}
nbchars = xmlCharEncInput(in, 0);
xmlBufResetInput(in->buffer, input);
if (nbchars < 0) {
xmlErrInternal(ctxt,
@@ -1261,25 +1248,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
return (0);
}
/**
* xmlSwitchInputEncoding:
* @ctxt: the parser context
* @input: the input stream
* @handler: the encoding handler
*
* DEPRECATED: Use xmlSwitchToEncoding
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
xmlCharEncodingHandlerPtr handler) {
return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
}
/**
* xmlSwitchToEncoding:
* @ctxt: the parser context
@@ -1295,7 +1263,7 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
{
if (ctxt == NULL)
return(-1);
return(xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, -1));
return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler));
}
/************************************************************************