mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-26 00:37:43 +03:00
Fixed ICU to set flush correctly and provide pivot buffer.
By always setting flush=TRUE when doing multiple reads, ICU will not correctly handle truncated utf8 chars across read boundaries. The fix is to set flush=TRUE only on final read, and to provide a pivot buffer which is maintained by libxml between calls to ucnv_convertEx.
This commit is contained in:
committed by
Nick Wellnhofer
parent
4b4135977e
commit
0b19f236a2
46
encoding.c
46
encoding.c
@@ -110,6 +110,9 @@ openIcuConverter(const char* name, int toUnicode)
|
|||||||
if (conv == NULL)
|
if (conv == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
conv->pivot_source = conv->pivot_buf;
|
||||||
|
conv->pivot_target = conv->pivot_buf;
|
||||||
|
|
||||||
conv->uconv = ucnv_open(name, &status);
|
conv->uconv = ucnv_open(name, &status);
|
||||||
if (U_FAILURE(status))
|
if (U_FAILURE(status))
|
||||||
goto error;
|
goto error;
|
||||||
@@ -1850,6 +1853,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
|
|||||||
* @outlen: the length of @out
|
* @outlen: the length of @out
|
||||||
* @in: a pointer to an array of ISO Latin 1 chars
|
* @in: a pointer to an array of ISO Latin 1 chars
|
||||||
* @inlen: the length of @in
|
* @inlen: the length of @in
|
||||||
|
* @flush: if true, indicates end of input
|
||||||
*
|
*
|
||||||
* Returns 0 if success, or
|
* Returns 0 if success, or
|
||||||
* -1 by lack of space, or
|
* -1 by lack of space, or
|
||||||
@@ -1863,7 +1867,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen,
|
|||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
||||||
const unsigned char *in, int *inlen) {
|
const unsigned char *in, int *inlen, int flush) {
|
||||||
const char *ucv_in = (const char *) in;
|
const char *ucv_in = (const char *) in;
|
||||||
char *ucv_out = (char *) out;
|
char *ucv_out = (char *) out;
|
||||||
UErrorCode err = U_ZERO_ERROR;
|
UErrorCode err = U_ZERO_ERROR;
|
||||||
@@ -1873,33 +1877,31 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
|||||||
return(-1);
|
return(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* TODO(jungshik)
|
|
||||||
* 1. is ucnv_convert(To|From)Algorithmic better?
|
|
||||||
* 2. had we better use an explicit pivot buffer?
|
|
||||||
* 3. error returned comes from 'fromUnicode' only even
|
|
||||||
* when toUnicode is true !
|
|
||||||
*/
|
|
||||||
if (toUnicode) {
|
if (toUnicode) {
|
||||||
/* encoding => UTF-16 => UTF-8 */
|
/* encoding => UTF-16 => UTF-8 */
|
||||||
ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
|
ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen,
|
||||||
&ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
|
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
|
||||||
0, TRUE, &err);
|
&cd->pivot_source, &cd->pivot_target,
|
||||||
|
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
|
||||||
} else {
|
} else {
|
||||||
/* UTF-8 => UTF-16 => encoding */
|
/* UTF-8 => UTF-16 => encoding */
|
||||||
ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
|
ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen,
|
||||||
&ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL,
|
&ucv_in, ucv_in + *inlen, cd->pivot_buf,
|
||||||
0, TRUE, &err);
|
&cd->pivot_source, &cd->pivot_target,
|
||||||
|
cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, flush, &err);
|
||||||
}
|
}
|
||||||
*inlen = ucv_in - (const char*) in;
|
*inlen = ucv_in - (const char*) in;
|
||||||
*outlen = ucv_out - (char *) out;
|
*outlen = ucv_out - (char *) out;
|
||||||
if (U_SUCCESS(err))
|
if (U_SUCCESS(err)) {
|
||||||
|
/* reset pivot buf if this is the last call for input (flush==TRUE) */
|
||||||
|
if (flush)
|
||||||
|
cd->pivot_source = cd->pivot_target = cd->pivot_buf;
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
if (err == U_BUFFER_OVERFLOW_ERROR)
|
if (err == U_BUFFER_OVERFLOW_ERROR)
|
||||||
return -1;
|
return -1;
|
||||||
if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
|
if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND)
|
||||||
return -2;
|
return -2;
|
||||||
/* if (err == U_TRUNCATED_CHAR_FOUND) */
|
|
||||||
return -3;
|
return -3;
|
||||||
}
|
}
|
||||||
#endif /* LIBXML_ICU_ENABLED */
|
#endif /* LIBXML_ICU_ENABLED */
|
||||||
@@ -1912,7 +1914,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
|
|||||||
|
|
||||||
static int
|
static int
|
||||||
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
||||||
int *outlen, const unsigned char *in, int *inlen) {
|
int *outlen, const unsigned char *in, int *inlen, int flush) {
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
if (handler->input != NULL) {
|
if (handler->input != NULL) {
|
||||||
@@ -1925,7 +1927,8 @@ xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
|||||||
#endif /* LIBXML_ICONV_ENABLED */
|
#endif /* LIBXML_ICONV_ENABLED */
|
||||||
#ifdef LIBXML_ICU_ENABLED
|
#ifdef LIBXML_ICU_ENABLED
|
||||||
else if (handler->uconv_in != NULL) {
|
else if (handler->uconv_in != NULL) {
|
||||||
ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen);
|
ret = xmlUconvWrapper(handler->uconv_in, 1, out, outlen, in, inlen,
|
||||||
|
flush);
|
||||||
}
|
}
|
||||||
#endif /* LIBXML_ICU_ENABLED */
|
#endif /* LIBXML_ICU_ENABLED */
|
||||||
else {
|
else {
|
||||||
@@ -1953,7 +1956,8 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
|
|||||||
#endif /* LIBXML_ICONV_ENABLED */
|
#endif /* LIBXML_ICONV_ENABLED */
|
||||||
#ifdef LIBXML_ICU_ENABLED
|
#ifdef LIBXML_ICU_ENABLED
|
||||||
else if (handler->uconv_out != NULL) {
|
else if (handler->uconv_out != NULL) {
|
||||||
ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen);
|
ret = xmlUconvWrapper(handler->uconv_out, 0, out, outlen, in, inlen,
|
||||||
|
TRUE);
|
||||||
}
|
}
|
||||||
#endif /* LIBXML_ICU_ENABLED */
|
#endif /* LIBXML_ICU_ENABLED */
|
||||||
else {
|
else {
|
||||||
@@ -2015,7 +2019,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
|||||||
}
|
}
|
||||||
|
|
||||||
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
|
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
|
||||||
in->content, &toconv);
|
in->content, &toconv, 0);
|
||||||
xmlBufferShrink(in, toconv);
|
xmlBufferShrink(in, toconv);
|
||||||
out->use += written;
|
out->use += written;
|
||||||
out->content[out->use] = 0;
|
out->content[out->use] = 0;
|
||||||
@@ -2133,7 +2137,7 @@ xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
|
|||||||
c_in = toconv;
|
c_in = toconv;
|
||||||
c_out = written;
|
c_out = written;
|
||||||
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
||||||
xmlBufContent(in), &c_in);
|
xmlBufContent(in), &c_in, 0);
|
||||||
xmlBufShrink(in, c_in);
|
xmlBufShrink(in, c_in);
|
||||||
xmlBufAddLen(out, c_out);
|
xmlBufAddLen(out, c_out);
|
||||||
if (ret == -1)
|
if (ret == -1)
|
||||||
@@ -2231,7 +2235,7 @@ xmlCharEncInput(xmlParserInputBufferPtr input, int flush)
|
|||||||
c_in = toconv;
|
c_in = toconv;
|
||||||
c_out = written;
|
c_out = written;
|
||||||
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
|
||||||
xmlBufContent(in), &c_in);
|
xmlBufContent(in), &c_in, flush);
|
||||||
xmlBufShrink(in, c_in);
|
xmlBufShrink(in, c_in);
|
||||||
xmlBufAddLen(out, c_out);
|
xmlBufAddLen(out, c_out);
|
||||||
if (ret == -1)
|
if (ret == -1)
|
||||||
@@ -2317,7 +2321,7 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
|
|||||||
written = out->size - out->use - 1;
|
written = out->size - out->use - 1;
|
||||||
}
|
}
|
||||||
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
|
ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
|
||||||
in->content, &toconv);
|
in->content, &toconv, 1);
|
||||||
xmlBufferShrink(in, toconv);
|
xmlBufferShrink(in, toconv);
|
||||||
out->use += written;
|
out->use += written;
|
||||||
out->content[out->use] = 0;
|
out->content[out->use] = 0;
|
||||||
|
|||||||
@@ -129,9 +129,14 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen,
|
|||||||
* If iconv is supported, there are two extra fields.
|
* If iconv is supported, there are two extra fields.
|
||||||
*/
|
*/
|
||||||
#ifdef LIBXML_ICU_ENABLED
|
#ifdef LIBXML_ICU_ENABLED
|
||||||
|
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
|
||||||
|
#define ICU_PIVOT_BUF_SIZE 1024
|
||||||
struct _uconv_t {
|
struct _uconv_t {
|
||||||
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
|
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
|
||||||
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
|
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
|
||||||
|
UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
|
||||||
|
UChar *pivot_source;
|
||||||
|
UChar *pivot_target;
|
||||||
};
|
};
|
||||||
typedef struct _uconv_t uconv_t;
|
typedef struct _uconv_t uconv_t;
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user