diff --git a/doc/src/sgml/ref/create_conversion.sgml b/doc/src/sgml/ref/create_conversion.sgml index e7700fecfc5..75d7b009455 100644 --- a/doc/src/sgml/ref/create_conversion.sgml +++ b/doc/src/sgml/ref/create_conversion.sgml @@ -117,9 +117,15 @@ conv_proc( integer, -- destination encoding ID cstring, -- source string (null terminated C string) internal, -- destination (fill with a null terminated C string) - integer -- source string length -) RETURNS void; - + integer, -- source string length + boolean -- if true, don't throw an error if conversion fails +) RETURNS integer; + + The return value is the number of source bytes that were successfully + converted. If the last argument is false, the function must throw an + error on invalid input, and the return value is always equal to the + source string length. + diff --git a/src/backend/commands/conversioncmds.c b/src/backend/commands/conversioncmds.c index f7ff321de71..5fed97a2f99 100644 --- a/src/backend/commands/conversioncmds.c +++ b/src/backend/commands/conversioncmds.c @@ -45,8 +45,9 @@ CreateConversionCommand(CreateConversionStmt *stmt) const char *from_encoding_name = stmt->for_encoding_name; const char *to_encoding_name = stmt->to_encoding_name; List *func_name = stmt->func_name; - static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID}; + static const Oid funcargs[] = {INT4OID, INT4OID, CSTRINGOID, INTERNALOID, INT4OID, BOOLOID}; char result[1]; + Datum funcresult; /* Convert list of names to a name and namespace */ namespaceId = QualifiedNameGetCreationNamespace(stmt->conversion_name, @@ -92,12 +93,12 @@ CreateConversionCommand(CreateConversionStmt *stmt) funcoid = LookupFuncName(func_name, sizeof(funcargs) / sizeof(Oid), funcargs, false); - /* Check it returns VOID, else it's probably the wrong function */ - if (get_func_rettype(funcoid) != VOIDOID) + /* Check it returns int4, else it's probably the wrong function */ + if (get_func_rettype(funcoid) != INT4OID) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("encoding conversion function %s must return type %s", - NameListToString(func_name), "void"))); + NameListToString(func_name), "integer"))); /* Check we have EXECUTE rights for the function */ aclresult = pg_proc_aclcheck(funcoid, GetUserId(), ACL_EXECUTE); @@ -111,12 +112,23 @@ CreateConversionCommand(CreateConversionStmt *stmt) * string; the conversion function should throw an error if it can't * perform the requested conversion. */ - OidFunctionCall5(funcoid, - Int32GetDatum(from_encoding), - Int32GetDatum(to_encoding), - CStringGetDatum(""), - CStringGetDatum(result), - Int32GetDatum(0)); + funcresult = OidFunctionCall6(funcoid, + Int32GetDatum(from_encoding), + Int32GetDatum(to_encoding), + CStringGetDatum(""), + CStringGetDatum(result), + Int32GetDatum(0), + BoolGetDatum(false)); + + /* + * The function should return 0 for empty input. Might as well check that, + * too. + */ + if (DatumGetInt32(funcresult) != 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("encoding conversion function %s returned incorrect result for empty input", + NameListToString(func_name)))); /* * All seem ok, go ahead (possible failure would be a duplicate conversion diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c index 9e4ea1b345a..423df2f3006 100644 --- a/src/backend/utils/error/elog.c +++ b/src/backend/utils/error/elog.c @@ -2271,6 +2271,8 @@ write_console(const char *line, int len) * Conversion on non-win32 platforms is not implemented yet. It requires * non-throw version of pg_do_encoding_conversion(), that converts * unconvertable characters to '?' without errors. + * + * XXX: We have a no-throw version now. It doesn't convert to '?' though. */ #endif diff --git a/src/backend/utils/mb/conv.c b/src/backend/utils/mb/conv.c index a07b54bd3b8..33e9c9a9e3c 100644 --- a/src/backend/utils/mb/conv.c +++ b/src/backend/utils/mb/conv.c @@ -25,15 +25,20 @@ * tab holds conversion entries for the source charset * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the target charset, or 0 if there is no equivalent code. + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int local2local(const unsigned char *l, unsigned char *p, int len, int src_encoding, int dest_encoding, - const unsigned char *tab) + const unsigned char *tab, + bool noError) { + const unsigned char *start = l; unsigned char c1, c2; @@ -41,7 +46,11 @@ local2local(const unsigned char *l, { c1 = *l; if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(src_encoding, (const char *) l, len); + } if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else @@ -50,13 +59,19 @@ local2local(const unsigned char *l, if (c2) *p++ = c2; else + { + if (noError) + break; report_untranslatable_char(src_encoding, dest_encoding, (const char *) l, len); + } } l++; len--; } *p = '\0'; + + return l - start; } /* @@ -66,18 +81,26 @@ local2local(const unsigned char *l, * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int latin2mic(const unsigned char *l, unsigned char *p, int len, - int lc, int encoding) + int lc, int encoding, bool noError) { + const unsigned char *start = l; int c1; while (len > 0) { c1 = *l; if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(encoding, (const char *) l, len); + } if (IS_HIGHBIT_SET(c1)) *p++ = lc; *p++ = c1; @@ -85,6 +108,8 @@ latin2mic(const unsigned char *l, unsigned char *p, int len, len--; } *p = '\0'; + + return l - start; } /* @@ -94,18 +119,26 @@ latin2mic(const unsigned char *l, unsigned char *p, int len, * p is the output area (must be large enough!) * lc is the mule character set id for the local encoding * encoding is the PG identifier for the local encoding + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int mic2latin(const unsigned char *mic, unsigned char *p, int len, - int lc, int encoding) + int lc, int encoding, bool noError) { + const unsigned char *start = mic; int c1; while (len > 0) { c1 = *mic; if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (!IS_HIGHBIT_SET(c1)) { /* easy for ASCII */ @@ -118,17 +151,27 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len, int l = pg_mule_mblen(mic); if (len < l) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1])) + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, encoding, (const char *) mic, len); + } *p++ = mic[1]; mic += 2; len -= 2; } } *p = '\0'; + + return mic - start; } @@ -143,15 +186,20 @@ mic2latin(const unsigned char *mic, unsigned char *p, int len, * tab holds conversion entries for the local charset * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the mule encoding, or 0 if there is no equivalent code. + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int latin2mic_with_table(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, - const unsigned char *tab) + const unsigned char *tab, + bool noError) { + const unsigned char *start = l; unsigned char c1, c2; @@ -159,7 +207,11 @@ latin2mic_with_table(const unsigned char *l, { c1 = *l; if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(encoding, (const char *) l, len); + } if (!IS_HIGHBIT_SET(c1)) *p++ = c1; else @@ -171,13 +223,19 @@ latin2mic_with_table(const unsigned char *l, *p++ = c2; } else + { + if (noError) + break; report_untranslatable_char(encoding, PG_MULE_INTERNAL, (const char *) l, len); + } } l++; len--; } *p = '\0'; + + return l - start; } /* @@ -191,15 +249,20 @@ latin2mic_with_table(const unsigned char *l, * tab holds conversion entries for the mule internal code's second byte, * starting from 128 (0x80). each entry in the table holds the corresponding * code point for the local charset, or 0 if there is no equivalent code. + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, - const unsigned char *tab) + const unsigned char *tab, + bool noError) { + const unsigned char *start = mic; unsigned char c1, c2; @@ -207,7 +270,11 @@ mic2latin_with_table(const unsigned char *mic, { c1 = *mic; if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (!IS_HIGHBIT_SET(c1)) { /* easy for ASCII */ @@ -220,11 +287,17 @@ mic2latin_with_table(const unsigned char *mic, int l = pg_mule_mblen(mic); if (len < l) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) || (c2 = tab[mic[1] - HIGHBIT]) == 0) { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, encoding, (const char *) mic, len); break; /* keep compiler quiet */ @@ -235,6 +308,8 @@ mic2latin_with_table(const unsigned char *mic, } } *p = '\0'; + + return mic - start; } /* @@ -424,18 +499,22 @@ pg_mb_radix_conv(const pg_mb_radix_tree *rt, * is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, - int encoding) + int encoding, bool noError) { uint32 iutf; int l; const pg_utf_to_local_combined *cp; + const unsigned char *start = utf; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, @@ -505,10 +584,19 @@ UtfToLocal(const unsigned char *utf, int len, l = pg_utf_mblen(utf); if (len < l) + { + /* need more data to decide if this is a combined char */ + utf -= l_save; break; + } if (!pg_utf8_islegal(utf, l)) + { + if (!noError) + report_invalid_encoding(PG_UTF8, (const char *) utf, len); + utf -= l_save; break; + } /* We assume ASCII character cannot be in combined map */ if (l > 1) @@ -584,15 +672,20 @@ UtfToLocal(const unsigned char *utf, int len, } /* failed to translate this character */ + utf -= l; + if (noError) + break; report_untranslatable_char(PG_UTF8, encoding, - (const char *) (utf - l), len); + (const char *) utf, len); } /* if we broke out of loop early, must be invalid input */ - if (len > 0) + if (len > 0 && !noError) report_invalid_encoding(PG_UTF8, (const char *) utf, len); *iso = '\0'; + + return utf - start; } /* @@ -616,18 +709,23 @@ UtfToLocal(const unsigned char *utf, int len, * (if provided) is applied. An error is raised if no match is found. * * See pg_wchar.h for more details about the data structures used here. + * + * Returns the number of input bytes consumed. If noError is true, this can + * be less than 'len'. */ -void +int LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, - int encoding) + int encoding, + bool noError) { uint32 iiso; int l; const pg_local_to_utf_combined *cp; + const unsigned char *start = iso; if (!PG_VALID_ENCODING(encoding)) ereport(ERROR, @@ -723,13 +821,18 @@ LocalToUtf(const unsigned char *iso, int len, } /* failed to translate this character */ + iso -= l; + if (noError) + break; report_untranslatable_char(encoding, PG_UTF8, - (const char *) (iso - l), len); + (const char *) iso, len); } /* if we broke out of loop early, must be invalid input */ - if (len > 0) + if (len > 0 && !noError) report_invalid_encoding(encoding, (const char *) iso, len); *utf = '\0'; + + return iso - start; } diff --git a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c index 4c5b02654de..368c2deb5e4 100644 --- a/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c @@ -44,8 +44,11 @@ PG_FUNCTION_INFO_V1(win866_to_iso); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -306,12 +309,14 @@ koi8r_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_MULE_INTERNAL); - latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R); + converted = latin2mic(src, dest, len, LC_KOI8_R, PG_KOI8R, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -320,12 +325,14 @@ mic_to_koi8r(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_KOI8R); - mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R); + converted = mic2latin(src, dest, len, LC_KOI8_R, PG_KOI8R, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -334,12 +341,14 @@ iso_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_MULE_INTERNAL); - latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi); + converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -348,12 +357,14 @@ mic_to_iso(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_ISO_8859_5); - mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso); + converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -362,12 +373,14 @@ win1251_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_MULE_INTERNAL); - latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi); + converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, win12512koi, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -376,12 +389,14 @@ mic_to_win1251(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1251); - mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251); + converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN1251, koi2win1251, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -390,12 +405,14 @@ win866_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_MULE_INTERNAL); - latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi); + converted = latin2mic_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, win8662koi, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -404,12 +421,14 @@ mic_to_win866(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN866); - mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866); + converted = mic2latin_with_table(src, dest, len, LC_KOI8_R, PG_WIN866, koi2win866, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -418,12 +437,14 @@ koi8r_to_win1251(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN1251); - local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251); + converted = local2local(src, dest, len, PG_KOI8R, PG_WIN1251, koi2win1251, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -432,12 +453,14 @@ win1251_to_koi8r(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_KOI8R); - local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi); + converted = local2local(src, dest, len, PG_WIN1251, PG_KOI8R, win12512koi, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -446,12 +469,14 @@ koi8r_to_win866(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_WIN866); - local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866); + converted = local2local(src, dest, len, PG_KOI8R, PG_WIN866, koi2win866, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -460,12 +485,14 @@ win866_to_koi8r(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_KOI8R); - local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi); + converted = local2local(src, dest, len, PG_WIN866, PG_KOI8R, win8662koi, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -474,12 +501,14 @@ win866_to_win1251(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_WIN1251); - local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251); + converted = local2local(src, dest, len, PG_WIN866, PG_WIN1251, win8662win1251, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -488,12 +517,14 @@ win1251_to_win866(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_WIN866); - local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866); + converted = local2local(src, dest, len, PG_WIN1251, PG_WIN866, win12512win866, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -502,12 +533,14 @@ iso_to_koi8r(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_KOI8R); - local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi); + converted = local2local(src, dest, len, PG_ISO_8859_5, PG_KOI8R, iso2koi, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -516,12 +549,14 @@ koi8r_to_iso(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_ISO_8859_5); - local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso); + converted = local2local(src, dest, len, PG_KOI8R, PG_ISO_8859_5, koi2iso, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -530,12 +565,14 @@ iso_to_win1251(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN1251); - local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251); + converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN1251, iso2win1251, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -544,12 +581,14 @@ win1251_to_iso(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1251, PG_ISO_8859_5); - local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso); + converted = local2local(src, dest, len, PG_WIN1251, PG_ISO_8859_5, win12512iso, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -558,12 +597,14 @@ iso_to_win866(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_ISO_8859_5, PG_WIN866); - local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866); + converted = local2local(src, dest, len, PG_ISO_8859_5, PG_WIN866, iso2win866, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -572,10 +613,12 @@ win866_to_iso(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN866, PG_ISO_8859_5); - local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso); + converted = local2local(src, dest, len, PG_WIN866, PG_ISO_8859_5, win8662iso, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c index 4d7fb116cfd..a3fd35bd406 100644 --- a/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c +++ b/src/backend/utils/mb/conversion_procs/euc2004_sjis2004/euc2004_sjis2004.c @@ -19,8 +19,8 @@ PG_MODULE_MAGIC; PG_FUNCTION_INFO_V1(euc_jis_2004_to_shift_jis_2004); PG_FUNCTION_INFO_V1(shift_jis_2004_to_euc_jis_2004); -static void euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len); -static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len); +static int euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError); /* ---------- * conv_proc( @@ -28,8 +28,11 @@ static void shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -39,12 +42,14 @@ euc_jis_2004_to_shift_jis_2004(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_SHIFT_JIS_2004); - euc_jis_20042shift_jis_2004(src, dest, len); + converted = euc_jis_20042shift_jis_2004(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -53,20 +58,23 @@ shift_jis_2004_to_euc_jis_2004(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_EUC_JIS_2004); - shift_jis_20042euc_jis_2004(src, dest, len); + converted = shift_jis_20042euc_jis_2004(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } /* * EUC_JIS_2004 -> SHIFT_JIS_2004 */ -static void -euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) +static int +euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; int c1, ku, ten; @@ -79,8 +87,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } *p++ = c1; euc++; len--; @@ -90,8 +102,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) l = pg_encoding_verifymbchar(PG_EUC_JIS_2004, (const char *) euc, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } if (c1 == SS2 && l == 2) /* JIS X 0201 kana? */ { @@ -121,8 +137,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) *p++ = (ku + 0x19b) >> 1; } else + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } } if (ku % 2) @@ -132,8 +152,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) else if (ten >= 64 && ten <= 94) *p++ = ten + 0x40; else + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } } else *p++ = ten + 0x9e; @@ -149,8 +173,12 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) else if (ku >= 63 && ku <= 94) *p++ = (ku + 0x181) >> 1; else + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } if (ku % 2) { @@ -159,20 +187,30 @@ euc_jis_20042shift_jis_2004(const unsigned char *euc, unsigned char *p, int len) else if (ten >= 64 && ten <= 94) *p++ = ten + 0x40; else + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } } else *p++ = ten + 0x9e; } else + { + if (noError) + break; report_invalid_encoding(PG_EUC_JIS_2004, (const char *) euc, len); + } euc += l; len -= l; } *p = '\0'; + + return euc - start; } /* @@ -212,9 +250,10 @@ get_ten(int b, int *ku) * SHIFT_JIS_2004 ---> EUC_JIS_2004 */ -static void -shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len) +static int +shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len, bool noError) { + const unsigned char *start = sjis; int c1; int ku, ten, @@ -230,8 +269,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, (const char *) sjis, len); + } *p++ = c1; sjis++; len--; @@ -241,8 +284,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len l = pg_encoding_verifymbchar(PG_SHIFT_JIS_2004, (const char *) sjis, len); if (l < 0 || l > len) + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, (const char *) sjis, len); + } if (c1 >= 0xa1 && c1 <= 0xdf && l == 1) { @@ -266,8 +313,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len ku = (c1 << 1) - 0x100; ten = get_ten(c2, &kubun); if (ten < 0) + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, (const char *) sjis, len); + } ku -= kubun; } else if (c1 >= 0xe0 && c1 <= 0xef) /* plane 1 62ku-94ku */ @@ -275,9 +326,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len ku = (c1 << 1) - 0x180; ten = get_ten(c2, &kubun); if (ten < 0) + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, - (const char *) sjis, len); + } ku -= kubun; } else if (c1 >= 0xf0 && c1 <= 0xf3) /* plane 2 @@ -286,8 +340,12 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len plane = 2; ten = get_ten(c2, &kubun); if (ten < 0) + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, (const char *) sjis, len); + } switch (c1) { case 0xf0: @@ -309,16 +367,24 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len plane = 2; ten = get_ten(c2, &kubun); if (ten < 0) + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, (const char *) sjis, len); + } if (c1 == 0xf4 && kubun == 1) ku = 15; else ku = (c1 << 1) - 0x19a - kubun; } else + { + if (noError) + break; report_invalid_encoding(PG_SHIFT_JIS_2004, (const char *) sjis, len); + } if (plane == 2) *p++ = SS3; @@ -330,4 +396,6 @@ shift_jis_20042euc_jis_2004(const unsigned char *sjis, unsigned char *p, int len len -= l; } *p = '\0'; + + return sjis - start; } diff --git a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c index e9bb896935f..09b3c2e75bf 100644 --- a/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c @@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_cn); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ -static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len); -static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len); +static int euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError); Datum euc_cn_to_mic(PG_FUNCTION_ARGS) @@ -40,12 +43,14 @@ euc_cn_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_MULE_INTERNAL); - euc_cn2mic(src, dest, len); + converted = euc_cn2mic(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -54,20 +59,23 @@ mic_to_euc_cn(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_CN); - mic2euc_cn(src, dest, len); + converted = mic2euc_cn(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } /* * EUC_CN ---> MIC */ -static void -euc_cn2mic(const unsigned char *euc, unsigned char *p, int len) +static int +euc_cn2mic(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; int c1; while (len > 0) @@ -76,7 +84,11 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len) if (IS_HIGHBIT_SET(c1)) { if (len < 2 || !IS_HIGHBIT_SET(euc[1])) + { + if (noError) + break; report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); + } *p++ = LC_GB2312_80; *p++ = c1; *p++ = euc[1]; @@ -86,21 +98,28 @@ euc_cn2mic(const unsigned char *euc, unsigned char *p, int len) else { /* should be ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_CN, (const char *) euc, len); + } *p++ = c1; euc++; len--; } } *p = '\0'; + + return euc - start; } /* * MIC ---> EUC_CN */ -static void -mic2euc_cn(const unsigned char *mic, unsigned char *p, int len) +static int +mic2euc_cn(const unsigned char *mic, unsigned char *p, int len, bool noError) { + const unsigned char *start = mic; int c1; while (len > 0) @@ -109,11 +128,19 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len) if (IS_HIGHBIT_SET(c1)) { if (c1 != LC_GB2312_80) + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN, (const char *) mic, len); + } if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2])) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } mic++; *p++ = *mic++; *p++ = *mic++; @@ -122,12 +149,18 @@ mic2euc_cn(const unsigned char *mic, unsigned char *p, int len) else { /* should be ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } *p++ = c1; mic++; len--; } } *p = '\0'; + + return mic - start; } diff --git a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c index 5059f917a98..2e68708893d 100644 --- a/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c @@ -42,17 +42,20 @@ PG_FUNCTION_INFO_V1(mic_to_sjis); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ -static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len); -static void mic2sjis(const unsigned char *mic, unsigned char *p, int len); -static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len); -static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len); -static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len); -static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len); +static int sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError); +static int mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError); +static int euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError); +static int euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError); +static int sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError); Datum euc_jp_to_sjis(PG_FUNCTION_ARGS) @@ -60,12 +63,14 @@ euc_jp_to_sjis(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_SJIS); - euc_jp2sjis(src, dest, len); + converted = euc_jp2sjis(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -74,12 +79,14 @@ sjis_to_euc_jp(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_EUC_JP); - sjis2euc_jp(src, dest, len); + converted = sjis2euc_jp(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -88,12 +95,14 @@ euc_jp_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_MULE_INTERNAL); - euc_jp2mic(src, dest, len); + converted = euc_jp2mic(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -102,12 +111,14 @@ mic_to_euc_jp(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_JP); - mic2euc_jp(src, dest, len); + converted = mic2euc_jp(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -116,12 +127,14 @@ sjis_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_MULE_INTERNAL); - sjis2mic(src, dest, len); + converted = sjis2mic(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -130,20 +143,23 @@ mic_to_sjis(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_SJIS); - mic2sjis(src, dest, len); + converted = mic2sjis(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } /* * SJIS ---> MIC */ -static void -sjis2mic(const unsigned char *sjis, unsigned char *p, int len) +static int +sjis2mic(const unsigned char *sjis, unsigned char *p, int len, bool noError) { + const unsigned char *start = sjis; int c1, c2, i, @@ -167,7 +183,11 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len) * JIS X0208, X0212, user defined extended characters */ if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1])) + { + if (noError) + break; report_invalid_encoding(PG_SJIS, (const char *) sjis, len); + } c2 = sjis[1]; k = (c1 << 8) + c2; if (k >= 0xed40 && k < 0xf040) @@ -257,21 +277,28 @@ sjis2mic(const unsigned char *sjis, unsigned char *p, int len) else { /* should be ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_SJIS, (const char *) sjis, len); + } *p++ = c1; sjis++; len--; } } *p = '\0'; + + return sjis - start; } /* * MIC ---> SJIS */ -static void -mic2sjis(const unsigned char *mic, unsigned char *p, int len) +static int +mic2sjis(const unsigned char *mic, unsigned char *p, int len, bool noError) { + const unsigned char *start = mic; int c1, c2, k, @@ -284,8 +311,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } *p++ = c1; mic++; len--; @@ -293,8 +324,12 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (c1 == LC_JISX0201K) *p++ = mic[1]; else if (c1 == LC_JISX0208) @@ -350,20 +385,27 @@ mic2sjis(const unsigned char *mic, unsigned char *p, int len) } } else + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS, (const char *) mic, len); + } mic += l; len -= l; } *p = '\0'; + + return mic - start; } /* * EUC_JP ---> MIC */ -static void -euc_jp2mic(const unsigned char *euc, unsigned char *p, int len) +static int +euc_jp2mic(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; int c1; int l; @@ -374,8 +416,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_JP, (const char *) euc, len); + } *p++ = c1; euc++; len--; @@ -383,8 +429,12 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_JP, (const char *) euc, len); + } if (c1 == SS2) { /* 1 byte kana? */ *p++ = LC_JISX0201K; @@ -406,14 +456,17 @@ euc_jp2mic(const unsigned char *euc, unsigned char *p, int len) len -= l; } *p = '\0'; + + return euc - start; } /* * MIC ---> EUC_JP */ -static void -mic2euc_jp(const unsigned char *mic, unsigned char *p, int len) +static int +mic2euc_jp(const unsigned char *mic, unsigned char *p, int len, bool noError) { + const unsigned char *start = mic; int c1; int l; @@ -424,8 +477,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } *p++ = c1; mic++; len--; @@ -433,8 +490,12 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (c1 == LC_JISX0201K) { *p++ = SS2; @@ -452,20 +513,27 @@ mic2euc_jp(const unsigned char *mic, unsigned char *p, int len) *p++ = mic[2]; } else + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP, (const char *) mic, len); + } mic += l; len -= l; } *p = '\0'; + + return mic - start; } /* * EUC_JP -> SJIS */ -static void -euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len) +static int +euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; int c1, c2, k; @@ -478,8 +546,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_JP, (const char *) euc, len); + } *p++ = c1; euc++; len--; @@ -487,8 +559,12 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_EUC_JP, (const char *) euc, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_JP, (const char *) euc, len); + } if (c1 == SS2) { /* hankaku kana? */ @@ -551,14 +627,17 @@ euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len) len -= l; } *p = '\0'; + + return euc - start; } /* * SJIS ---> EUC_JP */ -static void -sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len) +static int +sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len, bool noError) { + const unsigned char *start = sjis; int c1, c2, i, @@ -573,8 +652,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_SJIS, (const char *) sjis, len); + } *p++ = c1; sjis++; len--; @@ -582,8 +665,12 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_SJIS, (const char *) sjis, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_SJIS, (const char *) sjis, len); + } if (c1 >= 0xa1 && c1 <= 0xdf) { /* JIS X0201 (1 byte kana) */ @@ -680,4 +767,6 @@ sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len) len -= l; } *p = '\0'; + + return sjis - start; } diff --git a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c index ac823d6c270..3b85f0c1861 100644 --- a/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c @@ -26,13 +26,16 @@ PG_FUNCTION_INFO_V1(mic_to_euc_kr); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ -static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len); -static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len); +static int euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError); Datum euc_kr_to_mic(PG_FUNCTION_ARGS) @@ -40,12 +43,14 @@ euc_kr_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_MULE_INTERNAL); - euc_kr2mic(src, dest, len); + converted = euc_kr2mic(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -54,20 +59,23 @@ mic_to_euc_kr(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_KR); - mic2euc_kr(src, dest, len); + converted = mic2euc_kr(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } /* * EUC_KR ---> MIC */ -static void -euc_kr2mic(const unsigned char *euc, unsigned char *p, int len) +static int +euc_kr2mic(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; int c1; int l; @@ -78,8 +86,12 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len) { l = pg_encoding_verifymbchar(PG_EUC_KR, (const char *) euc, len); if (l != 2) + { + if (noError) + break; report_invalid_encoding(PG_EUC_KR, (const char *) euc, len); + } *p++ = LC_KS5601; *p++ = c1; *p++ = euc[1]; @@ -89,22 +101,29 @@ euc_kr2mic(const unsigned char *euc, unsigned char *p, int len) else { /* should be ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_KR, (const char *) euc, len); + } *p++ = c1; euc++; len--; } } *p = '\0'; + + return euc - start; } /* * MIC ---> EUC_KR */ -static void -mic2euc_kr(const unsigned char *mic, unsigned char *p, int len) +static int +mic2euc_kr(const unsigned char *mic, unsigned char *p, int len, bool noError) { + const unsigned char *start = mic; int c1; int l; @@ -115,8 +134,12 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } *p++ = c1; mic++; len--; @@ -124,18 +147,28 @@ mic2euc_kr(const unsigned char *mic, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (c1 == LC_KS5601) { *p++ = mic[1]; *p++ = mic[2]; } else + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR, (const char *) mic, len); + } mic += l; len -= l; } *p = '\0'; + + return mic - start; } diff --git a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c index 66c242d7f36..4bf8acda99f 100644 --- a/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c @@ -32,17 +32,20 @@ PG_FUNCTION_INFO_V1(mic_to_big5); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ -static void euc_tw2big5(const unsigned char *euc, unsigned char *p, int len); -static void big52euc_tw(const unsigned char *euc, unsigned char *p, int len); -static void big52mic(const unsigned char *big5, unsigned char *p, int len); -static void mic2big5(const unsigned char *mic, unsigned char *p, int len); -static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len); -static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len); +static int euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int big52euc_tw(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError); +static int mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError); +static int euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError); +static int mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError); Datum euc_tw_to_big5(PG_FUNCTION_ARGS) @@ -50,12 +53,14 @@ euc_tw_to_big5(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_BIG5); - euc_tw2big5(src, dest, len); + converted = euc_tw2big5(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -64,12 +69,14 @@ big5_to_euc_tw(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_EUC_TW); - big52euc_tw(src, dest, len); + converted = big52euc_tw(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -78,12 +85,14 @@ euc_tw_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_MULE_INTERNAL); - euc_tw2mic(src, dest, len); + converted = euc_tw2mic(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -92,12 +101,14 @@ mic_to_euc_tw(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_EUC_TW); - mic2euc_tw(src, dest, len); + converted = mic2euc_tw(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -106,12 +117,14 @@ big5_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_MULE_INTERNAL); - big52mic(src, dest, len); + converted = big52mic(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -120,21 +133,24 @@ mic_to_big5(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_BIG5); - mic2big5(src, dest, len); + converted = mic2big5(src, dest, len, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } /* * EUC_TW ---> Big5 */ -static void -euc_tw2big5(const unsigned char *euc, unsigned char *p, int len) +static int +euc_tw2big5(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; unsigned char c1; unsigned short big5buf, cnsBuf; @@ -149,8 +165,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len) /* Verify and decode the next EUC_TW input character */ l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_TW, (const char *) euc, len); + } if (c1 == SS2) { c1 = euc[1]; /* plane No. */ @@ -171,8 +191,12 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len) /* Write it out in Big5 */ big5buf = CNStoBIG5(cnsBuf, lc); if (big5buf == 0) + { + if (noError) + break; report_untranslatable_char(PG_EUC_TW, PG_BIG5, (const char *) euc, len); + } *p++ = (big5buf >> 8) & 0x00ff; *p++ = big5buf & 0x00ff; @@ -182,22 +206,29 @@ euc_tw2big5(const unsigned char *euc, unsigned char *p, int len) else { /* should be ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_TW, (const char *) euc, len); + } *p++ = c1; euc++; len--; } } *p = '\0'; + + return euc - start; } /* * Big5 ---> EUC_TW */ -static void -big52euc_tw(const unsigned char *big5, unsigned char *p, int len) +static int +big52euc_tw(const unsigned char *big5, unsigned char *p, int len, bool noError) { + const unsigned char *start = big5; unsigned short c1; unsigned short big5buf, cnsBuf; @@ -212,8 +243,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len) { l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_BIG5, (const char *) big5, len); + } big5buf = (c1 << 8) | big5[1]; cnsBuf = BIG5toCNS(big5buf, &lc); @@ -237,8 +272,12 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len) *p++ = cnsBuf & 0x00ff; } else + { + if (noError) + break; report_untranslatable_char(PG_BIG5, PG_EUC_TW, (const char *) big5, len); + } big5 += l; len -= l; @@ -256,14 +295,17 @@ big52euc_tw(const unsigned char *big5, unsigned char *p, int len) } } *p = '\0'; + + return big5 - start; } /* * EUC_TW ---> MIC */ -static void -euc_tw2mic(const unsigned char *euc, unsigned char *p, int len) +static int +euc_tw2mic(const unsigned char *euc, unsigned char *p, int len, bool noError) { + const unsigned char *start = euc; int c1; int l; @@ -274,8 +316,12 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len) { l = pg_encoding_verifymbchar(PG_EUC_TW, (const char *) euc, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_TW, (const char *) euc, len); + } if (c1 == SS2) { c1 = euc[1]; /* plane No. */ @@ -304,22 +350,29 @@ euc_tw2mic(const unsigned char *euc, unsigned char *p, int len) else { /* should be ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_EUC_TW, (const char *) euc, len); + } *p++ = c1; euc++; len--; } } *p = '\0'; + + return euc - start; } /* * MIC ---> EUC_TW */ -static void -mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) +static int +mic2euc_tw(const unsigned char *mic, unsigned char *p, int len, bool noError) { + const unsigned char *start = mic; int c1; int l; @@ -330,8 +383,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } *p++ = c1; mic++; len--; @@ -339,8 +396,12 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (c1 == LC_CNS11643_1) { *p++ = mic[1]; @@ -362,20 +423,27 @@ mic2euc_tw(const unsigned char *mic, unsigned char *p, int len) *p++ = mic[3]; } else + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW, (const char *) mic, len); + } mic += l; len -= l; } *p = '\0'; + + return mic - start; } /* * Big5 ---> MIC */ -static void -big52mic(const unsigned char *big5, unsigned char *p, int len) +static int +big52mic(const unsigned char *big5, unsigned char *p, int len, bool noError) { + const unsigned char *start = big5; unsigned short c1; unsigned short big5buf, cnsBuf; @@ -389,8 +457,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_BIG5, (const char *) big5, len); + } *p++ = c1; big5++; len--; @@ -398,8 +470,12 @@ big52mic(const unsigned char *big5, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_BIG5, (const char *) big5, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_BIG5, (const char *) big5, len); + } big5buf = (c1 << 8) | big5[1]; cnsBuf = BIG5toCNS(big5buf, &lc); if (lc != 0) @@ -412,20 +488,27 @@ big52mic(const unsigned char *big5, unsigned char *p, int len) *p++ = cnsBuf & 0x00ff; } else + { + if (noError) + break; report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL, (const char *) big5, len); + } big5 += l; len -= l; } *p = '\0'; + + return big5 - start; } /* * MIC ---> Big5 */ -static void -mic2big5(const unsigned char *mic, unsigned char *p, int len) +static int +mic2big5(const unsigned char *mic, unsigned char *p, int len, bool noError) { + const unsigned char *start = mic; unsigned short c1; unsigned short big5buf, cnsBuf; @@ -438,8 +521,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len) { /* ASCII */ if (c1 == 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } *p++ = c1; mic++; len--; @@ -447,8 +534,12 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len) } l = pg_encoding_verifymbchar(PG_MULE_INTERNAL, (const char *) mic, len); if (l < 0) + { + if (noError) + break; report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len); + } if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == LCPRV2_B) { if (c1 == LCPRV2_B) @@ -462,16 +553,26 @@ mic2big5(const unsigned char *mic, unsigned char *p, int len) } big5buf = CNStoBIG5(cnsBuf, c1); if (big5buf == 0) + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5, (const char *) mic, len); + } *p++ = (big5buf >> 8) & 0x00ff; *p++ = big5buf & 0x00ff; } else + { + if (noError) + break; report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5, (const char *) mic, len); + } mic += l; len -= l; } *p = '\0'; + + return mic - start; } diff --git a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c index 2e28e6780a5..8610fcb69aa 100644 --- a/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c +++ b/src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c @@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(win1250_to_latin2); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -82,12 +85,14 @@ latin2_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_MULE_INTERNAL); - latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2); + converted = latin2mic(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -96,12 +101,14 @@ mic_to_latin2(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN2); - mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2); + converted = mic2latin(src, dest, len, LC_ISO8859_2, PG_LATIN2, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -110,13 +117,15 @@ win1250_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_MULE_INTERNAL); - latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250, - win1250_2_iso88592); + converted = latin2mic_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250, + win1250_2_iso88592, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -125,13 +134,15 @@ mic_to_win1250(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_WIN1250); - mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250, - iso88592_2_win1250); + converted = mic2latin_with_table(src, dest, len, LC_ISO8859_2, PG_WIN1250, + iso88592_2_win1250, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -140,12 +151,15 @@ latin2_to_win1250(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN2, PG_WIN1250); - local2local(src, dest, len, PG_LATIN2, PG_WIN1250, iso88592_2_win1250); + converted = local2local(src, dest, len, PG_LATIN2, PG_WIN1250, + iso88592_2_win1250, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -154,10 +168,13 @@ win1250_to_latin2(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_WIN1250, PG_LATIN2); - local2local(src, dest, len, PG_WIN1250, PG_LATIN2, win1250_2_iso88592); + converted = local2local(src, dest, len, PG_WIN1250, PG_LATIN2, + win1250_2_iso88592, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c index bc651410f21..bff27d1c295 100644 --- a/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c +++ b/src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c @@ -30,8 +30,11 @@ PG_FUNCTION_INFO_V1(mic_to_latin4); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -42,12 +45,14 @@ latin1_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_MULE_INTERNAL); - latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1); + converted = latin2mic(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,12 +61,14 @@ mic_to_latin1(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN1); - mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1); + converted = mic2latin(src, dest, len, LC_ISO8859_1, PG_LATIN1, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -70,12 +77,14 @@ latin3_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN3, PG_MULE_INTERNAL); - latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3); + converted = latin2mic(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -84,12 +93,14 @@ mic_to_latin3(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN3); - mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3); + converted = mic2latin(src, dest, len, LC_ISO8859_3, PG_LATIN3, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -98,12 +109,14 @@ latin4_to_mic(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN4, PG_MULE_INTERNAL); - latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4); + converted = latin2mic(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -112,10 +125,12 @@ mic_to_latin4(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_MULE_INTERNAL, PG_LATIN4); - mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4); + converted = mic2latin(src, dest, len, LC_ISO8859_4, PG_LATIN4, noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c index d6067cdc24e..3838b15cab9 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_big5); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ big5_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_BIG5, PG_UTF8); - LocalToUtf(src, len, dest, - &big5_to_unicode_tree, - NULL, 0, - NULL, - PG_BIG5); + converted = LocalToUtf(src, len, dest, + &big5_to_unicode_tree, + NULL, 0, + NULL, + PG_BIG5, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_big5(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_BIG5); - UtfToLocal(src, len, dest, - &big5_from_unicode_tree, - NULL, 0, - NULL, - PG_BIG5); + converted = UtfToLocal(src, len, dest, + &big5_from_unicode_tree, + NULL, 0, + NULL, + PG_BIG5, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c index ed90e8e682e..75719fe5f1b 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c @@ -33,8 +33,11 @@ PG_FUNCTION_INFO_V1(koi8u_to_utf8); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -44,16 +47,19 @@ utf8_to_koi8r(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8R); - UtfToLocal(src, len, dest, - &koi8r_from_unicode_tree, - NULL, 0, - NULL, - PG_KOI8R); + converted = UtfToLocal(src, len, dest, + &koi8r_from_unicode_tree, + NULL, 0, + NULL, + PG_KOI8R, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -62,16 +68,19 @@ koi8r_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8R, PG_UTF8); - LocalToUtf(src, len, dest, - &koi8r_to_unicode_tree, - NULL, 0, - NULL, - PG_KOI8R); + converted = LocalToUtf(src, len, dest, + &koi8r_to_unicode_tree, + NULL, 0, + NULL, + PG_KOI8R, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -80,16 +89,19 @@ utf8_to_koi8u(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_KOI8U); - UtfToLocal(src, len, dest, - &koi8u_from_unicode_tree, - NULL, 0, - NULL, - PG_KOI8U); + converted = UtfToLocal(src, len, dest, + &koi8u_from_unicode_tree, + NULL, 0, + NULL, + PG_KOI8U, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -98,14 +110,17 @@ koi8u_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_KOI8U, PG_UTF8); - LocalToUtf(src, len, dest, - &koi8u_to_unicode_tree, - NULL, 0, - NULL, - PG_KOI8U); + converted = LocalToUtf(src, len, dest, + &koi8u_to_unicode_tree, + NULL, 0, + NULL, + PG_KOI8U, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c index d699affce47..5391001951a 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc2004/utf8_and_euc2004.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jis_2004); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ euc_jis_2004_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JIS_2004, PG_UTF8); - LocalToUtf(src, len, dest, - &euc_jis_2004_to_unicode_tree, - LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined), - NULL, - PG_EUC_JIS_2004); + converted = LocalToUtf(src, len, dest, + &euc_jis_2004_to_unicode_tree, + LUmapEUC_JIS_2004_combined, lengthof(LUmapEUC_JIS_2004_combined), + NULL, + PG_EUC_JIS_2004, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_euc_jis_2004(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JIS_2004); - UtfToLocal(src, len, dest, - &euc_jis_2004_from_unicode_tree, - ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined), - NULL, - PG_EUC_JIS_2004); + converted = UtfToLocal(src, len, dest, + &euc_jis_2004_from_unicode_tree, + ULmapEUC_JIS_2004_combined, lengthof(ULmapEUC_JIS_2004_combined), + NULL, + PG_EUC_JIS_2004, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c index d7c0ba6a58b..c87d1bf2398 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_cn); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ euc_cn_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_CN, PG_UTF8); - LocalToUtf(src, len, dest, - &euc_cn_to_unicode_tree, - NULL, 0, - NULL, - PG_EUC_CN); + converted = LocalToUtf(src, len, dest, + &euc_cn_to_unicode_tree, + NULL, 0, + NULL, + PG_EUC_CN, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_CN); - UtfToLocal(src, len, dest, - &euc_cn_from_unicode_tree, - NULL, 0, - NULL, - PG_EUC_CN); + converted = UtfToLocal(src, len, dest, + &euc_cn_from_unicode_tree, + NULL, 0, + NULL, + PG_EUC_CN, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c index 13a3a23e77b..6a55134db21 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_jp); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ euc_jp_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_JP, PG_UTF8); - LocalToUtf(src, len, dest, - &euc_jp_to_unicode_tree, - NULL, 0, - NULL, - PG_EUC_JP); + converted = LocalToUtf(src, len, dest, + &euc_jp_to_unicode_tree, + NULL, 0, + NULL, + PG_EUC_JP, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_JP); - UtfToLocal(src, len, dest, - &euc_jp_from_unicode_tree, - NULL, 0, - NULL, - PG_EUC_JP); + converted = UtfToLocal(src, len, dest, + &euc_jp_from_unicode_tree, + NULL, 0, + NULL, + PG_EUC_JP, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c index 1bbb8aaef7b..fe1924e2fec 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_kr); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ euc_kr_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_KR, PG_UTF8); - LocalToUtf(src, len, dest, - &euc_kr_to_unicode_tree, - NULL, 0, - NULL, - PG_EUC_KR); + converted = LocalToUtf(src, len, dest, + &euc_kr_to_unicode_tree, + NULL, 0, + NULL, + PG_EUC_KR, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_KR); - UtfToLocal(src, len, dest, - &euc_kr_from_unicode_tree, - NULL, 0, - NULL, - PG_EUC_KR); + converted = UtfToLocal(src, len, dest, + &euc_kr_from_unicode_tree, + NULL, 0, + NULL, + PG_EUC_KR, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c index 9830045dccd..68215659b57 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_euc_tw); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ euc_tw_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_EUC_TW, PG_UTF8); - LocalToUtf(src, len, dest, - &euc_tw_to_unicode_tree, - NULL, 0, - NULL, - PG_EUC_TW); + converted = LocalToUtf(src, len, dest, + &euc_tw_to_unicode_tree, + NULL, 0, + NULL, + PG_EUC_TW, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_EUC_TW); - UtfToLocal(src, len, dest, - &euc_tw_from_unicode_tree, - NULL, 0, - NULL, - PG_EUC_TW); + converted = UtfToLocal(src, len, dest, + &euc_tw_from_unicode_tree, + NULL, 0, + NULL, + PG_EUC_TW, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c index f86ecf27424..e1a59c39a4d 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c @@ -183,8 +183,11 @@ conv_utf8_to_18030(uint32 code) * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -193,16 +196,19 @@ gb18030_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_GB18030, PG_UTF8); - LocalToUtf(src, len, dest, - &gb18030_to_unicode_tree, - NULL, 0, - conv_18030_to_utf8, - PG_GB18030); + converted = LocalToUtf(src, len, dest, + &gb18030_to_unicode_tree, + NULL, 0, + conv_18030_to_utf8, + PG_GB18030, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -211,14 +217,17 @@ utf8_to_gb18030(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GB18030); - UtfToLocal(src, len, dest, - &gb18030_from_unicode_tree, - NULL, 0, - conv_utf8_to_18030, - PG_GB18030); + converted = UtfToLocal(src, len, dest, + &gb18030_from_unicode_tree, + NULL, 0, + conv_utf8_to_18030, + PG_GB18030, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c index 2ab8b16c8a8..881386d5347 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_gbk); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ gbk_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_GBK, PG_UTF8); - LocalToUtf(src, len, dest, - &gbk_to_unicode_tree, - NULL, 0, - NULL, - PG_GBK); + converted = LocalToUtf(src, len, dest, + &gbk_to_unicode_tree, + NULL, 0, + NULL, + PG_GBK, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_gbk(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_GBK); - UtfToLocal(src, len, dest, - &gbk_from_unicode_tree, - NULL, 0, - NULL, - PG_GBK); + converted = UtfToLocal(src, len, dest, + &gbk_from_unicode_tree, + NULL, 0, + NULL, + PG_GBK, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c index 3e49f67ea2f..d93a521badf 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c @@ -52,8 +52,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -100,6 +103,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); int i; CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8); @@ -108,12 +112,15 @@ iso8859_to_utf8(PG_FUNCTION_ARGS) { if (encoding == maps[i].encoding) { - LocalToUtf(src, len, dest, - maps[i].map1, - NULL, 0, - NULL, - encoding); - PG_RETURN_VOID(); + int converted; + + converted = LocalToUtf(src, len, dest, + maps[i].map1, + NULL, 0, + NULL, + encoding, + noError); + PG_RETURN_INT32(converted); } } @@ -122,7 +129,7 @@ iso8859_to_utf8(PG_FUNCTION_ARGS) errmsg("unexpected encoding ID %d for ISO 8859 character sets", encoding))); - PG_RETURN_VOID(); + PG_RETURN_INT32(0); } Datum @@ -132,6 +139,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); int i; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1); @@ -140,12 +148,15 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) { if (encoding == maps[i].encoding) { - UtfToLocal(src, len, dest, - maps[i].map2, - NULL, 0, - NULL, - encoding); - PG_RETURN_VOID(); + int converted; + + converted = UtfToLocal(src, len, dest, + maps[i].map2, + NULL, 0, + NULL, + encoding, + noError); + PG_RETURN_INT32(converted); } } @@ -154,5 +165,5 @@ utf8_to_iso8859(PG_FUNCTION_ARGS) errmsg("unexpected encoding ID %d for ISO 8859 character sets", encoding))); - PG_RETURN_VOID(); + PG_RETURN_INT32(0); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c index 67e713cca11..d0dc4cca378 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c @@ -26,8 +26,11 @@ PG_FUNCTION_INFO_V1(utf8_to_iso8859_1); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -37,6 +40,8 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + unsigned char *start = src; unsigned short c; CHECK_ENCODING_CONVERSION_ARGS(PG_LATIN1, PG_UTF8); @@ -45,7 +50,11 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS) { c = *src; if (c == 0) + { + if (noError) + break; report_invalid_encoding(PG_LATIN1, (const char *) src, len); + } if (!IS_HIGHBIT_SET(c)) *dest++ = c; else @@ -58,7 +67,7 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS) } *dest = '\0'; - PG_RETURN_VOID(); + PG_RETURN_INT32(src - start); } Datum @@ -67,6 +76,8 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + unsigned char *start = src; unsigned short c, c1; @@ -76,7 +87,11 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS) { c = *src; if (c == 0) + { + if (noError) + break; report_invalid_encoding(PG_UTF8, (const char *) src, len); + } /* fast path for ASCII-subset characters */ if (!IS_HIGHBIT_SET(c)) { @@ -89,10 +104,18 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS) int l = pg_utf_mblen(src); if (l > len || !pg_utf8_islegal(src, l)) + { + if (noError) + break; report_invalid_encoding(PG_UTF8, (const char *) src, len); + } if (l != 2) + { + if (noError) + break; report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); + } c1 = src[1] & 0x3f; c = ((c & 0x1f) << 6) | c1; if (c >= 0x80 && c <= 0xff) @@ -102,11 +125,15 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS) len -= 2; } else + { + if (noError) + break; report_untranslatable_char(PG_UTF8, PG_LATIN1, (const char *) src, len); + } } } *dest = '\0'; - PG_RETURN_VOID(); + PG_RETURN_INT32(src - start); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c index 578f5df4e7f..317daa2d5ee 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_johab); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ johab_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_JOHAB, PG_UTF8); - LocalToUtf(src, len, dest, - &johab_to_unicode_tree, - NULL, 0, - NULL, - PG_JOHAB); + converted = LocalToUtf(src, len, dest, + &johab_to_unicode_tree, + NULL, 0, + NULL, + PG_JOHAB, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_johab(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_JOHAB); - UtfToLocal(src, len, dest, - &johab_from_unicode_tree, - NULL, 0, - NULL, - PG_JOHAB); + converted = UtfToLocal(src, len, dest, + &johab_from_unicode_tree, + NULL, 0, + NULL, + PG_JOHAB, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c index dd9fc2975ad..4c9348aba59 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_sjis); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ sjis_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_SJIS, PG_UTF8); - LocalToUtf(src, len, dest, - &sjis_to_unicode_tree, - NULL, 0, - NULL, - PG_SJIS); + converted = LocalToUtf(src, len, dest, + &sjis_to_unicode_tree, + NULL, 0, + NULL, + PG_SJIS, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_sjis(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SJIS); - UtfToLocal(src, len, dest, - &sjis_from_unicode_tree, - NULL, 0, - NULL, - PG_SJIS); + converted = UtfToLocal(src, len, dest, + &sjis_from_unicode_tree, + NULL, 0, + NULL, + PG_SJIS, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c index 4bcc886d674..1fffdc5930c 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/utf8_and_sjis2004.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_shift_jis_2004); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ shift_jis_2004_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_SHIFT_JIS_2004, PG_UTF8); - LocalToUtf(src, len, dest, - &shift_jis_2004_to_unicode_tree, - LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined), - NULL, - PG_SHIFT_JIS_2004); + converted = LocalToUtf(src, len, dest, + &shift_jis_2004_to_unicode_tree, + LUmapSHIFT_JIS_2004_combined, lengthof(LUmapSHIFT_JIS_2004_combined), + NULL, + PG_SHIFT_JIS_2004, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_shift_jis_2004(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_SHIFT_JIS_2004); - UtfToLocal(src, len, dest, - &shift_jis_2004_from_unicode_tree, - ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined), - NULL, - PG_SHIFT_JIS_2004); + converted = UtfToLocal(src, len, dest, + &shift_jis_2004_from_unicode_tree, + ULmapSHIFT_JIS_2004_combined, lengthof(ULmapSHIFT_JIS_2004_combined), + NULL, + PG_SHIFT_JIS_2004, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c index c8e512994a1..d9471dad097 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c @@ -28,8 +28,11 @@ PG_FUNCTION_INFO_V1(utf8_to_uhc); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ Datum @@ -38,16 +41,19 @@ uhc_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UHC, PG_UTF8); - LocalToUtf(src, len, dest, - &uhc_to_unicode_tree, - NULL, 0, - NULL, - PG_UHC); + converted = LocalToUtf(src, len, dest, + &uhc_to_unicode_tree, + NULL, 0, + NULL, + PG_UHC, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } Datum @@ -56,14 +62,17 @@ utf8_to_uhc(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); + int converted; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, PG_UHC); - UtfToLocal(src, len, dest, - &uhc_from_unicode_tree, - NULL, 0, - NULL, - PG_UHC); + converted = UtfToLocal(src, len, dest, + &uhc_from_unicode_tree, + NULL, 0, + NULL, + PG_UHC, + noError); - PG_RETURN_VOID(); + PG_RETURN_INT32(converted); } diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c index 0c9493dee56..110ba5677d0 100644 --- a/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c +++ b/src/backend/utils/mb/conversion_procs/utf8_and_win/utf8_and_win.c @@ -48,8 +48,11 @@ PG_FUNCTION_INFO_V1(utf8_to_win); * INTEGER, -- destination encoding id * CSTRING, -- source string (null terminated C string) * CSTRING, -- destination string (null terminated C string) - * INTEGER -- source string length - * ) returns VOID; + * INTEGER, -- source string length + * BOOL -- if true, don't throw an error if conversion fails + * ) returns INTEGER; + * + * Returns the number of bytes successfully converted. * ---------- */ @@ -81,6 +84,7 @@ win_to_utf8(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); int i; CHECK_ENCODING_CONVERSION_ARGS(-1, PG_UTF8); @@ -89,12 +93,15 @@ win_to_utf8(PG_FUNCTION_ARGS) { if (encoding == maps[i].encoding) { - LocalToUtf(src, len, dest, - maps[i].map1, - NULL, 0, - NULL, - encoding); - PG_RETURN_VOID(); + int converted; + + converted = LocalToUtf(src, len, dest, + maps[i].map1, + NULL, 0, + NULL, + encoding, + noError); + PG_RETURN_INT32(converted); } } @@ -103,7 +110,7 @@ win_to_utf8(PG_FUNCTION_ARGS) errmsg("unexpected encoding ID %d for WIN character sets", encoding))); - PG_RETURN_VOID(); + PG_RETURN_INT32(0); } Datum @@ -113,6 +120,7 @@ utf8_to_win(PG_FUNCTION_ARGS) unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2); unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3); int len = PG_GETARG_INT32(4); + bool noError = PG_GETARG_BOOL(5); int i; CHECK_ENCODING_CONVERSION_ARGS(PG_UTF8, -1); @@ -121,12 +129,15 @@ utf8_to_win(PG_FUNCTION_ARGS) { if (encoding == maps[i].encoding) { - UtfToLocal(src, len, dest, - maps[i].map2, - NULL, 0, - NULL, - encoding); - PG_RETURN_VOID(); + int converted; + + converted = UtfToLocal(src, len, dest, + maps[i].map2, + NULL, 0, + NULL, + encoding, + noError); + PG_RETURN_INT32(converted); } } @@ -135,5 +146,5 @@ utf8_to_win(PG_FUNCTION_ARGS) errmsg("unexpected encoding ID %d for WIN character sets", encoding))); - PG_RETURN_VOID(); + PG_RETURN_INT32(0); } diff --git a/src/backend/utils/mb/mbutils.c b/src/backend/utils/mb/mbutils.c index 2578573b0ab..a13c398f4ac 100644 --- a/src/backend/utils/mb/mbutils.c +++ b/src/backend/utils/mb/mbutils.c @@ -406,12 +406,13 @@ pg_do_encoding_conversion(unsigned char *src, int len, MemoryContextAllocHuge(CurrentMemoryContext, (Size) len * MAX_CONVERSION_GROWTH + 1); - OidFunctionCall5(proc, - Int32GetDatum(src_encoding), - Int32GetDatum(dest_encoding), - CStringGetDatum(src), - CStringGetDatum(result), - Int32GetDatum(len)); + (void) OidFunctionCall6(proc, + Int32GetDatum(src_encoding), + Int32GetDatum(dest_encoding), + CStringGetDatum(src), + CStringGetDatum(result), + Int32GetDatum(len), + BoolGetDatum(false)); /* * If the result is large, it's worth repalloc'ing to release any extra @@ -435,6 +436,62 @@ pg_do_encoding_conversion(unsigned char *src, int len, return result; } +/* + * Convert src string to another encoding. + * + * This function has a different API than the other conversion functions. + * The caller should've looked up the conversion function using + * FindDefaultConversionProc(). Unlike the other functions, the converted + * result is not palloc'd. It is written to the caller-supplied buffer + * instead. + * + * src_encoding - encoding to convert from + * dest_encoding - encoding to convert to + * src, srclen - input buffer and its length in bytes + * dest, destlen - destination buffer and its size in bytes + * + * The output is null-terminated. + * + * If destlen < srclen * MAX_CONVERSION_LENGTH + 1, the converted output + * wouldn't necessarily fit in the output buffer, and the function will not + * convert the whole input. + * + * TODO: The conversion function interface is not great. Firstly, it + * would be nice to pass through the destination buffer size to the + * conversion function, so that if you pass a shorter destination buffer, it + * could still continue to fill up the whole buffer. Currently, we have to + * assume worst case expansion and stop the conversion short, even if there + * is in fact space left in the destination buffer. Secondly, it would be + * nice to return the number of bytes written to the caller, to avoid a call + * to strlen(). + */ +int +pg_do_encoding_conversion_buf(Oid proc, + int src_encoding, + int dest_encoding, + unsigned char *src, int srclen, + unsigned char *dest, int destlen, + bool noError) +{ + Datum result; + + /* + * If the destination buffer is not large enough to hold the result in the + * worst case, limit the input size passed to the conversion function. + */ + if ((Size) srclen >= ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH)) + srclen = ((destlen - 1) / (Size) MAX_CONVERSION_GROWTH); + + result = OidFunctionCall6(proc, + Int32GetDatum(src_encoding), + Int32GetDatum(dest_encoding), + CStringGetDatum(src), + CStringGetDatum(dest), + Int32GetDatum(srclen), + BoolGetDatum(noError)); + return DatumGetInt32(result); +} + /* * Convert string to encoding encoding_name. The source * encoding is the DB encoding. @@ -762,12 +819,13 @@ perform_default_encoding_conversion(const char *src, int len, MemoryContextAllocHuge(CurrentMemoryContext, (Size) len * MAX_CONVERSION_GROWTH + 1); - FunctionCall5(flinfo, + FunctionCall6(flinfo, Int32GetDatum(src_encoding), Int32GetDatum(dest_encoding), CStringGetDatum(src), CStringGetDatum(result), - Int32GetDatum(len)); + Int32GetDatum(len), + BoolGetDatum(false)); /* * Release extra space if there might be a lot --- see comments in @@ -849,12 +907,13 @@ pg_unicode_to_server(pg_wchar c, unsigned char *s) c_as_utf8[c_as_utf8_len] = '\0'; /* Convert, or throw error if we can't */ - FunctionCall5(Utf8ToServerConvProc, + FunctionCall6(Utf8ToServerConvProc, Int32GetDatum(PG_UTF8), Int32GetDatum(server_encoding), CStringGetDatum(c_as_utf8), CStringGetDatum(s), - Int32GetDatum(c_as_utf8_len)); + Int32GetDatum(c_as_utf8_len), + BoolGetDatum(false)); } diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 43fc297eb69..d77183b8d12 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -28,6 +28,7 @@ static void check_for_reg_data_type_usage(ClusterInfo *cluster); static void check_for_jsonb_9_4_usage(ClusterInfo *cluster); static void check_for_pg_role_prefix(ClusterInfo *cluster); static void check_for_new_tablespace_dir(ClusterInfo *new_cluster); +static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster); static char *get_canonical_locale_name(int category, const char *locale); @@ -102,6 +103,15 @@ check_and_dump_old_cluster(bool live_check) check_for_reg_data_type_usage(&old_cluster); check_for_isn_and_int8_passing_mismatch(&old_cluster); + /* + * PG 14 changed the function signature of encoding conversion functions. + * Conversions from older versions cannot be upgraded automatically + * because the user-defined functions used by the encoding conversions + * need to be changed to match the new signature. + */ + if (GET_MAJOR_VERSION(old_cluster.major_version) <= 1300) + check_for_user_defined_encoding_conversions(&old_cluster); + /* * Pre-PG 14 allowed user defined postfix operators, which are not * supported anymore. Verify there are none, iff applicable. @@ -1268,6 +1278,91 @@ check_for_pg_role_prefix(ClusterInfo *cluster) check_ok(); } +/* + * Verify that no user-defined encoding conversions exist. + */ +static void +check_for_user_defined_encoding_conversions(ClusterInfo *cluster) +{ + int dbnum; + FILE *script = NULL; + bool found = false; + char output_path[MAXPGPATH]; + + prep_status("Checking for user-defined encoding conversions"); + + snprintf(output_path, sizeof(output_path), + "encoding_conversions.txt"); + + /* Find any user defined encoding conversions */ + for (dbnum = 0; dbnum < cluster->dbarr.ndbs; dbnum++) + { + PGresult *res; + bool db_used = false; + int ntups; + int rowno; + int i_conoid, + i_conname, + i_nspname; + DbInfo *active_db = &cluster->dbarr.dbs[dbnum]; + PGconn *conn = connectToServer(cluster, active_db->db_name); + + /* + * The query below hardcodes FirstNormalObjectId as 16384 rather than + * interpolating that C #define into the query because, if that + * #define is ever changed, the cutoff we want to use is the value + * used by pre-version 14 servers, not that of some future version. + */ + res = executeQueryOrDie(conn, + "SELECT c.oid as conoid, c.conname, n.nspname " + "FROM pg_catalog.pg_conversion c, " + " pg_catalog.pg_namespace n " + "WHERE c.connamespace = n.oid AND " + " c.oid >= 16384"); + ntups = PQntuples(res); + i_conoid = PQfnumber(res, "conoid"); + i_conname = PQfnumber(res, "conname"); + i_nspname = PQfnumber(res, "nspname"); + for (rowno = 0; rowno < ntups; rowno++) + { + found = true; + if (script == NULL && + (script = fopen_priv(output_path, "w")) == NULL) + pg_fatal("could not open file \"%s\": %s\n", + output_path, strerror(errno)); + if (!db_used) + { + fprintf(script, "In database: %s\n", active_db->db_name); + db_used = true; + } + fprintf(script, " (oid=%s) %s.%s\n", + PQgetvalue(res, rowno, i_conoid), + PQgetvalue(res, rowno, i_nspname), + PQgetvalue(res, rowno, i_conname)); + } + + PQclear(res); + + PQfinish(conn); + } + + if (script) + fclose(script); + + if (found) + { + pg_log(PG_REPORT, "fatal\n"); + pg_fatal("Your installation contains user-defined encoding conversions.\n" + "The conversion function parameters changed in PostgreSQL version 14\n" + "so this cluster cannot currently be upgraded. You can remove the\n" + "encoding conversions in the old cluster and restart the upgrade.\n" + "A list of user-defined encoding conversions is in the file:\n" + " %s\n\n", output_path); + } + else + check_ok(); +} + /* * get_canonical_locale_name diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 489f5be427f..6a61c8f64f0 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 202103291 +#define CATALOG_VERSION_NO 202104011 #endif diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index bfb89e0575d..69ffd0c3f4d 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10914,388 +10914,388 @@ # conversion functions { oid => '4302', descr => 'internal conversion function for KOI8R to MULE_INTERNAL', - proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_mic', + proname => 'koi8r_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_mic', probin => '$libdir/cyrillic_and_mic' }, { oid => '4303', descr => 'internal conversion function for MULE_INTERNAL to KOI8R', - proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_koi8r', + proname => 'mic_to_koi8r', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_koi8r', probin => '$libdir/cyrillic_and_mic' }, { oid => '4304', descr => 'internal conversion function for ISO-8859-5 to MULE_INTERNAL', - proname => 'iso_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_mic', + proname => 'iso_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_mic', probin => '$libdir/cyrillic_and_mic' }, { oid => '4305', descr => 'internal conversion function for MULE_INTERNAL to ISO-8859-5', - proname => 'mic_to_iso', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_iso', + proname => 'mic_to_iso', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_iso', probin => '$libdir/cyrillic_and_mic' }, { oid => '4306', descr => 'internal conversion function for WIN1251 to MULE_INTERNAL', - proname => 'win1251_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_mic', + proname => 'win1251_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_mic', probin => '$libdir/cyrillic_and_mic' }, { oid => '4307', descr => 'internal conversion function for MULE_INTERNAL to WIN1251', - proname => 'mic_to_win1251', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1251', + proname => 'mic_to_win1251', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1251', probin => '$libdir/cyrillic_and_mic' }, { oid => '4308', descr => 'internal conversion function for WIN866 to MULE_INTERNAL', - proname => 'win866_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_mic', + proname => 'win866_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_mic', probin => '$libdir/cyrillic_and_mic' }, { oid => '4309', descr => 'internal conversion function for MULE_INTERNAL to WIN866', - proname => 'mic_to_win866', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win866', + proname => 'mic_to_win866', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win866', probin => '$libdir/cyrillic_and_mic' }, { oid => '4310', descr => 'internal conversion function for KOI8R to WIN1251', - proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'koi8r_to_win1251', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_win1251', probin => '$libdir/cyrillic_and_mic' }, { oid => '4311', descr => 'internal conversion function for WIN1251 to KOI8R', - proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'win1251_to_koi8r', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_koi8r', probin => '$libdir/cyrillic_and_mic' }, { oid => '4312', descr => 'internal conversion function for KOI8R to WIN866', - proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_win866', + proname => 'koi8r_to_win866', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_win866', probin => '$libdir/cyrillic_and_mic' }, { oid => '4313', descr => 'internal conversion function for WIN866 to KOI8R', - proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_koi8r', + proname => 'win866_to_koi8r', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_koi8r', probin => '$libdir/cyrillic_and_mic' }, { oid => '4314', descr => 'internal conversion function for WIN866 to WIN1251', - proname => 'win866_to_win1251', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'win866_to_win1251', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_win1251', probin => '$libdir/cyrillic_and_mic' }, { oid => '4315', descr => 'internal conversion function for WIN1251 to WIN866', - proname => 'win1251_to_win866', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'win1251_to_win866', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_win866', probin => '$libdir/cyrillic_and_mic' }, { oid => '4316', descr => 'internal conversion function for ISO-8859-5 to KOI8R', - proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_koi8r', + proname => 'iso_to_koi8r', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_koi8r', probin => '$libdir/cyrillic_and_mic' }, { oid => '4317', descr => 'internal conversion function for KOI8R to ISO-8859-5', - proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_iso', + proname => 'koi8r_to_iso', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_iso', probin => '$libdir/cyrillic_and_mic' }, { oid => '4318', descr => 'internal conversion function for ISO-8859-5 to WIN1251', - proname => 'iso_to_win1251', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win1251', + proname => 'iso_to_win1251', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win1251', probin => '$libdir/cyrillic_and_mic' }, { oid => '4319', descr => 'internal conversion function for WIN1251 to ISO-8859-5', - proname => 'win1251_to_iso', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1251_to_iso', + proname => 'win1251_to_iso', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1251_to_iso', probin => '$libdir/cyrillic_and_mic' }, { oid => '4320', descr => 'internal conversion function for ISO-8859-5 to WIN866', - proname => 'iso_to_win866', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso_to_win866', + proname => 'iso_to_win866', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso_to_win866', probin => '$libdir/cyrillic_and_mic' }, { oid => '4321', descr => 'internal conversion function for WIN866 to ISO-8859-5', - proname => 'win866_to_iso', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win866_to_iso', + proname => 'win866_to_iso', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win866_to_iso', probin => '$libdir/cyrillic_and_mic' }, { oid => '4322', descr => 'internal conversion function for EUC_CN to MULE_INTERNAL', - proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_mic', + proname => 'euc_cn_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_mic', probin => '$libdir/euc_cn_and_mic' }, { oid => '4323', descr => 'internal conversion function for MULE_INTERNAL to EUC_CN', - proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_cn', + proname => 'mic_to_euc_cn', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_cn', probin => '$libdir/euc_cn_and_mic' }, { oid => '4324', descr => 'internal conversion function for EUC_JP to SJIS', - proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_sjis', + proname => 'euc_jp_to_sjis', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_sjis', probin => '$libdir/euc_jp_and_sjis' }, { oid => '4325', descr => 'internal conversion function for SJIS to EUC_JP', - proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_euc_jp', + proname => 'sjis_to_euc_jp', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_euc_jp', probin => '$libdir/euc_jp_and_sjis' }, { oid => '4326', descr => 'internal conversion function for EUC_JP to MULE_INTERNAL', - proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_mic', + proname => 'euc_jp_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_mic', probin => '$libdir/euc_jp_and_sjis' }, { oid => '4327', descr => 'internal conversion function for SJIS to MULE_INTERNAL', - proname => 'sjis_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_mic', + proname => 'sjis_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_mic', probin => '$libdir/euc_jp_and_sjis' }, { oid => '4328', descr => 'internal conversion function for MULE_INTERNAL to EUC_JP', - proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_jp', + proname => 'mic_to_euc_jp', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_jp', probin => '$libdir/euc_jp_and_sjis' }, { oid => '4329', descr => 'internal conversion function for MULE_INTERNAL to SJIS', - proname => 'mic_to_sjis', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_sjis', + proname => 'mic_to_sjis', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_sjis', probin => '$libdir/euc_jp_and_sjis' }, { oid => '4330', descr => 'internal conversion function for EUC_KR to MULE_INTERNAL', - proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_mic', + proname => 'euc_kr_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_mic', probin => '$libdir/euc_kr_and_mic' }, { oid => '4331', descr => 'internal conversion function for MULE_INTERNAL to EUC_KR', - proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_kr', + proname => 'mic_to_euc_kr', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_kr', probin => '$libdir/euc_kr_and_mic' }, { oid => '4332', descr => 'internal conversion function for EUC_TW to BIG5', - proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_big5', + proname => 'euc_tw_to_big5', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_big5', probin => '$libdir/euc_tw_and_big5' }, { oid => '4333', descr => 'internal conversion function for BIG5 to EUC_TW', - proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_euc_tw', + proname => 'big5_to_euc_tw', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_euc_tw', probin => '$libdir/euc_tw_and_big5' }, { oid => '4334', descr => 'internal conversion function for EUC_TW to MULE_INTERNAL', - proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_mic', + proname => 'euc_tw_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_mic', probin => '$libdir/euc_tw_and_big5' }, { oid => '4335', descr => 'internal conversion function for BIG5 to MULE_INTERNAL', - proname => 'big5_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_mic', + proname => 'big5_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_mic', probin => '$libdir/euc_tw_and_big5' }, { oid => '4336', descr => 'internal conversion function for MULE_INTERNAL to EUC_TW', - proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_euc_tw', + proname => 'mic_to_euc_tw', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_euc_tw', probin => '$libdir/euc_tw_and_big5' }, { oid => '4337', descr => 'internal conversion function for MULE_INTERNAL to BIG5', - proname => 'mic_to_big5', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_big5', + proname => 'mic_to_big5', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_big5', probin => '$libdir/euc_tw_and_big5' }, { oid => '4338', descr => 'internal conversion function for LATIN2 to MULE_INTERNAL', - proname => 'latin2_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin2_to_mic', + proname => 'latin2_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin2_to_mic', probin => '$libdir/latin2_and_win1250' }, { oid => '4339', descr => 'internal conversion function for MULE_INTERNAL to LATIN2', - proname => 'mic_to_latin2', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin2', + proname => 'mic_to_latin2', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin2', probin => '$libdir/latin2_and_win1250' }, { oid => '4340', descr => 'internal conversion function for WIN1250 to MULE_INTERNAL', - proname => 'win1250_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win1250_to_mic', + proname => 'win1250_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1250_to_mic', probin => '$libdir/latin2_and_win1250' }, { oid => '4341', descr => 'internal conversion function for MULE_INTERNAL to WIN1250', - proname => 'mic_to_win1250', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_win1250', + proname => 'mic_to_win1250', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_win1250', probin => '$libdir/latin2_and_win1250' }, { oid => '4342', descr => 'internal conversion function for LATIN2 to WIN1250', - proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'latin2_to_win1250', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin2_to_win1250', probin => '$libdir/latin2_and_win1250' }, { oid => '4343', descr => 'internal conversion function for WIN1250 to LATIN2', - proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'win1250_to_latin2', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win1250_to_latin2', probin => '$libdir/latin2_and_win1250' }, { oid => '4344', descr => 'internal conversion function for LATIN1 to MULE_INTERNAL', - proname => 'latin1_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin1_to_mic', + proname => 'latin1_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin1_to_mic', probin => '$libdir/latin_and_mic' }, { oid => '4345', descr => 'internal conversion function for MULE_INTERNAL to LATIN1', - proname => 'mic_to_latin1', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin1', + proname => 'mic_to_latin1', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin1', probin => '$libdir/latin_and_mic' }, { oid => '4346', descr => 'internal conversion function for LATIN3 to MULE_INTERNAL', - proname => 'latin3_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin3_to_mic', + proname => 'latin3_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin3_to_mic', probin => '$libdir/latin_and_mic' }, { oid => '4347', descr => 'internal conversion function for MULE_INTERNAL to LATIN3', - proname => 'mic_to_latin3', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin3', + proname => 'mic_to_latin3', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin3', probin => '$libdir/latin_and_mic' }, { oid => '4348', descr => 'internal conversion function for LATIN4 to MULE_INTERNAL', - proname => 'latin4_to_mic', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'latin4_to_mic', + proname => 'latin4_to_mic', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'latin4_to_mic', probin => '$libdir/latin_and_mic' }, { oid => '4349', descr => 'internal conversion function for MULE_INTERNAL to LATIN4', - proname => 'mic_to_latin4', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'mic_to_latin4', + proname => 'mic_to_latin4', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'mic_to_latin4', probin => '$libdir/latin_and_mic' }, { oid => '4352', descr => 'internal conversion function for BIG5 to UTF8', - proname => 'big5_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'big5_to_utf8', + proname => 'big5_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'big5_to_utf8', probin => '$libdir/utf8_and_big5' }, { oid => '4353', descr => 'internal conversion function for UTF8 to BIG5', - proname => 'utf8_to_big5', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_big5', + proname => 'utf8_to_big5', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_big5', probin => '$libdir/utf8_and_big5' }, { oid => '4354', descr => 'internal conversion function for UTF8 to KOI8R', - proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8r', + proname => 'utf8_to_koi8r', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8r', probin => '$libdir/utf8_and_cyrillic' }, { oid => '4355', descr => 'internal conversion function for KOI8R to UTF8', - proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8r_to_utf8', + proname => 'koi8r_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8r_to_utf8', probin => '$libdir/utf8_and_cyrillic' }, { oid => '4356', descr => 'internal conversion function for UTF8 to KOI8U', - proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_koi8u', + proname => 'utf8_to_koi8u', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_koi8u', probin => '$libdir/utf8_and_cyrillic' }, { oid => '4357', descr => 'internal conversion function for KOI8U to UTF8', - proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'koi8u_to_utf8', + proname => 'koi8u_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'koi8u_to_utf8', probin => '$libdir/utf8_and_cyrillic' }, { oid => '4358', descr => 'internal conversion function for UTF8 to WIN', - proname => 'utf8_to_win', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_win', + proname => 'utf8_to_win', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_win', probin => '$libdir/utf8_and_win' }, { oid => '4359', descr => 'internal conversion function for WIN to UTF8', - proname => 'win_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'win_to_utf8', + proname => 'win_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'win_to_utf8', probin => '$libdir/utf8_and_win' }, { oid => '4360', descr => 'internal conversion function for EUC_CN to UTF8', - proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_cn_to_utf8', + proname => 'euc_cn_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_cn_to_utf8', probin => '$libdir/utf8_and_euc_cn' }, { oid => '4361', descr => 'internal conversion function for UTF8 to EUC_CN', - proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_cn', + proname => 'utf8_to_euc_cn', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_cn', probin => '$libdir/utf8_and_euc_cn' }, { oid => '4362', descr => 'internal conversion function for EUC_JP to UTF8', - proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_jp_to_utf8', + proname => 'euc_jp_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jp_to_utf8', probin => '$libdir/utf8_and_euc_jp' }, { oid => '4363', descr => 'internal conversion function for UTF8 to EUC_JP', - proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_jp', + proname => 'utf8_to_euc_jp', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_jp', probin => '$libdir/utf8_and_euc_jp' }, { oid => '4364', descr => 'internal conversion function for EUC_KR to UTF8', - proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_kr_to_utf8', + proname => 'euc_kr_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_kr_to_utf8', probin => '$libdir/utf8_and_euc_kr' }, { oid => '4365', descr => 'internal conversion function for UTF8 to EUC_KR', - proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_kr', + proname => 'utf8_to_euc_kr', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_kr', probin => '$libdir/utf8_and_euc_kr' }, { oid => '4366', descr => 'internal conversion function for EUC_TW to UTF8', - proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'euc_tw_to_utf8', + proname => 'euc_tw_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_tw_to_utf8', probin => '$libdir/utf8_and_euc_tw' }, { oid => '4367', descr => 'internal conversion function for UTF8 to EUC_TW', - proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_euc_tw', + proname => 'utf8_to_euc_tw', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_tw', probin => '$libdir/utf8_and_euc_tw' }, { oid => '4368', descr => 'internal conversion function for GB18030 to UTF8', - proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gb18030_to_utf8', + proname => 'gb18030_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gb18030_to_utf8', probin => '$libdir/utf8_and_gb18030' }, { oid => '4369', descr => 'internal conversion function for UTF8 to GB18030', - proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gb18030', + proname => 'utf8_to_gb18030', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gb18030', probin => '$libdir/utf8_and_gb18030' }, { oid => '4370', descr => 'internal conversion function for GBK to UTF8', - proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'gbk_to_utf8', + proname => 'gbk_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'gbk_to_utf8', probin => '$libdir/utf8_and_gbk' }, { oid => '4371', descr => 'internal conversion function for UTF8 to GBK', - proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_gbk', + proname => 'utf8_to_gbk', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_gbk', probin => '$libdir/utf8_and_gbk' }, { oid => '4372', descr => 'internal conversion function for UTF8 to ISO-8859 2-16', - proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_iso8859', + proname => 'utf8_to_iso8859', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_iso8859', probin => '$libdir/utf8_and_iso8859' }, { oid => '4373', descr => 'internal conversion function for ISO-8859 2-16 to UTF8', - proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'iso8859_to_utf8', + proname => 'iso8859_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso8859_to_utf8', probin => '$libdir/utf8_and_iso8859' }, { oid => '4374', descr => 'internal conversion function for LATIN1 to UTF8', - proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'iso8859_1_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'iso8859_1_to_utf8', probin => '$libdir/utf8_and_iso8859_1' }, { oid => '4375', descr => 'internal conversion function for UTF8 to LATIN1', - proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'utf8_to_iso8859_1', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_iso8859_1', probin => '$libdir/utf8_and_iso8859_1' }, { oid => '4376', descr => 'internal conversion function for JOHAB to UTF8', - proname => 'johab_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'johab_to_utf8', + proname => 'johab_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'johab_to_utf8', probin => '$libdir/utf8_and_johab' }, { oid => '4377', descr => 'internal conversion function for UTF8 to JOHAB', - proname => 'utf8_to_johab', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_johab', + proname => 'utf8_to_johab', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_johab', probin => '$libdir/utf8_and_johab' }, { oid => '4378', descr => 'internal conversion function for SJIS to UTF8', - proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'sjis_to_utf8', + proname => 'sjis_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'sjis_to_utf8', probin => '$libdir/utf8_and_sjis' }, { oid => '4379', descr => 'internal conversion function for UTF8 to SJIS', - proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_sjis', + proname => 'utf8_to_sjis', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_sjis', probin => '$libdir/utf8_and_sjis' }, { oid => '4380', descr => 'internal conversion function for UHC to UTF8', - proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'uhc_to_utf8', + proname => 'uhc_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'uhc_to_utf8', probin => '$libdir/utf8_and_uhc' }, { oid => '4381', descr => 'internal conversion function for UTF8 to UHC', - proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', prosrc => 'utf8_to_uhc', + proname => 'utf8_to_uhc', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_uhc', probin => '$libdir/utf8_and_uhc' }, { oid => '4382', descr => 'internal conversion function for EUC_JIS_2004 to UTF8', - proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'euc_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jis_2004_to_utf8', probin => '$libdir/utf8_and_euc2004' }, { oid => '4383', descr => 'internal conversion function for UTF8 to EUC_JIS_2004', - proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'utf8_to_euc_jis_2004', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_euc_jis_2004', probin => '$libdir/utf8_and_euc2004' }, { oid => '4384', descr => 'internal conversion function for SHIFT_JIS_2004 to UTF8', - proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'shift_jis_2004_to_utf8', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'shift_jis_2004_to_utf8', probin => '$libdir/utf8_and_sjis2004' }, { oid => '4385', descr => 'internal conversion function for UTF8 to SHIFT_JIS_2004', - proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'void', - proargtypes => 'int4 int4 cstring internal int4', + proname => 'utf8_to_shift_jis_2004', prolang => 'c', prorettype => 'int4', + proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'utf8_to_shift_jis_2004', probin => '$libdir/utf8_and_sjis2004' }, { oid => '4386', descr => 'internal conversion function for EUC_JIS_2004 to SHIFT_JIS_2004', proname => 'euc_jis_2004_to_shift_jis_2004', prolang => 'c', - prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4', + prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'euc_jis_2004_to_shift_jis_2004', probin => '$libdir/euc2004_sjis2004' }, { oid => '4387', descr => 'internal conversion function for SHIFT_JIS_2004 to EUC_JIS_2004', proname => 'shift_jis_2004_to_euc_jis_2004', prolang => 'c', - prorettype => 'void', proargtypes => 'int4 int4 cstring internal int4', + prorettype => 'int4', proargtypes => 'int4 int4 cstring internal int4 bool', prosrc => 'shift_jis_2004_to_euc_jis_2004', probin => '$libdir/euc2004_sjis2004' }, diff --git a/src/include/mb/pg_wchar.h b/src/include/mb/pg_wchar.h index 64b22e4b0d4..a9aaff9e6dc 100644 --- a/src/include/mb/pg_wchar.h +++ b/src/include/mb/pg_wchar.h @@ -616,6 +616,12 @@ extern int pg_bind_textdomain_codeset(const char *domainname); extern unsigned char *pg_do_encoding_conversion(unsigned char *src, int len, int src_encoding, int dest_encoding); +extern int pg_do_encoding_conversion_buf(Oid proc, + int src_encoding, + int dest_encoding, + unsigned char *src, int srclen, + unsigned char *dst, int dstlen, + bool noError); extern char *pg_client_to_server(const char *s, int len); extern char *pg_server_to_client(const char *s, int len); @@ -627,18 +633,18 @@ extern void pg_unicode_to_server(pg_wchar c, unsigned char *s); extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc); extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc); -extern void UtfToLocal(const unsigned char *utf, int len, +extern int UtfToLocal(const unsigned char *utf, int len, unsigned char *iso, const pg_mb_radix_tree *map, const pg_utf_to_local_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, - int encoding); -extern void LocalToUtf(const unsigned char *iso, int len, + int encoding, bool noError); +extern int LocalToUtf(const unsigned char *iso, int len, unsigned char *utf, const pg_mb_radix_tree *map, const pg_local_to_utf_combined *cmap, int cmapsize, utf_local_conversion_func conv_func, - int encoding); + int encoding, bool noError); extern bool pg_verifymbstr(const char *mbstr, int len, bool noError); extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len, @@ -656,18 +662,19 @@ extern void report_invalid_encoding(int encoding, const char *mbstr, int len) pg extern void report_untranslatable_char(int src_encoding, int dest_encoding, const char *mbstr, int len) pg_attribute_noreturn(); -extern void local2local(const unsigned char *l, unsigned char *p, int len, - int src_encoding, int dest_encoding, const unsigned char *tab); -extern void latin2mic(const unsigned char *l, unsigned char *p, int len, - int lc, int encoding); -extern void mic2latin(const unsigned char *mic, unsigned char *p, int len, - int lc, int encoding); -extern void latin2mic_with_table(const unsigned char *l, unsigned char *p, +extern int local2local(const unsigned char *l, unsigned char *p, int len, + int src_encoding, int dest_encoding, + const unsigned char *tab, bool noError); +extern int latin2mic(const unsigned char *l, unsigned char *p, int len, + int lc, int encoding, bool noError); +extern int mic2latin(const unsigned char *mic, unsigned char *p, int len, + int lc, int encoding, bool noError); +extern int latin2mic_with_table(const unsigned char *l, unsigned char *p, int len, int lc, int encoding, - const unsigned char *tab); -extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p, + const unsigned char *tab, bool noError); +extern int mic2latin_with_table(const unsigned char *mic, unsigned char *p, int len, int lc, int encoding, - const unsigned char *tab); + const unsigned char *tab, bool noError); #ifdef WIN32 extern WCHAR *pgwin32_message_to_UTF16(const char *str, int len, int *utf16len); diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out index 62c10671685..e34ab20974d 100644 --- a/src/test/regress/expected/conversion.out +++ b/src/test/regress/expected/conversion.out @@ -37,3 +37,522 @@ DROP CONVERSION mydef; -- RESET SESSION AUTHORIZATION; DROP USER regress_conversion_user; +-- +-- Test built-in conversion functions. +-- +-- Helper function to test a conversion. Uses the test_enc_conversion function +-- that was created in the create_function_1 test. +create or replace function test_conv( + input IN bytea, + src_encoding IN text, + dst_encoding IN text, + result OUT bytea, + errorat OUT bytea, + error OUT text) +language plpgsql as +$$ +declare + validlen int; +begin + -- First try to perform the conversion with noError = false. If that errors out, + -- capture the error message, and try again with noError = true. The second call + -- should succeed and return the position of the error, return that too. + begin + select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false); + errorat = NULL; + error := NULL; + exception when others then + error := sqlerrm; + select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true); + errorat = substr(input, validlen + 1); + end; + return; +end; +$$; +-- +-- UTF-8 +-- +CREATE TABLE utf8_inputs (inbytes bytea, description text); +insert into utf8_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\xc3a4c3b6', 'valid, extra latin chars'), + ('\xd184d0bed0be', 'valid, cyrillic'), + ('\x666f6fe8b1a1', 'valid, kanji/Chinese'), + ('\xe382abe3829a', 'valid, two chars that combine to one in EUC_JIS_2004'), + ('\xe382ab', 'only first half of combined char in EUC_JIS_2004'), + ('\xe382abe382', 'incomplete combination when converted EUC_JIS_2004'), + ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'), + ('\x666f6fefa8aa', 'valid, needs mapping function to convert to GB18030'), + ('\x66e8b1ff6f6f', 'invalid byte sequence'), + ('\x66006f', 'invalid, NUL byte'), + ('\x666f6fe8b100', 'invalid, NUL byte'), + ('\x666f6fe8b1', 'incomplete character at end'); +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs; + description | result | errorat | error +------------------------------------------------------+----------------------+--------------+----------------------------------------------------------- + valid, pure ASCII | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | | + valid, cyrillic | \xd184d0bed0be | | + valid, kanji/Chinese | \x666f6fe8b1a1 | | + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | | + only first half of combined char in EUC_JIS_2004 | \xe382ab | | + incomplete combination when converted EUC_JIS_2004 | \xe382ab | \xe382 | invalid byte sequence for encoding "UTF8": 0xe3 0x82 + valid, Hangul, Korean | \xecbd94eb81bceba6ac | | + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | | + invalid byte sequence | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +-- Test conversions from UTF-8 +select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs; + description | inbytes | result | errorat | error +------------------------------------------------------+----------------------+----------------+----------------------+------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | \xa9daa9ec | | + valid, cyrillic | \xd184d0bed0be | \xa7e6a7e0a7e0 | | + valid, kanji/Chinese | \x666f6fe8b1a1 | \x666f6fbedd | | + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | \xa5f7 | | + only first half of combined char in EUC_JIS_2004 | \xe382ab | \xa5ab | | + incomplete combination when converted EUC_JIS_2004 | \xe382abe382 | \x | \xe382abe382 | invalid byte sequence for encoding "UTF8": 0xe3 0x82 + valid, Hangul, Korean | \xecbd94eb81bceba6ac | \x | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004" + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "EUC_JIS_2004" + invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs; + description | inbytes | result | errorat | error +------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | \xe4f6 | | + valid, cyrillic | \xd184d0bed0be | \x | \xd184d0bed0be | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN1" + valid, kanji/Chinese | \x666f6fe8b1a1 | \x666f6f | \xe8b1a1 | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN1" + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | \x | \xe382abe3829a | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1" + only first half of combined char in EUC_JIS_2004 | \xe382ab | \x | \xe382ab | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1" + incomplete combination when converted EUC_JIS_2004 | \xe382abe382 | \x | \xe382abe382 | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN1" + valid, Hangul, Korean | \xecbd94eb81bceba6ac | \x | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN1" + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN1" + invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs; + description | inbytes | result | errorat | error +------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | \xe4f6 | | + valid, cyrillic | \xd184d0bed0be | \x | \xd184d0bed0be | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN2" + valid, kanji/Chinese | \x666f6fe8b1a1 | \x666f6f | \xe8b1a1 | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN2" + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | \x | \xe382abe3829a | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2" + only first half of combined char in EUC_JIS_2004 | \xe382ab | \x | \xe382ab | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2" + incomplete combination when converted EUC_JIS_2004 | \xe382abe382 | \x | \xe382abe382 | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN2" + valid, Hangul, Korean | \xecbd94eb81bceba6ac | \x | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN2" + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN2" + invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs; + description | inbytes | result | errorat | error +------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | \xe4f6 | | + valid, cyrillic | \xd184d0bed0be | \x | \xd184d0bed0be | character with byte sequence 0xd1 0x84 in encoding "UTF8" has no equivalent in encoding "LATIN5" + valid, kanji/Chinese | \x666f6fe8b1a1 | \x666f6f | \xe8b1a1 | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "LATIN5" + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | \x | \xe382abe3829a | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5" + only first half of combined char in EUC_JIS_2004 | \xe382ab | \x | \xe382ab | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5" + incomplete combination when converted EUC_JIS_2004 | \xe382abe382 | \x | \xe382abe382 | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "LATIN5" + valid, Hangul, Korean | \xecbd94eb81bceba6ac | \x | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "LATIN5" + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "LATIN5" + invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs; + description | inbytes | result | errorat | error +------------------------------------------------------+----------------------+----------+----------------------+------------------------------------------------------------------------------------------------------ + valid, pure ASCII | \x666f6f | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | \x | \xc3a4c3b6 | character with byte sequence 0xc3 0xa4 in encoding "UTF8" has no equivalent in encoding "KOI8R" + valid, cyrillic | \xd184d0bed0be | \xc6cfcf | | + valid, kanji/Chinese | \x666f6fe8b1a1 | \x666f6f | \xe8b1a1 | character with byte sequence 0xe8 0xb1 0xa1 in encoding "UTF8" has no equivalent in encoding "KOI8R" + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | \x | \xe382abe3829a | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R" + only first half of combined char in EUC_JIS_2004 | \xe382ab | \x | \xe382ab | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R" + incomplete combination when converted EUC_JIS_2004 | \xe382abe382 | \x | \xe382abe382 | character with byte sequence 0xe3 0x82 0xab in encoding "UTF8" has no equivalent in encoding "KOI8R" + valid, Hangul, Korean | \xecbd94eb81bceba6ac | \x | \xecbd94eb81bceba6ac | character with byte sequence 0xec 0xbd 0x94 in encoding "UTF8" has no equivalent in encoding "KOI8R" + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f | \xefa8aa | character with byte sequence 0xef 0xa8 0xaa in encoding "UTF8" has no equivalent in encoding "KOI8R" + invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs; + description | inbytes | result | errorat | error +------------------------------------------------------+----------------------+----------------------------+--------------+----------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid, extra latin chars | \xc3a4c3b6 | \x81308a3181308b32 | | + valid, cyrillic | \xd184d0bed0be | \xa7e6a7e0a7e0 | | + valid, kanji/Chinese | \x666f6fe8b1a1 | \x666f6fcff3 | | + valid, two chars that combine to one in EUC_JIS_2004 | \xe382abe3829a | \xa5ab8139a732 | | + only first half of combined char in EUC_JIS_2004 | \xe382ab | \xa5ab | | + incomplete combination when converted EUC_JIS_2004 | \xe382abe382 | \xa5ab | \xe382 | invalid byte sequence for encoding "UTF8": 0xe3 0x82 + valid, Hangul, Korean | \xecbd94eb81bceba6ac | \x8334e5398238c4338330b335 | | + valid, needs mapping function to convert to GB18030 | \x666f6fefa8aa | \x666f6f84309c38 | | + invalid byte sequence | \x66e8b1ff6f6f | \x66 | \xe8b1ff6f6f | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0xff + invalid, NUL byte | \x66006f | \x66 | \x006f | invalid byte sequence for encoding "UTF8": 0x00 + invalid, NUL byte | \x666f6fe8b100 | \x666f6f | \xe8b100 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 0x00 + incomplete character at end | \x666f6fe8b1 | \x666f6f | \xe8b1 | invalid byte sequence for encoding "UTF8": 0xe8 0xb1 +(13 rows) + +-- +-- EUC_JIS_2004 +-- +CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text); +insert into euc_jis_2004_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6fbedd', 'valid'), + ('\xa5f7', 'valid, translates to two UTF-8 chars '), + ('\xbeddbe', 'incomplete char '), + ('\x666f6f00bedd', 'invalid, NUL byte'), + ('\x666f6fbe00dd', 'invalid, NUL byte'), + ('\x666f6fbedd00', 'invalid, NUL byte'), + ('\xbe04', 'invalid byte sequence'); +-- Test EUC_JIS_2004 verification +select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs; + description | inbytes | result | errorat | error +---------------------------------------+----------------+--------------+----------+-------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fbedd | \x666f6fbedd | | + valid, translates to two UTF-8 chars | \xa5f7 | \xa5f7 | | + incomplete char | \xbeddbe | \xbedd | \xbe | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe + invalid, NUL byte | \x666f6f00bedd | \x666f6f | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00 + invalid, NUL byte | \x666f6fbe00dd | \x666f6f | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00 + invalid, NUL byte | \x666f6fbedd00 | \x666f6fbedd | \x00 | invalid byte sequence for encoding "EUC_JIS_2004": 0x00 + invalid byte sequence | \xbe04 | \x | \xbe04 | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04 +(8 rows) + +-- Test conversions from EUC_JIS_2004 +select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs; + description | inbytes | result | errorat | error +---------------------------------------+----------------+----------------+----------+-------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fbedd | \x666f6fe8b1a1 | | + valid, translates to two UTF-8 chars | \xa5f7 | \xe382abe3829a | | + incomplete char | \xbeddbe | \xe8b1a1 | \xbe | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe + invalid, NUL byte | \x666f6f00bedd | \x666f6f | \x00bedd | invalid byte sequence for encoding "EUC_JIS_2004": 0x00 + invalid, NUL byte | \x666f6fbe00dd | \x666f6f | \xbe00dd | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x00 + invalid, NUL byte | \x666f6fbedd00 | \x666f6fe8b1a1 | \x00 | invalid byte sequence for encoding "EUC_JIS_2004": 0x00 + invalid byte sequence | \xbe04 | \x | \xbe04 | invalid byte sequence for encoding "EUC_JIS_2004": 0xbe 0x04 +(8 rows) + +-- +-- SHIFT-JIS-2004 +-- +CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text); +insert into shiftjis2004_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6f8fdb', 'valid'), + ('\x666f6f81c0', 'valid, no translation to UTF-8'), + ('\x666f6f82f5', 'valid, translates to two UTF-8 chars '), + ('\x666f6f8fdb8f', 'incomplete char '), + ('\x666f6f820a', 'incomplete char, followed by newline '), + ('\x666f6f008fdb', 'invalid, NUL byte'), + ('\x666f6f8f00db', 'invalid, NUL byte'), + ('\x666f6f8fdb00', 'invalid, NUL byte'); +-- Test SHIFT-JIS-2004 verification +select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs; + description | inbytes | result | errorat | error +---------------------------------------+----------------+--------------+----------+---------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6f8fdb | \x666f6f8fdb | | + valid, no translation to UTF-8 | \x666f6f81c0 | \x666f6f81c0 | | + valid, translates to two UTF-8 chars | \x666f6f82f5 | \x666f6f82f5 | | + incomplete char | \x666f6f8fdb8f | \x666f6f8fdb | \x8f | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f + incomplete char, followed by newline | \x666f6f820a | \x666f6f | \x820a | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a + invalid, NUL byte | \x666f6f008fdb | \x666f6f | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00 + invalid, NUL byte | \x666f6f8f00db | \x666f6f | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00 + invalid, NUL byte | \x666f6f8fdb00 | \x666f6f8fdb | \x00 | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00 +(9 rows) + +-- Test conversions from SHIFT-JIS-2004 +select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs; + description | inbytes | result | errorat | error +---------------------------------------+----------------+----------------------+----------+---------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6f8fdb | \x666f6fe8b1a1 | | + valid, no translation to UTF-8 | \x666f6f81c0 | \x666f6fe28a84 | | + valid, translates to two UTF-8 chars | \x666f6f82f5 | \x666f6fe3818be3829a | | + incomplete char | \x666f6f8fdb8f | \x666f6fe8b1a1 | \x8f | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f + incomplete char, followed by newline | \x666f6f820a | \x666f6f | \x820a | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a + invalid, NUL byte | \x666f6f008fdb | \x666f6f | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00 + invalid, NUL byte | \x666f6f8f00db | \x666f6f | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00 + invalid, NUL byte | \x666f6f8fdb00 | \x666f6fe8b1a1 | \x00 | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00 +(9 rows) + +select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs; + description | inbytes | result | errorat | error +---------------------------------------+----------------+--------------+----------+---------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6f8fdb | \x666f6fbedd | | + valid, no translation to UTF-8 | \x666f6f81c0 | \x666f6fa2c2 | | + valid, translates to two UTF-8 chars | \x666f6f82f5 | \x666f6fa4f7 | | + incomplete char | \x666f6f8fdb8f | \x666f6fbedd | \x8f | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f + incomplete char, followed by newline | \x666f6f820a | \x666f6f | \x820a | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x82 0x0a + invalid, NUL byte | \x666f6f008fdb | \x666f6f | \x008fdb | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00 + invalid, NUL byte | \x666f6f8f00db | \x666f6f | \x8f00db | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x8f 0x00 + invalid, NUL byte | \x666f6f8fdb00 | \x666f6fbedd | \x00 | invalid byte sequence for encoding "SHIFT_JIS_2004": 0x00 +(9 rows) + +-- +-- GB18030 +-- +CREATE TABLE gb18030_inputs (inbytes bytea, description text); +insert into gb18030_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6fcff3', 'valid'), + ('\x666f6f8431a530', 'valid, no translation to UTF-8'), + ('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'), + ('\x666f6f84309c', 'incomplete char '), + ('\x666f6f84309c0a', 'incomplete char, followed by newline '), + ('\x666f6f84309c3800', 'invalid, NUL byte'), + ('\x666f6f84309c0038', 'invalid, NUL byte'); +-- Test GB18030 verification +select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs; + description | inbytes | result | errorat | error +------------------------------------------------+--------------------+------------------+--------------+------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fcff3 | \x666f6fcff3 | | + valid, no translation to UTF-8 | \x666f6f8431a530 | \x666f6f8431a530 | | + valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6f84309c38 | | + incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c + incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a + invalid, NUL byte | \x666f6f84309c3800 | \x666f6f84309c38 | \x00 | invalid byte sequence for encoding "GB18030": 0x00 + invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00 +(8 rows) + +-- Test conversions from GB18030 +select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs; + description | inbytes | result | errorat | error +------------------------------------------------+--------------------+----------------+--------------+------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fcff3 | \x666f6fe8b1a1 | | + valid, no translation to UTF-8 | \x666f6f8431a530 | \x666f6f | \x8431a530 | character with byte sequence 0x84 0x31 0xa5 0x30 in encoding "GB18030" has no equivalent in encoding "UTF8" + valid, translates to UTF-8 by mapping function | \x666f6f84309c38 | \x666f6fefa8aa | | + incomplete char | \x666f6f84309c | \x666f6f | \x84309c | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c + incomplete char, followed by newline | \x666f6f84309c0a | \x666f6f | \x84309c0a | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x0a + invalid, NUL byte | \x666f6f84309c3800 | \x666f6fefa8aa | \x00 | invalid byte sequence for encoding "GB18030": 0x00 + invalid, NUL byte | \x666f6f84309c0038 | \x666f6f | \x84309c0038 | invalid byte sequence for encoding "GB18030": 0x84 0x30 0x9c 0x00 +(8 rows) + +-- +-- ISO-8859-5 +-- +CREATE TABLE iso8859_5_inputs (inbytes bytea, description text); +insert into iso8859_5_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\xe4dede', 'valid'), + ('\x00', 'invalid, NUL byte'), + ('\xe400dede', 'invalid, NUL byte'), + ('\xe4dede00', 'invalid, NUL byte'); +-- Test ISO-8859-5 verification +select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs; + description | inbytes | result | errorat | error +-------------------+------------+----------+----------+------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \xe4dede | \xe4dede | | + invalid, NUL byte | \x00 | \x | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe400dede | \xe4 | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe4dede00 | \xe4dede | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 +(5 rows) + +-- Test conversions from ISO-8859-5 +select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs; + description | inbytes | result | errorat | error +-------------------+------------+----------------+----------+------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \xe4dede | \xd184d0bed0be | | + invalid, NUL byte | \x00 | \x | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe400dede | \xd184 | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe4dede00 | \xd184d0bed0be | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 +(5 rows) + +select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs; + description | inbytes | result | errorat | error +-------------------+------------+----------+----------+------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \xe4dede | \xc6cfcf | | + invalid, NUL byte | \x00 | \x | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe400dede | \xc6 | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe4dede00 | \xc6cfcf | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 +(5 rows) + +select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs; + description | inbytes | result | errorat | error +-------------------+------------+----------------+----------+------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \xe4dede | \x8bc68bcf8bcf | | + invalid, NUL byte | \x00 | \x | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe400dede | \x8bc6 | \x00dede | invalid byte sequence for encoding "ISO_8859_5": 0x00 + invalid, NUL byte | \xe4dede00 | \x8bc68bcf8bcf | \x00 | invalid byte sequence for encoding "ISO_8859_5": 0x00 +(5 rows) + +-- +-- Big5 +-- +CREATE TABLE big5_inputs (inbytes bytea, description text); +insert into big5_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6fb648', 'valid'), + ('\x666f6fa27f', 'valid, no translation to UTF-8'), + ('\x666f6fb60048', 'invalid, NUL byte'), + ('\x666f6fb64800', 'invalid, NUL byte'); +-- Test Big5 verification +select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs; + description | inbytes | result | errorat | error +--------------------------------+----------------+--------------+----------+------------------------------------------------------ + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fb648 | \x666f6fb648 | | + valid, no translation to UTF-8 | \x666f6fa27f | \x666f6fa27f | | + invalid, NUL byte | \x666f6fb60048 | \x666f6f | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00 + invalid, NUL byte | \x666f6fb64800 | \x666f6fb648 | \x00 | invalid byte sequence for encoding "BIG5": 0x00 +(5 rows) + +-- Test conversions from Big5 +select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs; + description | inbytes | result | errorat | error +--------------------------------+----------------+----------------+----------+------------------------------------------------------------------------------------------------ + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fb648 | \x666f6fe8b1a1 | | + valid, no translation to UTF-8 | \x666f6fa27f | \x666f6f | \xa27f | character with byte sequence 0xa2 0x7f in encoding "BIG5" has no equivalent in encoding "UTF8" + invalid, NUL byte | \x666f6fb60048 | \x666f6f | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00 + invalid, NUL byte | \x666f6fb64800 | \x666f6fe8b1a1 | \x00 | invalid byte sequence for encoding "BIG5": 0x00 +(5 rows) + +select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs; + description | inbytes | result | errorat | error +--------------------------------+----------------+----------------+----------+------------------------------------------------------ + valid, pure ASCII | \x666f6f | \x666f6f | | + valid | \x666f6fb648 | \x666f6f95e2af | | + valid, no translation to UTF-8 | \x666f6fa27f | \x666f6f95a3c1 | | + invalid, NUL byte | \x666f6fb60048 | \x666f6f | \xb60048 | invalid byte sequence for encoding "BIG5": 0xb6 0x00 + invalid, NUL byte | \x666f6fb64800 | \x666f6f95e2af | \x00 | invalid byte sequence for encoding "BIG5": 0x00 +(5 rows) + +-- +-- MULE_INTERNAL +-- +CREATE TABLE mic_inputs (inbytes bytea, description text); +insert into mic_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x8bc68bcf8bcf', 'valid (in KOI8R)'), + ('\x8bc68bcf8b', 'invalid,incomplete char'), + ('\x92bedd', 'valid (in SHIFT_JIS)'), + ('\x92be', 'invalid, incomplete char)'), + ('\x666f6f95a3c1', 'valid (in Big5)'), + ('\x666f6f95a3', 'invalid, incomplete char'), + ('\x9200bedd', 'invalid, NUL byte'), + ('\x92bedd00', 'invalid, NUL byte'), + ('\x8b00c68bcf8bcf', 'invalid, NUL byte'); +-- Test MULE_INTERNAL verification +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs; + description | inbytes | result | errorat | error +---------------------------+------------------+----------------+------------------+-------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid (in KOI8R) | \x8bc68bcf8bcf | \x8bc68bcf8bcf | | + invalid,incomplete char | \x8bc68bcf8b | \x8bc68bcf | \x8b | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b + valid (in SHIFT_JIS) | \x92bedd | \x92bedd | | + invalid, incomplete char) | \x92be | \x | \x92be | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe + valid (in Big5) | \x666f6f95a3c1 | \x666f6f95a3c1 | | + invalid, incomplete char | \x666f6f95a3 | \x666f6f | \x95a3 | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3 + invalid, NUL byte | \x9200bedd | \x | \x9200bedd | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe + invalid, NUL byte | \x92bedd00 | \x92bedd | \x00 | invalid byte sequence for encoding "MULE_INTERNAL": 0x00 + invalid, NUL byte | \x8b00c68bcf8bcf | \x | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00 +(10 rows) + +-- Test conversions from MULE_INTERNAL +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs; + description | inbytes | result | errorat | error +---------------------------+------------------+----------+------------------+--------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid (in KOI8R) | \x8bc68bcf8bcf | \xc6cfcf | | + invalid,incomplete char | \x8bc68bcf8b | \xc6cf | \x8b | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b + valid (in SHIFT_JIS) | \x92bedd | \x | \x92bedd | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R" + invalid, incomplete char) | \x92be | \x | \x92be | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe + valid (in Big5) | \x666f6f95a3c1 | \x666f6f | \x95a3c1 | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R" + invalid, incomplete char | \x666f6f95a3 | \x666f6f | \x95a3 | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3 + invalid, NUL byte | \x9200bedd | \x | \x9200bedd | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R" + invalid, NUL byte | \x92bedd00 | \x | \x92bedd00 | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R" + invalid, NUL byte | \x8b00c68bcf8bcf | \x | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "KOI8R" +(10 rows) + +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs; + description | inbytes | result | errorat | error +---------------------------+------------------+----------+------------------+-------------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid (in KOI8R) | \x8bc68bcf8bcf | \xe4dede | | + invalid,incomplete char | \x8bc68bcf8b | \xe4de | \x8b | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b + valid (in SHIFT_JIS) | \x92bedd | \x | \x92bedd | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5" + invalid, incomplete char) | \x92be | \x | \x92be | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe + valid (in Big5) | \x666f6f95a3c1 | \x666f6f | \x95a3c1 | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5" + invalid, incomplete char | \x666f6f95a3 | \x666f6f | \x95a3 | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3 + invalid, NUL byte | \x9200bedd | \x | \x9200bedd | character with byte sequence 0x92 0x00 0xbe in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5" + invalid, NUL byte | \x92bedd00 | \x | \x92bedd00 | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5" + invalid, NUL byte | \x8b00c68bcf8bcf | \x | \x8b00c68bcf8bcf | character with byte sequence 0x8b 0x00 in encoding "MULE_INTERNAL" has no equivalent in encoding "ISO_8859_5" +(10 rows) + +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs; + description | inbytes | result | errorat | error +---------------------------+------------------+----------+------------------+-------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid (in KOI8R) | \x8bc68bcf8bcf | \x | \x8bc68bcf8bcf | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS" + invalid,incomplete char | \x8bc68bcf8b | \x | \x8bc68bcf8b | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS" + valid (in SHIFT_JIS) | \x92bedd | \x8fdb | | + invalid, incomplete char) | \x92be | \x | \x92be | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe + valid (in Big5) | \x666f6f95a3c1 | \x666f6f | \x95a3c1 | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "SJIS" + invalid, incomplete char | \x666f6f95a3 | \x666f6f | \x95a3 | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3 + invalid, NUL byte | \x9200bedd | \x | \x9200bedd | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe + invalid, NUL byte | \x92bedd00 | \x8fdb | \x00 | invalid byte sequence for encoding "MULE_INTERNAL": 0x00 + invalid, NUL byte | \x8b00c68bcf8bcf | \x | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00 +(10 rows) + +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs; + description | inbytes | result | errorat | error +---------------------------+------------------+--------------+------------------+-------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid (in KOI8R) | \x8bc68bcf8bcf | \x | \x8bc68bcf8bcf | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5" + invalid,incomplete char | \x8bc68bcf8b | \x | \x8bc68bcf8b | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5" + valid (in SHIFT_JIS) | \x92bedd | \x | \x92bedd | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5" + invalid, incomplete char) | \x92be | \x | \x92be | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe + valid (in Big5) | \x666f6f95a3c1 | \x666f6fa2a1 | | + invalid, incomplete char | \x666f6f95a3 | \x666f6f | \x95a3 | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3 + invalid, NUL byte | \x9200bedd | \x | \x9200bedd | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe + invalid, NUL byte | \x92bedd00 | \x | \x92bedd00 | character with byte sequence 0x92 0xbe 0xdd in encoding "MULE_INTERNAL" has no equivalent in encoding "BIG5" + invalid, NUL byte | \x8b00c68bcf8bcf | \x | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00 +(10 rows) + +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs; + description | inbytes | result | errorat | error +---------------------------+------------------+----------+------------------+---------------------------------------------------------------------------------------------------------------- + valid, pure ASCII | \x666f6f | \x666f6f | | + valid (in KOI8R) | \x8bc68bcf8bcf | \x | \x8bc68bcf8bcf | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP" + invalid,incomplete char | \x8bc68bcf8b | \x | \x8bc68bcf8b | character with byte sequence 0x8b 0xc6 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP" + valid (in SHIFT_JIS) | \x92bedd | \xbedd | | + invalid, incomplete char) | \x92be | \x | \x92be | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0xbe + valid (in Big5) | \x666f6f95a3c1 | \x666f6f | \x95a3c1 | character with byte sequence 0x95 0xa3 0xc1 in encoding "MULE_INTERNAL" has no equivalent in encoding "EUC_JP" + invalid, incomplete char | \x666f6f95a3 | \x666f6f | \x95a3 | invalid byte sequence for encoding "MULE_INTERNAL": 0x95 0xa3 + invalid, NUL byte | \x9200bedd | \x | \x9200bedd | invalid byte sequence for encoding "MULE_INTERNAL": 0x92 0x00 0xbe + invalid, NUL byte | \x92bedd00 | \xbedd | \x00 | invalid byte sequence for encoding "MULE_INTERNAL": 0x00 + invalid, NUL byte | \x8b00c68bcf8bcf | \x | \x8b00c68bcf8bcf | invalid byte sequence for encoding "MULE_INTERNAL": 0x8b 0x00 +(10 rows) + diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index ef4b4444b90..fa26bf76104 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -1052,13 +1052,14 @@ WHERE p1.conproc = 0 OR SELECT p.oid, p.proname, c.oid, c.conname FROM pg_proc p, pg_conversion c WHERE p.oid = c.conproc AND - (p.prorettype != 'void'::regtype OR p.proretset OR - p.pronargs != 5 OR + (p.prorettype != 'int4'::regtype OR p.proretset OR + p.pronargs != 6 OR p.proargtypes[0] != 'int4'::regtype OR p.proargtypes[1] != 'int4'::regtype OR p.proargtypes[2] != 'cstring'::regtype OR p.proargtypes[3] != 'internal'::regtype OR - p.proargtypes[4] != 'int4'::regtype); + p.proargtypes[4] != 'int4'::regtype OR + p.proargtypes[5] != 'bool'::regtype); oid | proname | oid | conname -----+---------+-----+--------- (0 rows) diff --git a/src/test/regress/input/create_function_1.source b/src/test/regress/input/create_function_1.source index 412e339fcf2..6ba37fe63b6 100644 --- a/src/test/regress/input/create_function_1.source +++ b/src/test/regress/input/create_function_1.source @@ -78,6 +78,10 @@ CREATE FUNCTION test_opclass_options_func(internal) AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func' LANGUAGE C; +CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) + AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion' + LANGUAGE C; + -- Things that shouldn't work: CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL diff --git a/src/test/regress/output/create_function_1.source b/src/test/regress/output/create_function_1.source index 4d78fa12289..cb38a039bf4 100644 --- a/src/test/regress/output/create_function_1.source +++ b/src/test/regress/output/create_function_1.source @@ -68,6 +68,9 @@ CREATE FUNCTION test_opclass_options_func(internal) RETURNS void AS '@libdir@/regress@DLSUFFIX@', 'test_opclass_options_func' LANGUAGE C; +CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea) + AS '@libdir@/regress@DLSUFFIX@', 'test_enc_conversion' + LANGUAGE C; -- Things that shouldn't work: CREATE FUNCTION test1 (int) RETURNS int LANGUAGE SQL AS 'SELECT ''not an integer'';'; diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c index 32ab9ed6b53..1990cbb6a13 100644 --- a/src/test/regress/regress.c +++ b/src/test/regress/regress.c @@ -23,12 +23,15 @@ #include "access/htup_details.h" #include "access/transam.h" #include "access/xact.h" +#include "catalog/namespace.h" #include "catalog/pg_operator.h" #include "catalog/pg_type.h" #include "commands/sequence.h" #include "commands/trigger.h" #include "executor/executor.h" #include "executor/spi.h" +#include "funcapi.h" +#include "mb/pg_wchar.h" #include "miscadmin.h" #include "nodes/supportnodes.h" #include "optimizer/optimizer.h" @@ -1060,3 +1063,134 @@ test_opclass_options_func(PG_FUNCTION_ARGS) { PG_RETURN_NULL(); } + +/* + * Call an encoding conversion or verification function. + * + * Arguments: + * string bytea -- string to convert + * src_enc name -- source encoding + * dest_enc name -- destination encoding + * noError bool -- if set, don't ereport() on invalid or untranslatable + * input + * + * Result is a tuple with two attributes: + * int4 -- number of input bytes successfully converted + * bytea -- converted string + */ +PG_FUNCTION_INFO_V1(test_enc_conversion); +Datum +test_enc_conversion(PG_FUNCTION_ARGS) +{ + bytea *string = PG_GETARG_BYTEA_PP(0); + char *src_encoding_name = NameStr(*PG_GETARG_NAME(1)); + int src_encoding = pg_char_to_encoding(src_encoding_name); + char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2)); + int dest_encoding = pg_char_to_encoding(dest_encoding_name); + bool noError = PG_GETARG_BOOL(3); + TupleDesc tupdesc; + char *src; + char *dst; + bytea *retval; + Size srclen; + Size dstsize; + Oid proc; + int convertedbytes; + int dstlen; + Datum values[2]; + bool nulls[2]; + HeapTuple tuple; + + if (src_encoding < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid source encoding name \"%s\"", + src_encoding_name))); + if (dest_encoding < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid destination encoding name \"%s\"", + dest_encoding_name))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + tupdesc = BlessTupleDesc(tupdesc); + + srclen = VARSIZE_ANY_EXHDR(string); + src = VARDATA_ANY(string); + + if (src_encoding == dest_encoding) + { + /* just check that the source string is valid */ + int oklen; + + oklen = pg_encoding_verifymbstr(src_encoding, src, srclen); + + if (oklen == srclen) + { + convertedbytes = oklen; + retval = string; + } + else if (!noError) + { + report_invalid_encoding(src_encoding, src + oklen, srclen - oklen); + } + else + { + /* + * build bytea data type structure. + */ + Assert(oklen < srclen); + convertedbytes = oklen; + retval = (bytea *) palloc(oklen + VARHDRSZ); + SET_VARSIZE(retval, oklen + VARHDRSZ); + memcpy(VARDATA(retval), src, oklen); + } + } + else + { + proc = FindDefaultConversionProc(src_encoding, dest_encoding); + if (!OidIsValid(proc)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist", + pg_encoding_to_char(src_encoding), + pg_encoding_to_char(dest_encoding)))); + + if (srclen >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH)) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of memory"), + errdetail("String of %d bytes is too long for encoding conversion.", + (int) srclen))); + + dstsize = (Size) srclen * MAX_CONVERSION_GROWTH + 1; + dst = MemoryContextAlloc(CurrentMemoryContext, dstsize); + + /* perform conversion */ + convertedbytes = pg_do_encoding_conversion_buf(proc, + src_encoding, + dest_encoding, + (unsigned char *) src, srclen, + (unsigned char *) dst, dstsize, + noError); + dstlen = strlen(dst); + + /* + * build bytea data type structure. + */ + retval = (bytea *) palloc(dstlen + VARHDRSZ); + SET_VARSIZE(retval, dstlen + VARHDRSZ); + memcpy(VARDATA(retval), dst, dstlen); + + pfree(dst); + } + + MemSet(nulls, 0, sizeof(nulls)); + values[0] = Int32GetDatum(convertedbytes); + values[1] = PointerGetDatum(retval); + tuple = heap_form_tuple(tupdesc, values, nulls); + + PG_RETURN_DATUM(HeapTupleGetDatum(tuple)); +} diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql index 02cf39f1ce9..ea85f20ed83 100644 --- a/src/test/regress/sql/conversion.sql +++ b/src/test/regress/sql/conversion.sql @@ -34,3 +34,188 @@ DROP CONVERSION mydef; -- RESET SESSION AUTHORIZATION; DROP USER regress_conversion_user; + +-- +-- Test built-in conversion functions. +-- + +-- Helper function to test a conversion. Uses the test_enc_conversion function +-- that was created in the create_function_1 test. +create or replace function test_conv( + input IN bytea, + src_encoding IN text, + dst_encoding IN text, + + result OUT bytea, + errorat OUT bytea, + error OUT text) +language plpgsql as +$$ +declare + validlen int; +begin + -- First try to perform the conversion with noError = false. If that errors out, + -- capture the error message, and try again with noError = true. The second call + -- should succeed and return the position of the error, return that too. + begin + select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, false); + errorat = NULL; + error := NULL; + exception when others then + error := sqlerrm; + select * into validlen, result from test_enc_conversion(input, src_encoding, dst_encoding, true); + errorat = substr(input, validlen + 1); + end; + return; +end; +$$; + + +-- +-- UTF-8 +-- +CREATE TABLE utf8_inputs (inbytes bytea, description text); +insert into utf8_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\xc3a4c3b6', 'valid, extra latin chars'), + ('\xd184d0bed0be', 'valid, cyrillic'), + ('\x666f6fe8b1a1', 'valid, kanji/Chinese'), + ('\xe382abe3829a', 'valid, two chars that combine to one in EUC_JIS_2004'), + ('\xe382ab', 'only first half of combined char in EUC_JIS_2004'), + ('\xe382abe382', 'incomplete combination when converted EUC_JIS_2004'), + ('\xecbd94eb81bceba6ac', 'valid, Hangul, Korean'), + ('\x666f6fefa8aa', 'valid, needs mapping function to convert to GB18030'), + ('\x66e8b1ff6f6f', 'invalid byte sequence'), + ('\x66006f', 'invalid, NUL byte'), + ('\x666f6fe8b100', 'invalid, NUL byte'), + ('\x666f6fe8b1', 'incomplete character at end'); + +-- Test UTF-8 verification +select description, (test_conv(inbytes, 'utf8', 'utf8')).* from utf8_inputs; +-- Test conversions from UTF-8 +select description, inbytes, (test_conv(inbytes, 'utf8', 'euc_jis_2004')).* from utf8_inputs; +select description, inbytes, (test_conv(inbytes, 'utf8', 'latin1')).* from utf8_inputs; +select description, inbytes, (test_conv(inbytes, 'utf8', 'latin2')).* from utf8_inputs; +select description, inbytes, (test_conv(inbytes, 'utf8', 'latin5')).* from utf8_inputs; +select description, inbytes, (test_conv(inbytes, 'utf8', 'koi8r')).* from utf8_inputs; +select description, inbytes, (test_conv(inbytes, 'utf8', 'gb18030')).* from utf8_inputs; + +-- +-- EUC_JIS_2004 +-- +CREATE TABLE euc_jis_2004_inputs (inbytes bytea, description text); +insert into euc_jis_2004_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6fbedd', 'valid'), + ('\xa5f7', 'valid, translates to two UTF-8 chars '), + ('\xbeddbe', 'incomplete char '), + ('\x666f6f00bedd', 'invalid, NUL byte'), + ('\x666f6fbe00dd', 'invalid, NUL byte'), + ('\x666f6fbedd00', 'invalid, NUL byte'), + ('\xbe04', 'invalid byte sequence'); + +-- Test EUC_JIS_2004 verification +select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'euc_jis_2004')).* from euc_jis_2004_inputs; +-- Test conversions from EUC_JIS_2004 +select description, inbytes, (test_conv(inbytes, 'euc_jis_2004', 'utf8')).* from euc_jis_2004_inputs; + +-- +-- SHIFT-JIS-2004 +-- +CREATE TABLE shiftjis2004_inputs (inbytes bytea, description text); +insert into shiftjis2004_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6f8fdb', 'valid'), + ('\x666f6f81c0', 'valid, no translation to UTF-8'), + ('\x666f6f82f5', 'valid, translates to two UTF-8 chars '), + ('\x666f6f8fdb8f', 'incomplete char '), + ('\x666f6f820a', 'incomplete char, followed by newline '), + ('\x666f6f008fdb', 'invalid, NUL byte'), + ('\x666f6f8f00db', 'invalid, NUL byte'), + ('\x666f6f8fdb00', 'invalid, NUL byte'); + +-- Test SHIFT-JIS-2004 verification +select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'shiftjis2004')).* from shiftjis2004_inputs; +-- Test conversions from SHIFT-JIS-2004 +select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'utf8')).* from shiftjis2004_inputs; +select description, inbytes, (test_conv(inbytes, 'shiftjis2004', 'euc_jis_2004')).* from shiftjis2004_inputs; + +-- +-- GB18030 +-- +CREATE TABLE gb18030_inputs (inbytes bytea, description text); +insert into gb18030_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6fcff3', 'valid'), + ('\x666f6f8431a530', 'valid, no translation to UTF-8'), + ('\x666f6f84309c38', 'valid, translates to UTF-8 by mapping function'), + ('\x666f6f84309c', 'incomplete char '), + ('\x666f6f84309c0a', 'incomplete char, followed by newline '), + ('\x666f6f84309c3800', 'invalid, NUL byte'), + ('\x666f6f84309c0038', 'invalid, NUL byte'); + +-- Test GB18030 verification +select description, inbytes, (test_conv(inbytes, 'gb18030', 'gb18030')).* from gb18030_inputs; +-- Test conversions from GB18030 +select description, inbytes, (test_conv(inbytes, 'gb18030', 'utf8')).* from gb18030_inputs; + + +-- +-- ISO-8859-5 +-- +CREATE TABLE iso8859_5_inputs (inbytes bytea, description text); +insert into iso8859_5_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\xe4dede', 'valid'), + ('\x00', 'invalid, NUL byte'), + ('\xe400dede', 'invalid, NUL byte'), + ('\xe4dede00', 'invalid, NUL byte'); + +-- Test ISO-8859-5 verification +select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'iso8859-5')).* from iso8859_5_inputs; +-- Test conversions from ISO-8859-5 +select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'utf8')).* from iso8859_5_inputs; +select description, inbytes, (test_conv(inbytes, 'iso8859-5', 'koi8r')).* from iso8859_5_inputs; +select description, inbytes, (test_conv(inbytes, 'iso8859_5', 'mule_internal')).* from iso8859_5_inputs; + +-- +-- Big5 +-- +CREATE TABLE big5_inputs (inbytes bytea, description text); +insert into big5_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x666f6fb648', 'valid'), + ('\x666f6fa27f', 'valid, no translation to UTF-8'), + ('\x666f6fb60048', 'invalid, NUL byte'), + ('\x666f6fb64800', 'invalid, NUL byte'); + +-- Test Big5 verification +select description, inbytes, (test_conv(inbytes, 'big5', 'big5')).* from big5_inputs; +-- Test conversions from Big5 +select description, inbytes, (test_conv(inbytes, 'big5', 'utf8')).* from big5_inputs; +select description, inbytes, (test_conv(inbytes, 'big5', 'mule_internal')).* from big5_inputs; + +-- +-- MULE_INTERNAL +-- +CREATE TABLE mic_inputs (inbytes bytea, description text); +insert into mic_inputs values + ('\x666f6f', 'valid, pure ASCII'), + ('\x8bc68bcf8bcf', 'valid (in KOI8R)'), + ('\x8bc68bcf8b', 'invalid,incomplete char'), + ('\x92bedd', 'valid (in SHIFT_JIS)'), + ('\x92be', 'invalid, incomplete char)'), + ('\x666f6f95a3c1', 'valid (in Big5)'), + ('\x666f6f95a3', 'invalid, incomplete char'), + ('\x9200bedd', 'invalid, NUL byte'), + ('\x92bedd00', 'invalid, NUL byte'), + ('\x8b00c68bcf8bcf', 'invalid, NUL byte'); + +-- Test MULE_INTERNAL verification +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'mule_internal')).* from mic_inputs; +-- Test conversions from MULE_INTERNAL +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'koi8r')).* from mic_inputs; +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'iso8859-5')).* from mic_inputs; +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'sjis')).* from mic_inputs; +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'big5')).* from mic_inputs; +select description, inbytes, (test_conv(inbytes, 'mule_internal', 'euc_jp')).* from mic_inputs; diff --git a/src/test/regress/sql/opr_sanity.sql b/src/test/regress/sql/opr_sanity.sql index bbd3834b634..04691745981 100644 --- a/src/test/regress/sql/opr_sanity.sql +++ b/src/test/regress/sql/opr_sanity.sql @@ -556,13 +556,14 @@ WHERE p1.conproc = 0 OR SELECT p.oid, p.proname, c.oid, c.conname FROM pg_proc p, pg_conversion c WHERE p.oid = c.conproc AND - (p.prorettype != 'void'::regtype OR p.proretset OR - p.pronargs != 5 OR + (p.prorettype != 'int4'::regtype OR p.proretset OR + p.pronargs != 6 OR p.proargtypes[0] != 'int4'::regtype OR p.proargtypes[1] != 'int4'::regtype OR p.proargtypes[2] != 'cstring'::regtype OR p.proargtypes[3] != 'internal'::regtype OR - p.proargtypes[4] != 'int4'::regtype); + p.proargtypes[4] != 'int4'::regtype OR + p.proargtypes[5] != 'bool'::regtype); -- Check for conprocs that don't perform the specific conversion that -- pg_conversion alleges they do, by trying to invoke each conversion