1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-28 05:21:27 +03:00
Heikki Linnakangas ea1b99a661 Add 'noError' argument to encoding conversion functions.
With the 'noError' argument, you can try to convert a buffer without
knowing the character boundaries beforehand. The functions now need to
return the number of input bytes successfully converted.

This is is a backwards-incompatible change, if you have created a custom
encoding conversion with CREATE CONVERSION. This adds a check to
pg_upgrade for that, refusing the upgrade if there are any user-defined
encoding conversions. Custom conversions are very rare, there are no
commonly used extensions that I know of that uses that feature. No other
objects can depend on conversions, so if you do have one, you can fairly
easily drop it before upgrading, and recreate it after the upgrade with
an updated version.

Add regression tests for built-in encoding conversions. This doesn't cover
every conversion, but it covers all the internal functions in conv.c that
are used to implement the conversions.

Reviewed-by: John Naylor
Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01%40iki.fi
2021-04-01 11:45:22 +03:00

839 lines
18 KiB
C

/*-------------------------------------------------------------------------
*
* Utility functions for conversion procs.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/utils/mb/conv.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "mb/pg_wchar.h"
/*
* local2local: a generic single byte charset encoding
* conversion between two ASCII-superset encodings.
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* src_encoding is the PG identifier for the source encoding
* dest_encoding is the PG identifier for the target encoding
* tab holds conversion entries for the source charset
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the target charset, or 0 if there is no equivalent code.
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
local2local(const unsigned char *l,
unsigned char *p,
int len,
int src_encoding,
int dest_encoding,
const unsigned char *tab,
bool noError)
{
const unsigned char *start = l;
unsigned char c1,
c2;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
{
if (noError)
break;
report_invalid_encoding(src_encoding, (const char *) l, len);
}
if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
c2 = tab[c1 - HIGHBIT];
if (c2)
*p++ = c2;
else
{
if (noError)
break;
report_untranslatable_char(src_encoding, dest_encoding,
(const char *) l, len);
}
}
l++;
len--;
}
*p = '\0';
return l - start;
}
/*
* LATINn ---> MIC when the charset's local codes map directly to MIC
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
latin2mic(const unsigned char *l, unsigned char *p, int len,
int lc, int encoding, bool noError)
{
const unsigned char *start = l;
int c1;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
{
if (noError)
break;
report_invalid_encoding(encoding, (const char *) l, len);
}
if (IS_HIGHBIT_SET(c1))
*p++ = lc;
*p++ = c1;
l++;
len--;
}
*p = '\0';
return l - start;
}
/*
* MIC ---> LATINn when the charset's local codes map directly to MIC
*
* mic points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
mic2latin(const unsigned char *mic, unsigned char *p, int len,
int lc, int encoding, bool noError)
{
const unsigned char *start = mic;
int c1;
while (len > 0)
{
c1 = *mic;
if (c1 == 0)
{
if (noError)
break;
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
}
if (!IS_HIGHBIT_SET(c1))
{
/* easy for ASCII */
*p++ = c1;
mic++;
len--;
}
else
{
int l = pg_mule_mblen(mic);
if (len < l)
{
if (noError)
break;
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
len);
}
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
{
if (noError)
break;
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
(const char *) mic, len);
}
*p++ = mic[1];
mic += 2;
len -= 2;
}
}
*p = '\0';
return mic - start;
}
/*
* latin2mic_with_table: a generic single byte charset encoding
* conversion from a local charset to the mule internal code.
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
* tab holds conversion entries for the local charset
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the mule encoding, or 0 if there is no equivalent code.
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
latin2mic_with_table(const unsigned char *l,
unsigned char *p,
int len,
int lc,
int encoding,
const unsigned char *tab,
bool noError)
{
const unsigned char *start = l;
unsigned char c1,
c2;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
{
if (noError)
break;
report_invalid_encoding(encoding, (const char *) l, len);
}
if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
c2 = tab[c1 - HIGHBIT];
if (c2)
{
*p++ = lc;
*p++ = c2;
}
else
{
if (noError)
break;
report_untranslatable_char(encoding, PG_MULE_INTERNAL,
(const char *) l, len);
}
}
l++;
len--;
}
*p = '\0';
return l - start;
}
/*
* mic2latin_with_table: a generic single byte charset encoding
* conversion from the mule internal code to a local charset.
*
* mic points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
* tab holds conversion entries for the mule internal code's second byte,
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the local charset, or 0 if there is no equivalent code.
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
mic2latin_with_table(const unsigned char *mic,
unsigned char *p,
int len,
int lc,
int encoding,
const unsigned char *tab,
bool noError)
{
const unsigned char *start = mic;
unsigned char c1,
c2;
while (len > 0)
{
c1 = *mic;
if (c1 == 0)
{
if (noError)
break;
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
}
if (!IS_HIGHBIT_SET(c1))
{
/* easy for ASCII */
*p++ = c1;
mic++;
len--;
}
else
{
int l = pg_mule_mblen(mic);
if (len < l)
{
if (noError)
break;
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
len);
}
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
(c2 = tab[mic[1] - HIGHBIT]) == 0)
{
if (noError)
break;
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
(const char *) mic, len);
break; /* keep compiler quiet */
}
*p++ = c2;
mic += 2;
len -= 2;
}
}
*p = '\0';
return mic - start;
}
/*
* comparison routine for bsearch()
* this routine is intended for combined UTF8 -> local code
*/
static int
compare3(const void *p1, const void *p2)
{
uint32 s1,
s2,
d1,
d2;
s1 = *(const uint32 *) p1;
s2 = *((const uint32 *) p1 + 1);
d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
}
/*
* comparison routine for bsearch()
* this routine is intended for local code -> combined UTF8
*/
static int
compare4(const void *p1, const void *p2)
{
uint32 v1,
v2;
v1 = *(const uint32 *) p1;
v2 = ((const pg_local_to_utf_combined *) p2)->code;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* store 32bit character representation into multibyte stream
*/
static inline unsigned char *
store_coded_char(unsigned char *dest, uint32 code)
{
if (code & 0xff000000)
*dest++ = code >> 24;
if (code & 0x00ff0000)
*dest++ = code >> 16;
if (code & 0x0000ff00)
*dest++ = code >> 8;
if (code & 0x000000ff)
*dest++ = code;
return dest;
}
/*
* Convert a character using a conversion radix tree.
*
* 'l' is the length of the input character in bytes, and b1-b4 are
* the input character's bytes.
*/
static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree *rt,
int l,
unsigned char b1,
unsigned char b2,
unsigned char b3,
unsigned char b4)
{
if (l == 4)
{
/* 4-byte code */
/* check code validity */
if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b4root;
idx = rt->chars32[b1 + idx - rt->b4_1_lower];
idx = rt->chars32[b2 + idx - rt->b4_2_lower];
idx = rt->chars32[b3 + idx - rt->b4_3_lower];
return rt->chars32[b4 + idx - rt->b4_4_lower];
}
else
{
uint16 idx = rt->b4root;
idx = rt->chars16[b1 + idx - rt->b4_1_lower];
idx = rt->chars16[b2 + idx - rt->b4_2_lower];
idx = rt->chars16[b3 + idx - rt->b4_3_lower];
return rt->chars16[b4 + idx - rt->b4_4_lower];
}
}
else if (l == 3)
{
/* 3-byte code */
/* check code validity */
if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b3root;
idx = rt->chars32[b2 + idx - rt->b3_1_lower];
idx = rt->chars32[b3 + idx - rt->b3_2_lower];
return rt->chars32[b4 + idx - rt->b3_3_lower];
}
else
{
uint16 idx = rt->b3root;
idx = rt->chars16[b2 + idx - rt->b3_1_lower];
idx = rt->chars16[b3 + idx - rt->b3_2_lower];
return rt->chars16[b4 + idx - rt->b3_3_lower];
}
}
else if (l == 2)
{
/* 2-byte code */
/* check code validity - first byte */
if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b2root;
idx = rt->chars32[b3 + idx - rt->b2_1_lower];
return rt->chars32[b4 + idx - rt->b2_2_lower];
}
else
{
uint16 idx = rt->b2root;
idx = rt->chars16[b3 + idx - rt->b2_1_lower];
return rt->chars16[b4 + idx - rt->b2_2_lower];
}
}
else if (l == 1)
{
/* 1-byte code */
/* check code validity - first byte */
if (b4 < rt->b1_lower || b4 > rt->b1_upper)
return 0;
/* perform lookup */
if (rt->chars32)
return rt->chars32[b4 + rt->b1root - rt->b1_lower];
else
return rt->chars16[b4 + rt->b1root - rt->b1_lower];
}
return 0; /* shouldn't happen */
}
/*
* UTF8 ---> local code
*
* utf: input string in UTF8 encoding (need not be null-terminated)
* len: length of input string (in bytes)
* iso: pointer to the output area (must be large enough!)
(output string will be null-terminated)
* map: conversion map for single characters
* cmap: conversion map for combined characters
* (optional, pass NULL if none)
* cmapsize: number of entries in the conversion map for combined characters
* (optional, pass 0 if none)
* conv_func: algorithmic encoding conversion function
* (optional, pass NULL if none)
* encoding: PG identifier for the local encoding
*
* For each character, the cmap (if provided) is consulted first; if no match,
* the map is consulted next; if still no match, the conv_func (if provided)
* is applied. An error is raised if no match is found.
*
* See pg_wchar.h for more details about the data structures used here.
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
UtfToLocal(const unsigned char *utf, int len,
unsigned char *iso,
const pg_mb_radix_tree *map,
const pg_utf_to_local_combined *cmap, int cmapsize,
utf_local_conversion_func conv_func,
int encoding, bool noError)
{
uint32 iutf;
int l;
const pg_utf_to_local_combined *cp;
const unsigned char *start = utf;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
unsigned char b1 = 0;
unsigned char b2 = 0;
unsigned char b3 = 0;
unsigned char b4 = 0;
/* "break" cases all represent errors */
if (*utf == '\0')
break;
l = pg_utf_mblen(utf);
if (len < l)
break;
if (!pg_utf8_islegal(utf, l))
break;
if (l == 1)
{
/* ASCII case is easy, assume it's one-to-one conversion */
*iso++ = *utf++;
continue;
}
/* collect coded char of length l */
if (l == 2)
{
b3 = *utf++;
b4 = *utf++;
}
else if (l == 3)
{
b2 = *utf++;
b3 = *utf++;
b4 = *utf++;
}
else if (l == 4)
{
b1 = *utf++;
b2 = *utf++;
b3 = *utf++;
b4 = *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf = 0; /* keep compiler quiet */
}
iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
/* First, try with combined map if possible */
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
/* collect next character, same as above */
len -= l;
l = pg_utf_mblen(utf);
if (len < l)
{
/* need more data to decide if this is a combined char */
utf -= l_save;
break;
}
if (!pg_utf8_islegal(utf, l))
{
if (!noError)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
utf -= l_save;
break;
}
/* We assume ASCII character cannot be in combined map */
if (l > 1)
{
uint32 iutf2;
uint32 cutf[2];
if (l == 2)
{
iutf2 = *utf++ << 8;
iutf2 |= *utf++;
}
else if (l == 3)
{
iutf2 = *utf++ << 16;
iutf2 |= *utf++ << 8;
iutf2 |= *utf++;
}
else if (l == 4)
{
iutf2 = *utf++ << 24;
iutf2 |= *utf++ << 16;
iutf2 |= *utf++ << 8;
iutf2 |= *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf2 = 0; /* keep compiler quiet */
}
cutf[0] = iutf;
cutf[1] = iutf2;
cp = bsearch(cutf, cmap, cmapsize,
sizeof(pg_utf_to_local_combined), compare3);
if (cp)
{
iso = store_coded_char(iso, cp->code);
continue;
}
}
/* fail, so back up to reprocess second character next time */
utf = utf_save;
len = len_save;
l = l_save;
}
/* Now check ordinary map */
if (map)
{
uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
if (converted)
{
iso = store_coded_char(iso, converted);
continue;
}
}
/* if there's a conversion function, try that */
if (conv_func)
{
uint32 converted = (*conv_func) (iutf);
if (converted)
{
iso = store_coded_char(iso, converted);
continue;
}
}
/* failed to translate this character */
utf -= l;
if (noError)
break;
report_untranslatable_char(PG_UTF8, encoding,
(const char *) utf, len);
}
/* if we broke out of loop early, must be invalid input */
if (len > 0 && !noError)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
*iso = '\0';
return utf - start;
}
/*
* local code ---> UTF8
*
* iso: input string in local encoding (need not be null-terminated)
* len: length of input string (in bytes)
* utf: pointer to the output area (must be large enough!)
(output string will be null-terminated)
* map: conversion map for single characters
* cmap: conversion map for combined characters
* (optional, pass NULL if none)
* cmapsize: number of entries in the conversion map for combined characters
* (optional, pass 0 if none)
* conv_func: algorithmic encoding conversion function
* (optional, pass NULL if none)
* encoding: PG identifier for the local encoding
*
* For each character, the map is consulted first; if no match, the cmap
* (if provided) is consulted next; if still no match, the conv_func
* (if provided) is applied. An error is raised if no match is found.
*
* See pg_wchar.h for more details about the data structures used here.
*
* Returns the number of input bytes consumed. If noError is true, this can
* be less than 'len'.
*/
int
LocalToUtf(const unsigned char *iso, int len,
unsigned char *utf,
const pg_mb_radix_tree *map,
const pg_local_to_utf_combined *cmap, int cmapsize,
utf_local_conversion_func conv_func,
int encoding,
bool noError)
{
uint32 iiso;
int l;
const pg_local_to_utf_combined *cp;
const unsigned char *start = iso;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
unsigned char b1 = 0;
unsigned char b2 = 0;
unsigned char b3 = 0;
unsigned char b4 = 0;
/* "break" cases all represent errors */
if (*iso == '\0')
break;
if (!IS_HIGHBIT_SET(*iso))
{
/* ASCII case is easy, assume it's one-to-one conversion */
*utf++ = *iso++;
l = 1;
continue;
}
l = pg_encoding_verifymbchar(encoding, (const char *) iso, len);
if (l < 0)
break;
/* collect coded char of length l */
if (l == 1)
b4 = *iso++;
else if (l == 2)
{
b3 = *iso++;
b4 = *iso++;
}
else if (l == 3)
{
b2 = *iso++;
b3 = *iso++;
b4 = *iso++;
}
else if (l == 4)
{
b1 = *iso++;
b2 = *iso++;
b3 = *iso++;
b4 = *iso++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iiso = 0; /* keep compiler quiet */
}
iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
if (map)
{
uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
if (converted)
{
utf = store_coded_char(utf, converted);
continue;
}
/* If there's a combined character map, try that */
if (cmap)
{
cp = bsearch(&iiso, cmap, cmapsize,
sizeof(pg_local_to_utf_combined), compare4);
if (cp)
{
utf = store_coded_char(utf, cp->utf1);
utf = store_coded_char(utf, cp->utf2);
continue;
}
}
}
/* if there's a conversion function, try that */
if (conv_func)
{
uint32 converted = (*conv_func) (iiso);
if (converted)
{
utf = store_coded_char(utf, converted);
continue;
}
}
/* failed to translate this character */
iso -= l;
if (noError)
break;
report_untranslatable_char(encoding, PG_UTF8,
(const char *) iso, len);
}
/* if we broke out of loop early, must be invalid input */
if (len > 0 && !noError)
report_invalid_encoding(encoding, (const char *) iso, len);
*utf = '\0';
return iso - start;
}