1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-08 11:42:09 +03:00
Files
postgres/src/backend/utils/mb/conv.c
Tom Lane 0ab1a2e39b Remove dead encoding-conversion functions.
The code for conversions SQL_ASCII <-> MULE_INTERNAL and
SQL_ASCII <-> UTF8 was unreachable, because we long ago changed
the wrapper functions pg_do_encoding_conversion() et al so that
they have hard-wired behaviors for conversions involving SQL_ASCII.
(At least some of those fast paths date back to 2002, though it
looks like we may not have been totally consistent about this until
later.)  Given the lack of complaints, nobody is dissatisfied with
this state of affairs.  Hence, let's just remove the unreachable code.

Also, change CREATE CONVERSION so that it rejects attempts to
define such conversions.  Since we consider that SQL_ASCII represents
lack of knowledge about the encoding in use, such a conversion would
be semantically dubious even if it were reachable.

Adjust a couple of regression test cases that had randomly decided
to rely on these conversion functions rather than any other ones.

Discussion: https://postgr.es/m/41163.1559156593@sss.pgh.pa.us
2019-07-05 14:17:27 -04:00

736 lines
16 KiB
C

/*-------------------------------------------------------------------------
*
* Utility functions for conversion procs.
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/backend/utils/mb/conv.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "mb/pg_wchar.h"
/*
* local2local: a generic single byte charset encoding
* conversion between two ASCII-superset encodings.
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* src_encoding is the PG identifier for the source encoding
* dest_encoding is the PG identifier for the target encoding
* tab holds conversion entries for the source charset
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the target charset, or 0 if there is no equivalent code.
*/
void
local2local(const unsigned char *l,
unsigned char *p,
int len,
int src_encoding,
int dest_encoding,
const unsigned char *tab)
{
unsigned char c1,
c2;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
report_invalid_encoding(src_encoding, (const char *) l, len);
if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
c2 = tab[c1 - HIGHBIT];
if (c2)
*p++ = c2;
else
report_untranslatable_char(src_encoding, dest_encoding,
(const char *) l, len);
}
l++;
len--;
}
*p = '\0';
}
/*
* LATINn ---> MIC when the charset's local codes map directly to MIC
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
*/
void
latin2mic(const unsigned char *l, unsigned char *p, int len,
int lc, int encoding)
{
int c1;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
report_invalid_encoding(encoding, (const char *) l, len);
if (IS_HIGHBIT_SET(c1))
*p++ = lc;
*p++ = c1;
l++;
len--;
}
*p = '\0';
}
/*
* MIC ---> LATINn when the charset's local codes map directly to MIC
*
* mic points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
*/
void
mic2latin(const unsigned char *mic, unsigned char *p, int len,
int lc, int encoding)
{
int c1;
while (len > 0)
{
c1 = *mic;
if (c1 == 0)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
if (!IS_HIGHBIT_SET(c1))
{
/* easy for ASCII */
*p++ = c1;
mic++;
len--;
}
else
{
int l = pg_mic_mblen(mic);
if (len < l)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
len);
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
(const char *) mic, len);
*p++ = mic[1];
mic += 2;
len -= 2;
}
}
*p = '\0';
}
/*
* latin2mic_with_table: a generic single byte charset encoding
* conversion from a local charset to the mule internal code.
*
* l points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
* tab holds conversion entries for the local charset
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the mule encoding, or 0 if there is no equivalent code.
*/
void
latin2mic_with_table(const unsigned char *l,
unsigned char *p,
int len,
int lc,
int encoding,
const unsigned char *tab)
{
unsigned char c1,
c2;
while (len > 0)
{
c1 = *l;
if (c1 == 0)
report_invalid_encoding(encoding, (const char *) l, len);
if (!IS_HIGHBIT_SET(c1))
*p++ = c1;
else
{
c2 = tab[c1 - HIGHBIT];
if (c2)
{
*p++ = lc;
*p++ = c2;
}
else
report_untranslatable_char(encoding, PG_MULE_INTERNAL,
(const char *) l, len);
}
l++;
len--;
}
*p = '\0';
}
/*
* mic2latin_with_table: a generic single byte charset encoding
* conversion from the mule internal code to a local charset.
*
* mic points to the source string of length len
* p is the output area (must be large enough!)
* lc is the mule character set id for the local encoding
* encoding is the PG identifier for the local encoding
* tab holds conversion entries for the mule internal code's second byte,
* starting from 128 (0x80). each entry in the table holds the corresponding
* code point for the local charset, or 0 if there is no equivalent code.
*/
void
mic2latin_with_table(const unsigned char *mic,
unsigned char *p,
int len,
int lc,
int encoding,
const unsigned char *tab)
{
unsigned char c1,
c2;
while (len > 0)
{
c1 = *mic;
if (c1 == 0)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
if (!IS_HIGHBIT_SET(c1))
{
/* easy for ASCII */
*p++ = c1;
mic++;
len--;
}
else
{
int l = pg_mic_mblen(mic);
if (len < l)
report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
len);
if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
(c2 = tab[mic[1] - HIGHBIT]) == 0)
{
report_untranslatable_char(PG_MULE_INTERNAL, encoding,
(const char *) mic, len);
break; /* keep compiler quiet */
}
*p++ = c2;
mic += 2;
len -= 2;
}
}
*p = '\0';
}
/*
* comparison routine for bsearch()
* this routine is intended for combined UTF8 -> local code
*/
static int
compare3(const void *p1, const void *p2)
{
uint32 s1,
s2,
d1,
d2;
s1 = *(const uint32 *) p1;
s2 = *((const uint32 *) p1 + 1);
d1 = ((const pg_utf_to_local_combined *) p2)->utf1;
d2 = ((const pg_utf_to_local_combined *) p2)->utf2;
return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
}
/*
* comparison routine for bsearch()
* this routine is intended for local code -> combined UTF8
*/
static int
compare4(const void *p1, const void *p2)
{
uint32 v1,
v2;
v1 = *(const uint32 *) p1;
v2 = ((const pg_local_to_utf_combined *) p2)->code;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* store 32bit character representation into multibyte stream
*/
static inline unsigned char *
store_coded_char(unsigned char *dest, uint32 code)
{
if (code & 0xff000000)
*dest++ = code >> 24;
if (code & 0x00ff0000)
*dest++ = code >> 16;
if (code & 0x0000ff00)
*dest++ = code >> 8;
if (code & 0x000000ff)
*dest++ = code;
return dest;
}
/*
* Convert a character using a conversion radix tree.
*
* 'l' is the length of the input character in bytes, and b1-b4 are
* the input character's bytes.
*/
static inline uint32
pg_mb_radix_conv(const pg_mb_radix_tree *rt,
int l,
unsigned char b1,
unsigned char b2,
unsigned char b3,
unsigned char b4)
{
if (l == 4)
{
/* 4-byte code */
/* check code validity */
if (b1 < rt->b4_1_lower || b1 > rt->b4_1_upper ||
b2 < rt->b4_2_lower || b2 > rt->b4_2_upper ||
b3 < rt->b4_3_lower || b3 > rt->b4_3_upper ||
b4 < rt->b4_4_lower || b4 > rt->b4_4_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b4root;
idx = rt->chars32[b1 + idx - rt->b4_1_lower];
idx = rt->chars32[b2 + idx - rt->b4_2_lower];
idx = rt->chars32[b3 + idx - rt->b4_3_lower];
return rt->chars32[b4 + idx - rt->b4_4_lower];
}
else
{
uint16 idx = rt->b4root;
idx = rt->chars16[b1 + idx - rt->b4_1_lower];
idx = rt->chars16[b2 + idx - rt->b4_2_lower];
idx = rt->chars16[b3 + idx - rt->b4_3_lower];
return rt->chars16[b4 + idx - rt->b4_4_lower];
}
}
else if (l == 3)
{
/* 3-byte code */
/* check code validity */
if (b2 < rt->b3_1_lower || b2 > rt->b3_1_upper ||
b3 < rt->b3_2_lower || b3 > rt->b3_2_upper ||
b4 < rt->b3_3_lower || b4 > rt->b3_3_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b3root;
idx = rt->chars32[b2 + idx - rt->b3_1_lower];
idx = rt->chars32[b3 + idx - rt->b3_2_lower];
return rt->chars32[b4 + idx - rt->b3_3_lower];
}
else
{
uint16 idx = rt->b3root;
idx = rt->chars16[b2 + idx - rt->b3_1_lower];
idx = rt->chars16[b3 + idx - rt->b3_2_lower];
return rt->chars16[b4 + idx - rt->b3_3_lower];
}
}
else if (l == 2)
{
/* 2-byte code */
/* check code validity - first byte */
if (b3 < rt->b2_1_lower || b3 > rt->b2_1_upper ||
b4 < rt->b2_2_lower || b4 > rt->b2_2_upper)
return 0;
/* perform lookup */
if (rt->chars32)
{
uint32 idx = rt->b2root;
idx = rt->chars32[b3 + idx - rt->b2_1_lower];
return rt->chars32[b4 + idx - rt->b2_2_lower];
}
else
{
uint16 idx = rt->b2root;
idx = rt->chars16[b3 + idx - rt->b2_1_lower];
return rt->chars16[b4 + idx - rt->b2_2_lower];
}
}
else if (l == 1)
{
/* 1-byte code */
/* check code validity - first byte */
if (b4 < rt->b1_lower || b4 > rt->b1_upper)
return 0;
/* perform lookup */
if (rt->chars32)
return rt->chars32[b4 + rt->b1root - rt->b1_lower];
else
return rt->chars16[b4 + rt->b1root - rt->b1_lower];
}
return 0; /* shouldn't happen */
}
/*
* UTF8 ---> local code
*
* utf: input string in UTF8 encoding (need not be null-terminated)
* len: length of input string (in bytes)
* iso: pointer to the output area (must be large enough!)
(output string will be null-terminated)
* map: conversion map for single characters
* cmap: conversion map for combined characters
* (optional, pass NULL if none)
* cmapsize: number of entries in the conversion map for combined characters
* (optional, pass 0 if none)
* conv_func: algorithmic encoding conversion function
* (optional, pass NULL if none)
* encoding: PG identifier for the local encoding
*
* For each character, the cmap (if provided) is consulted first; if no match,
* the map is consulted next; if still no match, the conv_func (if provided)
* is applied. An error is raised if no match is found.
*
* See pg_wchar.h for more details about the data structures used here.
*/
void
UtfToLocal(const unsigned char *utf, int len,
unsigned char *iso,
const pg_mb_radix_tree *map,
const pg_utf_to_local_combined *cmap, int cmapsize,
utf_local_conversion_func conv_func,
int encoding)
{
uint32 iutf;
int l;
const pg_utf_to_local_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
unsigned char b1 = 0;
unsigned char b2 = 0;
unsigned char b3 = 0;
unsigned char b4 = 0;
/* "break" cases all represent errors */
if (*utf == '\0')
break;
l = pg_utf_mblen(utf);
if (len < l)
break;
if (!pg_utf8_islegal(utf, l))
break;
if (l == 1)
{
/* ASCII case is easy, assume it's one-to-one conversion */
*iso++ = *utf++;
continue;
}
/* collect coded char of length l */
if (l == 2)
{
b3 = *utf++;
b4 = *utf++;
}
else if (l == 3)
{
b2 = *utf++;
b3 = *utf++;
b4 = *utf++;
}
else if (l == 4)
{
b1 = *utf++;
b2 = *utf++;
b3 = *utf++;
b4 = *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf = 0; /* keep compiler quiet */
}
iutf = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
/* First, try with combined map if possible */
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
/* collect next character, same as above */
len -= l;
l = pg_utf_mblen(utf);
if (len < l)
break;
if (!pg_utf8_islegal(utf, l))
break;
/* We assume ASCII character cannot be in combined map */
if (l > 1)
{
uint32 iutf2;
uint32 cutf[2];
if (l == 2)
{
iutf2 = *utf++ << 8;
iutf2 |= *utf++;
}
else if (l == 3)
{
iutf2 = *utf++ << 16;
iutf2 |= *utf++ << 8;
iutf2 |= *utf++;
}
else if (l == 4)
{
iutf2 = *utf++ << 24;
iutf2 |= *utf++ << 16;
iutf2 |= *utf++ << 8;
iutf2 |= *utf++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iutf2 = 0; /* keep compiler quiet */
}
cutf[0] = iutf;
cutf[1] = iutf2;
cp = bsearch(cutf, cmap, cmapsize,
sizeof(pg_utf_to_local_combined), compare3);
if (cp)
{
iso = store_coded_char(iso, cp->code);
continue;
}
}
/* fail, so back up to reprocess second character next time */
utf = utf_save;
len = len_save;
l = l_save;
}
/* Now check ordinary map */
if (map)
{
uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
if (converted)
{
iso = store_coded_char(iso, converted);
continue;
}
}
/* if there's a conversion function, try that */
if (conv_func)
{
uint32 converted = (*conv_func) (iutf);
if (converted)
{
iso = store_coded_char(iso, converted);
continue;
}
}
/* failed to translate this character */
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
}
/* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(PG_UTF8, (const char *) utf, len);
*iso = '\0';
}
/*
* local code ---> UTF8
*
* iso: input string in local encoding (need not be null-terminated)
* len: length of input string (in bytes)
* utf: pointer to the output area (must be large enough!)
(output string will be null-terminated)
* map: conversion map for single characters
* cmap: conversion map for combined characters
* (optional, pass NULL if none)
* cmapsize: number of entries in the conversion map for combined characters
* (optional, pass 0 if none)
* conv_func: algorithmic encoding conversion function
* (optional, pass NULL if none)
* encoding: PG identifier for the local encoding
*
* For each character, the map is consulted first; if no match, the cmap
* (if provided) is consulted next; if still no match, the conv_func
* (if provided) is applied. An error is raised if no match is found.
*
* See pg_wchar.h for more details about the data structures used here.
*/
void
LocalToUtf(const unsigned char *iso, int len,
unsigned char *utf,
const pg_mb_radix_tree *map,
const pg_local_to_utf_combined *cmap, int cmapsize,
utf_local_conversion_func conv_func,
int encoding)
{
uint32 iiso;
int l;
const pg_local_to_utf_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid encoding number: %d", encoding)));
for (; len > 0; len -= l)
{
unsigned char b1 = 0;
unsigned char b2 = 0;
unsigned char b3 = 0;
unsigned char b4 = 0;
/* "break" cases all represent errors */
if (*iso == '\0')
break;
if (!IS_HIGHBIT_SET(*iso))
{
/* ASCII case is easy, assume it's one-to-one conversion */
*utf++ = *iso++;
l = 1;
continue;
}
l = pg_encoding_verifymb(encoding, (const char *) iso, len);
if (l < 0)
break;
/* collect coded char of length l */
if (l == 1)
b4 = *iso++;
else if (l == 2)
{
b3 = *iso++;
b4 = *iso++;
}
else if (l == 3)
{
b2 = *iso++;
b3 = *iso++;
b4 = *iso++;
}
else if (l == 4)
{
b1 = *iso++;
b2 = *iso++;
b3 = *iso++;
b4 = *iso++;
}
else
{
elog(ERROR, "unsupported character length %d", l);
iiso = 0; /* keep compiler quiet */
}
iiso = (b1 << 24 | b2 << 16 | b3 << 8 | b4);
if (map)
{
uint32 converted = pg_mb_radix_conv(map, l, b1, b2, b3, b4);
if (converted)
{
utf = store_coded_char(utf, converted);
continue;
}
/* If there's a combined character map, try that */
if (cmap)
{
cp = bsearch(&iiso, cmap, cmapsize,
sizeof(pg_local_to_utf_combined), compare4);
if (cp)
{
utf = store_coded_char(utf, cp->utf1);
utf = store_coded_char(utf, cp->utf2);
continue;
}
}
}
/* if there's a conversion function, try that */
if (conv_func)
{
uint32 converted = (*conv_func) (iiso);
if (converted)
{
utf = store_coded_char(utf, converted);
continue;
}
}
/* failed to translate this character */
report_untranslatable_char(encoding, PG_UTF8,
(const char *) (iso - l), len);
}
/* if we broke out of loop early, must be invalid input */
if (len > 0)
report_invalid_encoding(encoding, (const char *) iso, len);
*utf = '\0';
}