1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-21 00:42:43 +03:00

Add new encoding EUC_JIS_2004 and SHIFT_JIS_2004,

along with new conversions among EUC_JIS_2004, SHIFT_JIS_2004 and UTF-8.
catalog version has been bump up.
This commit is contained in:
Tatsuo Ishii
2007-03-25 11:56:04 +00:00
parent 7b4726e6c3
commit 75c6519ff6
41 changed files with 70345 additions and 119 deletions

View File

@@ -6,7 +6,7 @@
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.62 2007/01/05 22:19:44 momjian Exp $
* $PostgreSQL: pgsql/src/backend/utils/mb/conv.c,v 1.63 2007/03/25 11:56:02 ishii Exp $
*
*-------------------------------------------------------------------------
*/
@@ -244,10 +244,10 @@ mic2latin_with_table(const unsigned char *mic,
static int
compare1(const void *p1, const void *p2)
{
unsigned int v1,
uint32 v1,
v2;
v1 = *(unsigned int *) p1;
v1 = *(uint32 *) p1;
v2 = ((pg_utf_to_local *) p2)->utf;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
@@ -259,31 +259,86 @@ compare1(const void *p1, const void *p2)
static int
compare2(const void *p1, const void *p2)
{
unsigned int v1,
uint32 v1,
v2;
v1 = *(unsigned int *) p1;
v1 = *(uint32 *) p1;
v2 = ((pg_local_to_utf *) p2)->code;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* comparison routine for bsearch()
* this routine is intended for combined UTF8 -> local code
*/
static int
compare3(const void *p1, const void *p2)
{
uint32 s1, s2, d1, d2;
s1 = *(uint32 *)p1;
s2 = *((uint32 *)p1 + 1);
d1 = ((pg_utf_to_local_combined *) p2)->utf1;
d2 = ((pg_utf_to_local_combined *) p2)->utf2;
return (s1 > d1 || (s1 == d1 && s2 > d2)) ? 1 : ((s1 == d1 && s2 == d2) ? 0 : -1);
}
/*
* comparison routine for bsearch()
* this routine is intended for local code -> combined UTF8
*/
static int
compare4(const void *p1, const void *p2)
{
uint32 v1,
v2;
v1 = *(uint32 *) p1;
v2 = ((pg_local_to_utf_combined *) p2)->code;
return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
}
/*
* convert 32bit wide character to mutibye stream pointed to by iso
*/
static unsigned char *set_iso_code(unsigned char *iso, uint32 code)
{
if (code & 0xff000000)
*iso++ = code >> 24;
if (code & 0x00ff0000)
*iso++ = (code & 0x00ff0000) >> 16;
if (code & 0x0000ff00)
*iso++ = (code & 0x0000ff00) >> 8;
if (code & 0x000000ff)
*iso++ = code & 0x000000ff;
return iso;
}
/*
* UTF8 ---> local code
*
* utf: input UTF8 string (need not be null-terminated).
* iso: pointer to the output area (must be large enough!)
* map: the conversion map.
* size: the size of the conversion map.
* cmap: the conversion map for combined characters.
* (optional)
* size1: the size of the conversion map.
* size2: the size of the conversion map for combined characters
* (optional)
* encoding: the PG identifier for the local encoding.
* len: length of input string.
*/
void
UtfToLocal(const unsigned char *utf, unsigned char *iso,
const pg_utf_to_local *map, int size, int encoding, int len)
const pg_utf_to_local *map, const pg_utf_to_local_combined *cmap,
int size1, int size2, int encoding, int len)
{
unsigned int iutf;
int l;
uint32 iutf;
uint32 cutf[2];
uint32 code;
pg_utf_to_local *p;
pg_utf_to_local_combined *cp;
int l;
for (; len > 0; len -= l)
{
@@ -324,21 +379,94 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
iutf |= *utf++;
}
p = bsearch(&iutf, map, size,
sizeof(pg_utf_to_local), compare1);
/*
* first, try with combined map if possible
*/
if (cmap && len > l)
{
const unsigned char *utf_save = utf;
int len_save = len;
int l_save = l;
len -= l;
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
l = pg_utf_mblen(utf);
if (len < l)
break;
if (p->code & 0xff000000)
*iso++ = p->code >> 24;
if (p->code & 0x00ff0000)
*iso++ = (p->code & 0x00ff0000) >> 16;
if (p->code & 0x0000ff00)
*iso++ = (p->code & 0x0000ff00) >> 8;
if (p->code & 0x000000ff)
*iso++ = p->code & 0x000000ff;
if (!pg_utf8_islegal(utf, l))
break;
cutf[0] = iutf;
if (l == 1)
{
if (len_save > 1)
{
p = bsearch(&cutf[0], map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf_save - l_save), len_save);
iso = set_iso_code(iso, p->code);
}
/* ASCII case is easy */
*iso++ = *utf++;
continue;
}
else if (l == 2)
{
iutf = *utf++ << 8;
iutf |= *utf++;
}
else if (l == 3)
{
iutf = *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
else if (l == 4)
{
iutf = *utf++ << 24;
iutf |= *utf++ << 16;
iutf |= *utf++ << 8;
iutf |= *utf++;
}
cutf[1] = iutf;
cp = bsearch(cutf, cmap, size2,
sizeof(pg_utf_to_local_combined), compare3);
if (cp)
code = cp->code;
else
{
/* not found in combined map. try with ordinary map */
p = bsearch(&cutf[0], map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf_save - l_save), len_save);
iso = set_iso_code(iso, p->code);
p = bsearch(&cutf[1], map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
code = p->code;
}
}
else /* no cmap or no remaining data */
{
p = bsearch(&iutf, map, size1,
sizeof(pg_utf_to_local), compare1);
if (p == NULL)
report_untranslatable_char(PG_UTF8, encoding,
(const char *) (utf - l), len);
code = p->code;
}
iso = set_iso_code(iso, code);
}
if (len > 0)
@@ -353,17 +481,23 @@ UtfToLocal(const unsigned char *utf, unsigned char *iso,
* iso: input local string (need not be null-terminated).
* utf: pointer to the output area (must be large enough!)
* map: the conversion map.
* size: the size of the conversion map.
* cmap: the conversion map for combined characters.
* (optional)
* size1: the size of the conversion map.
* size2: the size of the conversion map for combined characters
* (optional)
* encoding: the PG identifier for the local encoding.
* len: length of input string.
*/
void
LocalToUtf(const unsigned char *iso, unsigned char *utf,
const pg_local_to_utf *map, int size, int encoding, int len)
const pg_local_to_utf *map, const pg_local_to_utf_combined *cmap,
int size1, int size2, int encoding, int len)
{
unsigned int iiso;
int l;
pg_local_to_utf *p;
pg_local_to_utf_combined *cp;
if (!PG_VALID_ENCODING(encoding))
ereport(ERROR,
@@ -409,20 +543,59 @@ LocalToUtf(const unsigned char *iso, unsigned char *utf,
iiso |= *iso++;
}
p = bsearch(&iiso, map, size,
p = bsearch(&iiso, map, size1,
sizeof(pg_local_to_utf), compare2);
if (p == NULL)
{
/*
* not found in the ordinary map. if there's a combined
* character map, try with it
*/
if (cmap)
{
cp = bsearch(&iiso, cmap, size2,
sizeof(pg_local_to_utf_combined), compare4);
if (cp)
{
if (cp->utf1 & 0xff000000)
*utf++ = cp->utf1 >> 24;
if (cp->utf1 & 0x00ff0000)
*utf++ = (cp->utf1 & 0x00ff0000) >> 16;
if (cp->utf1 & 0x0000ff00)
*utf++ = (cp->utf1 & 0x0000ff00) >> 8;
if (cp->utf1 & 0x000000ff)
*utf++ = cp->utf1 & 0x000000ff;
if (cp->utf2 & 0xff000000)
*utf++ = cp->utf2 >> 24;
if (cp->utf2 & 0x00ff0000)
*utf++ = (cp->utf2 & 0x00ff0000) >> 16;
if (cp->utf2 & 0x0000ff00)
*utf++ = (cp->utf2 & 0x0000ff00) >> 8;
if (cp->utf2 & 0x000000ff)
*utf++ = cp->utf2 & 0x000000ff;
continue;
}
}
report_untranslatable_char(encoding, PG_UTF8,
(const char *) (iso - l), len);
if (p->utf & 0xff000000)
*utf++ = p->utf >> 24;
if (p->utf & 0x00ff0000)
*utf++ = (p->utf & 0x00ff0000) >> 16;
if (p->utf & 0x0000ff00)
*utf++ = (p->utf & 0x0000ff00) >> 8;
if (p->utf & 0x000000ff)
*utf++ = p->utf & 0x000000ff;
}
else
{
if (p->utf & 0xff000000)
*utf++ = p->utf >> 24;
if (p->utf & 0x00ff0000)
*utf++ = (p->utf & 0x00ff0000) >> 16;
if (p->utf & 0x0000ff00)
*utf++ = (p->utf & 0x0000ff00) >> 8;
if (p->utf & 0x000000ff)
*utf++ = p->utf & 0x000000ff;
}
}
if (len > 0)