I really hope that I haven't missed anything in this one...

From: t-ishii@sra.co.jp Attached are patches to enhance the multi-byte support. (patches are against 7/18 snapshot) * determine encoding at initdb/createdb rather than compile time Now initdb/createdb has an option to specify the encoding. Also, I modified the syntax of CREATE DATABASE to accept encoding option. See README.mb for more details. For this purpose I have added new column "encoding" to pg_database. Also pg_attribute and pg_class are changed to catch up the modification to pg_database. Actually I haved added pg_database_mb.h, pg_attribute_mb.h and pg_class_mb.h. These are used only when MB is enabled. The reason having separate files is I couldn't find a way to use ifdef or whatever in those files. I have to admit it looks ugly. No way. * support for PGCLIENTENCODING when issuing COPY command commands/copy.c modified. * support for SQL92 syntax "SET NAMES" See gram.y. * support for LATIN2-5 * add UNICODE regression test case * new test suite for MB New directory test/mb added. * clean up source files Basic idea is to have MB's own subdirectory for easier maintenance. These are include/mb and backend/utils/mb.
2025-11-12 05:01:15 +03:00 · 1998-07-24 03:32:46 +00:00
parent 6e66468f3a
commit bf00bbb0c4
82 changed files with 2161 additions and 759 deletions
--- a/src/backend/utils/mb/Makefile
+++ b/src/backend/utils/mb/Makefile
@@ -0,0 +1,35 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for utils/mb
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/utils/mb/Makefile,v 1.1 1998/07/24 03:31:54 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SRCDIR = ../../..
+include ../../../Makefile.global
+
+CFLAGS += -I../..
+ifdef MB
+CFLAGS += -DMB=$(MB)
+endif
+
+OBJS = common.o conv.o mbutils.o wchar.o wstrcmp.o wstrncmp.o variable.o
+
+all: SUBSYS.o
+
+SUBSYS.o: $(OBJS)
+	$(LD) -r -o SUBSYS.o $(OBJS)
+
+depend dep:
+	$(CC) -MM $(CFLAGS) *.c >depend
+
+clean: 
+	rm -f SUBSYS.o $(OBJS)
+
+ifeq (depend,$(wildcard depend))
+include depend
+endif
+
--- a/src/backend/utils/mb/README
+++ b/src/backend/utils/mb/README
@@ -0,0 +1,10 @@
+common.c:	public functions for both the backend and the frontend.
+		requires conv.c and wchar.c
+conv.c:		static functions and a public table for code conversion
+wchar.c:	mostly static functions and a public table for mb string and
+		multi-byte conversion
+mbutilc.c:	public functions for the backend only.
+		requires conv.c and wchar.c
+wstrcmp.c:	strcmp for mb
+wstrncmp.c:	strncmp for mb
+varable.c:	public functions for show/set/reset variable commands
--- a/src/backend/utils/mb/common.c
+++ b/src/backend/utils/mb/common.c
@@ -0,0 +1,67 @@
+/*
+ * This file contains some public functions
+ * usable for both the backend and the frontend.
+ * Tatsuo Ishii
+ * $Id: common.c,v 1.1 1998/07/24 03:31:56 scrappy Exp $ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "mb/pg_wchar.h"
+
+/*
+ * convert encoding char to encoding symbol value.
+ * case is ignored.
+ * if there's no valid encoding, returns -1
+ */
+int pg_char_to_encoding(const char *s)
+{
+  pg_encoding_conv_tbl *p = pg_conv_tbl;
+
+  for(;p->encoding >= 0;p++) {
+    if (!strcasecmp(s, p->name)) {
+      break;
+    }
+  }
+  return(p->encoding);
+}
+
+/*
+ * check to see if encoding name is valid
+ */
+int pg_valid_client_encoding(const char *name)
+{
+  return(pg_char_to_encoding(name));
+}
+
+/*
+ * find encoding table entry by encoding
+ */
+pg_encoding_conv_tbl *pg_get_encent_by_encoding(int encoding)
+{
+  pg_encoding_conv_tbl *p = pg_conv_tbl;
+  for(;p->encoding >= 0;p++) {
+    if (p->encoding == encoding) {
+      return(p);
+    }
+  }
+  return(0);
+}
+
+/*
+ * convert encoding symbol to encoding char.
+ * if there's no valid encoding symbol, returns ""
+ */
+const char *pg_encoding_to_char(int encoding)
+{
+  pg_encoding_conv_tbl *p = pg_get_encent_by_encoding(encoding);
+
+  if (!p) return("");
+  return(p->name);
+}
+
+/* returns the byte length of a multi-byte word for an encoding */
+int pg_encoding_mblen(int encoding, const unsigned char *mbstr)
+{
+  return((*pg_wchar_table[encoding].mblen)(mbstr));
+}
--- a/src/backend/utils/mb/conv.c
+++ b/src/backend/utils/mb/conv.c
@@ -0,0 +1,386 @@
+/*
+ * conversion between client encoding and server internal encoding
+ * (currently mule internal code (mic) is used)
+ * Tatsuo Ishii
+ * $Id: conv.c,v 1.1 1998/07/24 03:31:56 scrappy Exp $
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "mb/pg_wchar.h"
+
+/*
+ * convert bogus chars that cannot be represented in the current encoding
+ * system.
+ */
+static void printBogusChar(unsigned char **mic, unsigned char **p)
+{
+  char strbuf[16];
+  int l = pg_mic_mblen(*mic);
+
+  *(*p)++ = '(';
+  while (l--) {
+    sprintf(strbuf,"%02x",*(*mic)++);
+    *(*p)++ = strbuf[0];
+    *(*p)++ = strbuf[1];
+  }
+  *(*p)++ = ')';
+}
+
+/*
+ * SJIS ---> MIC
+ */
+static void sjis2mic(unsigned char *sjis, unsigned char *p, int len)
+{
+  int c1,c2;
+
+  while (len > 0 && (c1 = *sjis++)) {
+    if (c1 >= 0xa1 && c1 <= 0xdf) {	/* 1 byte kana? */
+      len--;
+      *p++ = LC_JISX0201K;
+      *p++ = c1;
+    } else if (c1 > 0x7f) {	/* kanji? */
+      c2 = *sjis++;
+      len -= 2;
+      *p++ = LC_JISX0208;
+      *p++ = ((c1 & 0x3f)<<1) + 0x9f + (c2 > 0x9e);
+      *p++ = c2 + ((c2 > 0x9e)? 2 : 0x60) + (c2 < 0x80);
+    } else {	/* should be ASCII */
+      len--;
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * MIC ---> SJIS
+ */
+static void mic2sjis(unsigned char *mic, unsigned char *p, int len)
+{
+  int c1,c2;
+
+  while (len > 0 && (c1 = *mic)) {
+    len -= pg_mic_mblen(mic++);
+
+    if (c1 == LC_JISX0201K) {
+      *p++ = *mic++;
+    } else if (c1 == LC_JISX0208) {
+      c1 = *mic++;
+      c2 = *mic++;
+      *p++ = ((c1 - 0xa1)>>1) + ((c1 < 0xdf)? 0x81 : 0xc1);
+      *p++ = c2 - ((c1 & 1)? ((c2 < 0xe0)? 0x61 : 0x60) : 2);
+    } else if (c1 > 0x7f) {	/* cannot convert to SJIS! */
+      mic--;
+      printBogusChar(&mic, &p);
+    } else {	/* should be ASCII */
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * EUC_JP ---> MIC
+ */
+static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *euc++)) {
+    if (c1 == SS2) {	/* 1 byte kana? */
+      len -= 2;
+      *p++ = LC_JISX0201K;
+      *p++ = *euc++;
+    } else if (c1 == SS3) {	/* JIS X0212 kanji? */
+      len -= 3;
+      *p++ = LC_JISX0212;
+      *p++ = *euc++;
+      *p++ = *euc++;
+    } else if (c1 & 0x80) {	/* kanji? */
+      len -= 2;
+      *p++ = LC_JISX0208;
+      *p++ = c1;
+      *p++ = *euc++;
+    } else {	/* should be ASCII */
+      len--;
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_JP
+ */
+static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *mic)) {
+    len -= pg_mic_mblen(mic++);
+
+    if (c1 == LC_JISX0201K) {
+      *p++ = SS2;
+      *p++ = *mic++;
+    } else if (c1 == LC_JISX0212) {
+      *p++ = SS3;
+      *p++ = *mic++;
+      *p++ = *mic++;
+    } else if (c1 == LC_JISX0208) {
+      *p++ = *mic++;
+      *p++ = *mic++;
+    } else if (c1 > 0x7f) {	/* cannot convert to EUC_JP! */
+      mic--;
+      printBogusChar(&mic, &p);
+    } else {	/* should be ASCII */
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * EUC_KR ---> MIC
+ */
+static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *euc++)) {
+    if (c1 & 0x80) {
+      len -= 2;
+      *p++ = LC_KS5601;
+      *p++ = c1;
+      *p++ = *euc++;
+    } else {	/* should be ASCII */
+      len--;
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_KR
+ */
+static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *mic)) {
+    len -= pg_mic_mblen(mic++);
+
+    if (c1 == LC_KS5601) {
+      *p++ = *mic++;
+      *p++ = *mic++;
+    } else if (c1 > 0x7f) {	/* cannot convert to EUC_KR! */
+      mic--;
+      printBogusChar(&mic, &p);
+    } else {	/* should be ASCII */
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * EUC_CN ---> MIC
+ */
+static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *euc++)) {
+    if (c1 & 0x80) {
+      len -= 2;
+      *p++ = LC_GB2312_80;
+      *p++ = c1;
+      *p++ = *euc++;
+    } else {	/* should be ASCII */
+      len--;
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_CN
+ */
+static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *mic)) {
+    len -= pg_mic_mblen(mic++);
+
+    if (c1 == LC_GB2312_80) {
+      *p++ = *mic++;
+      *p++ = *mic++;
+    } else if (c1 > 0x7f) {	/* cannot convert to EUC_CN! */
+      mic--;
+      printBogusChar(&mic, &p);
+    } else {	/* should be ASCII */
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * EUC_TW ---> MIC
+ */
+static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *euc++)) {
+    if (c1 == SS2) {
+      len -= 4;
+      c1 = *euc++;	/* plane No. */
+      if (c1 == 0xa1) {
+	*p++ = LC_CNS11643_1;
+      } else if (c1 == 0xa2) {
+	*p++ = LC_CNS11643_2;
+      } else {
+	*p++ = 0x9d;	/* LCPRV2 */
+	*p++ = 0xa3 - c1 + LC_CNS11643_3;
+      }
+      *p++ = *euc++;
+      *p++ = *euc++;
+    } else if (c1 & 0x80) {	/* CNS11643-1 */
+      len -= 2;
+      *p++ = LC_CNS11643_1;
+      *p++ = c1;
+      *p++ = *euc++;      
+    } else {	/* should be ASCII */
+      len --;
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * MIC ---> EUC_TW
+ */
+static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *mic)) {
+    len -= pg_mic_mblen(mic++);
+
+    if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2) {
+      *p++ = *mic++;
+      *p++ = *mic++;
+    } else if (c1 == 0x9d) {	/* LCPRV2? */
+      *p++ = SS2;
+      *p++ = c1 - LC_CNS11643_3 + 0xa3;
+      *p++ = *mic++;
+      *p++ = *mic++;
+    } else if (c1 > 0x7f) {	/* cannot convert to EUC_TW! */
+      mic--;
+      printBogusChar(&mic, &p);
+    } else {	/* should be ASCII */
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+/*
+ * LATINn ---> MIC
+ */
+static void latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
+{
+  int c1;
+
+  while (len-- > 0 && (c1 = *l++)) {
+    if (c1 > 0x7f) {	/* Latin1? */
+      *p++ = lc;
+    }
+    *p++ = c1;
+  }
+  *p = '\0';
+}
+
+/*
+ * MIC ---> LATINn
+ */
+static void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
+{
+  int c1;
+
+  while (len > 0 && (c1 = *mic)) {
+    len -= pg_mic_mblen(mic++);
+
+    if (c1 == lc) {
+      *p++ = *mic++;
+    } else if (c1 > 0x7f) {
+      mic--;
+      printBogusChar(&mic, &p);
+    } else {      /* should be ASCII */
+      *p++ = c1;
+    }
+  }
+  *p = '\0';
+}
+
+static void latin12mic(unsigned char *l, unsigned char *p, int len)
+{
+  latin2mic(l, p, len, LC_ISO8859_1);
+}
+static void mic2latin1(unsigned char *mic, unsigned char *p, int len)
+{
+  mic2latin(mic, p, len, LC_ISO8859_1);
+}
+static void latin22mic(unsigned char *l, unsigned char *p, int len)
+{
+  latin2mic(l, p, len, LC_ISO8859_2);
+}
+static void mic2latin2(unsigned char *mic, unsigned char *p, int len)
+{
+  mic2latin(mic, p, len, LC_ISO8859_2);
+}
+static void latin32mic(unsigned char *l, unsigned char *p, int len)
+{
+  latin2mic(l, p, len, LC_ISO8859_3);
+}
+static void mic2latin3(unsigned char *mic, unsigned char *p, int len)
+{
+  mic2latin(mic, p, len, LC_ISO8859_3);
+}
+static void latin42mic(unsigned char *l, unsigned char *p, int len)
+{
+  latin2mic(l, p, len, LC_ISO8859_4);
+}
+static void mic2latin4(unsigned char *mic, unsigned char *p, int len)
+{
+  mic2latin(mic, p, len, LC_ISO8859_4);
+}
+static void latin52mic(unsigned char *l, unsigned char *p, int len)
+{
+  latin2mic(l, p, len, LC_ISO8859_5);
+}
+static void mic2latin5(unsigned char *mic, unsigned char *p, int len)
+{
+  mic2latin(mic, p, len, LC_ISO8859_5);
+}
+
+pg_encoding_conv_tbl pg_conv_tbl[] = {
+  {EUC_JP, "EUC_JP", 0, euc_jp2mic, mic2euc_jp},	/* EUC_JP */
+  {EUC_CN, "EUC_CN", 0, euc_cn2mic, mic2euc_cn},	/* EUC_CN */
+  {EUC_KR, "EUC_KR", 0, euc_kr2mic, mic2euc_kr},	/* EUC_KR */
+  {EUC_TW, "EUC_TW", 0, euc_tw2mic, mic2euc_tw},	/* EUC_TW */
+  {UNICODE, "UNICODE", 0, 0, 0},			/* UNICODE */
+  {MULE_INTERNAL, "MULE_INTERNAL", 0, 0, 0},		/* MULE_INTERNAL */
+  {LATIN1, "LATIN1", 0, latin12mic, mic2latin1},	/* ISO 8859 Latin 1 */
+  {LATIN2, "LATIN2", 0, latin22mic, mic2latin2},	/* ISO 8859 Latin 2 */
+  {LATIN3, "LATIN3", 0, latin32mic, mic2latin3},	/* ISO 8859 Latin 3 */
+  {LATIN4, "LATIN4", 0, latin42mic, mic2latin4},	/* ISO 8859 Latin 4 */
+  {LATIN5, "LATIN5", 0, latin52mic, mic2latin5},	/* ISO 8859 Latin 5 */
+  {SJIS, "SJIS", 1, sjis2mic, mic2sjis},		/* SJIS */
+  {-1, "", 0, 0, 0}					/* end mark */
+};
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -0,0 +1,216 @@
+/*
+ * This file contains public functions for conversion between
+ * client encoding and server internal encoding.
+ * (currently mule internal code (mic) is used)
+ * Tatsuo Ishii
+ * $Id: mbutils.c,v 1.1 1998/07/24 03:31:56 scrappy Exp $ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "mb/pg_wchar.h"
+
+static client_encoding = -1;
+static void (*client_to_mic)();	/* something to MIC */
+static void (*client_from_mic)();	/* MIC to something */
+static void (*server_to_mic)();	/* something to MIC */
+static void (*server_from_mic)();	/* MIC to something */
+
+/*
+ * find encoding table entry by encoding
+ */
+static pg_encoding_conv_tbl *get_enc_ent(int encoding)
+{
+  pg_encoding_conv_tbl *p = pg_conv_tbl;
+  for(;p->encoding >= 0;p++) {
+    if (p->encoding == encoding) {
+      return(p);
+    }
+  }
+  return(0);
+}
+
+/*
+ * set the client encoding. if client/server encoding is
+ * not supported, returns -1
+ */
+int pg_set_client_encoding(int encoding)
+{
+  int current_server_encoding = GetDatabaseEncoding();
+
+  client_encoding = encoding;
+
+  if (client_encoding == current_server_encoding) {	/* server == client? */
+    client_to_mic = client_from_mic = 0;
+    server_to_mic = server_from_mic = 0;
+  } else if (current_server_encoding == MULE_INTERNAL) {	/* server == MULE_INETRNAL? */
+    client_to_mic = get_enc_ent(encoding)->to_mic;
+    client_from_mic = get_enc_ent(encoding)->from_mic;
+    server_to_mic = server_from_mic = 0;
+    if (client_to_mic == 0 || client_from_mic == 0) {
+      return(-1);
+    }
+  } else if (encoding == MULE_INTERNAL) {	/* client == MULE_INETRNAL? */
+    client_to_mic = client_from_mic = 0;
+    server_to_mic = get_enc_ent(current_server_encoding)->to_mic;
+    server_from_mic = get_enc_ent(current_server_encoding)->from_mic;
+    if (server_to_mic == 0 || server_from_mic == 0) {
+      return(-1);
+    }
+  } else {
+    client_to_mic = get_enc_ent(encoding)->to_mic;
+    client_from_mic = get_enc_ent(encoding)->from_mic;
+    server_to_mic = get_enc_ent(current_server_encoding)->to_mic;
+    server_from_mic = get_enc_ent(current_server_encoding)->from_mic;
+    if (client_to_mic == 0 || client_from_mic == 0) {
+      return(-1);
+    }
+    if (server_to_mic == 0 || server_from_mic == 0) {
+      return(-1);
+    }
+  }
+  return(0);
+}
+
+/*
+ * returns the current client encoding
+ */
+int pg_get_client_encoding()
+{
+  if (client_encoding == -1) {
+    /* this is the first time */
+    client_encoding = GetDatabaseEncoding();
+  }
+  return(client_encoding);
+}
+
+/*
+ * convert client encoding to server encoding. if server_encoding ==
+ * client_encoding or no conversion function exists,
+ * returns s. So be careful.
+ */
+unsigned char *pg_client_to_server(unsigned char *s, int len)
+{
+  static unsigned char b1[MAX_PARSE_BUFFER*4];	/* is this enough? */
+  static unsigned char b2[MAX_PARSE_BUFFER*4];	/* is this enough? */
+  unsigned char *p = s;
+
+  if (client_encoding == GetDatabaseEncoding()) {
+    return(p);
+  }
+  if (client_to_mic) {
+    (*client_to_mic)(s, b1, len);
+    len = strlen(b1);
+    p = b1;
+  }
+  if (server_from_mic) {
+    (*server_from_mic)(p, b2, len);
+    p = b2;
+  }
+  return(p);
+}
+
+/*
+ * convert server encoding to client encoding. if server_encoding ==
+ * client_encoding or no conversion function exists,
+ * returns s. So be careful.
+ */
+unsigned char *pg_server_to_client(unsigned char *s, int len)
+{
+  static unsigned char b1[MAX_PARSE_BUFFER*4];	/* is this enough? */
+  static unsigned char b2[MAX_PARSE_BUFFER*4];	/* is this enough? */
+  unsigned char *p = s;
+
+  if (client_encoding == GetDatabaseEncoding()) {
+    return(p);
+  }
+  if (server_to_mic) {
+    (*server_to_mic)(s, b1, len);
+    len = strlen(b1);
+    p = b1;
+  }
+  if (client_from_mic) {
+    (*client_from_mic)(p, b2, len);
+    p = b2;
+  }
+  return(p);
+}
+
+/* convert a multi-byte string to a wchar */
+void pg_mb2wchar(const unsigned char *from, pg_wchar *to)
+{
+  (*pg_wchar_table[GetDatabaseEncoding()].mb2wchar_with_len)(from,to,strlen(from));
+}
+
+/* convert a multi-byte string to a wchar with a limited length */
+void pg_mb2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+  (*pg_wchar_table[GetDatabaseEncoding()].mb2wchar_with_len)(from,to,len);
+}
+
+/* returns the byte length of a multi-byte word */
+int pg_mblen(const unsigned char *mbstr)
+{
+  return((*pg_wchar_table[GetDatabaseEncoding()].mblen)(mbstr));
+}
+
+/* returns the length (counted as a wchar) of a multi-byte string */
+int pg_mbstrlen(const unsigned char *mbstr)
+{
+  int len = 0;
+  while (*mbstr) {
+    mbstr += pg_mblen(mbstr);
+    len++;
+  }
+  return(len);
+}
+
+/* returns the length (counted as a wchar) of a multi-byte string 
+   (not necessarily  NULL terminated) */
+int pg_mbstrlen_with_len(const unsigned char *mbstr, int limit)
+{
+  int len = 0;
+  int l;
+  while (*mbstr && limit > 0) {
+    l = pg_mblen(mbstr);
+    limit -= l;
+    mbstr += l;
+    len++;
+  }
+  return(len);
+}
+
+/*
+ * fuctions for utils/init
+ */
+static int DatabaseEncoding = MB;
+void
+SetDatabaseEncoding(int encoding)
+{
+  DatabaseEncoding = encoding;
+}
+
+int
+GetDatabaseEncoding()
+{
+  return(DatabaseEncoding);
+}
+
+/* for builtin-function */
+const char *
+getdatabaseencoding()
+{
+  return(pg_encoding_to_char(DatabaseEncoding));
+}
+
+/* set and get template1 database encoding */
+static int templateEncoding;
+void SetTemplateEncoding(int encoding)
+{
+  templateEncoding = encoding;
+}
+
+int GetTemplateEncoding()
+{
+  return(templateEncoding);
+}
--- a/src/backend/utils/mb/utftest.c
+++ b/src/backend/utils/mb/utftest.c
@@ -0,0 +1,33 @@
+/*
+ * testing of utf2wchar()
+ * $Id: utftest.c,v 1.1 1998/07/24 03:31:57 scrappy Exp $
+ */
+#include <regex/regex.h>
+#include <regex/utils.h>
+#include <regex/regex2.h>
+
+#include <regex/pg_wchar.h>
+
+main()
+{
+  /* Example 1 from RFC2044 */
+  char utf1[] = {0x41,0xe2,0x89,0xa2,0xce,0x91,0x2e,0};
+  /* Example 2 from RFC2044 */
+  char utf2[] = {0x48,0x69,0x20,0x4d,0x6f,0x6d,0x20,0xe2,0x98,0xba,0x21,0};
+  /* Example 3 from RFC2044 */
+  char utf3[] = {0xe6,0x97,0xa5,0xe6,0x9c,0xac,0xe8,0xaa,0x9e,0};
+  char *utf[] = {utf1,utf2,utf3};
+  pg_wchar ucs[128];
+  pg_wchar *p;
+  int i;
+
+  for (i=0;i<sizeof(utf)/sizeof(char *);i++) {
+    pg_utf2wchar(utf[i],ucs);
+    p = ucs;
+    while(*p) {
+      printf("%04x ",*p);
+      p++;
+    }
+    printf("\n");
+  }
+}
--- a/src/backend/utils/mb/variable.c
+++ b/src/backend/utils/mb/variable.c
@@ -0,0 +1,73 @@
+/*
+ * This file contains some public functions
+ * related to show/set/reset variable commands.
+ * Tatsuo Ishii
+ * $Id: variable.c,v 1.1 1998/07/24 03:31:57 scrappy Exp $
+ */
+
+#include "mb/pg_wchar.h"
+
+bool
+parse_client_encoding(const char *value)
+{
+  int encoding;
+
+  encoding = pg_valid_client_encoding(value);
+  if (encoding < 0) {
+    elog(ERROR, "Client encoding %s is not supported", value);
+  } else {    
+    if (pg_set_client_encoding(encoding)) {
+      elog(ERROR, "Conversion between %s and %s is not supported",
+	   value, pg_encoding_to_char(GetDatabaseEncoding()));
+    }
+  }
+  return TRUE;
+}
+
+bool
+show_client_encoding()
+{
+  elog(NOTICE, "Current client encoding is %s",
+       pg_encoding_to_char(pg_get_client_encoding()));
+  return TRUE;
+}
+
+bool
+reset_client_encoding()
+{
+  int encoding;
+  char *env = getenv("PGCLIENTENCODING");
+
+  if (env) {
+    encoding = pg_char_to_encoding(env);
+    if (encoding < 0) {
+      encoding = GetDatabaseEncoding();
+    }
+  } else {
+    encoding = GetDatabaseEncoding();
+  }
+  pg_set_client_encoding(encoding);
+  return TRUE;
+}
+
+bool
+parse_server_encoding(const char *value)
+{
+  elog(NOTICE, "SET SERVER_ENCODING is not supported");
+  return TRUE;
+}
+
+bool
+show_server_encoding()
+{
+  elog(NOTICE, "Current server encoding is %s",
+       pg_encoding_to_char(GetDatabaseEncoding()));
+  return TRUE;
+}
+
+bool
+reset_server_encoding()
+{
+  elog(NOTICE, "RESET SERVER_ENCODING is not supported");
+  return TRUE;
+}
--- a/src/backend/utils/mb/wchar.c
+++ b/src/backend/utils/mb/wchar.c
@@ -0,0 +1,358 @@
+/*
+ * conversion functions between pg_wchar and multi-byte streams.
+ * Tatsuo Ishii
+ * $Id: wchar.c,v 1.1 1998/07/24 03:31:57 scrappy Exp $
+ */
+
+#include "mb/pg_wchar.h"
+
+/*
+ * conversion to pg_wchar is done by "table driven."
+ * to add an encoding support, define mb2wchar_with_len(), mblen()
+ * for the particular encoding. Note that if the encoding is only
+ * supported in the client, you don't need to define 
+ * mb2wchar_with_len() function (SJIS is the case).
+ */
+static void pg_euc2wchar_with_len
+(const unsigned char *from, pg_wchar *to, int len)
+{
+  while (*from && len > 0) {
+    if (*from == SS2) {
+      from++;
+      len--;
+      *to = 0xff & *from++;
+      len--;
+    } else if (*from == SS3) {
+      from++;
+      *to = *from++ << 8;
+      *to |= 0x3f & *from++;
+      len -= 3;
+    } else if (*from & 0x80) {
+      *to = *from++ << 8;
+      *to |= *from++;
+      len -= 2;
+    } else {
+      *to = *from++;
+      len--;
+    }
+    to++;
+  }
+  *to = 0;
+}
+
+static int pg_euc_mblen(const unsigned char *s)
+{
+  int len;
+
+  if (*s == SS2) {
+    len = 2;
+  } else if (*s == SS3) {
+    len = 3;
+  } else if (*s & 0x80) {
+    len = 2;
+  } else {
+    len = 1;
+  }
+  return(len);
+}
+
+/*
+ * EUC_JP
+ */
+static void pg_eucjp2wchar_with_len
+(const unsigned char *from, pg_wchar *to, int len)
+{
+  pg_euc2wchar_with_len(from,to,len);
+}
+
+static int pg_eucjp_mblen(const unsigned char *s)
+{
+  return(pg_euc_mblen(s));
+}
+
+/*
+ * EUC_KR
+ */
+static void pg_euckr2wchar_with_len
+(const unsigned char *from, pg_wchar *to, int len)
+{
+  pg_euc2wchar_with_len(from,to,len);
+}
+
+static int pg_euckr_mblen(const unsigned char *s)
+{
+  return(pg_euc_mblen(s));
+}
+
+/*
+ * EUC_CN
+ */
+static void pg_euccn2wchar_with_len
+(const unsigned char *from, pg_wchar *to, int len)
+{
+  while (*from && len > 0) {
+    if (*from == SS2) {
+      from++;
+      len--;
+      *to = 0x3f00 & (*from++ << 8);
+      *to = *from++;
+      len -= 2;
+    } else if (*from == SS3) {
+      from++;
+      *to = *from++ << 8;
+      *to |= 0x3f & *from++;
+      len -= 3;
+    } else if (*from & 0x80) {
+      *to = *from++ << 8;
+      *to |= *from++;
+      len -= 2;
+    } else {
+      *to = *from++;
+      len--;
+    }
+    to++;
+  }
+  *to = 0;
+}
+
+static int pg_euccn_mblen(const unsigned char *s)
+{
+  int len;
+
+  if (*s == SS2) {
+    len = 3;
+  } else if (*s == SS3) {
+    len = 3;
+  } else if (*s & 0x80) {
+    len = 2;
+  } else {
+    len = 1;
+  }
+  return(len);
+}
+
+/*
+ * EUC_TW
+ */
+static void pg_euctw2wchar_with_len
+(const unsigned char *from, pg_wchar *to, int len)
+{
+  while (*from && len > 0) {
+    if (*from == SS2) {
+      from++;
+      len--;
+      *to = *from++ << 16;
+      *to |= *from++ << 8;
+      *to |= *from++;
+      len -= 3;
+    } else if (*from == SS3) {
+      from++;
+      *to = *from++ << 8;
+      *to |= 0x3f & *from++;
+      len -= 3;
+    } else if (*from & 0x80) {
+      *to = *from++ << 8;
+      *to |= *from++;
+      len -= 2;
+    } else {
+      *to = *from++;
+      len--;
+    }
+    to++;
+  }
+  *to = 0;
+}
+
+static int pg_euctw_mblen(const unsigned char *s)
+{
+  int len;
+
+  if (*s == SS2) {
+    len = 4;
+  } else if (*s == SS3) {
+    len = 3;
+  } else if (*s & 0x80) {
+    len = 2;
+  } else {
+    len = 1;
+  }
+  return(len);
+}
+
+/*
+ * convert UTF-8 to pg_wchar (UCS-2)
+ * caller should allocate enough space for "to"
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static void pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+  unsigned char c1,c2,c3;
+  while (*from && len > 0) {
+    if ((*from & 0x80) == 0) {
+      *to = *from++;
+      len--;
+    } else if ((*from & 0xe0) == 0xc0) {
+      c1 = *from++ & 0x1f;
+      c2 = *from++ & 0x3f;
+      len -= 2;
+      *to = c1 << 6;
+      *to |= c2;
+    } else if ((*from & 0xe0) == 0xe0) {
+      c1 = *from++ & 0x0f;
+      c2 = *from++ & 0x3f;
+      c3 = *from++ & 0x3f;
+      len -= 3;
+      *to = c1 << 12;
+      *to |= c2 << 6;
+      *to |= c3;
+    }
+    to++;
+  }
+  *to = 0;
+}
+
+static int pg_utf_mblen(const unsigned char *s)
+{
+  int len = 1;
+
+  if ((*s & 0x80) == 0) {
+    len = 1;
+  } else if ((*s & 0xe0) == 0xc0) {
+    len = 2;
+  } else if ((*s & 0xe0) == 0xe0) {
+    len = 3;
+  }
+  return(len);
+}
+
+/*
+ * convert mule internal code to pg_wchar
+ * caller should allocate enough space for "to"
+ * len: length of from.
+ * "from" not necessarily null terminated.
+ */
+static void pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+  while (*from && len > 0) {
+    if (IS_LC1(*from)) {
+      *to = *from++ << 16;
+      *to |= *from++;
+      len -= 2;
+    } else if (IS_LCPRV1(*from)) {
+      from++;
+      *to = *from++ << 16;
+      *to |= *from++;
+      len -= 3;
+    } else if (IS_LC2(*from)) {
+      *to = *from++ << 16;
+      *to |= *from++ << 8;
+      *to |= *from++;
+      len -= 3;
+    } else if (IS_LCPRV2(*from)) {
+      from++;
+      *to = *from++ << 16;
+      *to |= *from++ << 8;
+      *to |= *from++;
+      len -= 4;
+    } else {	/* assume ASCII */
+      *to = (unsigned char)*from++;
+      len--;
+    }
+    to++;
+  }
+  *to = 0;
+}
+
+int pg_mule_mblen(const unsigned char *s)
+{
+  int len;
+
+  if (IS_LC1(*s)) {
+    len = 2;
+  } else if (IS_LCPRV1(*s)) {
+    len = 3;
+  } else if (IS_LC2(*s)) {
+    len = 3;
+  } else if (IS_LCPRV2(*s)) {
+    len = 4;
+  } else {	/* assume ASCII */
+    len = 1;
+  }
+  return(len);
+}
+
+/*
+ * ISO8859-1
+ */
+static void pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
+{
+  while (*from && len-- > 0) {
+    *to++ = *from++;
+  }
+  *to = 0;
+}
+
+static int pg_latin1_mblen(const unsigned char *s)
+{
+  return(1);
+}
+
+/*
+ * SJIS
+ */
+static int pg_sjis_mblen(const unsigned char *s)
+{
+  int len;
+
+  if (*s >= 0xa1 && *s <= 0xdf) {	/* 1 byte kana? */
+    len = 1;
+  } else if (*s > 0x7f) {	/* kanji? */
+    len = 2;
+  } else {	/* should be ASCII */
+    len = 1;
+  }
+  return(len);
+}
+
+pg_wchar_tbl pg_wchar_table[] = {
+  {pg_eucjp2wchar_with_len, pg_eucjp_mblen},
+  {pg_euccn2wchar_with_len, pg_euccn_mblen},
+  {pg_euckr2wchar_with_len, pg_euckr_mblen},
+  {pg_euctw2wchar_with_len, pg_euctw_mblen},
+  {pg_utf2wchar_with_len, pg_utf_mblen},
+  {pg_mule2wchar_with_len, pg_mule_mblen},
+  {pg_latin12wchar_with_len, pg_latin1_mblen},
+  {pg_latin12wchar_with_len, pg_latin1_mblen},
+  {pg_latin12wchar_with_len, pg_latin1_mblen},
+  {pg_latin12wchar_with_len, pg_latin1_mblen},
+  {pg_latin12wchar_with_len, pg_latin1_mblen},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, 0},
+  {0, pg_sjis_mblen}
+};
+
+/* returns the byte length of a word for mule internal code */
+int pg_mic_mblen(const unsigned char *mbstr)
+{
+  return(pg_mule_mblen(mbstr));
+}
--- a/src/backend/utils/mb/wstrcmp.c
+++ b/src/backend/utils/mb/wstrcmp.c
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Chris Torek.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <mb/pg_wchar.h>
+
+int
+pg_char_and_wchar_strcmp(s1, s2)
+	register const char *s1;
+	register const pg_wchar *s2;
+{
+	while ((pg_wchar)*s1 == *s2++)
+		if (*s1++ == 0)
+			return (0);
+	return (*(const unsigned char *)s1 - *(const pg_wchar *)(s2 - 1));
+}
--- a/src/backend/utils/mb/wstrncmp.c
+++ b/src/backend/utils/mb/wstrncmp.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from FreeBSD 2.2.1-RELEASE software.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <mb/pg_wchar.h>
+
+int
+pg_wchar_strncmp(s1, s2, n)
+	register const pg_wchar *s1, *s2;
+	register size_t n;
+{
+
+	if (n == 0)
+		return (0);
+	do {
+		if (*s1 != *s2++)
+			return (*(const pg_wchar *)s1 -
+				*(const pg_wchar *)(s2 - 1));
+		if (*s1++ == 0)
+			break;
+	} while (--n != 0);
+	return (0);
+}
+
+int
+pg_char_and_wchar_strncmp(s1, s2, n)
+	register const char *s1;
+	register const pg_wchar *s2;
+	register size_t n;
+{
+
+	if (n == 0)
+		return (0);
+	do {
+		if ((pg_wchar )*s1 != *s2++)
+			return (*(const pg_wchar *)s1 -
+				*(const pg_wchar *)(s2 - 1));
+		if (*s1++ == 0)
+			break;
+	} while (--n != 0);
+	return (0);
+}
+
+size_t
+pg_wchar_strlen(str)
+	const pg_wchar *str;
+{
+	register const pg_wchar *s;
+
+	for (s = str; *s; ++s);
+	return(s - str);
+}