From: t-ishii@sra.co.jp

Included are patches intended for allowing PostgreSQL to handle multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and Mule internal code. With the MB patch you can use multi-byte character sets in regexp and LIKE. The encoding system chosen is determined at the compile time. To enable the MB extension, you need to define a variable "MB" in Makefile.global or in Makefile.custom. For further information please take a look at README.mb under doc directory. (Note that unlike "jp patch" I do not use modified GNU regexp any more. I changed Henry Spencer's regexp coming with PostgreSQL.)
2025-08-24 09:27:52 +03:00 · 1998-03-15 07:39:04 +00:00
parent 31a925c4d0
commit 661ecf3c48
23 changed files with 1104 additions and 135 deletions
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -62,8 +62,8 @@ static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
 */
 struct parse
 {
-	char	   *next;			/* next character in RE */
-	char	   *end;			/* end of string (-> NUL normally) */
+	pg_wchar   *next;			/* next character in RE */
+	pg_wchar   *end;			/* end of string (-> NUL normally) */
 	int			error;			/* has an error been seen? */
 	sop		   *strip;			/* malloced strip */
 	sopno		ssize;			/* malloced strip size (allocated) */
@@ -93,7 +93,7 @@ extern		"C"
 	static void p_b_term(struct parse * p, cset *cs);
 	static void p_b_cclass(struct parse * p, cset *cs);
 	static void p_b_eclass(struct parse * p, cset *cs);
-	static char p_b_symbol(struct parse * p);
+	static pg_wchar p_b_symbol(struct parse * p);
 	static char p_b_coll_elem(struct parse * p, int endc);
 	static char othercase(int ch);
 	static void bothcases(struct parse * p, int ch);
@@ -120,6 +120,10 @@ extern		"C"
 	static void stripsnug(struct parse * p, struct re_guts * g);
 	static void findmust(struct parse * p, struct re_guts * g);
 	static sopno pluscount(struct parse * p, struct re_guts * g);
+	static int pg_isdigit(int c);
+	static int pg_isalpha(int c);
+	static int pg_isupper(int c);
+	static int pg_islower(int c);

 #ifdef __cplusplus
 }
@@ -127,7 +131,7 @@ extern		"C"
 #endif
 /* ========= end header generated by ./mkh ========= */

-static char nuls[10];			/* place to point scanner in event of
+static pg_wchar nuls[10];			/* place to point scanner in event of
 								 * error */

 /*
@@ -190,6 +194,9 @@ int			cflags;
 	struct parse *p = &pa;
 	int			i;
 	size_t		len;
+#ifdef MB
+ 	pg_wchar *wcp;
+#endif

 #ifdef REDEBUG
 #define  GOODFLAGS(f)	 (f)
@@ -203,12 +210,31 @@ int			cflags;

 	if (cflags & REG_PEND)
 	{
+#ifdef MB
+	        wcp = preg->patsave;
+		if (preg->re_endp < wcp)
+			return (REG_INVARG);
+		len = preg->re_endp - wcp;
+#else
 		if (preg->re_endp < pattern)
 			return (REG_INVARG);
 		len = preg->re_endp - pattern;
+#endif
+	}
+	else {
+#ifdef MB
+	  wcp = (pg_wchar *)malloc((strlen(pattern)+1) * sizeof(pg_wchar));
+	  if (wcp == NULL) {
+	    return (REG_ESPACE);
+	  }
+	  preg->patsave = wcp;
+	  (void)pg_mb2wchar((unsigned char *)pattern,wcp);
+	  len = pg_wchar_strlen(wcp);
+#else
+
+	  len = strlen((char *) pattern);
+#endif
 	}
-	else
-		len = strlen((char *) pattern);

 	/* do the mallocs early so failure handling is easy */
 	g = (struct re_guts *) malloc(sizeof(struct re_guts) +
@@ -227,7 +253,11 @@ int			cflags;

 	/* set things up */
 	p->g = g;
-	p->next = (char *) pattern; /* convenience; we do not modify it */
+#ifdef MB
+	p->next = wcp;
+#else
+	p->next = pattern; /* convenience; we do not modify it */
+#endif
 	p->end = p->next + len;
 	p->error = 0;
 	p->ncsalloc = 0;
@@ -342,7 +372,7 @@ static void
 p_ere_exp(p)
 struct parse *p;
 {
-	char		c;
+	pg_wchar	c;
 	sopno		pos;
 	int			count;
 	int			count2;
@@ -420,7 +450,7 @@ struct parse *p;
 			break;
 		case '{':				/* okay as ordinary except if digit
 								 * follows */
-			REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
+			REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT);
 			/* FALLTHROUGH */
 		default:
 			ordinary(p, c);
@@ -432,7 +462,7 @@ struct parse *p;
 	c = PEEK();
 	/* we call { a repetition if followed by a digit */
 	if (!(c == '*' || c == '+' || c == '?' ||
-		  (c == '{' && MORE2() && isdigit(PEEK2()))))
+		  (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
 		return;					/* no repetition, we're done */
 	NEXT();

@@ -463,7 +493,7 @@ struct parse *p;
 			count = p_count(p);
 			if (EAT(','))
 			{
-				if (isdigit(PEEK()))
+				if (pg_isdigit(PEEK()))
 				{
 					count2 = p_count(p);
 					REQUIRE(count <= count2, REG_BADBR);
@@ -490,7 +520,7 @@ struct parse *p;
 		return;
 	c = PEEK();
 	if (!(c == '*' || c == '+' || c == '?' ||
-		  (c == '{' && MORE2() && isdigit(PEEK2()))))
+		  (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
 		return;
 	SETERROR(REG_BADRPT);
 }
@@ -568,7 +598,7 @@ int			starordinary;		/* is a leading * an ordinary character? */
 	int			i;
 	sopno		subno;

-#define  BACKSL  (1<<CHAR_BIT)
+#define  BACKSL  (1<<24)

 	pos = HERE();				/* repetion op, if any, covers from here */

@@ -577,7 +607,11 @@ int			starordinary;		/* is a leading * an ordinary character? */
 	if (c == '\\')
 	{
 		REQUIRE(MORE(), REG_EESCAPE);
+#ifdef MB
+		c = BACKSL | (pg_wchar) GETNEXT();
+#else
 		c = BACKSL | (unsigned char) GETNEXT();
+#endif
 	}
 	switch (c)
 	{
@@ -660,7 +694,7 @@ int			starordinary;		/* is a leading * an ordinary character? */
 		count = p_count(p);
 		if (EAT(','))
 		{
-			if (MORE() && isdigit(PEEK()))
+			if (MORE() && pg_isdigit(PEEK()))
 			{
 				count2 = p_count(p);
 				REQUIRE(count <= count2, REG_BADBR);
@@ -698,7 +732,7 @@ struct parse *p;
 	int			count = 0;
 	int			ndigits = 0;

-	while (MORE() && isdigit(PEEK()) && count <= DUPMAX)
+	while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX)
 	{
 		count = count * 10 + (GETNEXT() - '0');
 		ndigits++;
@@ -721,15 +755,27 @@ struct parse *p;
 {
 	cset	   *cs = allocset(p);
 	int			invert = 0;
+#ifdef MB
+	pg_wchar sp1[] = {'[', ':', '<', ':', ']', ']'};
+	pg_wchar sp2[] = {'[', ':', '>', ':', ']', ']'};
+#endif

 	/* Dept of Truly Sickening Special-Case Kludges */
+#ifdef MB
+ 	if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0)
+#else
 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0)
+#endif
 	{
 		EMIT(OBOW, 0);
 		NEXTn(6);
 		return;
 	}
+#ifdef MB
+ 	if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0)
+#else
 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0)
+#endif
 	{
 		EMIT(OEOW, 0);
 		NEXTn(6);
@@ -757,7 +803,7 @@ struct parse *p;
 		int			ci;

 		for (i = p->g->csetsize - 1; i >= 0; i--)
-			if (CHIN(cs, i) && isalpha(i))
+			if (CHIN(cs, i) && pg_isalpha(i))
 			{
 				ci = othercase(i);
 				if (ci != i)
@@ -801,8 +847,8 @@ p_b_term(p, cs)
 struct parse *p;
 cset	   *cs;
 {
-	char		c;
-	char		start,
+	pg_wchar		c;
+	pg_wchar		start,
 				finish;
 	int			i;

@@ -857,6 +903,11 @@ cset	   *cs;
 				finish = start;
 /* xxx what about signed chars here... */
 			REQUIRE(start <= finish, REG_ERANGE);
+#ifdef MB
+		  if (CHlc(start) != CHlc(finish)) {
+		    SETERROR(REG_ERANGE);
+		  }
+#endif
 			for (i = start; i <= finish; i++)
 				CHadd(cs, i);
 			break;
@@ -872,17 +923,21 @@ p_b_cclass(p, cs)
 struct parse *p;
 cset	   *cs;
 {
-	char	   *sp = p->next;
+	pg_wchar   *sp = p->next;
 	struct cclass *cp;
 	size_t		len;
 	char	   *u;
 	char		c;

-	while (MORE() && isalpha(PEEK()))
+	while (MORE() && pg_isalpha(PEEK()))
 		NEXT();
 	len = p->next - sp;
 	for (cp = cclasses; cp->name != NULL; cp++)
+#ifdef MB
+		if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#else
 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#endif
 			break;
 	if (cp->name == NULL)
 	{
@@ -919,11 +974,11 @@ cset	   *cs;
 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 == static char p_b_symbol(struct parse *p);
 */
-static char						/* value of symbol */
+static pg_wchar						/* value of symbol */
 p_b_symbol(p)
 struct parse *p;
 {
-	char		value;
+	pg_wchar		value;

 	REQUIRE(MORE(), REG_EBRACK);
 	if (!EATTWO('[', '.'))
@@ -944,7 +999,7 @@ p_b_coll_elem(p, endc)
 struct parse *p;
 int			endc;				/* name ended by endc,']' */
 {
-	char	   *sp = p->next;
+	pg_wchar	   *sp = p->next;
 	struct cname *cp;
 	int			len;

@@ -957,7 +1012,11 @@ int			endc;				/* name ended by endc,']' */
 	}
 	len = p->next - sp;
 	for (cp = cnames; cp->name != NULL; cp++)
+#ifdef MB
+		if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#else
 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
+#endif
 			return (cp->code);	/* known name */
 	if (len == 1)
 		return (*sp);			/* single character */
@@ -973,10 +1032,10 @@ static char						/* if no counterpart, return ch */
 othercase(ch)
 int			ch;
 {
-	assert(isalpha(ch));
-	if (isupper(ch))
+	assert(pg_isalpha(ch));
+	if (pg_isupper(ch))
 		return (tolower(ch));
-	else if (islower(ch))
+	else if (pg_islower(ch))
 		return (toupper(ch));
 	else
 /* peculiar, but could happen */
@@ -994,9 +1053,9 @@ bothcases(p, ch)
 struct parse *p;
 int			ch;
 {
-	char	   *oldnext = p->next;
-	char	   *oldend = p->end;
-	char		bracket[3];
+	pg_wchar	   *oldnext = p->next;
+	pg_wchar	   *oldend = p->end;
+	pg_wchar		bracket[3];

 	assert(othercase(ch) != ch);/* p_bracket() would recurse */
 	p->next = bracket;
@@ -1021,12 +1080,16 @@ int			ch;
 {
 	cat_t	   *cap = p->g->categories;

-	if ((p->g->cflags & REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
+	if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
 		bothcases(p, ch);
 	else
 	{
+#ifdef MB
+		EMIT(OCHAR, (pg_wchar) ch);
+#else
 		EMIT(OCHAR, (unsigned char) ch);
-		if (cap[ch] == 0)
+#endif
+		if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0)
 			cap[ch] = p->g->ncategories++;
 	}
 }
@@ -1041,9 +1104,9 @@ static void
 nonnewline(p)
 struct parse *p;
 {
-	char	   *oldnext = p->next;
-	char	   *oldend = p->end;
-	char		bracket[4];
+	pg_wchar	   *oldnext = p->next;
+	pg_wchar	   *oldend = p->end;
+	pg_wchar		bracket[4];

 	p->next = bracket;
 	p->end = bracket + 3;
@@ -1674,7 +1737,7 @@ struct re_guts *g;
 	sop		   *newstart = 0;
 	sopno		newlen;
 	sop			s;
-	char	   *cp;
+	pg_wchar	   *cp;
 	sopno		i;

 	/* avoid making error situations worse */
@@ -1729,7 +1792,11 @@ struct re_guts *g;
 		return;

 	/* turn it into a character string */
+#ifdef MB
+	g->must = (pg_wchar *)malloc((size_t) (g->mlen + 1)*sizeof(pg_wchar));
+#else
 	g->must = malloc((size_t) g->mlen + 1);
+#endif
 	if (g->must == NULL)
 	{							/* argh; just forget it */
 		g->mlen = 0;
@@ -1742,7 +1809,7 @@ struct re_guts *g;
 		while (OP(s = *scan++) != OCHAR)
 			continue;
 		assert(cp < g->must + g->mlen);
-		*cp++ = (char) OPND(s);
+		*cp++ = (pg_wchar) OPND(s);
 	}
 	assert(cp == g->must + g->mlen);
 	*cp++ = '\0';				/* just on general principles */
@@ -1785,3 +1852,42 @@ struct re_guts *g;
 		g->iflags |= BAD;
 	return (maxnest);
 }
+
+/*
+ * some ctype functions with none-ascii-char guard
+ */
+static int pg_isdigit(int c)
+{
+#ifdef MB
+  return(c >= 0 && c <= UCHAR_MAX && isdigit(c));
+#else
+  return(isdigit(c));
+#endif
+}
+
+static int pg_isalpha(int c)
+{
+#ifdef MB
+  return(c >= 0 && c <= UCHAR_MAX && isalpha(c));
+#else
+  return(isalpha(c));
+#endif
+}
+
+static int pg_isupper(int c)
+{
+#ifdef MB
+  return(c >= 0 && c <= UCHAR_MAX && isupper(c));
+#else
+  return(isupper(c));
+#endif
+}
+
+static int pg_islower(int c)
+{
+#ifdef MB
+  return(c >= 0 && c <= UCHAR_MAX && islower(c));
+#else
+  return(islower(c));
+#endif
+}