1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-03 20:02:46 +03:00
Included are patches intended for allowing PostgreSQL to handle
multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and
Mule internal code. With the MB patch you can use multi-byte character
sets in regexp and LIKE. The encoding system chosen is determined at
the compile time.

To enable the MB extension, you need to define a variable "MB" in
Makefile.global or in Makefile.custom. For further information please
take a look at README.mb under doc directory.

(Note that unlike "jp patch" I do not use modified GNU regexp any
more. I changed Henry Spencer's regexp coming with PostgreSQL.)
This commit is contained in:
Marc G. Fournier
1998-03-15 07:39:04 +00:00
parent 31a925c4d0
commit 661ecf3c48
23 changed files with 1104 additions and 135 deletions

View File

@ -62,8 +62,8 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
*/
struct parse
{
char *next; /* next character in RE */
char *end; /* end of string (-> NUL normally) */
pg_wchar *next; /* next character in RE */
pg_wchar *end; /* end of string (-> NUL normally) */
int error; /* has an error been seen? */
sop *strip; /* malloced strip */
sopno ssize; /* malloced strip size (allocated) */
@ -93,7 +93,7 @@ extern "C"
static void p_b_term(struct parse * p, cset *cs);
static void p_b_cclass(struct parse * p, cset *cs);
static void p_b_eclass(struct parse * p, cset *cs);
static char p_b_symbol(struct parse * p);
static pg_wchar p_b_symbol(struct parse * p);
static char p_b_coll_elem(struct parse * p, int endc);
static char othercase(int ch);
static void bothcases(struct parse * p, int ch);
@ -120,6 +120,10 @@ extern "C"
static void stripsnug(struct parse * p, struct re_guts * g);
static void findmust(struct parse * p, struct re_guts * g);
static sopno pluscount(struct parse * p, struct re_guts * g);
static int pg_isdigit(int c);
static int pg_isalpha(int c);
static int pg_isupper(int c);
static int pg_islower(int c);
#ifdef __cplusplus
}
@ -127,7 +131,7 @@ extern "C"
#endif
/* ========= end header generated by ./mkh ========= */
static char nuls[10]; /* place to point scanner in event of
static pg_wchar nuls[10]; /* place to point scanner in event of
* error */
/*
@ -190,6 +194,9 @@ int cflags;
struct parse *p = &pa;
int i;
size_t len;
#ifdef MB
pg_wchar *wcp;
#endif
#ifdef REDEBUG
#define GOODFLAGS(f) (f)
@ -203,12 +210,31 @@ int cflags;
if (cflags & REG_PEND)
{
#ifdef MB
wcp = preg->patsave;
if (preg->re_endp < wcp)
return (REG_INVARG);
len = preg->re_endp - wcp;
#else
if (preg->re_endp < pattern)
return (REG_INVARG);
len = preg->re_endp - pattern;
#endif
}
else {
#ifdef MB
wcp = (pg_wchar *)malloc((strlen(pattern)+1) * sizeof(pg_wchar));
if (wcp == NULL) {
return (REG_ESPACE);
}
preg->patsave = wcp;
(void)pg_mb2wchar((unsigned char *)pattern,wcp);
len = pg_wchar_strlen(wcp);
#else
len = strlen((char *) pattern);
#endif
}
else
len = strlen((char *) pattern);
/* do the mallocs early so failure handling is easy */
g = (struct re_guts *) malloc(sizeof(struct re_guts) +
@ -227,7 +253,11 @@ int cflags;
/* set things up */
p->g = g;
p->next = (char *) pattern; /* convenience; we do not modify it */
#ifdef MB
p->next = wcp;
#else
p->next = pattern; /* convenience; we do not modify it */
#endif
p->end = p->next + len;
p->error = 0;
p->ncsalloc = 0;
@ -342,7 +372,7 @@ static void
p_ere_exp(p)
struct parse *p;
{
char c;
pg_wchar c;
sopno pos;
int count;
int count2;
@ -420,7 +450,7 @@ struct parse *p;
break;
case '{': /* okay as ordinary except if digit
* follows */
REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT);
/* FALLTHROUGH */
default:
ordinary(p, c);
@ -432,7 +462,7 @@ struct parse *p;
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!(c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit(PEEK2()))))
(c == '{' && MORE2() && pg_isdigit(PEEK2()))))
return; /* no repetition, we're done */
NEXT();
@ -463,7 +493,7 @@ struct parse *p;
count = p_count(p);
if (EAT(','))
{
if (isdigit(PEEK()))
if (pg_isdigit(PEEK()))
{
count2 = p_count(p);
REQUIRE(count <= count2, REG_BADBR);
@ -490,7 +520,7 @@ struct parse *p;
return;
c = PEEK();
if (!(c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit(PEEK2()))))
(c == '{' && MORE2() && pg_isdigit(PEEK2()))))
return;
SETERROR(REG_BADRPT);
}
@ -568,7 +598,7 @@ int starordinary; /* is a leading * an ordinary character? */
int i;
sopno subno;
#define BACKSL (1<<CHAR_BIT)
#define BACKSL (1<<24)
pos = HERE(); /* repetion op, if any, covers from here */
@ -577,7 +607,11 @@ int starordinary; /* is a leading * an ordinary character? */
if (c == '\\')
{
REQUIRE(MORE(), REG_EESCAPE);
#ifdef MB
c = BACKSL | (pg_wchar) GETNEXT();
#else
c = BACKSL | (unsigned char) GETNEXT();
#endif
}
switch (c)
{
@ -660,7 +694,7 @@ int starordinary; /* is a leading * an ordinary character? */
count = p_count(p);
if (EAT(','))
{
if (MORE() && isdigit(PEEK()))
if (MORE() && pg_isdigit(PEEK()))
{
count2 = p_count(p);
REQUIRE(count <= count2, REG_BADBR);
@ -698,7 +732,7 @@ struct parse *p;
int count = 0;
int ndigits = 0;
while (MORE() && isdigit(PEEK()) && count <= DUPMAX)
while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX)
{
count = count * 10 + (GETNEXT() - '0');
ndigits++;
@ -721,15 +755,27 @@ struct parse *p;
{
cset *cs = allocset(p);
int invert = 0;
#ifdef MB
pg_wchar sp1[] = {'[', ':', '<', ':', ']', ']'};
pg_wchar sp2[] = {'[', ':', '>', ':', ']', ']'};
#endif
/* Dept of Truly Sickening Special-Case Kludges */
#ifdef MB
if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0)
#else
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0)
#endif
{
EMIT(OBOW, 0);
NEXTn(6);
return;
}
#ifdef MB
if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0)
#else
if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0)
#endif
{
EMIT(OEOW, 0);
NEXTn(6);
@ -757,7 +803,7 @@ struct parse *p;
int ci;
for (i = p->g->csetsize - 1; i >= 0; i--)
if (CHIN(cs, i) && isalpha(i))
if (CHIN(cs, i) && pg_isalpha(i))
{
ci = othercase(i);
if (ci != i)
@ -801,8 +847,8 @@ p_b_term(p, cs)
struct parse *p;
cset *cs;
{
char c;
char start,
pg_wchar c;
pg_wchar start,
finish;
int i;
@ -857,6 +903,11 @@ cset *cs;
finish = start;
/* xxx what about signed chars here... */
REQUIRE(start <= finish, REG_ERANGE);
#ifdef MB
if (CHlc(start) != CHlc(finish)) {
SETERROR(REG_ERANGE);
}
#endif
for (i = start; i <= finish; i++)
CHadd(cs, i);
break;
@ -872,17 +923,21 @@ p_b_cclass(p, cs)
struct parse *p;
cset *cs;
{
char *sp = p->next;
pg_wchar *sp = p->next;
struct cclass *cp;
size_t len;
char *u;
char c;
while (MORE() && isalpha(PEEK()))
while (MORE() && pg_isalpha(PEEK()))
NEXT();
len = p->next - sp;
for (cp = cclasses; cp->name != NULL; cp++)
#ifdef MB
if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
#else
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
#endif
break;
if (cp->name == NULL)
{
@ -919,11 +974,11 @@ cset *cs;
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
== static char p_b_symbol(struct parse *p);
*/
static char /* value of symbol */
static pg_wchar /* value of symbol */
p_b_symbol(p)
struct parse *p;
{
char value;
pg_wchar value;
REQUIRE(MORE(), REG_EBRACK);
if (!EATTWO('[', '.'))
@ -944,7 +999,7 @@ p_b_coll_elem(p, endc)
struct parse *p;
int endc; /* name ended by endc,']' */
{
char *sp = p->next;
pg_wchar *sp = p->next;
struct cname *cp;
int len;
@ -957,7 +1012,11 @@ int endc; /* name ended by endc,']' */
}
len = p->next - sp;
for (cp = cnames; cp->name != NULL; cp++)
#ifdef MB
if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
#else
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
#endif
return (cp->code); /* known name */
if (len == 1)
return (*sp); /* single character */
@ -973,10 +1032,10 @@ static char /* if no counterpart, return ch */
othercase(ch)
int ch;
{
assert(isalpha(ch));
if (isupper(ch))
assert(pg_isalpha(ch));
if (pg_isupper(ch))
return (tolower(ch));
else if (islower(ch))
else if (pg_islower(ch))
return (toupper(ch));
else
/* peculiar, but could happen */
@ -994,9 +1053,9 @@ bothcases(p, ch)
struct parse *p;
int ch;
{
char *oldnext = p->next;
char *oldend = p->end;
char bracket[3];
pg_wchar *oldnext = p->next;
pg_wchar *oldend = p->end;
pg_wchar bracket[3];
assert(othercase(ch) != ch);/* p_bracket() would recurse */
p->next = bracket;
@ -1021,12 +1080,16 @@ int ch;
{
cat_t *cap = p->g->categories;
if ((p->g->cflags & REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
bothcases(p, ch);
else
{
#ifdef MB
EMIT(OCHAR, (pg_wchar) ch);
#else
EMIT(OCHAR, (unsigned char) ch);
if (cap[ch] == 0)
#endif
if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0)
cap[ch] = p->g->ncategories++;
}
}
@ -1041,9 +1104,9 @@ static void
nonnewline(p)
struct parse *p;
{
char *oldnext = p->next;
char *oldend = p->end;
char bracket[4];
pg_wchar *oldnext = p->next;
pg_wchar *oldend = p->end;
pg_wchar bracket[4];
p->next = bracket;
p->end = bracket + 3;
@ -1674,7 +1737,7 @@ struct re_guts *g;
sop *newstart = 0;
sopno newlen;
sop s;
char *cp;
pg_wchar *cp;
sopno i;
/* avoid making error situations worse */
@ -1729,7 +1792,11 @@ struct re_guts *g;
return;
/* turn it into a character string */
#ifdef MB
g->must = (pg_wchar *)malloc((size_t) (g->mlen + 1)*sizeof(pg_wchar));
#else
g->must = malloc((size_t) g->mlen + 1);
#endif
if (g->must == NULL)
{ /* argh; just forget it */
g->mlen = 0;
@ -1742,7 +1809,7 @@ struct re_guts *g;
while (OP(s = *scan++) != OCHAR)
continue;
assert(cp < g->must + g->mlen);
*cp++ = (char) OPND(s);
*cp++ = (pg_wchar) OPND(s);
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */
@ -1785,3 +1852,42 @@ struct re_guts *g;
g->iflags |= BAD;
return (maxnest);
}
/*
* some ctype functions with none-ascii-char guard
*/
static int pg_isdigit(int c)
{
#ifdef MB
return(c >= 0 && c <= UCHAR_MAX && isdigit(c));
#else
return(isdigit(c));
#endif
}
static int pg_isalpha(int c)
{
#ifdef MB
return(c >= 0 && c <= UCHAR_MAX && isalpha(c));
#else
return(isalpha(c));
#endif
}
static int pg_isupper(int c)
{
#ifdef MB
return(c >= 0 && c <= UCHAR_MAX && isupper(c));
#else
return(isupper(c));
#endif
}
static int pg_islower(int c)
{
#ifdef MB
return(c >= 0 && c <= UCHAR_MAX && islower(c));
#else
return(islower(c));
#endif
}