mirror of
https://github.com/postgres/postgres.git
synced 2025-07-03 20:02:46 +03:00
From: t-ishii@sra.co.jp
Included are patches intended for allowing PostgreSQL to handle multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and Mule internal code. With the MB patch you can use multi-byte character sets in regexp and LIKE. The encoding system chosen is determined at the compile time. To enable the MB extension, you need to define a variable "MB" in Makefile.global or in Makefile.custom. For further information please take a look at README.mb under doc directory. (Note that unlike "jp patch" I do not use modified GNU regexp any more. I changed Henry Spencer's regexp coming with PostgreSQL.)
This commit is contained in:
@ -62,8 +62,8 @@ static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
|
||||
*/
|
||||
struct parse
|
||||
{
|
||||
char *next; /* next character in RE */
|
||||
char *end; /* end of string (-> NUL normally) */
|
||||
pg_wchar *next; /* next character in RE */
|
||||
pg_wchar *end; /* end of string (-> NUL normally) */
|
||||
int error; /* has an error been seen? */
|
||||
sop *strip; /* malloced strip */
|
||||
sopno ssize; /* malloced strip size (allocated) */
|
||||
@ -93,7 +93,7 @@ extern "C"
|
||||
static void p_b_term(struct parse * p, cset *cs);
|
||||
static void p_b_cclass(struct parse * p, cset *cs);
|
||||
static void p_b_eclass(struct parse * p, cset *cs);
|
||||
static char p_b_symbol(struct parse * p);
|
||||
static pg_wchar p_b_symbol(struct parse * p);
|
||||
static char p_b_coll_elem(struct parse * p, int endc);
|
||||
static char othercase(int ch);
|
||||
static void bothcases(struct parse * p, int ch);
|
||||
@ -120,6 +120,10 @@ extern "C"
|
||||
static void stripsnug(struct parse * p, struct re_guts * g);
|
||||
static void findmust(struct parse * p, struct re_guts * g);
|
||||
static sopno pluscount(struct parse * p, struct re_guts * g);
|
||||
static int pg_isdigit(int c);
|
||||
static int pg_isalpha(int c);
|
||||
static int pg_isupper(int c);
|
||||
static int pg_islower(int c);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
@ -127,7 +131,7 @@ extern "C"
|
||||
#endif
|
||||
/* ========= end header generated by ./mkh ========= */
|
||||
|
||||
static char nuls[10]; /* place to point scanner in event of
|
||||
static pg_wchar nuls[10]; /* place to point scanner in event of
|
||||
* error */
|
||||
|
||||
/*
|
||||
@ -190,6 +194,9 @@ int cflags;
|
||||
struct parse *p = &pa;
|
||||
int i;
|
||||
size_t len;
|
||||
#ifdef MB
|
||||
pg_wchar *wcp;
|
||||
#endif
|
||||
|
||||
#ifdef REDEBUG
|
||||
#define GOODFLAGS(f) (f)
|
||||
@ -203,12 +210,31 @@ int cflags;
|
||||
|
||||
if (cflags & REG_PEND)
|
||||
{
|
||||
#ifdef MB
|
||||
wcp = preg->patsave;
|
||||
if (preg->re_endp < wcp)
|
||||
return (REG_INVARG);
|
||||
len = preg->re_endp - wcp;
|
||||
#else
|
||||
if (preg->re_endp < pattern)
|
||||
return (REG_INVARG);
|
||||
len = preg->re_endp - pattern;
|
||||
#endif
|
||||
}
|
||||
else {
|
||||
#ifdef MB
|
||||
wcp = (pg_wchar *)malloc((strlen(pattern)+1) * sizeof(pg_wchar));
|
||||
if (wcp == NULL) {
|
||||
return (REG_ESPACE);
|
||||
}
|
||||
preg->patsave = wcp;
|
||||
(void)pg_mb2wchar((unsigned char *)pattern,wcp);
|
||||
len = pg_wchar_strlen(wcp);
|
||||
#else
|
||||
|
||||
len = strlen((char *) pattern);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
len = strlen((char *) pattern);
|
||||
|
||||
/* do the mallocs early so failure handling is easy */
|
||||
g = (struct re_guts *) malloc(sizeof(struct re_guts) +
|
||||
@ -227,7 +253,11 @@ int cflags;
|
||||
|
||||
/* set things up */
|
||||
p->g = g;
|
||||
p->next = (char *) pattern; /* convenience; we do not modify it */
|
||||
#ifdef MB
|
||||
p->next = wcp;
|
||||
#else
|
||||
p->next = pattern; /* convenience; we do not modify it */
|
||||
#endif
|
||||
p->end = p->next + len;
|
||||
p->error = 0;
|
||||
p->ncsalloc = 0;
|
||||
@ -342,7 +372,7 @@ static void
|
||||
p_ere_exp(p)
|
||||
struct parse *p;
|
||||
{
|
||||
char c;
|
||||
pg_wchar c;
|
||||
sopno pos;
|
||||
int count;
|
||||
int count2;
|
||||
@ -420,7 +450,7 @@ struct parse *p;
|
||||
break;
|
||||
case '{': /* okay as ordinary except if digit
|
||||
* follows */
|
||||
REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
|
||||
REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT);
|
||||
/* FALLTHROUGH */
|
||||
default:
|
||||
ordinary(p, c);
|
||||
@ -432,7 +462,7 @@ struct parse *p;
|
||||
c = PEEK();
|
||||
/* we call { a repetition if followed by a digit */
|
||||
if (!(c == '*' || c == '+' || c == '?' ||
|
||||
(c == '{' && MORE2() && isdigit(PEEK2()))))
|
||||
(c == '{' && MORE2() && pg_isdigit(PEEK2()))))
|
||||
return; /* no repetition, we're done */
|
||||
NEXT();
|
||||
|
||||
@ -463,7 +493,7 @@ struct parse *p;
|
||||
count = p_count(p);
|
||||
if (EAT(','))
|
||||
{
|
||||
if (isdigit(PEEK()))
|
||||
if (pg_isdigit(PEEK()))
|
||||
{
|
||||
count2 = p_count(p);
|
||||
REQUIRE(count <= count2, REG_BADBR);
|
||||
@ -490,7 +520,7 @@ struct parse *p;
|
||||
return;
|
||||
c = PEEK();
|
||||
if (!(c == '*' || c == '+' || c == '?' ||
|
||||
(c == '{' && MORE2() && isdigit(PEEK2()))))
|
||||
(c == '{' && MORE2() && pg_isdigit(PEEK2()))))
|
||||
return;
|
||||
SETERROR(REG_BADRPT);
|
||||
}
|
||||
@ -568,7 +598,7 @@ int starordinary; /* is a leading * an ordinary character? */
|
||||
int i;
|
||||
sopno subno;
|
||||
|
||||
#define BACKSL (1<<CHAR_BIT)
|
||||
#define BACKSL (1<<24)
|
||||
|
||||
pos = HERE(); /* repetion op, if any, covers from here */
|
||||
|
||||
@ -577,7 +607,11 @@ int starordinary; /* is a leading * an ordinary character? */
|
||||
if (c == '\\')
|
||||
{
|
||||
REQUIRE(MORE(), REG_EESCAPE);
|
||||
#ifdef MB
|
||||
c = BACKSL | (pg_wchar) GETNEXT();
|
||||
#else
|
||||
c = BACKSL | (unsigned char) GETNEXT();
|
||||
#endif
|
||||
}
|
||||
switch (c)
|
||||
{
|
||||
@ -660,7 +694,7 @@ int starordinary; /* is a leading * an ordinary character? */
|
||||
count = p_count(p);
|
||||
if (EAT(','))
|
||||
{
|
||||
if (MORE() && isdigit(PEEK()))
|
||||
if (MORE() && pg_isdigit(PEEK()))
|
||||
{
|
||||
count2 = p_count(p);
|
||||
REQUIRE(count <= count2, REG_BADBR);
|
||||
@ -698,7 +732,7 @@ struct parse *p;
|
||||
int count = 0;
|
||||
int ndigits = 0;
|
||||
|
||||
while (MORE() && isdigit(PEEK()) && count <= DUPMAX)
|
||||
while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX)
|
||||
{
|
||||
count = count * 10 + (GETNEXT() - '0');
|
||||
ndigits++;
|
||||
@ -721,15 +755,27 @@ struct parse *p;
|
||||
{
|
||||
cset *cs = allocset(p);
|
||||
int invert = 0;
|
||||
#ifdef MB
|
||||
pg_wchar sp1[] = {'[', ':', '<', ':', ']', ']'};
|
||||
pg_wchar sp2[] = {'[', ':', '>', ':', ']', ']'};
|
||||
#endif
|
||||
|
||||
/* Dept of Truly Sickening Special-Case Kludges */
|
||||
#ifdef MB
|
||||
if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0)
|
||||
#else
|
||||
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0)
|
||||
#endif
|
||||
{
|
||||
EMIT(OBOW, 0);
|
||||
NEXTn(6);
|
||||
return;
|
||||
}
|
||||
#ifdef MB
|
||||
if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0)
|
||||
#else
|
||||
if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0)
|
||||
#endif
|
||||
{
|
||||
EMIT(OEOW, 0);
|
||||
NEXTn(6);
|
||||
@ -757,7 +803,7 @@ struct parse *p;
|
||||
int ci;
|
||||
|
||||
for (i = p->g->csetsize - 1; i >= 0; i--)
|
||||
if (CHIN(cs, i) && isalpha(i))
|
||||
if (CHIN(cs, i) && pg_isalpha(i))
|
||||
{
|
||||
ci = othercase(i);
|
||||
if (ci != i)
|
||||
@ -801,8 +847,8 @@ p_b_term(p, cs)
|
||||
struct parse *p;
|
||||
cset *cs;
|
||||
{
|
||||
char c;
|
||||
char start,
|
||||
pg_wchar c;
|
||||
pg_wchar start,
|
||||
finish;
|
||||
int i;
|
||||
|
||||
@ -857,6 +903,11 @@ cset *cs;
|
||||
finish = start;
|
||||
/* xxx what about signed chars here... */
|
||||
REQUIRE(start <= finish, REG_ERANGE);
|
||||
#ifdef MB
|
||||
if (CHlc(start) != CHlc(finish)) {
|
||||
SETERROR(REG_ERANGE);
|
||||
}
|
||||
#endif
|
||||
for (i = start; i <= finish; i++)
|
||||
CHadd(cs, i);
|
||||
break;
|
||||
@ -872,17 +923,21 @@ p_b_cclass(p, cs)
|
||||
struct parse *p;
|
||||
cset *cs;
|
||||
{
|
||||
char *sp = p->next;
|
||||
pg_wchar *sp = p->next;
|
||||
struct cclass *cp;
|
||||
size_t len;
|
||||
char *u;
|
||||
char c;
|
||||
|
||||
while (MORE() && isalpha(PEEK()))
|
||||
while (MORE() && pg_isalpha(PEEK()))
|
||||
NEXT();
|
||||
len = p->next - sp;
|
||||
for (cp = cclasses; cp->name != NULL; cp++)
|
||||
#ifdef MB
|
||||
if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
|
||||
#else
|
||||
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
|
||||
#endif
|
||||
break;
|
||||
if (cp->name == NULL)
|
||||
{
|
||||
@ -919,11 +974,11 @@ cset *cs;
|
||||
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
|
||||
== static char p_b_symbol(struct parse *p);
|
||||
*/
|
||||
static char /* value of symbol */
|
||||
static pg_wchar /* value of symbol */
|
||||
p_b_symbol(p)
|
||||
struct parse *p;
|
||||
{
|
||||
char value;
|
||||
pg_wchar value;
|
||||
|
||||
REQUIRE(MORE(), REG_EBRACK);
|
||||
if (!EATTWO('[', '.'))
|
||||
@ -944,7 +999,7 @@ p_b_coll_elem(p, endc)
|
||||
struct parse *p;
|
||||
int endc; /* name ended by endc,']' */
|
||||
{
|
||||
char *sp = p->next;
|
||||
pg_wchar *sp = p->next;
|
||||
struct cname *cp;
|
||||
int len;
|
||||
|
||||
@ -957,7 +1012,11 @@ int endc; /* name ended by endc,']' */
|
||||
}
|
||||
len = p->next - sp;
|
||||
for (cp = cnames; cp->name != NULL; cp++)
|
||||
#ifdef MB
|
||||
if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
|
||||
#else
|
||||
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
|
||||
#endif
|
||||
return (cp->code); /* known name */
|
||||
if (len == 1)
|
||||
return (*sp); /* single character */
|
||||
@ -973,10 +1032,10 @@ static char /* if no counterpart, return ch */
|
||||
othercase(ch)
|
||||
int ch;
|
||||
{
|
||||
assert(isalpha(ch));
|
||||
if (isupper(ch))
|
||||
assert(pg_isalpha(ch));
|
||||
if (pg_isupper(ch))
|
||||
return (tolower(ch));
|
||||
else if (islower(ch))
|
||||
else if (pg_islower(ch))
|
||||
return (toupper(ch));
|
||||
else
|
||||
/* peculiar, but could happen */
|
||||
@ -994,9 +1053,9 @@ bothcases(p, ch)
|
||||
struct parse *p;
|
||||
int ch;
|
||||
{
|
||||
char *oldnext = p->next;
|
||||
char *oldend = p->end;
|
||||
char bracket[3];
|
||||
pg_wchar *oldnext = p->next;
|
||||
pg_wchar *oldend = p->end;
|
||||
pg_wchar bracket[3];
|
||||
|
||||
assert(othercase(ch) != ch);/* p_bracket() would recurse */
|
||||
p->next = bracket;
|
||||
@ -1021,12 +1080,16 @@ int ch;
|
||||
{
|
||||
cat_t *cap = p->g->categories;
|
||||
|
||||
if ((p->g->cflags & REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
|
||||
if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
|
||||
bothcases(p, ch);
|
||||
else
|
||||
{
|
||||
#ifdef MB
|
||||
EMIT(OCHAR, (pg_wchar) ch);
|
||||
#else
|
||||
EMIT(OCHAR, (unsigned char) ch);
|
||||
if (cap[ch] == 0)
|
||||
#endif
|
||||
if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0)
|
||||
cap[ch] = p->g->ncategories++;
|
||||
}
|
||||
}
|
||||
@ -1041,9 +1104,9 @@ static void
|
||||
nonnewline(p)
|
||||
struct parse *p;
|
||||
{
|
||||
char *oldnext = p->next;
|
||||
char *oldend = p->end;
|
||||
char bracket[4];
|
||||
pg_wchar *oldnext = p->next;
|
||||
pg_wchar *oldend = p->end;
|
||||
pg_wchar bracket[4];
|
||||
|
||||
p->next = bracket;
|
||||
p->end = bracket + 3;
|
||||
@ -1674,7 +1737,7 @@ struct re_guts *g;
|
||||
sop *newstart = 0;
|
||||
sopno newlen;
|
||||
sop s;
|
||||
char *cp;
|
||||
pg_wchar *cp;
|
||||
sopno i;
|
||||
|
||||
/* avoid making error situations worse */
|
||||
@ -1729,7 +1792,11 @@ struct re_guts *g;
|
||||
return;
|
||||
|
||||
/* turn it into a character string */
|
||||
#ifdef MB
|
||||
g->must = (pg_wchar *)malloc((size_t) (g->mlen + 1)*sizeof(pg_wchar));
|
||||
#else
|
||||
g->must = malloc((size_t) g->mlen + 1);
|
||||
#endif
|
||||
if (g->must == NULL)
|
||||
{ /* argh; just forget it */
|
||||
g->mlen = 0;
|
||||
@ -1742,7 +1809,7 @@ struct re_guts *g;
|
||||
while (OP(s = *scan++) != OCHAR)
|
||||
continue;
|
||||
assert(cp < g->must + g->mlen);
|
||||
*cp++ = (char) OPND(s);
|
||||
*cp++ = (pg_wchar) OPND(s);
|
||||
}
|
||||
assert(cp == g->must + g->mlen);
|
||||
*cp++ = '\0'; /* just on general principles */
|
||||
@ -1785,3 +1852,42 @@ struct re_guts *g;
|
||||
g->iflags |= BAD;
|
||||
return (maxnest);
|
||||
}
|
||||
|
||||
/*
|
||||
* some ctype functions with none-ascii-char guard
|
||||
*/
|
||||
static int pg_isdigit(int c)
|
||||
{
|
||||
#ifdef MB
|
||||
return(c >= 0 && c <= UCHAR_MAX && isdigit(c));
|
||||
#else
|
||||
return(isdigit(c));
|
||||
#endif
|
||||
}
|
||||
|
||||
static int pg_isalpha(int c)
|
||||
{
|
||||
#ifdef MB
|
||||
return(c >= 0 && c <= UCHAR_MAX && isalpha(c));
|
||||
#else
|
||||
return(isalpha(c));
|
||||
#endif
|
||||
}
|
||||
|
||||
static int pg_isupper(int c)
|
||||
{
|
||||
#ifdef MB
|
||||
return(c >= 0 && c <= UCHAR_MAX && isupper(c));
|
||||
#else
|
||||
return(isupper(c));
|
||||
#endif
|
||||
}
|
||||
|
||||
static int pg_islower(int c)
|
||||
{
|
||||
#ifdef MB
|
||||
return(c >= 0 && c <= UCHAR_MAX && islower(c));
|
||||
#else
|
||||
return(islower(c));
|
||||
#endif
|
||||
}
|
||||
|
Reference in New Issue
Block a user