mirror of
https://github.com/postgres/postgres.git
synced 2025-07-02 09:02:37 +03:00
From: t-ishii@sra.co.jp
Included are patches intended for allowing PostgreSQL to handle multi-byte charachter sets such as EUC(Extende Unix Code), Unicode and Mule internal code. With the MB patch you can use multi-byte character sets in regexp and LIKE. The encoding system chosen is determined at the compile time. To enable the MB extension, you need to define a variable "MB" in Makefile.global or in Makefile.custom. For further information please take a look at README.mb under doc directory. (Note that unlike "jp patch" I do not use modified GNU regexp any more. I changed Henry Spencer's regexp coming with PostgreSQL.)
This commit is contained in:
44
src/include/regex/pg_wchar.h
Normal file
44
src/include/regex/pg_wchar.h
Normal file
@ -0,0 +1,44 @@
|
||||
/* $Id: pg_wchar.h,v 1.1 1998/03/15 07:38:47 scrappy Exp $ */
|
||||
|
||||
#ifndef PG_WCHAR_H
|
||||
#define PG_WCHAR_H
|
||||
|
||||
#include <sys/types.h>
|
||||
|
||||
#define EUC_JP 0 /* EUC for Japanese */
|
||||
#define EUC_CN 1 /* EUC for Chinese */
|
||||
#define EUC_KR 2 /* EUC for Korean */
|
||||
#define EUC_TW 3 /* EUC for Taiwan */
|
||||
#define UNICODE 4 /* Unicode UTF-8 */
|
||||
#define MULE_INTERNAL 5 /* Mule internal code */
|
||||
|
||||
#ifdef MB
|
||||
typedef unsigned int pg_wchar;
|
||||
#else
|
||||
#define pg_wchar char
|
||||
#endif
|
||||
|
||||
/*
|
||||
* various definitions for EUC
|
||||
*/
|
||||
#define SS2 0x8e /* single shift 2 */
|
||||
#define SS3 0x8f /* single shift 3 */
|
||||
|
||||
/*
|
||||
* various definitions for mule internal code
|
||||
*/
|
||||
#define IS_LC1(c) ((unsigned char)(c) >= 0x81 && (unsigned char)(c) <= 0x8f)
|
||||
#define IS_LCPRV1(c) ((unsigned char)(c) == 0x9a || (unsigned char)(c) == 0x9b)
|
||||
#define IS_LC2(c) ((unsigned char)(c) >= 0x90 && (unsigned char)(c) <= 0x99)
|
||||
#define IS_LCPRV2(c) ((unsigned char)(c) == 0x9c || (unsigned char)(c) == 0x9d)
|
||||
|
||||
#ifdef MB
|
||||
extern void pg_mb2wchar(const unsigned char *, pg_wchar *);
|
||||
extern void pg_mb2wchar_with_len(const unsigned char *, pg_wchar *, int);
|
||||
extern int pg_char_and_wchar_strcmp(const char *, const pg_wchar *);
|
||||
extern int pg_wchar_strncmp(const pg_wchar *, const pg_wchar *, size_t);
|
||||
extern int pg_char_and_wchar_strncmp(const char *, const pg_wchar *, size_t);
|
||||
extern size_t pg_wchar_strlen(const pg_wchar *);
|
||||
#endif
|
||||
|
||||
#endif
|
@ -41,6 +41,7 @@
|
||||
#define _REGEX_H_
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <regex/pg_wchar.h>
|
||||
|
||||
/* types */
|
||||
typedef off_t regoff_t;
|
||||
@ -49,8 +50,12 @@ typedef struct
|
||||
{
|
||||
int re_magic;
|
||||
size_t re_nsub; /* number of parenthesized subexpressions */
|
||||
const char *re_endp; /* end pointer for REG_PEND */
|
||||
const pg_wchar *re_endp; /* end pointer for REG_PEND */
|
||||
struct re_guts *re_g; /* none of your business :-) */
|
||||
#ifdef MB
|
||||
pg_wchar *patsave; /* mee too :-) */
|
||||
#endif
|
||||
|
||||
} regex_t;
|
||||
|
||||
typedef struct
|
||||
|
@ -127,12 +127,29 @@ typedef struct
|
||||
{
|
||||
uch *ptr; /* -> uch [csetsize] */
|
||||
uch mask; /* bit within array */
|
||||
uch hash; /* hash code */
|
||||
#ifdef MB
|
||||
pg_wchar hash; /* hash code */
|
||||
unsigned int lc; /* leading character (character-set) */
|
||||
#else
|
||||
uch hash; /* hash code */
|
||||
#endif
|
||||
size_t smultis;
|
||||
char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
|
||||
} cset;
|
||||
|
||||
/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
|
||||
#ifdef MB
|
||||
#define CHlc(c) (((unsigned)(c)&0xff0000)>>16)
|
||||
#define CHadd(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] |= (cs)->mask, (cs)->hash += (unsigned)(c)&0xffff,\
|
||||
(cs)->lc = CHlc(c))
|
||||
#define CHsub(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] &= ~(cs)->mask, (cs)->hash -= (unsigned)(c)&0xffff)
|
||||
#define CHIN(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] & (cs)->mask && \
|
||||
((cs)->lc == CHlc(c)))
|
||||
#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal
|
||||
* fns */
|
||||
#define MCsub(p, cs, cp) mcsub(p, cs, cp)
|
||||
#define MCin(p, cs, cp) mcin(p, cs, cp)
|
||||
#else
|
||||
#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
|
||||
#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
|
||||
#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
|
||||
@ -140,6 +157,7 @@ typedef struct
|
||||
* fns */
|
||||
#define MCsub(p, cs, cp) mcsub(p, cs, cp)
|
||||
#define MCin(p, cs, cp) mcin(p, cs, cp)
|
||||
#endif
|
||||
|
||||
/* stuff for character categories */
|
||||
typedef unsigned char cat_t;
|
||||
@ -168,7 +186,7 @@ struct re_guts
|
||||
int neol; /* number of $ used */
|
||||
int ncategories; /* how many character categories */
|
||||
cat_t *categories; /* ->catspace[-CHAR_MIN] */
|
||||
char *must; /* match must contain this string */
|
||||
pg_wchar *must; /* match must contain this string */
|
||||
int mlen; /* length of must */
|
||||
size_t nsub; /* copy of re_nsub */
|
||||
int backrefs; /* does it use back references? */
|
||||
@ -178,5 +196,21 @@ struct re_guts
|
||||
};
|
||||
|
||||
/* misc utilities */
|
||||
#define OUT (CHAR_MAX+1) /* a non-character value */
|
||||
#define ISWORD(c) (isalnum(c) || (c) == '_')
|
||||
#ifdef MB
|
||||
# if MB == MULE_INTERNAL
|
||||
# define OUT (16777216+1) /* 16777216 == 2^24 == 3 bytes */
|
||||
# elif MB == EUC_JP || MB == EUC_CN || MB == EUC_KR || MB == EUC_TW
|
||||
# define OUT (USHRT_MAX+1) /* 2 bytes */
|
||||
# elif MB == UNICODE
|
||||
# define OUT (USHRT_MAX+1) /* 2 bytes. assuming UCS-2 */
|
||||
# endif
|
||||
#else
|
||||
# define OUT (CHAR_MAX+1) /* a non-character value */
|
||||
#endif
|
||||
|
||||
#ifdef MB
|
||||
#define ISWORD(c) ((c >= 0 && c <= UCHAR_MAX) && \
|
||||
(isalnum(c) || (c) == '_'))
|
||||
#else
|
||||
#define ISWORD(c) (isalnum(c) || (c) == '_')
|
||||
#endif
|
||||
|
@ -42,7 +42,12 @@
|
||||
/* utility definitions */
|
||||
#define DUPMAX 100000000 /* xxx is this right? */
|
||||
#define INFINITY (DUPMAX + 1)
|
||||
|
||||
#ifdef MB
|
||||
#define NC (SHRT_MAX - SHRT_MIN + 1)
|
||||
#else
|
||||
#define NC (CHAR_MAX - CHAR_MIN + 1)
|
||||
#endif
|
||||
typedef unsigned char uch;
|
||||
|
||||
/* switch off assertions (if not already off) if no REDEBUG */
|
||||
|
Reference in New Issue
Block a user