1
0
mirror of https://github.com/MariaDB/server.git synced 2026-01-06 05:22:24 +03:00

Adding a shared include file ctype-mb.ic and removing a number

of very similar copies of my_well_formed_len_xxx(), implemented
for big5, cp932, euckr, eucjpms, gb2312m gbk, sjis, ujis.
This commit is contained in:
Alexander Barkov
2015-03-04 09:16:43 +04:00
parent d8c1165c28
commit a7ed8523e3
9 changed files with 184 additions and 353 deletions

View File

@@ -34,6 +34,7 @@
/*
Support for Chinese(BIG5) characters, by jou@nematic.ieo.nctu.edu.tw
CP950 and HKSCS additional characters are also accepted.
modified by Wei He (hewei@mail.ied.ac.cn)
modified by Alex Barkov <bar@udm.net>
*/
@@ -47,6 +48,12 @@
#define big5head(e) ((uchar)(e>>8))
#define big5tail(e) ((uchar)(e&0xff))
#define MY_FUNCTION_NAME(x) my_ ## x ## _big5
#define IS_MB2_CHAR(x,y) (isbig5head(x) && isbig5tail(y))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static const uchar ctype_big5[257] =
{
0, /* For standard library */
@@ -6843,42 +6850,6 @@ my_mb_wc_big5(CHARSET_INFO *cs __attribute__((unused)),
}
/*
Returns a well formed length of a BIG5 string.
CP950 and HKSCS additional characters are also accepted.
*/
static
size_t my_well_formed_len_big5(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t pos, int *error)
{
const char *b0= b;
const char *emb= e - 1; /* Last possible end of an MB character */
*error= 0;
while (pos-- && b < e)
{
if ((uchar) b[0] < 128)
{
/* Single byte ascii character */
b++;
}
else if ((b < emb) && isbig5code((uchar)*b, (uchar)b[1]))
{
/* Double byte character */
b+= 2;
}
else
{
/* Wrong byte sequence */
*error= 1;
break;
}
}
return (size_t) (b - b0);
}
static MY_COLLATION_HANDLER my_collation_big5_chinese_ci_handler =
{
NULL, /* init */

View File

@@ -176,10 +176,18 @@ static const uchar sort_order_cp932[]=
(uchar) '\370',(uchar) '\371',(uchar) '\372',(uchar) '\373',(uchar) '\374',(uchar) '\375',(uchar) '\376',(uchar) '\377'
};
#define iscp932head(c) ((0x81<=(c) && (c)<=0x9f) || \
((0xe0<=(c)) && (c)<=0xfc))
#define iscp932tail(c) ((0x40<=(c) && (c)<=0x7e) || \
(0x80<=(c) && (c)<=0xfc))
#define iscp932head(c) ((0x81 <= (uchar) (c) && (uchar) (c) <= 0x9f) || \
(0xe0 <= (uchar) (c) && (uchar) (c) <= 0xfc))
#define iscp932tail(c) ((0x40 <= (uchar) (c) && (uchar) (c) <= 0x7e) || \
(0x80 <= (uchar) (c) && (uchar) (c) <= 0xfc))
#define iscp932kata(c) (0xA1 <= (uchar) (c) && (uchar) (c) <= 0xDF)
#define MY_FUNCTION_NAME(x) my_ ## x ## _cp932
#define IS_8BIT_CHAR(x) iscp932kata(x)
#define IS_MB2_CHAR(x,y) (iscp932head(x) && iscp932tail(y))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static uint ismbchar_cp932(CHARSET_INFO *cs __attribute__((unused)),
@@ -34711,50 +34719,6 @@ size_t my_numcells_cp932(CHARSET_INFO *cs __attribute__((unused)),
return clen;
}
/*
Returns a well formed length of a cp932 string.
cp932 additional characters are also accepted.
*/
static
size_t my_well_formed_len_cp932(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t pos, int *error)
{
const char *b0= b;
*error= 0;
while (pos-- && b < e)
{
/*
Cast to int8 for extra safety.
"char" can be unsigned by default
on some platforms.
*/
if (((int8)b[0]) >= 0)
{
/* Single byte ascii character */
b++;
}
else if (iscp932head((uchar)*b) && (e-b)>1 && iscp932tail((uchar)b[1]))
{
/* Double byte character */
b+= 2;
}
else if (((uchar)*b) >= 0xA1 && ((uchar)*b) <= 0xDF)
{
/* Half width kana */
b++;
}
else
{
/* Wrong byte sequence */
*error= 1;
break;
}
}
return (size_t) (b - b0);
}
static MY_COLLATION_HANDLER my_collation_ci_handler =
{

View File

@@ -202,6 +202,12 @@ static const uchar sort_order_euc_kr[]=
iseuc_kr_tail3(c))
#define MY_FUNCTION_NAME(x) my_ ## x ## _euckr
#define IS_MB2_CHAR(x,y) (iseuc_kr_head(x) && iseuc_kr_tail(y))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static uint ismbchar_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
const char* p, const char *e)
{
@@ -9929,41 +9935,6 @@ my_mb_wc_euc_kr(CHARSET_INFO *cs __attribute__((unused)),
}
/*
Returns well formed length of a EUC-KR string.
*/
static size_t
my_well_formed_len_euckr(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t pos, int *error)
{
const char *b0= b;
const char *emb= e - 1; /* Last possible end of an MB character */
*error= 0;
while (pos-- && b < e)
{
if ((uchar) b[0] < 128)
{
/* Single byte ascii character */
b++;
}
else if (b < emb && iseuc_kr_head(*b) && iseuc_kr_tail(b[1]))
{
/* Double byte character */
b+= 2;
}
else
{
/* Wrong byte sequence */
*error= 1;
break;
}
}
return (size_t) (b - b0);
}
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
NULL, /* init */

View File

@@ -180,10 +180,26 @@ static const uchar sort_order_eucjpms[]=
};
#define iseucjpms(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xfe))
#define iskata(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xdf))
#define iseucjpms_ss2(c) (((c)&0xff) == 0x8e)
#define iseucjpms_ss3(c) (((c)&0xff) == 0x8f)
/*
EUCJPMS encoding subcomponents:
[x00-x7F] # ASCII/JIS-Roman (one-byte/character)
[x8E][xA1-xDF] # half-width katakana (two bytes/char)
[x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char)
[xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char)
*/
#define iseucjpms(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xfe)
#define iskata(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xdf)
#define iseucjpms_ss2(c) ((uchar) (c) == 0x8e)
#define iseucjpms_ss3(c) ((uchar) (c) == 0x8f)
#define MY_FUNCTION_NAME(x) my_ ## x ## _eucjpms
#define IS_MB2_JIS(x,y) (iseucjpms(x) && iseucjpms(y))
#define IS_MB2_KATA(x,y) (iseucjpms_ss2(x) && iskata(y))
#define IS_MB2_CHAR(x,y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
#define IS_MB3_CHAR(x,y,z) (iseucjpms_ss3(x) && IS_MB2_JIS(y,z))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static uint ismbchar_eucjpms(CHARSET_INFO *cs __attribute__((unused)),
@@ -67416,61 +67432,6 @@ my_wc_mb_eucjpms(CHARSET_INFO *cs __attribute__((unused)),
}
/*
EUCJPMS encoding subcomponents:
[x00-x7F] # ASCII/JIS-Roman (one-byte/character)
[x8E][xA1-xDF] # half-width katakana (two bytes/char)
[x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char)
[xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char)
*/
static
size_t my_well_formed_len_eucjpms(CHARSET_INFO *cs __attribute__((unused)),
const char *beg, const char *end, size_t pos,
int *error)
{
const uchar *b= (uchar *) beg;
*error=0;
for ( ; pos && b < (uchar*) end; pos--, b++)
{
char *chbeg;
uint ch= *b;
if (ch <= 0x7F) /* one byte */
continue;
chbeg= (char *) b++;
if (b >= (uchar *) end) /* need more bytes */
return (uint) (chbeg - beg); /* unexpected EOL */
if (iseucjpms_ss2(ch)) /* [x8E][xA1-xDF] */
{
if (iskata(*b))
continue;
*error=1;
return (uint) (chbeg - beg); /* invalid sequence */
}
if (iseucjpms_ss3(ch)) /* [x8F][xA1-xFE][xA1-xFE] */
{
ch= *b++;
if (b >= (uchar*) end)
{
*error= 1;
return (uint)(chbeg - beg); /* unexpected EOL */
}
}
if (iseucjpms(ch) && iseucjpms(*b)) /* [xA1-xFE][xA1-xFE] */
continue;
*error=1;
return (size_t) (chbeg - beg); /* invalid sequence */
}
return (size_t) (b - (uchar *) beg);
}
static
size_t my_numcells_eucjpms(CHARSET_INFO *cs __attribute__((unused)),
const char *str, const char *str_end)

View File

@@ -165,6 +165,12 @@ static const uchar sort_order_gb2312[]=
#define isgb2312tail(c) (0xa1<=(uchar)(c) && (uchar)(c)<=0xfe)
#define MY_FUNCTION_NAME(x) my_ ## x ## _gb2312
#define IS_MB2_CHAR(x,y) (isgb2312head(x) && isgb2312tail(y))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static uint ismbchar_gb2312(CHARSET_INFO *cs __attribute__((unused)),
const char* p, const char *e)
{
@@ -6332,41 +6338,6 @@ my_mb_wc_gb2312(CHARSET_INFO *cs __attribute__((unused)),
}
/*
Returns well formed length of a EUC-KR string.
*/
static size_t
my_well_formed_len_gb2312(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t pos, int *error)
{
const char *b0= b;
const char *emb= e - 1; /* Last possible end of an MB character */
*error= 0;
while (pos-- && b < e)
{
if ((uchar) b[0] < 128)
{
/* Single byte ascii character */
b++;
}
else if (b < emb && isgb2312head(*b) && isgb2312tail(b[1]))
{
/* Double byte character */
b+= 2;
}
else
{
/* Wrong byte sequence */
*error= 1;
break;
}
}
return (size_t) (b - b0);
}
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
NULL, /* init */

View File

@@ -43,6 +43,12 @@
#define gbkhead(e) ((uchar)(e>>8))
#define gbktail(e) ((uchar)(e&0xff))
#define MY_FUNCTION_NAME(x) my_ ## x ## _gbk
#define IS_MB2_CHAR(x,y) (isgbkhead(x) && isgbktail(y))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static const uchar ctype_gbk[257] =
{
0, /* For standard library */
@@ -10726,43 +10732,6 @@ my_mb_wc_gbk(CHARSET_INFO *cs __attribute__((unused)),
}
/*
Returns well formed length of a GBK string.
*/
static
size_t my_well_formed_len_gbk(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t pos, int *error)
{
const char *b0= b;
const char *emb= e - 1; /* Last possible end of an MB character */
*error= 0;
while (pos-- && b < e)
{
if ((uchar) b[0] < 128)
{
/* Single byte ascii character */
b++;
}
else if ((b < emb) && isgbkcode((uchar)*b, (uchar)b[1]))
{
/* Double byte character */
b+= 2;
}
else
{
/* Wrong byte sequence */
*error= 1;
break;
}
}
return (size_t) (b - b0);
}
static MY_COLLATION_HANDLER my_collation_ci_handler =
{
NULL, /* init */

94
strings/ctype-mb.ic Normal file
View File

@@ -0,0 +1,94 @@
/*
Copyright (c) 2015, MariaDB Foundation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#ifndef MY_FUNCTION_NAME
#error MY_FUNCTION_NAME is not defined
#endif
#if defined(IS_MB3_CHAR) && !defined(IS_MB2_CHAR)
#error IS_MB3_CHAR is defined, while IS_MB2_CHAR is not!
#endif
#if defined(IS_MB4_CHAR) && !defined(IS_MB3_CHAR)
#error IS_MB4_CHAR is defined, while IS_MB3_CHAR is not!
#endif
#ifdef WELL_FORMED_LEN
/**
Returns well formed length of a character string with
variable character length for character sets with:
- mbminlen == 1
- mbmaxlen == 2, 3, or 4
*/
static size_t
MY_FUNCTION_NAME(well_formed_len)(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t nchars, int *error)
{
const char *b0= b;
DBUG_ASSERT(cs->mbminlen == 1);
DBUG_ASSERT(cs->mbmaxlen <= 4);
for (*error= 0 ; b < e && nchars-- ; )
{
if ((uchar) b[0] < 128)
{
b++; /* Single byte ASCII character */
continue;
}
if (b + 2 <= e && IS_MB2_CHAR(b[0], b[1]))
{
b+= 2; /* Double byte character */
continue;
}
#ifdef IS_MB3_CHAR
if (b + 3 <= e && IS_MB3_CHAR(b[0], b[1], b[2]))
{
b+= 3; /* Three-byte character */
continue;
}
#endif
#ifdef IS_MB4_CHAR
if (b + 4 <= e && IS_MB4_CHAR(b[0], b[1], b[2], b[3]))
{
b+= 4; /* Four-byte character */
continue;
}
#endif
#ifdef IS_8BIT_CHAR
if (IS_8BIT_CHAR(b[0]))
{
b++; /* Single byte non-ASCII character, e.g. half width kana in sjis */
continue;
}
#endif
/* Wrong byte sequence */
*error= 1;
break;
}
return b - b0;
}
#endif /* WELL_FORMED_LEN */

View File

@@ -176,10 +176,19 @@ static const uchar sort_order_sjis[]=
(uchar) '\370',(uchar) '\371',(uchar) '\372',(uchar) '\373',(uchar) '\374',(uchar) '\375',(uchar) '\376',(uchar) '\377'
};
#define issjishead(c) ((0x81<=(c) && (c)<=0x9f) || \
((0xe0<=(c)) && (c)<=0xfc))
#define issjistail(c) ((0x40<=(c) && (c)<=0x7e) || \
(0x80<=(c) && (c)<=0xfc))
#define issjishead(c) ((0x81 <= (uchar) (c) && (uchar) (c) <= 0x9f) || \
(0xe0 <= (uchar) (c) && (uchar) (c) <= 0xfc))
#define issjistail(c) ((0x40 <= (uchar) (c) && (uchar) (c) <= 0x7e) || \
(0x80 <= (uchar) (c) && (uchar) (c) <= 0xfc))
#define issjiskata(c) ((0xA1 <= (uchar) (c) && (uchar) (c) <= 0xDF))
#define MY_FUNCTION_NAME(x) my_ ## x ## _sjis
#define IS_8BIT_CHAR(x) issjiskata(x)
#define IS_MB2_CHAR(x,y) (issjishead(x) && issjistail(y))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static uint ismbchar_sjis(CHARSET_INFO *cs __attribute__((unused)),
@@ -34089,44 +34098,6 @@ size_t my_numcells_sjis(CHARSET_INFO *cs __attribute__((unused)),
return clen;
}
/*
Returns a well formed length of a SJIS string.
CP932 additional characters are also accepted.
*/
static
size_t my_well_formed_len_sjis(CHARSET_INFO *cs __attribute__((unused)),
const char *b, const char *e,
size_t pos, int *error)
{
const char *b0= b;
*error= 0;
while (pos-- && b < e)
{
if ((uchar) b[0] < 128)
{
/* Single byte ascii character */
b++;
}
else if (issjishead((uchar)*b) && (e-b)>1 && issjistail((uchar)b[1]))
{
/* Double byte character */
b+= 2;
}
else if (((uchar)*b) >= 0xA1 && ((uchar)*b) <= 0xDF)
{
/* Half width kana */
b++;
}
else
{
/* Wrong byte sequence */
*error= 1;
break;
}
}
return (size_t) (b - b0);
}
static MY_COLLATION_HANDLER my_collation_ci_handler =
{

View File

@@ -179,10 +179,26 @@ static const uchar sort_order_ujis[]=
};
#define isujis(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xfe))
#define iskata(c) ((0xa1<=((c)&0xff) && ((c)&0xff)<=0xdf))
#define isujis_ss2(c) (((c)&0xff) == 0x8e)
#define isujis_ss3(c) (((c)&0xff) == 0x8f)
/*
EUC-JP encoding subcomponents:
[x00-x7F] # ASCII/JIS-Roman (one-byte/character)
[x8E][xA1-xDF] # half-width katakana (two bytes/char)
[x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char)
[xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char)
*/
#define isujis(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xfe)
#define iskata(c) (0xa1 <= (uchar) (c) && (uchar) (c) <= 0xdf)
#define isujis_ss2(c) ((uchar) (c) == 0x8e)
#define isujis_ss3(c) ((uchar) (c) == 0x8f)
#define MY_FUNCTION_NAME(x) my_ ## x ## _ujis
#define IS_MB2_JIS(x,y) (isujis(x) && isujis(y))
#define IS_MB2_KATA(x,y) (isujis_ss2(x) && iskata(y))
#define IS_MB2_CHAR(x, y) (IS_MB2_KATA(x,y) || IS_MB2_JIS(x,y))
#define IS_MB3_CHAR(x, y, z) (isujis_ss3(x) && IS_MB2_JIS(y,z))
#define WELL_FORMED_LEN
#include "ctype-mb.ic"
static uint ismbchar_ujis(CHARSET_INFO *cs __attribute__((unused)),
@@ -201,63 +217,6 @@ static uint mbcharlen_ujis(CHARSET_INFO *cs __attribute__((unused)),uint c)
}
/*
EUC-JP encoding subcomponents:
[x00-x7F] # ASCII/JIS-Roman (one-byte/character)
[x8E][xA1-xDF] # half-width katakana (two bytes/char)
[x8F][xA1-xFE][xA1-xFE] # JIS X 0212-1990 (three bytes/char)
[xA1-xFE][xA1-xFE] # JIS X 0208:1997 (two bytes/char)
*/
static
size_t my_well_formed_len_ujis(CHARSET_INFO *cs __attribute__((unused)),
const char *beg, const char *end,
size_t pos, int *error)
{
const uchar *b= (uchar *) beg;
for ( *error= 0 ; pos && b < (uchar*) end; pos--, b++)
{
char *chbeg;
uint ch= *b;
if (ch <= 0x7F) /* one byte */
continue;
chbeg= (char *) b++;
if (b >= (uchar *) end) /* need more bytes */
{
*error= 1;
return (size_t) (chbeg - beg); /* unexpected EOL */
}
if (isujis_ss2(ch)) /* [x8E][xA1-xDF] */
{
if (iskata(*b))
continue;
*error= 1;
return (size_t) (chbeg - beg); /* invalid sequence */
}
if (isujis_ss3(ch)) /* [x8F][xA1-xFE][xA1-xFE] */
{
ch= *b++;
if (b >= (uchar*) end)
{
*error= 1;
return (size_t) (chbeg - beg); /* unexpected EOL */
}
}
if (isujis(ch) && isujis(*b)) /* [xA1-xFE][xA1-xFE] */
continue;
*error= 1;
return (size_t) (chbeg - beg); /* invalid sequence */
}
return (size_t) (b - (uchar *) beg);
}
static
size_t my_numcells_eucjp(CHARSET_INFO *cs __attribute__((unused)),
const char *str, const char *str_end)