mirror of
https://github.com/postgres/postgres.git
synced 2025-04-22 23:02:54 +03:00
Improve support of multibyte encoding:
- tsvector_(in|out) - tsquery_(in|out) - to_tsvector - to_tsquery, plainto_tsquery - 'simple' dictionary
This commit is contained in:
parent
ec0baf949e
commit
cb4ea994c6
@ -14,7 +14,6 @@ void sortstoplist(StopList * s);
|
||||
void freestoplist(StopList * s);
|
||||
void readstoplist(text *in, StopList * s);
|
||||
bool searchstoplist(StopList * s, char *key);
|
||||
char *lowerstr(char *str);
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -6,6 +6,7 @@
|
||||
|
||||
#include "dict.h"
|
||||
#include "common.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "dict.h"
|
||||
#include "common.h"
|
||||
#include "ispell/spell.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include "snowball/header.h"
|
||||
#include "snowball/english_stem.h"
|
||||
#include "snowball/russian_stem.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "dict.h"
|
||||
#include "common.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
#define SYNBUFLEN 4096
|
||||
typedef struct
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "common.h"
|
||||
#include "snowball/header.h"
|
||||
#include "subinclude.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
typedef struct {
|
||||
struct SN_env *z;
|
||||
|
@ -12,6 +12,7 @@
|
||||
#include "common.h"
|
||||
|
||||
#include "subinclude.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
HASINIT typedef struct {
|
||||
HASINIT StopList stoplist;
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "spell.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
#define MAX_NORM 1024
|
||||
#define MAXNORMLEN 256
|
||||
@ -30,18 +31,6 @@ cmpspellaffix(const void *s1, const void *s2)
|
||||
return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
|
||||
}
|
||||
|
||||
static void
|
||||
strlower(char *str)
|
||||
{
|
||||
unsigned char *ptr = (unsigned char *) str;
|
||||
|
||||
while (*ptr)
|
||||
{
|
||||
*ptr = tolower(*ptr);
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
|
||||
static char *
|
||||
strnduplicate(char *s, int len)
|
||||
{
|
||||
@ -175,7 +164,7 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
|
||||
}
|
||||
else
|
||||
flag = "";
|
||||
strlower(str);
|
||||
lowerstr(str);
|
||||
/* Dont load words if first letter is not required */
|
||||
/* It allows to optimize loading at search time */
|
||||
s = str;
|
||||
@ -385,7 +374,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
|
||||
*s = 0;
|
||||
if (!*str)
|
||||
continue;
|
||||
strlower(str);
|
||||
lowerstr(str);
|
||||
strcpy(mask, "");
|
||||
strcpy(find, "");
|
||||
strcpy(repl, "");
|
||||
@ -851,7 +840,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
|
||||
|
||||
if (wrdlen > MAXNORMLEN)
|
||||
return NULL;
|
||||
strlower(word);
|
||||
lowerstr(word);
|
||||
cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
|
||||
*cur = NULL;
|
||||
|
||||
|
@ -8,6 +8,7 @@
|
||||
|
||||
#include "dict.h"
|
||||
#include "common.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
#define CS_WAITKEY 0
|
||||
#define CS_INKEY 1
|
||||
@ -30,11 +31,11 @@ nstrdup(char *ptr, int len)
|
||||
cptr = ptr = res;
|
||||
while (*ptr)
|
||||
{
|
||||
if (*ptr == '\\')
|
||||
if (t_iseq(ptr, '\\'))
|
||||
ptr++;
|
||||
*cptr = *ptr;
|
||||
ptr++;
|
||||
cptr++;
|
||||
COPYCHAR( cptr, ptr );
|
||||
cptr+=pg_mblen(ptr);
|
||||
ptr+=pg_mblen(ptr);
|
||||
}
|
||||
*cptr = '\0';
|
||||
|
||||
@ -52,9 +53,9 @@ parse_cfgdict(text *in, Map ** m)
|
||||
|
||||
while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
|
||||
{
|
||||
if (*ptr == ',')
|
||||
if ( t_iseq(ptr, ',') )
|
||||
num++;
|
||||
ptr++;
|
||||
ptr+=pg_mblen(ptr);
|
||||
}
|
||||
|
||||
*m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
|
||||
@ -64,56 +65,56 @@ parse_cfgdict(text *in, Map ** m)
|
||||
{
|
||||
if (state == CS_WAITKEY)
|
||||
{
|
||||
if (isalpha((unsigned char) *ptr))
|
||||
if (t_isalpha(ptr))
|
||||
{
|
||||
begin = ptr;
|
||||
state = CS_INKEY;
|
||||
}
|
||||
else if (!isspace((unsigned char) *ptr))
|
||||
else if (!t_isspace(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("syntax error"),
|
||||
errdetail("Syntax error in position %d near \"%c\"",
|
||||
(int) (ptr - VARDATA(in)), *ptr)));
|
||||
errdetail("Syntax error in position %d",
|
||||
(int) (ptr - VARDATA(in)))));
|
||||
}
|
||||
else if (state == CS_INKEY)
|
||||
{
|
||||
if (isspace((unsigned char) *ptr))
|
||||
if (t_isspace(ptr))
|
||||
{
|
||||
mptr->key = nstrdup(begin, ptr - begin);
|
||||
state = CS_WAITEQ;
|
||||
}
|
||||
else if (*ptr == '=')
|
||||
else if (t_iseq(ptr,'='))
|
||||
{
|
||||
mptr->key = nstrdup(begin, ptr - begin);
|
||||
state = CS_WAITVALUE;
|
||||
}
|
||||
else if (!isalpha((unsigned char) *ptr))
|
||||
else if (!t_isalpha(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("syntax error"),
|
||||
errdetail("Syntax error in position %d near \"%c\"",
|
||||
(int) (ptr - VARDATA(in)), *ptr)));
|
||||
errdetail("Syntax error in position %d",
|
||||
(int) (ptr - VARDATA(in)))));
|
||||
}
|
||||
else if (state == CS_WAITEQ)
|
||||
{
|
||||
if (*ptr == '=')
|
||||
if (t_iseq(ptr, '='))
|
||||
state = CS_WAITVALUE;
|
||||
else if (!isspace((unsigned char) *ptr))
|
||||
else if (!t_isspace(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("syntax error"),
|
||||
errdetail("Syntax error in position %d near \"%c\"",
|
||||
(int) (ptr - VARDATA(in)), *ptr)));
|
||||
errdetail("Syntax error in position %d",
|
||||
(int) (ptr - VARDATA(in)))));
|
||||
}
|
||||
else if (state == CS_WAITVALUE)
|
||||
{
|
||||
if (*ptr == '"')
|
||||
if (t_iseq(ptr, '"'))
|
||||
{
|
||||
begin = ptr + 1;
|
||||
state = CS_INVALUE;
|
||||
}
|
||||
else if (!isspace((unsigned char) *ptr))
|
||||
else if (!t_isspace(ptr))
|
||||
{
|
||||
begin = ptr;
|
||||
state = CS_IN2VALUE;
|
||||
@ -121,36 +122,36 @@ parse_cfgdict(text *in, Map ** m)
|
||||
}
|
||||
else if (state == CS_INVALUE)
|
||||
{
|
||||
if (*ptr == '"')
|
||||
if (t_iseq(ptr, '"'))
|
||||
{
|
||||
mptr->value = nstrdup(begin, ptr - begin);
|
||||
mptr++;
|
||||
state = CS_WAITDELIM;
|
||||
}
|
||||
else if (*ptr == '\\')
|
||||
else if (t_iseq(ptr, '\\'))
|
||||
state = CS_INESC;
|
||||
}
|
||||
else if (state == CS_IN2VALUE)
|
||||
{
|
||||
if (isspace((unsigned char) *ptr) || *ptr == ',')
|
||||
if (t_isspace(ptr) || t_iseq(ptr, ','))
|
||||
{
|
||||
mptr->value = nstrdup(begin, ptr - begin);
|
||||
mptr++;
|
||||
state = (*ptr == ',') ? CS_WAITKEY : CS_WAITDELIM;
|
||||
state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
|
||||
}
|
||||
else if (*ptr == '\\')
|
||||
else if (t_iseq(ptr, '\\'))
|
||||
state = CS_INESC;
|
||||
}
|
||||
else if (state == CS_WAITDELIM)
|
||||
{
|
||||
if (*ptr == ',')
|
||||
if (t_iseq(ptr, ','))
|
||||
state = CS_WAITKEY;
|
||||
else if (!isspace((unsigned char) *ptr))
|
||||
else if (!t_isspace(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("syntax error"),
|
||||
errdetail("Syntax error in position %d near \"%c\"",
|
||||
(int) (ptr - VARDATA(in)), *ptr)));
|
||||
errdetail("Syntax error in position %d",
|
||||
(int) (ptr - VARDATA(in)))));
|
||||
}
|
||||
else if (state == CS_INESC)
|
||||
state = CS_INVALUE;
|
||||
@ -160,9 +161,9 @@ parse_cfgdict(text *in, Map ** m)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("bad parser state"),
|
||||
errdetail("%d at position %d near \"%c\"",
|
||||
state, (int) (ptr - VARDATA(in)), *ptr)));
|
||||
ptr++;
|
||||
errdetail("%d at position %d",
|
||||
state, (int) (ptr - VARDATA(in)))));
|
||||
ptr+=pg_mblen(ptr);
|
||||
}
|
||||
|
||||
if (state == CS_IN2VALUE)
|
||||
|
@ -25,7 +25,7 @@
|
||||
#include "query.h"
|
||||
#include "query_cleanup.h"
|
||||
#include "common.h"
|
||||
|
||||
#include "ts_locale.h"
|
||||
|
||||
PG_FUNCTION_INFO_V1(tsquery_in);
|
||||
Datum tsquery_in(PG_FUNCTION_ARGS);
|
||||
@ -108,24 +108,28 @@ get_weight(char *buf, int2 *weight)
|
||||
{
|
||||
*weight = 0;
|
||||
|
||||
if (*buf != ':')
|
||||
if ( !t_iseq(buf, ':') )
|
||||
return buf;
|
||||
|
||||
buf++;
|
||||
while (*buf)
|
||||
while ( *buf && pg_mblen(buf) == 1 )
|
||||
{
|
||||
switch (tolower(*buf))
|
||||
switch (*buf)
|
||||
{
|
||||
case 'a':
|
||||
case 'A':
|
||||
*weight |= 1 << 3;
|
||||
break;
|
||||
case 'b':
|
||||
case 'B':
|
||||
*weight |= 1 << 2;
|
||||
break;
|
||||
case 'c':
|
||||
case 'C':
|
||||
*weight |= 1 << 1;
|
||||
break;
|
||||
case 'd':
|
||||
case 'D':
|
||||
*weight |= 1;
|
||||
break;
|
||||
default:
|
||||
@ -149,25 +153,25 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
|
||||
{
|
||||
case WAITFIRSTOPERAND:
|
||||
case WAITOPERAND:
|
||||
if (*(state->buf) == '!')
|
||||
if ( t_iseq(state->buf, '!') )
|
||||
{
|
||||
(state->buf)++;
|
||||
(state->buf)++; /* can safely ++, t_iseq guarantee that pg_mblen()==1 */
|
||||
*val = (int4) '!';
|
||||
return OPR;
|
||||
}
|
||||
else if (*(state->buf) == '(')
|
||||
else if ( t_iseq(state->buf, '(') )
|
||||
{
|
||||
state->count++;
|
||||
(state->buf)++;
|
||||
return OPEN;
|
||||
}
|
||||
else if (*(state->buf) == ':')
|
||||
else if ( t_iseq(state->buf, ':') )
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("error at start of operand")));
|
||||
}
|
||||
else if (*(state->buf) != ' ')
|
||||
else if ( !t_isspace(state->buf) )
|
||||
{
|
||||
state->valstate.prsbuf = state->buf;
|
||||
if (gettoken_tsvector(&(state->valstate)))
|
||||
@ -187,14 +191,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
|
||||
}
|
||||
break;
|
||||
case WAITOPERATOR:
|
||||
if (*(state->buf) == '&' || *(state->buf) == '|')
|
||||
if ( t_iseq(state->buf, '&') || t_iseq(state->buf, '|') )
|
||||
{
|
||||
state->state = WAITOPERAND;
|
||||
*val = (int4) *(state->buf);
|
||||
(state->buf)++;
|
||||
return OPR;
|
||||
}
|
||||
else if (*(state->buf) == ')')
|
||||
else if ( t_iseq(state->buf, ')') )
|
||||
{
|
||||
(state->buf)++;
|
||||
state->count--;
|
||||
@ -202,7 +206,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
|
||||
}
|
||||
else if (*(state->buf) == '\0')
|
||||
return (state->count) ? ERR : END;
|
||||
else if (*(state->buf) != ' ')
|
||||
else if ( !t_isspace(state->buf) )
|
||||
return ERR;
|
||||
break;
|
||||
case WAITSINGLEOPERAND:
|
||||
@ -217,7 +221,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
|
||||
return ERR;
|
||||
break;
|
||||
}
|
||||
(state->buf)++;
|
||||
state->buf+=pg_mblen(state->buf);
|
||||
}
|
||||
return END;
|
||||
}
|
||||
@ -697,8 +701,11 @@ static QUERYTYPE *
|
||||
Datum
|
||||
tsquery_in(PG_FUNCTION_ARGS)
|
||||
{
|
||||
char * in = (char*)PG_GETARG_POINTER(0);
|
||||
pg_verifymbstr( in, strlen(in), false);
|
||||
|
||||
SET_FUNCOID();
|
||||
PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
|
||||
PG_RETURN_POINTER(queryin((char *) in, pushval_asis, 0, false));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -732,20 +739,23 @@ infix(INFIX * in, bool first)
|
||||
if (in->curpol->type == VAL)
|
||||
{
|
||||
char *op = in->op + in->curpol->distance;
|
||||
int clen;
|
||||
|
||||
RESIZEBUF(in, in->curpol->length * 2 + 2 + 5);
|
||||
RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length()+1) + 2 + 5);
|
||||
*(in->cur) = '\'';
|
||||
in->cur++;
|
||||
while (*op)
|
||||
{
|
||||
if (*op == '\'')
|
||||
if ( t_iseq(op, '\'') )
|
||||
{
|
||||
*(in->cur) = '\\';
|
||||
in->cur++;
|
||||
}
|
||||
*(in->cur) = *op;
|
||||
op++;
|
||||
in->cur++;
|
||||
COPYCHAR(in->cur,op);
|
||||
|
||||
clen = pg_mblen(op);
|
||||
op+=clen;
|
||||
in->cur+=clen;
|
||||
}
|
||||
*(in->cur) = '\'';
|
||||
in->cur++;
|
||||
|
@ -4,7 +4,7 @@
|
||||
#define BS_DEBUG
|
||||
*/
|
||||
|
||||
|
||||
#include "ts_locale.h"
|
||||
/*
|
||||
* item in polish notation with back link
|
||||
* to left operand
|
||||
@ -38,7 +38,7 @@ typedef struct
|
||||
#define GETQUERY(x) (ITEM*)( (char*)(x)+HDRSIZEQT )
|
||||
#define GETOPERAND(x) ( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) )
|
||||
|
||||
#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' )
|
||||
#define ISOPERATOR(x) ( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
|
||||
|
||||
#define END 0
|
||||
#define ERR 1
|
||||
|
@ -10,22 +10,10 @@
|
||||
|
||||
#include "common.h"
|
||||
#include "dict.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
#define STOPBUFLEN 4096
|
||||
|
||||
char *
|
||||
lowerstr(char *str)
|
||||
{
|
||||
char *ptr = str;
|
||||
|
||||
while (*ptr)
|
||||
{
|
||||
*ptr = tolower(*(unsigned char *) ptr);
|
||||
ptr++;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
void
|
||||
freestoplist(StopList * s)
|
||||
{
|
||||
@ -60,10 +48,16 @@ readstoplist(text *in, StopList * s)
|
||||
{
|
||||
char sharepath[MAXPGPATH];
|
||||
char *absfn;
|
||||
#ifdef WIN32
|
||||
char delim = '\\';
|
||||
#else
|
||||
char delim = '/';
|
||||
#endif
|
||||
|
||||
get_share_path(my_exec_path, sharepath);
|
||||
absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
|
||||
sprintf(absfn, "%s/%s", sharepath, filename);
|
||||
sprintf(absfn, "%s%c%s", sharepath, delim, filename);
|
||||
|
||||
pfree(filename);
|
||||
filename = absfn;
|
||||
}
|
||||
|
@ -5,7 +5,9 @@
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
|
||||
#if defined(TS_USE_WIDE) && defined(WIN32)
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
size_t
|
||||
wchar2char(char *to, const wchar_t *from, size_t len)
|
||||
@ -69,4 +71,59 @@ char2wchar(wchar_t *to, const char *from, size_t len)
|
||||
return mbstowcs(to, from, len);
|
||||
}
|
||||
|
||||
#endif /* WIN32 */
|
||||
|
||||
int
|
||||
_t_isalpha( char *ptr ) {
|
||||
wchar_t character;
|
||||
|
||||
char2wchar(&character, ptr, 1);
|
||||
|
||||
return iswalpha( (wint_t)character );
|
||||
}
|
||||
|
||||
int
|
||||
_t_isprint( char *ptr ) {
|
||||
wchar_t character;
|
||||
|
||||
char2wchar(&character, ptr, 1);
|
||||
|
||||
return iswprint( (wint_t)character );
|
||||
}
|
||||
|
||||
#endif /* TS_USE_WIDE */
|
||||
|
||||
char *
|
||||
lowerstr(char *str)
|
||||
{
|
||||
char *ptr = str;
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
/*
|
||||
* Use wide char code only when max encoding length > 1 and ctype != C.
|
||||
* Some operating systems fail with multi-byte encodings and a C locale.
|
||||
* Also, for a C locale there is no need to process as multibyte. From
|
||||
* backend/utils/adt/oracle_compat.c Teodor
|
||||
*/
|
||||
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) {
|
||||
wchar_t *wstr, *wptr;
|
||||
int len = strlen(str);
|
||||
|
||||
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
|
||||
char2wchar(wstr, str, len+1);
|
||||
while (*wptr) {
|
||||
*wptr = towlower((wint_t) *wptr);
|
||||
wptr++;
|
||||
}
|
||||
wchar2char(str, wstr, len);
|
||||
pfree( wstr );
|
||||
} else
|
||||
#endif
|
||||
while (*ptr)
|
||||
{
|
||||
*ptr = tolower(*(unsigned char *) ptr);
|
||||
ptr++;
|
||||
}
|
||||
return str;
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,8 @@
|
||||
#define __TSLOCALE_H__
|
||||
|
||||
#include "postgres.h"
|
||||
#include "utils/pg_locale.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
@ -19,18 +21,58 @@
|
||||
|
||||
#if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
|
||||
#define TS_USE_WIDE
|
||||
#endif
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
#endif /* TS_USE_WIDE */
|
||||
|
||||
|
||||
#define TOUCHAR(x) (*((unsigned char*)(x)))
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
size_t wchar2char(char *to, const wchar_t *from, size_t len);
|
||||
size_t char2wchar(wchar_t *to, const char *from, size_t len);
|
||||
#else /* WIN32 */
|
||||
#else /* WIN32 */
|
||||
|
||||
/* correct mbstowcs */
|
||||
#define char2wchar mbstowcs
|
||||
#define wchar2char wcstombs
|
||||
#endif /* WIN32 */
|
||||
#endif /* defined(HAVE_WCSTOMBS) &&
|
||||
* defined(HAVE_TOWLOWER) */
|
||||
|
||||
#define t_isdigit(x) ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
|
||||
#define t_isspace(x) ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
|
||||
int _t_isalpha( char *ptr );
|
||||
#define t_isalpha(x) ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
|
||||
int _t_isprint( char *ptr );
|
||||
#define t_isprint(x) ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
|
||||
/*
|
||||
* t_iseq() should be called only for ASCII symbols
|
||||
*/
|
||||
#define t_iseq(x,c) ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false )
|
||||
|
||||
#define COPYCHAR(d,s) do { \
|
||||
int lll = pg_mblen( s ); \
|
||||
\
|
||||
while( lll-- ) \
|
||||
TOUCHAR(d+lll) = TOUCHAR(s+lll); \
|
||||
} while(0)
|
||||
|
||||
|
||||
#else /* not def TS_USE_WIDE */
|
||||
|
||||
#define t_isdigit(x) isdigit( TOUCHAR(x) )
|
||||
#define t_isspace(x) isspace( TOUCHAR(x) )
|
||||
#define t_isalpha(x) isalpha( TOUCHAR(x) )
|
||||
#define t_isprint(x) isprint( TOUCHAR(x) )
|
||||
#define t_iseq(x,c) ( TOUCHAR(x) == ((unsigned char)(c)) )
|
||||
|
||||
#define COPYCHAR(d,s) TOUCHAR(d) = TOUCHAR(s)
|
||||
|
||||
#endif
|
||||
|
||||
char* lowerstr(char *str);
|
||||
|
||||
#endif /* __TSLOCALE_H__ */
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include "catalog/pg_type.h"
|
||||
#include "executor/spi.h"
|
||||
#include "common.h"
|
||||
#include "ts_locale.h"
|
||||
|
||||
PG_FUNCTION_INFO_V1(tsstat_in);
|
||||
Datum tsstat_in(PG_FUNCTION_ARGS);
|
||||
@ -476,24 +477,30 @@ ts_stat_sql(text *txt, text *ws)
|
||||
buf = VARDATA(ws);
|
||||
while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
|
||||
{
|
||||
switch (tolower(*buf))
|
||||
{
|
||||
case 'a':
|
||||
stat->weight |= 1 << 3;
|
||||
break;
|
||||
case 'b':
|
||||
stat->weight |= 1 << 2;
|
||||
break;
|
||||
case 'c':
|
||||
stat->weight |= 1 << 1;
|
||||
break;
|
||||
case 'd':
|
||||
stat->weight |= 1;
|
||||
break;
|
||||
default:
|
||||
stat->weight |= 0;
|
||||
if ( pg_mblen(buf) == 1 ) {
|
||||
switch (*buf)
|
||||
{
|
||||
case 'A':
|
||||
case 'a':
|
||||
stat->weight |= 1 << 3;
|
||||
break;
|
||||
case 'B':
|
||||
case 'b':
|
||||
stat->weight |= 1 << 2;
|
||||
break;
|
||||
case 'C':
|
||||
case 'c':
|
||||
stat->weight |= 1 << 1;
|
||||
break;
|
||||
case 'D':
|
||||
case 'd':
|
||||
stat->weight |= 1;
|
||||
break;
|
||||
default:
|
||||
stat->weight |= 0;
|
||||
}
|
||||
}
|
||||
buf++;
|
||||
buf+=pg_mblen(buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -16,8 +16,9 @@
|
||||
#include "catalog/namespace.h"
|
||||
|
||||
#include "utils/pg_locale.h"
|
||||
#include "mb/pg_wchar.h"
|
||||
|
||||
#include <ctype.h> /* tolower */
|
||||
#include <ctype.h>
|
||||
#include "tsvector.h"
|
||||
#include "query.h"
|
||||
#include "ts_cfg.h"
|
||||
@ -173,7 +174,7 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
|
||||
|
||||
#define RESIZEPRSBUF \
|
||||
do { \
|
||||
if ( state->curpos - state->word + 1 >= state->len ) \
|
||||
if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
|
||||
{ \
|
||||
int4 clen = state->curpos - state->word; \
|
||||
state->len *= 2; \
|
||||
@ -182,6 +183,7 @@ do { \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
int4
|
||||
gettoken_tsvector(TI_IN_STATE * state)
|
||||
{
|
||||
@ -197,21 +199,21 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
{
|
||||
if (*(state->prsbuf) == '\0')
|
||||
return 0;
|
||||
else if (*(state->prsbuf) == '\'')
|
||||
else if ( t_iseq(state->prsbuf, '\'') )
|
||||
state->state = WAITENDCMPLX;
|
||||
else if (*(state->prsbuf) == '\\')
|
||||
else if ( t_iseq(state->prsbuf, '\\') )
|
||||
{
|
||||
state->state = WAITNEXTCHAR;
|
||||
oldstate = WAITENDWORD;
|
||||
}
|
||||
else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))
|
||||
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("syntax error")));
|
||||
else if (*(state->prsbuf) != ' ')
|
||||
else if (!t_isspace(state->prsbuf))
|
||||
{
|
||||
*(state->curpos) = *(state->prsbuf);
|
||||
state->curpos++;
|
||||
COPYCHAR(state->curpos, state->prsbuf);
|
||||
state->curpos+=pg_mblen(state->prsbuf);
|
||||
state->state = WAITENDWORD;
|
||||
}
|
||||
}
|
||||
@ -224,20 +226,20 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
else
|
||||
{
|
||||
RESIZEPRSBUF;
|
||||
*(state->curpos) = *(state->prsbuf);
|
||||
state->curpos++;
|
||||
COPYCHAR(state->curpos, state->prsbuf);
|
||||
state->curpos+=pg_mblen(state->prsbuf);
|
||||
state->state = oldstate;
|
||||
}
|
||||
}
|
||||
else if (state->state == WAITENDWORD)
|
||||
{
|
||||
if (*(state->prsbuf) == '\\')
|
||||
if ( t_iseq(state->prsbuf, '\\') )
|
||||
{
|
||||
state->state = WAITNEXTCHAR;
|
||||
oldstate = WAITENDWORD;
|
||||
}
|
||||
else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' ||
|
||||
(state->oprisdelim && ISOPERATOR(*(state->prsbuf))))
|
||||
else if ( t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
|
||||
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
|
||||
{
|
||||
RESIZEPRSBUF;
|
||||
if (state->curpos == state->word)
|
||||
@ -247,7 +249,7 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
*(state->curpos) = '\0';
|
||||
return 1;
|
||||
}
|
||||
else if (*(state->prsbuf) == ':')
|
||||
else if ( t_iseq(state->prsbuf,':') )
|
||||
{
|
||||
if (state->curpos == state->word)
|
||||
ereport(ERROR,
|
||||
@ -262,13 +264,13 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
else
|
||||
{
|
||||
RESIZEPRSBUF;
|
||||
*(state->curpos) = *(state->prsbuf);
|
||||
state->curpos++;
|
||||
COPYCHAR(state->curpos, state->prsbuf);
|
||||
state->curpos+=pg_mblen(state->prsbuf);
|
||||
}
|
||||
}
|
||||
else if (state->state == WAITENDCMPLX)
|
||||
{
|
||||
if (*(state->prsbuf) == '\'')
|
||||
if ( t_iseq(state->prsbuf, '\'') )
|
||||
{
|
||||
RESIZEPRSBUF;
|
||||
*(state->curpos) = '\0';
|
||||
@ -278,13 +280,13 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
errmsg("syntax error")));
|
||||
if (state->oprisdelim)
|
||||
{
|
||||
state->prsbuf++;
|
||||
state->prsbuf+=pg_mblen(state->prsbuf);
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
state->state = WAITPOSINFO;
|
||||
}
|
||||
else if (*(state->prsbuf) == '\\')
|
||||
else if ( t_iseq(state->prsbuf, '\\') )
|
||||
{
|
||||
state->state = WAITNEXTCHAR;
|
||||
oldstate = WAITENDCMPLX;
|
||||
@ -296,20 +298,20 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
else
|
||||
{
|
||||
RESIZEPRSBUF;
|
||||
*(state->curpos) = *(state->prsbuf);
|
||||
state->curpos++;
|
||||
COPYCHAR(state->curpos, state->prsbuf);
|
||||
state->curpos+=pg_mblen(state->prsbuf);
|
||||
}
|
||||
}
|
||||
else if (state->state == WAITPOSINFO)
|
||||
{
|
||||
if (*(state->prsbuf) == ':')
|
||||
if ( t_iseq(state->prsbuf, ':') )
|
||||
state->state = INPOSINFO;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
else if (state->state == INPOSINFO)
|
||||
{
|
||||
if (isdigit((unsigned char) *(state->prsbuf)))
|
||||
if (t_isdigit(state->prsbuf))
|
||||
{
|
||||
if (state->alen == 0)
|
||||
{
|
||||
@ -338,9 +340,9 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
}
|
||||
else if (state->state == WAITPOSDELIM)
|
||||
{
|
||||
if (*(state->prsbuf) == ',')
|
||||
if ( t_iseq(state->prsbuf, ',') )
|
||||
state->state = INPOSINFO;
|
||||
else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*')
|
||||
else if ( t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*') )
|
||||
{
|
||||
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
|
||||
ereport(ERROR,
|
||||
@ -348,7 +350,7 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
errmsg("syntax error")));
|
||||
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
|
||||
}
|
||||
else if (tolower(*(state->prsbuf)) == 'b')
|
||||
else if ( t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B') )
|
||||
{
|
||||
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
|
||||
ereport(ERROR,
|
||||
@ -356,7 +358,7 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
errmsg("syntax error")));
|
||||
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
|
||||
}
|
||||
else if (tolower(*(state->prsbuf)) == 'c')
|
||||
else if ( t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C') )
|
||||
{
|
||||
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
|
||||
ereport(ERROR,
|
||||
@ -364,7 +366,7 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
errmsg("syntax error")));
|
||||
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
|
||||
}
|
||||
else if (tolower(*(state->prsbuf)) == 'd')
|
||||
else if ( t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D') )
|
||||
{
|
||||
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
|
||||
ereport(ERROR,
|
||||
@ -372,10 +374,10 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
errmsg("syntax error")));
|
||||
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
|
||||
}
|
||||
else if (isspace((unsigned char) *(state->prsbuf)) ||
|
||||
else if (t_isspace(state->prsbuf) ||
|
||||
*(state->prsbuf) == '\0')
|
||||
return 1;
|
||||
else if (!isdigit((unsigned char) *(state->prsbuf)))
|
||||
else if (!t_isdigit(state->prsbuf))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("syntax error")));
|
||||
@ -383,7 +385,7 @@ gettoken_tsvector(TI_IN_STATE * state)
|
||||
else
|
||||
/* internal error */
|
||||
elog(ERROR, "internal error");
|
||||
state->prsbuf++;
|
||||
state->prsbuf+=pg_mblen(state->prsbuf);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -405,6 +407,8 @@ tsvector_in(PG_FUNCTION_ARGS)
|
||||
buflen = 256;
|
||||
|
||||
SET_FUNCOID();
|
||||
|
||||
pg_verifymbstr( buf, strlen(buf), false );
|
||||
state.prsbuf = buf;
|
||||
state.len = 32;
|
||||
state.word = (char *) palloc(state.len);
|
||||
@ -495,17 +499,16 @@ tsvector_out(PG_FUNCTION_ARGS)
|
||||
tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
|
||||
char *outbuf;
|
||||
int4 i,
|
||||
j,
|
||||
lenbuf = 0,
|
||||
pp;
|
||||
WordEntry *ptr = ARRPTR(out);
|
||||
char *curin,
|
||||
char *curbegin, *curin,
|
||||
*curout;
|
||||
|
||||
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
|
||||
for (i = 0; i < out->size; i++)
|
||||
{
|
||||
lenbuf += ptr[i].len * 2 /* for escape */ ;
|
||||
lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length()/* for escape */ ;
|
||||
if (ptr[i].haspos)
|
||||
lenbuf += 7 * POSDATALEN(out, &(ptr[i]));
|
||||
}
|
||||
@ -513,14 +516,14 @@ tsvector_out(PG_FUNCTION_ARGS)
|
||||
curout = outbuf = (char *) palloc(lenbuf);
|
||||
for (i = 0; i < out->size; i++)
|
||||
{
|
||||
curin = STRPTR(out) + ptr->pos;
|
||||
curbegin = curin = STRPTR(out) + ptr->pos;
|
||||
if (i != 0)
|
||||
*curout++ = ' ';
|
||||
*curout++ = '\'';
|
||||
j = ptr->len;
|
||||
while (j--)
|
||||
while ( curin-curbegin < ptr->len )
|
||||
{
|
||||
if (*curin == '\'')
|
||||
int len = pg_mblen(curin);
|
||||
if ( t_iseq(curin, '\'') )
|
||||
{
|
||||
int4 pos = curout - outbuf;
|
||||
|
||||
@ -528,7 +531,8 @@ tsvector_out(PG_FUNCTION_ARGS)
|
||||
curout = outbuf + pos;
|
||||
*curout++ = '\\';
|
||||
}
|
||||
*curout++ = *curin++;
|
||||
while(len--)
|
||||
*curout++ = *curin++;
|
||||
}
|
||||
*curout++ = '\'';
|
||||
if ((pp = POSDATALEN(out, ptr)) != 0)
|
||||
|
@ -15,7 +15,6 @@
|
||||
|
||||
#include "utils/pg_locale.h"
|
||||
|
||||
#include <ctype.h> /* tolower */
|
||||
#include "tsvector.h"
|
||||
#include "query.h"
|
||||
#include "ts_cfg.h"
|
||||
@ -76,17 +75,21 @@ setweight(PG_FUNCTION_ARGS)
|
||||
WordEntryPos *p;
|
||||
int w = 0;
|
||||
|
||||
switch (tolower(cw))
|
||||
switch (cw)
|
||||
{
|
||||
case 'A':
|
||||
case 'a':
|
||||
w = 3;
|
||||
break;
|
||||
case 'B':
|
||||
case 'b':
|
||||
w = 2;
|
||||
break;
|
||||
case 'C':
|
||||
case 'c':
|
||||
w = 1;
|
||||
break;
|
||||
case 'D':
|
||||
case 'd':
|
||||
w = 0;
|
||||
break;
|
||||
|
@ -71,8 +71,11 @@ TParserClose(TParser * prs)
|
||||
prs->state = ptr;
|
||||
}
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
if (prs->wstr)
|
||||
pfree(prs->wstr);
|
||||
#endif
|
||||
|
||||
pfree(prs);
|
||||
}
|
||||
|
||||
|
@ -134,8 +134,10 @@ typedef struct TParser
|
||||
/* string and position information */
|
||||
char *str; /* multibyte string */
|
||||
int lenstr; /* length of mbstring */
|
||||
#ifdef TS_USE_WIDE
|
||||
wchar_t *wstr; /* wide character string */
|
||||
int lenwstr; /* length of wsting */
|
||||
#endif
|
||||
|
||||
/* State of parse */
|
||||
int charmaxlen;
|
||||
|
Loading…
x
Reference in New Issue
Block a user