1
0
mirror of https://github.com/postgres/postgres.git synced 2025-09-02 04:21:28 +03:00

Tsearch2 functionality migrates to core. The bulk of this work is by

Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing,
so anything that's broken is probably my fault.

Documentation is nonexistent as yet, but let's land the patch so we can
get some portability testing done.
This commit is contained in:
Tom Lane
2007-08-21 01:11:32 +00:00
parent 4e94d1f952
commit 140d4ebcb4
200 changed files with 54388 additions and 147 deletions

View File

@@ -0,0 +1,51 @@
#-------------------------------------------------------------------------
#
# Makefile for backend/tsearch
#
# Copyright (c) 2006-2007, PostgreSQL Global Development Group
#
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.1 2007/08/21 01:11:18 tgl Exp $
#
#-------------------------------------------------------------------------
subdir = src/backend/tsearch
top_builddir = ../../..
include $(top_builddir)/src/Makefile.global
DICTDIR=tsearch_data
DICTFILES=synonym.syn.sample thesaurus.ths.sample
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \
dict_ispell.o regis.o spell.o \
to_tsany.o ts_utils.o
all: SUBSYS.o
SUBSYS.o: $(OBJS)
$(LD) $(LDREL) $(LDOUT) SUBSYS.o $^
depend dep:
$(CC) -MM $(CFLAGS) *.c >depend
.PHONY: install-data
install-data: $(DICTFILES) installdirs
for i in $(DICTFILES); \
do $(INSTALL_DATA) $$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \
done
installdirs:
$(mkinstalldirs) '$(DESTDIR)$(datadir)' '$(DESTDIR)$(datadir)/$(DICTDIR)'
.PHONY: uninstall-data
uninstall-data:
for i in $(DICTFILES); \
do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \
done
clean distclean maintainer-clean:
rm -f SUBSYS.o $(OBJS)
ifeq (depend,$(wildcard depend))
include depend
endif

131
src/backend/tsearch/dict.c Normal file
View File

@@ -0,0 +1,131 @@
/*-------------------------------------------------------------------------
*
* dict.c
* Standard interface to dictionary
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/skey.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_ts_dict.h"
#include "catalog/pg_type.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/rel.h"
#include "utils/syscache.h"
/*
* Lexize one word by dictionary, mostly debug function
*/
static ArrayType *
ts_lexize_workhorse(Oid dictId, text *in)
{
TSDictionaryCacheEntry *dict;
TSLexeme *res,
*ptr;
Datum *da;
ArrayType *a;
DictSubState dstate = {false, false, NULL};
dict = lookup_ts_dictionary_cache(dictId);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize,
PointerGetDatum(dict->dictData),
PointerGetDatum(VARDATA(in)),
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
PointerGetDatum(&dstate)));
if (dstate.getnext)
{
dstate.isend = true;
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize,
PointerGetDatum(dict->dictData),
PointerGetDatum(VARDATA(in)),
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
PointerGetDatum(&dstate)));
if (ptr != NULL)
res = ptr;
}
if (!res)
return NULL;
ptr = res;
while (ptr->lexeme)
ptr++;
da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1));
ptr = res;
while (ptr->lexeme)
{
da[ptr - res] = DirectFunctionCall1(textin, CStringGetDatum(ptr->lexeme));
ptr++;
}
a = construct_array(da,
ptr - res,
TEXTOID,
-1,
false,
'i');
ptr = res;
while (ptr->lexeme)
{
pfree(DatumGetPointer(da[ptr - res]));
pfree(ptr->lexeme);
ptr++;
}
pfree(res);
pfree(da);
return a;
}
Datum
ts_lexize_byid(PG_FUNCTION_ARGS)
{
Oid dictId = PG_GETARG_OID(0);
text *in = PG_GETARG_TEXT_P(1);
ArrayType *a;
a = ts_lexize_workhorse(dictId, in);
if (a)
PG_RETURN_POINTER(a);
else
PG_RETURN_NULL();
}
Datum
ts_lexize_byname(PG_FUNCTION_ARGS)
{
text *dictname = PG_GETARG_TEXT_P(0);
text *in = PG_GETARG_TEXT_P(1);
Oid dictId;
ArrayType *a;
dictId = TSDictionaryGetDictid(textToQualifiedNameList(dictname), false);
a = ts_lexize_workhorse(dictId, in);
if (a)
PG_RETURN_POINTER(a);
else
PG_RETURN_NULL();
}

View File

@@ -0,0 +1,164 @@
/*-------------------------------------------------------------------------
*
* dict_ispell.c
* Ispell dictionary interface
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/dicts/spell.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
typedef struct
{
StopList stoplist;
IspellDict obj;
} DictISpell;
Datum
dispell_init(PG_FUNCTION_ARGS)
{
DictISpell *d;
Map *cfg,
*pcfg;
bool affloaded = false,
dictloaded = false,
stoploaded = false;
text *in;
/* init functions must defend against NULLs for themselves */
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("NULL config not allowed for ISpell")));
in = PG_GETARG_TEXT_P(0);
parse_keyvalpairs(in, &cfg);
PG_FREE_IF_COPY(in, 0);
d = (DictISpell *) palloc0(sizeof(DictISpell));
d->stoplist.wordop = recode_and_lowerstr;
pcfg = cfg;
while (pcfg->key)
{
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
{
if (dictloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple DictFile parameters")));
NIImportDictionary(&(d->obj),
get_tsearch_config_filename(pcfg->value,
"dict"));
dictloaded = true;
}
else if (pg_strcasecmp("AffFile", pcfg->key) == 0)
{
if (affloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple AffFile parameters")));
NIImportAffixes(&(d->obj),
get_tsearch_config_filename(pcfg->value,
"affix"));
affloaded = true;
}
else if (pg_strcasecmp("StopWords", pcfg->key) == 0)
{
if (stoploaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple StopWords parameters")));
readstoplist(pcfg->value, &(d->stoplist));
sortstoplist(&(d->stoplist));
stoploaded = true;
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized ISpell parameter: \"%s\"",
pcfg->key)));
}
pfree(pcfg->key);
pfree(pcfg->value);
pcfg++;
}
pfree(cfg);
if (affloaded && dictloaded)
{
NISortDictionary(&(d->obj));
NISortAffixes(&(d->obj));
}
else if (!affloaded)
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing AffFile parameter")));
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing DictFile parameter")));
}
MemoryContextDeleteChildren(CurrentMemoryContext);
PG_RETURN_POINTER(d);
}
Datum
dispell_lexize(PG_FUNCTION_ARGS)
{
DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *txt;
TSLexeme *res;
TSLexeme *ptr,
*cptr;
if (len <= 0)
PG_RETURN_POINTER(NULL);
txt = lowerstr_with_len(in, len);
res = NINormalizeWord(&(d->obj), txt);
if (res == NULL)
PG_RETURN_POINTER(NULL);
ptr = cptr = res;
while (ptr->lexeme)
{
if (searchstoplist(&(d->stoplist), ptr->lexeme))
{
pfree(ptr->lexeme);
ptr->lexeme = NULL;
ptr++;
}
else
{
memcpy(cptr, ptr, sizeof(TSLexeme));
cptr++;
ptr++;
}
}
cptr->lexeme = NULL;
PG_RETURN_POINTER(res);
}

View File

@@ -0,0 +1,65 @@
/*-------------------------------------------------------------------------
*
* dict_simple.c
* Simple dictionary: just lowercase and check for stopword
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
typedef struct
{
StopList stoplist;
} DictExample;
Datum
dsimple_init(PG_FUNCTION_ARGS)
{
DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
d->stoplist.wordop = recode_and_lowerstr;
if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL)
{
text *in = PG_GETARG_TEXT_P(0);
char *filename = TextPGetCString(in);
readstoplist(filename, &d->stoplist);
sortstoplist(&d->stoplist);
pfree(filename);
}
PG_RETURN_POINTER(d);
}
Datum
dsimple_lexize(PG_FUNCTION_ARGS)
{
DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
char *txt = lowerstr_with_len(in, len);
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
{
pfree(txt);
}
else
res[0].lexeme = txt;
PG_RETURN_POINTER(res);
}

View File

@@ -0,0 +1,176 @@
/*-------------------------------------------------------------------------
*
* dict_synonym.c
* Synonym dictionary: replace word by its synonym
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/fd.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#define SYNBUFLEN 4096
typedef struct
{
char *in;
char *out;
} Syn;
typedef struct
{
int len;
Syn *syn;
} DictSyn;
static char *
findwrd(char *in, char **end)
{
char *start;
*end = NULL;
while (*in && t_isspace(in))
in += pg_mblen(in);
if (*in == '\0')
return NULL;
start = in;
while (*in && !t_isspace(in))
in += pg_mblen(in);
*end = in;
return start;
}
static int
compareSyn(const void *a, const void *b)
{
return strcmp(((Syn *) a)->in, ((Syn *) b)->in);
}
Datum
dsynonym_init(PG_FUNCTION_ARGS)
{
text *in;
DictSyn *d;
int cur = 0;
FILE *fin;
char *filename;
char buf[SYNBUFLEN];
char *starti,
*starto,
*end = NULL;
int slen;
/* init functions must defend against NULLs for themselves */
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("NULL config not allowed for Synonym")));
in = PG_GETARG_TEXT_P(0);
filename = get_tsearch_config_filename(TextPGetCString(in), "syn");
PG_FREE_IF_COPY(in, 0);
if ((fin = AllocateFile(filename, "r")) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open synonym file \"%s\": %m",
filename)));
d = (DictSyn *) palloc0(sizeof(DictSyn));
while (fgets(buf, SYNBUFLEN, fin))
{
slen = strlen(buf);
pg_verifymbstr(buf, slen, false);
if (cur == d->len)
{
if (d->len == 0)
{
d->len = 16;
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
else
{
d->len *= 2;
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
}
}
starti = findwrd(buf, &end);
if (!starti)
continue;
*end = '\0';
if (end >= buf + slen)
continue;
starto = findwrd(end + 1, &end);
if (!starto)
continue;
*end = '\0';
d->syn[cur].in = recode_and_lowerstr(starti);
d->syn[cur].out = recode_and_lowerstr(starto);
if (!(d->syn[cur].in && d->syn[cur].out))
{
FreeFile(fin);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
cur++;
}
FreeFile(fin);
d->len = cur;
if (cur > 1)
qsort(d->syn, d->len, sizeof(Syn), compareSyn);
pfree(filename);
PG_RETURN_POINTER(d);
}
Datum
dsynonym_lexize(PG_FUNCTION_ARGS)
{
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int32 len = PG_GETARG_INT32(2);
Syn key,
*found;
TSLexeme *res;
if (len <= 0)
PG_RETURN_POINTER(NULL);
key.in = lowerstr_with_len(in, len);
key.out = NULL;
found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
pfree(key.in);
if (!found)
PG_RETURN_POINTER(NULL);
res = palloc(sizeof(TSLexeme) * 2);
memset(res, 0, sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out);
PG_RETURN_POINTER(res);
}

View File

@@ -0,0 +1,887 @@
/*-------------------------------------------------------------------------
*
* dict_thesaurus.c
* Thesaurus dictionary: phrase to phrase substitution
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "storage/fd.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
/*
* Temporay we use TSLexeme.flags for inner use...
*/
#define DT_USEASIS 0x1000
typedef struct LexemeInfo
{
uint16 idsubst; /* entry's number in DictThesaurus->subst */
uint16 posinsubst; /* pos info in entry */
uint16 tnvariant; /* total num lexemes in one variant */
struct LexemeInfo *nextentry;
struct LexemeInfo *nextvariant;
} LexemeInfo;
typedef struct
{
char *lexeme;
LexemeInfo *entries;
} TheLexeme;
typedef struct
{
uint16 lastlexeme; /* number lexemes to substitute */
uint16 reslen;
TSLexeme *res; /* prepared substituted result */
} TheSubstitute;
typedef struct
{
/* subdictionary to normalize lexemes */
Oid subdictOid;
TSDictionaryCacheEntry *subdict;
/* Array to search lexeme by exact match */
TheLexeme *wrds;
int nwrds;
int ntwrds;
/*
* Storage of substituted result, n-th element is for n-th expression
*/
TheSubstitute *subst;
int nsubst;
} DictThesaurus;
static void
newLexeme(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
{
TheLexeme *ptr;
if (d->nwrds >= d->ntwrds)
{
if (d->ntwrds == 0)
{
d->ntwrds = 16;
d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
}
else
{
d->ntwrds *= 2;
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
}
}
ptr = d->wrds + d->nwrds;
d->nwrds++;
ptr->lexeme = palloc(e - b + 1);
memcpy(ptr->lexeme, b, e - b);
ptr->lexeme[e - b] = '\0';
ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
ptr->entries->nextentry = NULL;
ptr->entries->idsubst = idsubst;
ptr->entries->posinsubst = posinsubst;
}
static void
addWrd(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
{
static int nres = 0;
static int ntres = 0;
TheSubstitute *ptr;
if (nwrd == 0)
{
nres = ntres = 0;
if (idsubst >= d->nsubst)
{
if (d->nsubst == 0)
{
d->nsubst = 16;
d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
}
else
{
d->nsubst *= 2;
d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
}
}
}
ptr = d->subst + idsubst;
ptr->lastlexeme = posinsubst - 1;
if (nres + 1 >= ntres)
{
if (ntres == 0)
{
ntres = 2;
ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
}
else
{
ntres *= 2;
ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
}
}
ptr->res[nres].lexeme = palloc(e - b + 1);
memcpy(ptr->res[nres].lexeme, b, e - b);
ptr->res[nres].lexeme[e - b] = '\0';
ptr->res[nres].nvariant = nwrd;
if (useasis)
ptr->res[nres].flags = DT_USEASIS;
else
ptr->res[nres].flags = 0;
ptr->res[++nres].lexeme = NULL;
}
#define TR_WAITLEX 1
#define TR_INLEX 2
#define TR_WAITSUBS 3
#define TR_INSUBS 4
static void
thesaurusRead(char *filename, DictThesaurus * d)
{
FILE *fh;
char str[BUFSIZ];
int lineno = 0;
uint16 idsubst = 0;
bool useasis = false;
filename = get_tsearch_config_filename(filename, "ths");
fh = AllocateFile(filename, "r");
if (!fh)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open thesaurus file \"%s\": %m",
filename)));
while (fgets(str, sizeof(str), fh))
{
char *ptr,
*recoded;
int state = TR_WAITLEX;
char *beginwrd = NULL;
uint16 posinsubst = 0;
uint16 nwrd = 0;
ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
GetDatabaseEncoding(), PG_UTF8);
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
lineno++;
/* is it comment ? */
while (t_isspace(ptr))
ptr += pg_mblen(ptr);
if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
continue;
while (*ptr)
{
if (state == TR_WAITLEX)
{
if (t_iseq(ptr, ':'))
{
if (posinsubst == 0)
{
FreeFile(fh);
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected delimiter at line %d of thesaurus file \"%s\"",
lineno, filename)));
}
state = TR_WAITSUBS;
}
else if (!t_isspace(ptr))
{
beginwrd = ptr;
state = TR_INLEX;
}
}
else if (state == TR_INLEX)
{
if (t_iseq(ptr, ':'))
{
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
state = TR_WAITSUBS;
}
else if (t_isspace(ptr))
{
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
state = TR_WAITLEX;
}
}
else if (state == TR_WAITSUBS)
{
if (t_iseq(ptr, '*'))
{
useasis = true;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
}
else if (t_iseq(ptr, '\\'))
{
useasis = false;
state = TR_INSUBS;
beginwrd = ptr + pg_mblen(ptr);
}
else if (!t_isspace(ptr))
{
useasis = false;
beginwrd = ptr;
state = TR_INSUBS;
}
}
else if (state == TR_INSUBS)
{
if (t_isspace(ptr))
{
if (ptr == beginwrd)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
lineno, filename)));
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
state = TR_WAITSUBS;
}
}
else
elog(ERROR, "unrecognized thesaurus state: %d", state);
ptr += pg_mblen(ptr);
}
if (state == TR_INSUBS)
{
if (ptr == beginwrd)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
lineno, filename)));
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
}
idsubst++;
if (!(nwrd && posinsubst))
{
FreeFile(fh);
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("unexpected end of line at line %d of thesaurus file \"%s\"",
lineno, filename)));
}
if (recoded != str)
pfree(recoded);
}
d->nsubst = idsubst;
FreeFile(fh);
}
static TheLexeme *
addCompiledLexeme(TheLexeme * newwrds, int *nnw, int *tnm, TSLexeme * lexeme, LexemeInfo * src, uint16 tnvariant)
{
if (*nnw >= *tnm)
{
*tnm *= 2;
newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
}
newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
if (lexeme && lexeme->lexeme)
{
newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
newwrds[*nnw].entries->tnvariant = tnvariant;
}
else
{
newwrds[*nnw].lexeme = NULL;
newwrds[*nnw].entries->tnvariant = 1;
}
newwrds[*nnw].entries->idsubst = src->idsubst;
newwrds[*nnw].entries->posinsubst = src->posinsubst;
newwrds[*nnw].entries->nextentry = NULL;
(*nnw)++;
return newwrds;
}
static int
cmpLexemeInfo(LexemeInfo * a, LexemeInfo * b)
{
if (a == NULL || b == NULL)
return 0;
if (a->idsubst == b->idsubst)
{
if (a->posinsubst == b->posinsubst)
{
if (a->tnvariant == b->tnvariant)
return 0;
return (a->tnvariant > b->tnvariant) ? 1 : -1;
}
return (a->posinsubst > b->posinsubst) ? 1 : -1;
}
return (a->idsubst > b->idsubst) ? 1 : -1;
}
static int
cmpLexeme(TheLexeme * a, TheLexeme * b)
{
if (a->lexeme == NULL)
{
if (b->lexeme == NULL)
return 0;
else
return 1;
}
else if (b->lexeme == NULL)
return -1;
return strcmp(a->lexeme, b->lexeme);
}
static int
cmpLexemeQ(const void *a, const void *b)
{
return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
}
static int
cmpTheLexeme(const void *a, const void *b)
{
TheLexeme *la = (TheLexeme *) a;
TheLexeme *lb = (TheLexeme *) b;
int res;
if ((res = cmpLexeme(la, lb)) != 0)
return res;
return -cmpLexemeInfo(la->entries, lb->entries);
}
static void
compileTheLexeme(DictThesaurus * d)
{
int i,
nnw = 0,
tnm = 16;
TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
*ptrwrds;
for (i = 0; i < d->nwrds; i++)
{
TSLexeme *ptr;
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData),
PointerGetDatum(d->wrds[i].lexeme),
Int32GetDatum(strlen(d->wrds[i].lexeme)),
PointerGetDatum(NULL)));
if (!(ptr && ptr->lexeme))
{
if (!ptr)
elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
else
elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
}
else
{
while (ptr->lexeme)
{
TSLexeme *remptr = ptr + 1;
int tnvar = 1;
int curvar = ptr->nvariant;
/* compute n words in one variant */
while (remptr->lexeme)
{
if (remptr->nvariant != (remptr - 1)->nvariant)
break;
tnvar++;
remptr++;
}
remptr = ptr;
while (remptr->lexeme && remptr->nvariant == curvar)
{
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
remptr++;
}
ptr = remptr;
}
}
pfree(d->wrds[i].lexeme);
pfree(d->wrds[i].entries);
}
pfree(d->wrds);
d->wrds = newwrds;
d->nwrds = nnw;
d->ntwrds = tnm;
if (d->nwrds > 1)
{
qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
/* uniq */
newwrds = d->wrds;
ptrwrds = d->wrds + 1;
while (ptrwrds - d->wrds < d->nwrds)
{
if (cmpLexeme(ptrwrds, newwrds) == 0)
{
if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
{
ptrwrds->entries->nextentry = newwrds->entries;
newwrds->entries = ptrwrds->entries;
}
else
pfree(ptrwrds->entries);
if (ptrwrds->lexeme)
pfree(ptrwrds->lexeme);
}
else
{
newwrds++;
*newwrds = *ptrwrds;
}
ptrwrds++;
}
d->nwrds = newwrds - d->wrds + 1;
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
}
}
static void
compileTheSubstitute(DictThesaurus * d)
{
int i;
for (i = 0; i < d->nsubst; i++)
{
TSLexeme *rem = d->subst[i].res,
*outptr,
*inptr;
int n = 2;
outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
outptr->lexeme = NULL;
inptr = rem;
while (inptr && inptr->lexeme)
{
TSLexeme *lexized,
tmplex[2];
if (inptr->flags & DT_USEASIS)
{ /* do not lexize */
tmplex[0] = *inptr;
tmplex[0].flags = 0;
tmplex[1].lexeme = NULL;
lexized = tmplex;
}
else
{
lexized = (TSLexeme *) DatumGetPointer(
FunctionCall4(
&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData),
PointerGetDatum(inptr->lexeme),
Int32GetDatum(strlen(inptr->lexeme)),
PointerGetDatum(NULL)
)
);
}
if (lexized && lexized->lexeme)
{
int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
while (lexized->lexeme)
{
if (outptr - d->subst[i].res + 1 >= n)
{
int diff = outptr - d->subst[i].res;
n *= 2;
d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
outptr = d->subst[i].res + diff;
}
*outptr = *lexized;
outptr->lexeme = pstrdup(lexized->lexeme);
outptr++;
lexized++;
}
if (toset > 0)
d->subst[i].res[toset].flags |= TSL_ADDPOS;
}
else if (lexized)
{
elog(NOTICE, "thesaurus word \"%s\" in substitution is a stop-word, ignored (rule %d)", inptr->lexeme, i + 1);
}
else
{
elog(ERROR, "thesaurus word \"%s\" in substitution isn't recognized (rule %d)", inptr->lexeme, i + 1);
}
if (inptr->lexeme)
pfree(inptr->lexeme);
inptr++;
}
if (outptr == d->subst[i].res)
elog(ERROR, "all words in thesaurus substitution are stop words (rule %d)", i + 1);
d->subst[i].reslen = outptr - d->subst[i].res;
pfree(rem);
}
}
Datum
thesaurus_init(PG_FUNCTION_ARGS)
{
DictThesaurus *d;
Map *cfg,
*pcfg;
text *in;
char *subdictname = NULL;
bool fileloaded = false;
/* init functions must defend against NULLs for themselves */
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("NULL config not allowed for Thesaurus")));
in = PG_GETARG_TEXT_P(0);
parse_keyvalpairs(in, &cfg);
PG_FREE_IF_COPY(in, 0);
d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
pcfg = cfg;
while (pcfg->key)
{
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
{
if (fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple DictFile parameters")));
thesaurusRead(pcfg->value, d);
fileloaded = true;
}
else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
{
if (subdictname)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("multiple Dictionary parameters")));
subdictname = pstrdup(pcfg->value);
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized Thesaurus parameter: \"%s\"",
pcfg->key)));
}
pfree(pcfg->key);
pfree(pcfg->value);
pcfg++;
}
pfree(cfg);
if (!fileloaded)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing DictFile parameter")));
if (!subdictname)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("missing Dictionary parameter")));
d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
compileTheLexeme(d);
compileTheSubstitute(d);
PG_RETURN_POINTER(d);
}
static LexemeInfo *
findTheLexeme(DictThesaurus * d, char *lexeme)
{
TheLexeme key = {lexeme, NULL}, *res;
if (d->nwrds == 0)
return NULL;
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
if (res == NULL)
return NULL;
return res->entries;
}
static bool
matchIdSubst(LexemeInfo * stored, uint16 idsubst)
{
bool res = true;
if (stored)
{
res = false;
for (; stored; stored = stored->nextvariant)
if (stored->idsubst == idsubst)
{
res = true;
break;
}
}
return res;
}
static LexemeInfo *
findVariant(LexemeInfo * in, LexemeInfo * stored, uint16 curpos, LexemeInfo ** newin, int newn)
{
for (;;)
{
int i;
LexemeInfo *ptr = newin[0];
for (i = 0; i < newn; i++)
{
while (newin[i] && newin[i]->idsubst < ptr->idsubst)
newin[i] = newin[i]->nextentry;
if (newin[i] == NULL)
return in;
if (newin[i]->idsubst > ptr->idsubst)
{
ptr = newin[i];
i = -1;
continue;
}
while (newin[i]->idsubst == ptr->idsubst)
{
if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
{
ptr = newin[i];
break;
}
newin[i] = newin[i]->nextentry;
if (newin[i] == NULL)
return in;
}
if (newin[i]->idsubst != ptr->idsubst)
{
ptr = newin[i];
i = -1;
continue;
}
}
if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
{ /* found */
ptr->nextvariant = in;
in = ptr;
}
/* step forward */
for (i = 0; i < newn; i++)
newin[i] = newin[i]->nextentry;
}
return NULL;
}
static TSLexeme *
copyTSLexeme(TheSubstitute * ts)
{
TSLexeme *res;
uint16 i;
res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
for (i = 0; i < ts->reslen; i++)
{
res[i] = ts->res[i];
res[i].lexeme = pstrdup(ts->res[i].lexeme);
}
res[ts->reslen].lexeme = NULL;
return res;
}
static TSLexeme *
checkMatch(DictThesaurus * d, LexemeInfo * info, uint16 curpos, bool *moreres)
{
*moreres = false;
while (info)
{
Assert(info->idsubst < d->nsubst);
if (info->nextvariant)
*moreres = true;
if (d->subst[info->idsubst].lastlexeme == curpos)
return copyTSLexeme(d->subst + info->idsubst);
info = info->nextvariant;
}
return NULL;
}
Datum
thesaurus_lexize(PG_FUNCTION_ARGS)
{
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
TSLexeme *res = NULL;
LexemeInfo *stored,
*info = NULL;
uint16 curpos = 0;
bool moreres = false;
if (PG_NARGS() < 4 || dstate == NULL)
elog(ERROR, "forbidden call of thesaurus or nested call");
if (dstate->isend)
PG_RETURN_POINTER(NULL);
stored = (LexemeInfo *) dstate->private;
if (stored)
curpos = stored->posinsubst + 1;
if (!d->subdict->isvalid)
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
PointerGetDatum(d->subdict->dictData),
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(2),
PointerGetDatum(NULL)));
if (res && res->lexeme)
{
TSLexeme *ptr = res,
*basevar;
while (ptr->lexeme)
{
uint16 nv = ptr->nvariant;
uint16 i,
nlex = 0;
LexemeInfo **infos;
basevar = ptr;
while (ptr->lexeme && nv == ptr->nvariant)
{
nlex++;
ptr++;
}
infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
for (i = 0; i < nlex; i++)
if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
break;
if (i < nlex)
{
/* no chance to find */
pfree(infos);
continue;
}
info = findVariant(info, stored, curpos, infos, nlex);
}
}
else if (res)
{ /* stop-word */
LexemeInfo *infos = findTheLexeme(d, NULL);
info = findVariant(NULL, stored, curpos, &infos, 1);
}
else
{
info = NULL; /* word isn't recognized */
}
dstate->private = (void *) info;
if (!info)
{
dstate->getnext = false;
PG_RETURN_POINTER(NULL);
}
if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
{
dstate->getnext = moreres;
PG_RETURN_POINTER(res);
}
dstate->getnext = true;
PG_RETURN_POINTER(NULL);
}

236
src/backend/tsearch/regis.c Normal file
View File

@@ -0,0 +1,236 @@
/*-------------------------------------------------------------------------
*
* regis.c
* Fast regex subset
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/regis.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/dicts/regis.h"
#include "tsearch/ts_locale.h"
bool
RS_isRegis(const char *str)
{
while (str && *str)
{
if (t_isalpha(str) ||
t_iseq(str, '[') ||
t_iseq(str, ']') ||
t_iseq(str, '^'))
str += pg_mblen(str);
else
return false;
}
return true;
}
#define RS_IN_ONEOF 1
#define RS_IN_ONEOF_IN 2
#define RS_IN_NONEOF 3
#define RS_IN_WAIT 4
static RegisNode *
newRegisNode(RegisNode * prev, int len)
{
RegisNode *ptr;
ptr = (RegisNode *) palloc0(RNHDRSZ + len + 1);
if (prev)
prev->next = ptr;
return ptr;
}
void
RS_compile(Regis * r, bool issuffix, char *str)
{
int len = strlen(str);
int state = RS_IN_WAIT;
char *c = (char *) str;
RegisNode *ptr = NULL;
memset(r, 0, sizeof(Regis));
r->issuffix = (issuffix) ? 1 : 0;
while (*c)
{
if (state == RS_IN_WAIT)
{
if (t_isalpha(c))
{
if (ptr)
ptr = newRegisNode(ptr, len);
else
ptr = r->node = newRegisNode(NULL, len);
COPYCHAR(ptr->data, c);
ptr->type = RSF_ONEOF;
ptr->len = pg_mblen(c);
}
else if (t_iseq(c, '['))
{
if (ptr)
ptr = newRegisNode(ptr, len);
else
ptr = r->node = newRegisNode(NULL, len);
ptr->type = RSF_ONEOF;
state = RS_IN_ONEOF;
}
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("invalid regis pattern: \"%s\"",
str)));
}
else if (state == RS_IN_ONEOF)
{
if (t_iseq(c, '^'))
{
ptr->type = RSF_NONEOF;
state = RS_IN_NONEOF;
}
else if (t_isalpha(c))
{
COPYCHAR(ptr->data, c);
ptr->len = pg_mblen(c);
state = RS_IN_ONEOF_IN;
}
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("invalid regis pattern: \"%s\"",
str)));
}
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
{
if (t_isalpha(c))
{
COPYCHAR(ptr->data + ptr->len, c);
ptr->len += pg_mblen(c);
}
else if (t_iseq(c, ']'))
state = RS_IN_WAIT;
else
ereport(ERROR,
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
errmsg("invalid regis pattern: \"%s\"",
str)));
}
else
elog(ERROR, "internal error in RS_compile: state %d", state);
c += pg_mblen(c);
}
ptr = r->node;
while (ptr)
{
r->nchar++;
ptr = ptr->next;
}
}
void
RS_free(Regis * r)
{
RegisNode *ptr = r->node,
*tmp;
while (ptr)
{
tmp = ptr->next;
pfree(ptr);
ptr = tmp;
}
r->node = NULL;
}
#ifdef TS_USE_WIDE
static bool
mb_strchr(char *str, char *c)
{
int clen = pg_mblen(c),
plen,
i;
char *ptr = str;
bool res = false;
clen = pg_mblen(c);
while (*ptr && !res)
{
plen = pg_mblen(ptr);
if (plen == clen)
{
i = plen;
res = true;
while (i--)
if (*(ptr + i) != *(c + i))
{
res = false;
break;
}
}
ptr += plen;
}
return res;
}
#else
#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true )
#endif
bool
RS_execute(Regis * r, char *str)
{
RegisNode *ptr = r->node;
char *c = str;
int len = 0;
while (*c)
{
len++;
c += pg_mblen(c);
}
if (len < r->nchar)
return 0;
c = str;
if (r->issuffix)
{
len -= r->nchar;
while (len-- > 0)
c += pg_mblen(c);
}
while (ptr)
{
switch (ptr->type)
{
case RSF_ONEOF:
if (mb_strchr((char *) ptr->data, c) != true)
return false;
break;
case RSF_NONEOF:
if (mb_strchr((char *) ptr->data, c) == true)
return false;
break;
default:
elog(ERROR, "unrecognized regis node type: %d", ptr->type);
}
ptr = ptr->next;
c += pg_mblen(c);
}
return true;
}

1747
src/backend/tsearch/spell.c Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,3 @@
skies sky
booking book
bookings book

View File

@@ -0,0 +1,20 @@
#
# Theasurus config file. Character ':' separates string from replacement, eg
# sample-words : substitute-words
#
# Any substitute-word can be marked by preceding '*' character,
# which means do not lexize this word
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
one two three : *123
one two : *12
one : *1
two : *2
#foo bar : blah blah
#f bar : fbar
#e bar : ebar
#g bar bar : gbarbar
#asd:sdffff
#qwerty:qwer wert erty

View File

@@ -0,0 +1,363 @@
/*-------------------------------------------------------------------------
*
* to_tsany.c
* to_ts* function definitions
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/syscache.h"
Datum
get_current_ts_config(PG_FUNCTION_ARGS)
{
PG_RETURN_OID(getTSCurrentConfig(true));
}
/*
* to_tsvector
*/
static int
compareWORD(const void *a, const void *b)
{
if (((ParsedWord *) a)->len == ((ParsedWord *) b)->len)
{
int res = strncmp(
((ParsedWord *) a)->word,
((ParsedWord *) b)->word,
((ParsedWord *) b)->len);
if (res == 0)
return (((ParsedWord *) a)->pos.pos > ((ParsedWord *) b)->pos.pos) ? 1 : -1;
return res;
}
return (((ParsedWord *) a)->len > ((ParsedWord *) b)->len) ? 1 : -1;
}
static int
uniqueWORD(ParsedWord * a, int4 l)
{
ParsedWord *ptr,
*res;
int tmppos;
if (l == 1)
{
tmppos = LIMITPOS(a->pos.pos);
a->alen = 2;
a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
a->pos.apos[0] = 1;
a->pos.apos[1] = tmppos;
return l;
}
res = a;
ptr = a + 1;
qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
tmppos = LIMITPOS(a->pos.pos);
a->alen = 2;
a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
a->pos.apos[0] = 1;
a->pos.apos[1] = tmppos;
while (ptr - a < l)
{
if (!(ptr->len == res->len &&
strncmp(ptr->word, res->word, res->len) == 0))
{
res++;
res->len = ptr->len;
res->word = ptr->word;
tmppos = LIMITPOS(ptr->pos.pos);
res->alen = 2;
res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
res->pos.apos[0] = 1;
res->pos.apos[1] = tmppos;
}
else
{
pfree(ptr->word);
if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1)
{
if (res->pos.apos[0] + 1 >= res->alen)
{
res->alen *= 2;
res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
}
if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
{
res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
res->pos.apos[0]++;
}
}
}
ptr++;
}
return res + 1 - a;
}
/*
* make value of tsvector, given parsed text
*/
TSVector
make_tsvector(ParsedText *prs)
{
int4 i,
j,
lenstr = 0,
totallen;
TSVector in;
WordEntry *ptr;
char *str,
*cur;
prs->curwords = uniqueWORD(prs->words, prs->curwords);
for (i = 0; i < prs->curwords; i++)
{
lenstr += SHORTALIGN(prs->words[i].len);
if (prs->words[i].alen)
lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
}
totallen = CALCDATASIZE(prs->curwords, lenstr);
in = (TSVector) palloc0(totallen);
SET_VARSIZE(in, totallen);
in->size = prs->curwords;
ptr = ARRPTR(in);
cur = str = STRPTR(in);
for (i = 0; i < prs->curwords; i++)
{
ptr->len = prs->words[i].len;
if (cur - str > MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("string is too long for tsvector")));
ptr->pos = cur - str;
memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len);
pfree(prs->words[i].word);
cur += SHORTALIGN(prs->words[i].len);
if (prs->words[i].alen)
{
WordEntryPos *wptr;
ptr->haspos = 1;
*(uint16 *) cur = prs->words[i].pos.apos[0];
wptr = POSDATAPTR(in, ptr);
for (j = 0; j < *(uint16 *) cur; j++)
{
WEP_SETWEIGHT(wptr[j], 0);
WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
}
cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
pfree(prs->words[i].pos.apos);
}
else
ptr->haspos = 0;
ptr++;
}
pfree(prs->words);
return in;
}
Datum
to_tsvector_byid(PG_FUNCTION_ARGS)
{
Oid cfgId = PG_GETARG_OID(0);
text *in = PG_GETARG_TEXT_P(1);
ParsedText prs;
TSVector out;
prs.lenwords = (VARSIZE(in) - VARHDRSZ) / 6; /* just estimation of
* word's number */
if (prs.lenwords == 0)
prs.lenwords = 2;
prs.curwords = 0;
prs.pos = 0;
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
parsetext(cfgId, &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ);
PG_FREE_IF_COPY(in, 1);
if (prs.curwords)
out = make_tsvector(&prs);
else
{
pfree(prs.words);
out = palloc(CALCDATASIZE(0, 0));
SET_VARSIZE(out, CALCDATASIZE(0, 0));
out->size = 0;
}
PG_RETURN_POINTER(out);
}
Datum
to_tsvector(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(0);
Oid cfgId;
cfgId = getTSCurrentConfig(true);
PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
ObjectIdGetDatum(cfgId),
PointerGetDatum(in)));
}
/*
* to_tsquery
*/
/*
* This function is used for morph parsing
*/
static void
pushval_morph(TSQueryParserState * state, int typeval, char *strval, int lenval, int2 weight)
{
int4 count = 0;
ParsedText prs;
uint32 variant,
pos,
cntvar = 0,
cntpos = 0,
cnt = 0;
prs.lenwords = 4;
prs.curwords = 0;
prs.pos = 0;
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
parsetext(state->cfg_id, &prs, strval, lenval);
if (prs.curwords > 0)
{
while (count < prs.curwords)
{
pos = prs.words[count].pos.pos;
cntvar = 0;
while (count < prs.curwords && pos == prs.words[count].pos.pos)
{
variant = prs.words[count].nvariant;
cnt = 0;
while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant)
{
pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
pfree(prs.words[count].word);
if (cnt)
pushquery(state, OPR, (int4) '&', 0, 0, 0);
cnt++;
count++;
}
if (cntvar)
pushquery(state, OPR, (int4) '|', 0, 0, 0);
cntvar++;
}
if (cntpos)
pushquery(state, OPR, (int4) '&', 0, 0, 0);
cntpos++;
}
pfree(prs.words);
}
else
pushval_asis(state, VALSTOP, NULL, 0, 0);
}
Datum
to_tsquery_byid(PG_FUNCTION_ARGS)
{
Oid cfgid = PG_GETARG_OID(0);
text *in = PG_GETARG_TEXT_P(1);
TSQuery query;
QueryItem *res;
int4 len;
query = parse_tsquery(TextPGetCString(in), pushval_morph, cfgid, false);
if (query->size == 0)
PG_RETURN_TSQUERY(query);
res = clean_fakeval(GETQUERY(query), &len);
if (!res)
{
SET_VARSIZE(query, HDRSIZETQ);
query->size = 0;
PG_RETURN_POINTER(query);
}
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
pfree(res);
PG_RETURN_TSQUERY(query);
}
Datum
to_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(0);
Oid cfgId;
cfgId = getTSCurrentConfig(true);
PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
ObjectIdGetDatum(cfgId),
PointerGetDatum(in)));
}
Datum
plainto_tsquery_byid(PG_FUNCTION_ARGS)
{
Oid cfgid = PG_GETARG_OID(0);
text *in = PG_GETARG_TEXT_P(1);
TSQuery query;
QueryItem *res;
int4 len;
query = parse_tsquery(TextPGetCString(in), pushval_morph, cfgid, true);
if (query->size == 0)
PG_RETURN_TSQUERY(query);
res = clean_fakeval(GETQUERY(query), &len);
if (!res)
{
SET_VARSIZE(query, HDRSIZETQ);
query->size = 0;
PG_RETURN_POINTER(query);
}
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
pfree(res);
PG_RETURN_POINTER(query);
}
Datum
plainto_tsquery(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(0);
Oid cfgId;
cfgId = getTSCurrentConfig(true);
PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
ObjectIdGetDatum(cfgId),
PointerGetDatum(in)));
}

View File

@@ -0,0 +1,241 @@
/*-------------------------------------------------------------------------
*
* ts_locale.c
* locale compatiblility layer for tsearch
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#ifdef TS_USE_WIDE
#ifdef WIN32
size_t
wchar2char(char *to, const wchar_t *from, size_t len)
{
if (len == 0)
return 0;
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
NULL, NULL);
if (r == 0)
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("UTF-16 to UTF-8 translation failed: %lu",
GetLastError())));
Assert(r <= len);
return r;
}
return wcstombs(to, from, len);
}
#endif /* WIN32 */
size_t
char2wchar(wchar_t *to, const char *from, size_t len)
{
if (len == 0)
return 0;
#ifdef WIN32
if (GetDatabaseEncoding() == PG_UTF8)
{
int r;
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
if (!r)
{
pg_verifymbstr(from, len, false);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte character for locale"),
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
}
Assert(r <= len);
return r;
}
else
#endif /* WIN32 */
if (lc_ctype_is_c())
{
/*
* pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
* allocated with sufficient space
*/
return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
}
else
{
/*
* mbstowcs require ending '\0'
*/
char *str = pnstrdup(from, len);
size_t tolen;
tolen = mbstowcs(to, str, len);
pfree(str);
return tolen;
}
}
int
_t_isalpha(const char *ptr)
{
wchar_t character[2];
if (lc_ctype_is_c())
return isalpha(TOUCHAR(ptr));
char2wchar(character, ptr, 1);
return iswalpha((wint_t) *character);
}
int
_t_isprint(const char *ptr)
{
wchar_t character[2];
if (lc_ctype_is_c())
return isprint(TOUCHAR(ptr));
char2wchar(character, ptr, 1);
return iswprint((wint_t) *character);
}
#endif /* TS_USE_WIDE */
/*
* Convert C-string from UTF8 to server encoding and
* lower it
*/
char *
recode_and_lowerstr(char *str)
{
char *recoded;
char *ret;
recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
PG_UTF8, GetDatabaseEncoding());
if (recoded == NULL)
elog(ERROR, "encoding conversion failed");
ret = lowerstr(recoded);
if (recoded != str)
pfree(recoded);
return ret;
}
char *
lowerstr(char *str)
{
return lowerstr_with_len(str, strlen(str));
}
char *
lowerstr_with_len(char *str, int len)
{
char *ptr = str;
char *out;
if (len == 0)
return pstrdup("");
#ifdef TS_USE_WIDE
/*
* Use wide char code only when max encoding length > 1 and ctype != C.
* Some operating systems fail with multi-byte encodings and a C locale.
* Also, for a C locale there is no need to process as multibyte. From
* backend/utils/adt/oracle_compat.c Teodor
*/
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
{
wchar_t *wstr,
*wptr;
int wlen;
/*
* alloc number of wchar_t for worst case, len contains number of
* bytes <= number of characters and alloc 1 wchar_t for 0, because
* wchar2char(wcstombs in really) wants zero-terminated string
*/
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
/*
* str SHOULD be cstring, so wlen contains number of converted
* character
*/
wlen = char2wchar(wstr, str, len);
if (wlen < 0)
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("translation failed from server encoding to wchar_t")));
Assert(wlen <= len);
wstr[wlen] = 0;
while (*wptr)
{
*wptr = towlower((wint_t) *wptr);
wptr++;
}
/*
* Alloc result string for worst case + '\0'
*/
len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
out = (char *) palloc(len);
/*
* wlen now is number of bytes which is always >= number of characters
*/
wlen = wchar2char(out, wstr, len);
pfree(wstr);
if (wlen < 0)
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("translation failed from wchar_t to server encoding %d", errno)));
Assert(wlen <= len);
out[wlen] = '\0';
}
else
#endif
{
char *outptr;
outptr = out = (char *) palloc(sizeof(char) * (len + 1));
while (*ptr && ptr - str < len)
{
*outptr++ = tolower(*(unsigned char *) ptr);
ptr++;
}
*outptr = '\0';
}
return out;
}

View File

@@ -0,0 +1,626 @@
/*-------------------------------------------------------------------------
*
* ts_parse.c
* main parse functions for tsearch
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#define IGNORE_LONGLEXEME 1
/*
* Lexize subsystem
*/
typedef struct ParsedLex
{
int type;
char *lemm;
int lenlemm;
bool resfollow;
struct ParsedLex *next;
} ParsedLex;
typedef struct ListParsedLex
{
ParsedLex *head;
ParsedLex *tail;
} ListParsedLex;
typedef struct
{
TSConfigCacheEntry *cfg;
Oid curDictId;
int posDict;
DictSubState dictState;
ParsedLex *curSub;
ListParsedLex towork; /* current list to work */
ListParsedLex waste; /* list of lexemes that already lexized */
/*
* fields to store last variant to lexize (basically, thesaurus or similar
* to, which wants several lexemes
*/
ParsedLex *lastRes;
TSLexeme *tmpRes;
} LexizeData;
static void
LexizeInit(LexizeData * ld, TSConfigCacheEntry * cfg)
{
ld->cfg = cfg;
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
ld->waste.head = ld->waste.tail = NULL;
ld->lastRes = NULL;
ld->tmpRes = NULL;
}
static void
LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
{
if (list->tail)
{
list->tail->next = newpl;
list->tail = newpl;
}
else
list->head = list->tail = newpl;
newpl->next = NULL;
}
static ParsedLex *
LPLRemoveHead(ListParsedLex * list)
{
ParsedLex *res = list->head;
if (list->head)
list->head = list->head->next;
if (list->head == NULL)
list->tail = NULL;
return res;
}
static void
LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
{
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
newpl->type = type;
newpl->lemm = lemm;
newpl->lenlemm = lenlemm;
LPLAddTail(&ld->towork, newpl);
ld->curSub = ld->towork.tail;
}
static void
RemoveHead(LexizeData * ld)
{
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
ld->posDict = 0;
}
static void
setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
{
if (correspondLexem)
{
*correspondLexem = ld->waste.head;
}
else
{
ParsedLex *tmp,
*ptr = ld->waste.head;
while (ptr)
{
tmp = ptr->next;
pfree(ptr);
ptr = tmp;
}
}
ld->waste.head = ld->waste.tail = NULL;
}
static void
moveToWaste(LexizeData * ld, ParsedLex * stop)
{
bool go = true;
while (ld->towork.head && go)
{
if (ld->towork.head == stop)
{
ld->curSub = stop->next;
go = false;
}
RemoveHead(ld);
}
}
static void
setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
{
if (ld->tmpRes)
{
TSLexeme *ptr;
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
pfree(ptr->lexeme);
pfree(ld->tmpRes);
}
ld->tmpRes = res;
ld->lastRes = lex;
}
static TSLexeme *
LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
{
int i;
ListDictionary *map;
TSDictionaryCacheEntry *dict;
TSLexeme *res;
if (ld->curDictId == InvalidOid)
{
/*
* usial mode: dictionary wants only one word, but we should keep in
* mind that we should go through all stack
*/
while (ld->towork.head)
{
ParsedLex *curVal = ld->towork.head;
map = ld->cfg->map + curVal->type;
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
RemoveHead(ld);
continue;
}
for (i = ld->posDict; i < map->len; i++)
{
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
ld->dictState.isend = ld->dictState.getnext = false;
ld->dictState.private = NULL;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)
));
if (ld->dictState.getnext)
{
/*
* dictionary wants next word, so setup and store current
* position and go to multiword mode
*/
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
ld->posDict = i + 1;
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
return LexizeExec(ld, correspondLexem);
}
if (!res) /* dictionary doesn't know this lexeme */
continue;
RemoveHead(ld);
setCorrLex(ld, correspondLexem);
return res;
}
RemoveHead(ld);
}
}
else
{ /* curDictId is valid */
dict = lookup_ts_dictionary_cache(ld->curDictId);
/*
* Dictionary ld->curDictId asks us about following words
*/
while (ld->curSub)
{
ParsedLex *curVal = ld->curSub;
map = ld->cfg->map + curVal->type;
if (curVal->type != 0)
{
bool dictExists = false;
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
{
/* skip this type of lexeme */
ld->curSub = curVal->next;
continue;
}
/*
* We should be sure that current type of lexeme is recognized
* by our dictinonary: we just check is it exist in list of
* dictionaries ?
*/
for (i = 0; i < map->len && !dictExists; i++)
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
dictExists = true;
if (!dictExists)
{
/*
* Dictionary can't work with current tpe of lexeme,
* return to basic mode and redo all stored lexemes
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
ld->dictState.isend = (curVal->type == 0) ? true : false;
ld->dictState.getnext = false;
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
&(dict->lexize),
PointerGetDatum(dict->dictData),
PointerGetDatum(curVal->lemm),
Int32GetDatum(curVal->lenlemm),
PointerGetDatum(&ld->dictState)
));
if (ld->dictState.getnext)
{
/* Dictionary wants one more */
ld->curSub = curVal->next;
if (res)
setNewTmpRes(ld, curVal, res);
continue;
}
if (res || ld->tmpRes)
{
/*
* Dictionary normalizes lexemes, so we remove from stack all
* used lexemes , return to basic mode and redo end of stack
* (if it exists)
*/
if (res)
{
moveToWaste(ld, ld->curSub);
}
else
{
res = ld->tmpRes;
moveToWaste(ld, ld->lastRes);
}
/* reset to initial state */
ld->curDictId = InvalidOid;
ld->posDict = 0;
ld->lastRes = NULL;
ld->tmpRes = NULL;
setCorrLex(ld, correspondLexem);
return res;
}
/*
* Dict don't want next lexem and didn't recognize anything, redo
* from ld->towork.head
*/
ld->curDictId = InvalidOid;
return LexizeExec(ld, correspondLexem);
}
}
setCorrLex(ld, correspondLexem);
return NULL;
}
/*
* Parse string and lexize words
*/
void
parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
{
int type,
lenlemm;
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long to be indexed")));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
{
TSLexeme *ptr = norms;
prs->pos++; /* set pos */
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
}
if (ptr->flags & TSL_ADDPOS)
prs->pos++;
prs->words[prs->curwords].len = strlen(ptr->lexeme);
prs->words[prs->curwords].word = ptr->lexeme;
prs->words[prs->curwords].nvariant = ptr->nvariant;
prs->words[prs->curwords].alen = 0;
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
ptr++;
prs->curwords++;
}
pfree(norms);
}
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
/*
* Headline framework
*/
static void
hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
{
while (prs->curwords >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
}
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
prs->words[prs->curwords].type = (uint8) type;
prs->words[prs->curwords].len = buflen;
prs->words[prs->curwords].word = palloc(buflen);
memcpy(prs->words[prs->curwords].word, buf, buflen);
prs->curwords++;
}
static void
hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
{
int i;
QueryItem *item = GETQUERY(query);
HeadlineWord *word;
while (prs->curwords + query->size >= prs->lenwords)
{
prs->lenwords *= 2;
prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
}
word = &(prs->words[prs->curwords - 1]);
for (i = 0; i < query->size; i++)
{
if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0)
{
if (word->item)
{
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
prs->words[prs->curwords].item = item;
prs->words[prs->curwords].repeated = 1;
prs->curwords++;
}
else
word->item = item;
}
item++;
}
}
static void
addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
{
ParsedLex *tmplexs;
TSLexeme *ptr;
while (lexs)
{
if (lexs->type > 0)
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
ptr = norms;
while (ptr && ptr->lexeme)
{
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
ptr++;
}
tmplexs = lexs->next;
pfree(lexs);
lexs = tmplexs;
}
if (norms)
{
ptr = norms;
while (ptr->lexeme)
{
pfree(ptr->lexeme);
ptr++;
}
pfree(norms);
}
}
void
hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
{
int type,
lenlemm;
char *lemm = NULL;
LexizeData ldata;
TSLexeme *norms;
ParsedLex *lexs;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
void *prsdata;
cfg = lookup_ts_config_cache(cfgId);
prsobj = lookup_ts_parser_cache(cfg->prsId);
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
PointerGetDatum(buf),
Int32GetDatum(buflen)));
LexizeInit(&ldata, cfg);
do
{
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
PointerGetDatum(prsdata),
PointerGetDatum(&lemm),
PointerGetDatum(&lenlemm)));
if (type > 0 && lenlemm >= MAXSTRLEN)
{
#ifdef IGNORE_LONGLEXEME
ereport(NOTICE,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long to be indexed"),
errdetail("Words longer than %d characters are ignored.",
MAXSTRLEN)));
continue;
#else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long to be indexed")));
#endif
}
LexizeAddLemm(&ldata, type, lemm, lenlemm);
do
{
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
addHLParsedLex(prs, query, lexs, norms);
else
addHLParsedLex(prs, query, lexs, NULL);
} while (norms);
} while (type > 0);
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
text *
generatHeadline(HeadlineText * prs)
{
text *out;
int len = 128;
char *ptr;
HeadlineWord *wrd = prs->words;
out = (text *) palloc(len);
ptr = ((char *) out) + VARHDRSZ;
while (wrd - prs->words < prs->curwords)
{
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
{
int dist = ptr - ((char *) out);
len *= 2;
out = (text *) repalloc(out, len);
ptr = ((char *) out) + dist;
}
if (wrd->in && !wrd->repeated)
{
if (wrd->replace)
{
*ptr = ' ';
ptr++;
}
else
{
if (wrd->selected)
{
memcpy(ptr, prs->startsel, prs->startsellen);
ptr += prs->startsellen;
}
memcpy(ptr, wrd->word, wrd->len);
ptr += wrd->len;
if (wrd->selected)
{
memcpy(ptr, prs->stopsel, prs->stopsellen);
ptr += prs->stopsellen;
}
}
}
else if (!wrd->repeated)
pfree(wrd->word);
wrd++;
}
SET_VARSIZE(out, ptr - ((char *) out));
return out;
}

View File

@@ -0,0 +1,330 @@
/*-------------------------------------------------------------------------
*
* ts_utils.c
* various support functions
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include "miscadmin.h"
#include "storage/fd.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#define CS_WAITKEY 0
#define CS_INKEY 1
#define CS_WAITEQ 2
#define CS_WAITVALUE 3
#define CS_INVALUE 4
#define CS_IN2VALUE 5
#define CS_WAITDELIM 6
#define CS_INESC 7
#define CS_IN2ESC 8
static char *
nstrdup(char *ptr, int len)
{
char *res = palloc(len + 1),
*cptr;
memcpy(res, ptr, len);
res[len] = '\0';
cptr = ptr = res;
while (*ptr)
{
if (t_iseq(ptr, '\\'))
ptr++;
COPYCHAR(cptr, ptr);
cptr += pg_mblen(ptr);
ptr += pg_mblen(ptr);
}
*cptr = '\0';
return res;
}
/*
* Parse a parameter string consisting of key = value clauses
*/
void
parse_keyvalpairs(text *in, Map ** m)
{
Map *mptr;
char *ptr = VARDATA(in),
*begin = NULL;
char num = 0;
int state = CS_WAITKEY;
while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
{
if (t_iseq(ptr, ','))
num++;
ptr += pg_mblen(ptr);
}
*m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
memset(mptr, 0, sizeof(Map) * (num + 2));
ptr = VARDATA(in);
while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
{
if (state == CS_WAITKEY)
{
if (t_isalpha(ptr))
{
begin = ptr;
state = CS_INKEY;
}
else if (!t_isspace(ptr))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid parameter list format: \"%s\"",
TextPGetCString(in))));
}
else if (state == CS_INKEY)
{
if (t_isspace(ptr))
{
mptr->key = nstrdup(begin, ptr - begin);
state = CS_WAITEQ;
}
else if (t_iseq(ptr, '='))
{
mptr->key = nstrdup(begin, ptr - begin);
state = CS_WAITVALUE;
}
else if (!t_isalpha(ptr))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid parameter list format: \"%s\"",
TextPGetCString(in))));
}
else if (state == CS_WAITEQ)
{
if (t_iseq(ptr, '='))
state = CS_WAITVALUE;
else if (!t_isspace(ptr))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid parameter list format: \"%s\"",
TextPGetCString(in))));
}
else if (state == CS_WAITVALUE)
{
if (t_iseq(ptr, '"'))
{
begin = ptr + 1;
state = CS_INVALUE;
}
else if (!t_isspace(ptr))
{
begin = ptr;
state = CS_IN2VALUE;
}
}
else if (state == CS_INVALUE)
{
if (t_iseq(ptr, '"'))
{
mptr->value = nstrdup(begin, ptr - begin);
mptr++;
state = CS_WAITDELIM;
}
else if (t_iseq(ptr, '\\'))
state = CS_INESC;
}
else if (state == CS_IN2VALUE)
{
if (t_isspace(ptr) || t_iseq(ptr, ','))
{
mptr->value = nstrdup(begin, ptr - begin);
mptr++;
state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
}
else if (t_iseq(ptr, '\\'))
state = CS_INESC;
}
else if (state == CS_WAITDELIM)
{
if (t_iseq(ptr, ','))
state = CS_WAITKEY;
else if (!t_isspace(ptr))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid parameter list format: \"%s\"",
TextPGetCString(in))));
}
else if (state == CS_INESC)
state = CS_INVALUE;
else if (state == CS_IN2ESC)
state = CS_IN2VALUE;
else
elog(ERROR, "unrecognized parse_keyvalpairs state: %d", state);
ptr += pg_mblen(ptr);
}
if (state == CS_IN2VALUE)
{
mptr->value = nstrdup(begin, ptr - begin);
mptr++;
}
else if (!(state == CS_WAITDELIM || state == CS_WAITKEY))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("invalid parameter list format: \"%s\"",
TextPGetCString(in))));
}
/*
* Given the base name and extension of a tsearch config file, return
* its full path name. The base name is assumed to be user-supplied,
* and is checked to prevent pathname attacks. The extension is assumed
* to be safe.
*
* The result is a palloc'd string.
*/
char *
get_tsearch_config_filename(const char *basename,
const char *extension)
{
char sharepath[MAXPGPATH];
char *result;
const char *p;
/*
* We enforce that the basename is all alpha characters. This may be
* overly restrictive, but we don't want to allow access to anything
* outside the tsearch_data directory, so for instance '/' *must* be
* rejected. This is the same test used for timezonesets names.
*/
for (p = basename; *p; p++)
{
if (!isalpha((unsigned char) *p))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("invalid text search configuration file name \"%s\"",
basename)));
}
get_share_path(my_exec_path, sharepath);
result = palloc(MAXPGPATH);
snprintf(result, MAXPGPATH, "%s/tsearch_data/%s.%s",
sharepath, basename, extension);
return result;
}
#define STOPBUFLEN 4096
void
readstoplist(char *in, StopList * s)
{
char **stop = NULL;
s->len = 0;
if (in && *in)
{
char *filename = get_tsearch_config_filename(in, "stop");
FILE *hin;
char buf[STOPBUFLEN];
int reallen = 0;
int line = 0;
if ((hin = AllocateFile(filename, "r")) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open stopword file \"%s\": %m",
filename)));
while (fgets(buf, STOPBUFLEN, hin))
{
char *pbuf = buf;
line++;
while (*pbuf && !isspace(*pbuf))
pbuf++;
*pbuf = '\0';
if (*buf == '\0')
continue;
if (!pg_verifymbstr(buf, strlen(buf), true))
{
FreeFile(hin);
ereport(ERROR,
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
errmsg("invalid multibyte encoding at line %d in file \"%s\"",
line, filename)));
}
if (s->len >= reallen)
{
if (reallen == 0)
{
reallen = 16;
stop = (char **) palloc(sizeof(char *) * reallen);
}
else
{
reallen *= 2;
stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen);
}
}
if (s->wordop)
stop[s->len] = s->wordop(buf);
else
stop[s->len] = pstrdup(buf);
(s->len)++;
}
FreeFile(hin);
pfree(filename);
}
s->stop = stop;
}
static int
comparestr(const void *a, const void *b)
{
return strcmp(*(char **) a, *(char **) b);
}
void
sortstoplist(StopList * s)
{
if (s->stop && s->len > 0)
qsort(s->stop, s->len, sizeof(char *), comparestr);
}
bool
searchstoplist(StopList * s, char *key)
{
return (s->stop && s->len > 0 &&
bsearch(&key, s->stop, s->len,
sizeof(char *), comparestr)) ? true : false;
}
char *
pnstrdup(const char *in, int len)
{
char *out = palloc(len + 1);
memcpy(out, in, len);
out[len] = '\0';
return out;
}

View File

@@ -0,0 +1,360 @@
/*-------------------------------------------------------------------------
*
* wparser.c
* Standard interface to word parser
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "funcapi.h"
#include "access/genam.h"
#include "access/heapam.h"
#include "access/skey.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_ts_parser.h"
#include "catalog/pg_type.h"
#include "tsearch/ts_cache.h"
#include "tsearch/ts_public.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/fmgroids.h"
#include "utils/rel.h"
#include "utils/syscache.h"
/******sql-level interface******/
typedef struct
{
int cur;
LexDescr *list;
} TSTokenTypeStorage;
static void
tt_setup_firstcall(FuncCallContext *funcctx, Oid prsid)
{
TupleDesc tupdesc;
MemoryContext oldcontext;
TSTokenTypeStorage *st;
TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid);
if (!OidIsValid(prs->lextypeOid))
elog(ERROR, "method lextype isn't defined for text search parser %u",
prsid);
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
st = (TSTokenTypeStorage *) palloc(sizeof(TSTokenTypeStorage));
st->cur = 0;
/* OidFunctionCall0 is absent */
st->list = (LexDescr *) DatumGetPointer(OidFunctionCall1(prs->lextypeOid,
(Datum) 0));
funcctx->user_fctx = (void *) st;
tupdesc = CreateTemplateTupleDesc(3, false);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "alias",
TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "description",
TEXTOID, -1, 0);
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
MemoryContextSwitchTo(oldcontext);
}
static Datum
tt_process_call(FuncCallContext *funcctx)
{
TSTokenTypeStorage *st;
st = (TSTokenTypeStorage *) funcctx->user_fctx;
if (st->list && st->list[st->cur].lexid)
{
Datum result;
char *values[3];
char txtid[16];
HeapTuple tuple;
sprintf(txtid, "%d", st->list[st->cur].lexid);
values[0] = txtid;
values[1] = st->list[st->cur].alias;
values[2] = st->list[st->cur].descr;
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
result = HeapTupleGetDatum(tuple);
pfree(values[1]);
pfree(values[2]);
st->cur++;
return result;
}
if (st->list)
pfree(st->list);
pfree(st);
return (Datum) 0;
}
Datum
ts_token_type_byid(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
funcctx = SRF_FIRSTCALL_INIT();
tt_setup_firstcall(funcctx, PG_GETARG_OID(0));
}
funcctx = SRF_PERCALL_SETUP();
if ((result = tt_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
Datum
ts_token_type_byname(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
text *prsname = PG_GETARG_TEXT_P(0);
Oid prsId;
funcctx = SRF_FIRSTCALL_INIT();
prsId = TSParserGetPrsid(textToQualifiedNameList(prsname), false);
tt_setup_firstcall(funcctx, prsId);
}
funcctx = SRF_PERCALL_SETUP();
if ((result = tt_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
typedef struct
{
int type;
char *lexeme;
} LexemeEntry;
typedef struct
{
int cur;
int len;
LexemeEntry *list;
} PrsStorage;
static void
prs_setup_firstcall(FuncCallContext *funcctx, Oid prsid, text *txt)
{
TupleDesc tupdesc;
MemoryContext oldcontext;
PrsStorage *st;
TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid);
char *lex = NULL;
int llen = 0,
type = 0;
void *prsdata;
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
st = (PrsStorage *) palloc(sizeof(PrsStorage));
st->cur = 0;
st->len = 16;
st->list = (LexemeEntry *) palloc(sizeof(LexemeEntry) * st->len);
prsdata = (void *) DatumGetPointer(FunctionCall2(&prs->prsstart,
PointerGetDatum(VARDATA(txt)),
Int32GetDatum(VARSIZE(txt) - VARHDRSZ)));
while ((type = DatumGetInt32(FunctionCall3(&prs->prstoken,
PointerGetDatum(prsdata),
PointerGetDatum(&lex),
PointerGetDatum(&llen)))) != 0)
{
if (st->cur >= st->len)
{
st->len = 2 * st->len;
st->list = (LexemeEntry *) repalloc(st->list, sizeof(LexemeEntry) * st->len);
}
st->list[st->cur].lexeme = palloc(llen + 1);
memcpy(st->list[st->cur].lexeme, lex, llen);
st->list[st->cur].lexeme[llen] = '\0';
st->list[st->cur].type = type;
st->cur++;
}
FunctionCall1(&prs->prsend, PointerGetDatum(prsdata));
st->len = st->cur;
st->cur = 0;
funcctx->user_fctx = (void *) st;
tupdesc = CreateTemplateTupleDesc(2, false);
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid",
INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "token",
TEXTOID, -1, 0);
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
MemoryContextSwitchTo(oldcontext);
}
static Datum
prs_process_call(FuncCallContext *funcctx)
{
PrsStorage *st;
st = (PrsStorage *) funcctx->user_fctx;
if (st->cur < st->len)
{
Datum result;
char *values[2];
char tid[16];
HeapTuple tuple;
values[0] = tid;
sprintf(tid, "%d", st->list[st->cur].type);
values[1] = st->list[st->cur].lexeme;
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
result = HeapTupleGetDatum(tuple);
pfree(values[1]);
st->cur++;
return result;
}
else
{
if (st->list)
pfree(st->list);
pfree(st);
}
return (Datum) 0;
}
Datum
ts_parse_byid(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
text *txt = PG_GETARG_TEXT_P(1);
funcctx = SRF_FIRSTCALL_INIT();
prs_setup_firstcall(funcctx, PG_GETARG_OID(0), txt);
PG_FREE_IF_COPY(txt, 1);
}
funcctx = SRF_PERCALL_SETUP();
if ((result = prs_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
Datum
ts_parse_byname(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
text *prsname = PG_GETARG_TEXT_P(0);
text *txt = PG_GETARG_TEXT_P(1);
Oid prsId;
funcctx = SRF_FIRSTCALL_INIT();
prsId = TSParserGetPrsid(textToQualifiedNameList(prsname), false);
prs_setup_firstcall(funcctx, prsId, txt);
}
funcctx = SRF_PERCALL_SETUP();
if ((result = prs_process_call(funcctx)) != (Datum) 0)
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
Datum
ts_headline_byid_opt(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(1);
TSQuery query = PG_GETARG_TSQUERY(2);
text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
HeadlineText prs;
text *out;
TSConfigCacheEntry *cfg;
TSParserCacheEntry *prsobj;
cfg = lookup_ts_config_cache(PG_GETARG_OID(0));
prsobj = lookup_ts_parser_cache(cfg->prsId);
memset(&prs, 0, sizeof(HeadlineText));
prs.lenwords = 32;
prs.words = (HeadlineWord *) palloc(sizeof(HeadlineWord) * prs.lenwords);
hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);
FunctionCall3(&(prsobj->prsheadline),
PointerGetDatum(&prs),
PointerGetDatum(opt),
PointerGetDatum(query));
out = generatHeadline(&prs);
PG_FREE_IF_COPY(in, 1);
PG_FREE_IF_COPY(query, 2);
if (opt)
PG_FREE_IF_COPY(opt, 3);
pfree(prs.words);
pfree(prs.startsel);
pfree(prs.stopsel);
PG_RETURN_POINTER(out);
}
Datum
ts_headline_byid(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(2)));
}
Datum
ts_headline(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
ObjectIdGetDatum(getTSCurrentConfig(true)),
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1)));
}
Datum
ts_headline_opt(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
ObjectIdGetDatum(getTSCurrentConfig(true)),
PG_GETARG_DATUM(0),
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(2)));
}

File diff suppressed because it is too large Load Diff