mirror of
https://github.com/postgres/postgres.git
synced 2025-09-02 04:21:28 +03:00
Tsearch2 functionality migrates to core. The bulk of this work is by
Oleg Bartunov and Teodor Sigaev, but I did a lot of editorializing, so anything that's broken is probably my fault. Documentation is nonexistent as yet, but let's land the patch so we can get some portability testing done.
This commit is contained in:
51
src/backend/tsearch/Makefile
Normal file
51
src/backend/tsearch/Makefile
Normal file
@@ -0,0 +1,51 @@
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# Makefile for backend/tsearch
|
||||
#
|
||||
# Copyright (c) 2006-2007, PostgreSQL Global Development Group
|
||||
#
|
||||
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
subdir = src/backend/tsearch
|
||||
top_builddir = ../../..
|
||||
include $(top_builddir)/src/Makefile.global
|
||||
|
||||
DICTDIR=tsearch_data
|
||||
|
||||
DICTFILES=synonym.syn.sample thesaurus.ths.sample
|
||||
|
||||
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
|
||||
dict_simple.o dict_synonym.o dict_thesaurus.o \
|
||||
dict_ispell.o regis.o spell.o \
|
||||
to_tsany.o ts_utils.o
|
||||
|
||||
all: SUBSYS.o
|
||||
|
||||
SUBSYS.o: $(OBJS)
|
||||
$(LD) $(LDREL) $(LDOUT) SUBSYS.o $^
|
||||
|
||||
depend dep:
|
||||
$(CC) -MM $(CFLAGS) *.c >depend
|
||||
|
||||
.PHONY: install-data
|
||||
install-data: $(DICTFILES) installdirs
|
||||
for i in $(DICTFILES); \
|
||||
do $(INSTALL_DATA) $$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \
|
||||
done
|
||||
|
||||
installdirs:
|
||||
$(mkinstalldirs) '$(DESTDIR)$(datadir)' '$(DESTDIR)$(datadir)/$(DICTDIR)'
|
||||
|
||||
.PHONY: uninstall-data
|
||||
uninstall-data:
|
||||
for i in $(DICTFILES); \
|
||||
do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \
|
||||
done
|
||||
|
||||
clean distclean maintainer-clean:
|
||||
rm -f SUBSYS.o $(OBJS)
|
||||
|
||||
ifeq (depend,$(wildcard depend))
|
||||
include depend
|
||||
endif
|
131
src/backend/tsearch/dict.c
Normal file
131
src/backend/tsearch/dict.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* dict.c
|
||||
* Standard interface to dictionary
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "access/genam.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/skey.h"
|
||||
#include "catalog/indexing.h"
|
||||
#include "catalog/namespace.h"
|
||||
#include "catalog/pg_ts_dict.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "tsearch/ts_cache.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/array.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/fmgroids.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/syscache.h"
|
||||
|
||||
|
||||
/*
|
||||
* Lexize one word by dictionary, mostly debug function
|
||||
*/
|
||||
static ArrayType *
|
||||
ts_lexize_workhorse(Oid dictId, text *in)
|
||||
{
|
||||
TSDictionaryCacheEntry *dict;
|
||||
TSLexeme *res,
|
||||
*ptr;
|
||||
Datum *da;
|
||||
ArrayType *a;
|
||||
DictSubState dstate = {false, false, NULL};
|
||||
|
||||
dict = lookup_ts_dictionary_cache(dictId);
|
||||
|
||||
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize,
|
||||
PointerGetDatum(dict->dictData),
|
||||
PointerGetDatum(VARDATA(in)),
|
||||
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
|
||||
PointerGetDatum(&dstate)));
|
||||
|
||||
if (dstate.getnext)
|
||||
{
|
||||
dstate.isend = true;
|
||||
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&dict->lexize,
|
||||
PointerGetDatum(dict->dictData),
|
||||
PointerGetDatum(VARDATA(in)),
|
||||
Int32GetDatum(VARSIZE(in) - VARHDRSZ),
|
||||
PointerGetDatum(&dstate)));
|
||||
if (ptr != NULL)
|
||||
res = ptr;
|
||||
}
|
||||
|
||||
if (!res)
|
||||
return NULL;
|
||||
|
||||
ptr = res;
|
||||
while (ptr->lexeme)
|
||||
ptr++;
|
||||
da = (Datum *) palloc(sizeof(Datum) * (ptr - res + 1));
|
||||
ptr = res;
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
da[ptr - res] = DirectFunctionCall1(textin, CStringGetDatum(ptr->lexeme));
|
||||
ptr++;
|
||||
}
|
||||
|
||||
a = construct_array(da,
|
||||
ptr - res,
|
||||
TEXTOID,
|
||||
-1,
|
||||
false,
|
||||
'i');
|
||||
|
||||
ptr = res;
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
pfree(DatumGetPointer(da[ptr - res]));
|
||||
pfree(ptr->lexeme);
|
||||
ptr++;
|
||||
}
|
||||
pfree(res);
|
||||
pfree(da);
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_lexize_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid dictId = PG_GETARG_OID(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
ArrayType *a;
|
||||
|
||||
a = ts_lexize_workhorse(dictId, in);
|
||||
|
||||
if (a)
|
||||
PG_RETURN_POINTER(a);
|
||||
else
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_lexize_byname(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *dictname = PG_GETARG_TEXT_P(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
Oid dictId;
|
||||
ArrayType *a;
|
||||
|
||||
dictId = TSDictionaryGetDictid(textToQualifiedNameList(dictname), false);
|
||||
a = ts_lexize_workhorse(dictId, in);
|
||||
|
||||
if (a)
|
||||
PG_RETURN_POINTER(a);
|
||||
else
|
||||
PG_RETURN_NULL();
|
||||
}
|
164
src/backend/tsearch/dict_ispell.c
Normal file
164
src/backend/tsearch/dict_ispell.c
Normal file
@@ -0,0 +1,164 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* dict_ispell.c
|
||||
* Ispell dictionary interface
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_ispell.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "tsearch/dicts/spell.h"
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/memutils.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
StopList stoplist;
|
||||
IspellDict obj;
|
||||
} DictISpell;
|
||||
|
||||
Datum
|
||||
dispell_init(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictISpell *d;
|
||||
Map *cfg,
|
||||
*pcfg;
|
||||
bool affloaded = false,
|
||||
dictloaded = false,
|
||||
stoploaded = false;
|
||||
text *in;
|
||||
|
||||
/* init functions must defend against NULLs for themselves */
|
||||
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("NULL config not allowed for ISpell")));
|
||||
in = PG_GETARG_TEXT_P(0);
|
||||
|
||||
parse_keyvalpairs(in, &cfg);
|
||||
PG_FREE_IF_COPY(in, 0);
|
||||
|
||||
d = (DictISpell *) palloc0(sizeof(DictISpell));
|
||||
d->stoplist.wordop = recode_and_lowerstr;
|
||||
|
||||
pcfg = cfg;
|
||||
while (pcfg->key)
|
||||
{
|
||||
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
|
||||
{
|
||||
if (dictloaded)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("multiple DictFile parameters")));
|
||||
NIImportDictionary(&(d->obj),
|
||||
get_tsearch_config_filename(pcfg->value,
|
||||
"dict"));
|
||||
dictloaded = true;
|
||||
}
|
||||
else if (pg_strcasecmp("AffFile", pcfg->key) == 0)
|
||||
{
|
||||
if (affloaded)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("multiple AffFile parameters")));
|
||||
NIImportAffixes(&(d->obj),
|
||||
get_tsearch_config_filename(pcfg->value,
|
||||
"affix"));
|
||||
affloaded = true;
|
||||
}
|
||||
else if (pg_strcasecmp("StopWords", pcfg->key) == 0)
|
||||
{
|
||||
if (stoploaded)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("multiple StopWords parameters")));
|
||||
readstoplist(pcfg->value, &(d->stoplist));
|
||||
sortstoplist(&(d->stoplist));
|
||||
stoploaded = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("unrecognized ISpell parameter: \"%s\"",
|
||||
pcfg->key)));
|
||||
}
|
||||
pfree(pcfg->key);
|
||||
pfree(pcfg->value);
|
||||
pcfg++;
|
||||
}
|
||||
pfree(cfg);
|
||||
|
||||
if (affloaded && dictloaded)
|
||||
{
|
||||
NISortDictionary(&(d->obj));
|
||||
NISortAffixes(&(d->obj));
|
||||
}
|
||||
else if (!affloaded)
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("missing AffFile parameter")));
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("missing DictFile parameter")));
|
||||
}
|
||||
|
||||
MemoryContextDeleteChildren(CurrentMemoryContext);
|
||||
|
||||
PG_RETURN_POINTER(d);
|
||||
}
|
||||
|
||||
Datum
|
||||
dispell_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictISpell *d = (DictISpell *) PG_GETARG_POINTER(0);
|
||||
char *in = (char *) PG_GETARG_POINTER(1);
|
||||
int32 len = PG_GETARG_INT32(2);
|
||||
char *txt;
|
||||
TSLexeme *res;
|
||||
TSLexeme *ptr,
|
||||
*cptr;
|
||||
|
||||
if (len <= 0)
|
||||
PG_RETURN_POINTER(NULL);
|
||||
|
||||
txt = lowerstr_with_len(in, len);
|
||||
res = NINormalizeWord(&(d->obj), txt);
|
||||
|
||||
if (res == NULL)
|
||||
PG_RETURN_POINTER(NULL);
|
||||
|
||||
ptr = cptr = res;
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
if (searchstoplist(&(d->stoplist), ptr->lexeme))
|
||||
{
|
||||
pfree(ptr->lexeme);
|
||||
ptr->lexeme = NULL;
|
||||
ptr++;
|
||||
}
|
||||
else
|
||||
{
|
||||
memcpy(cptr, ptr, sizeof(TSLexeme));
|
||||
cptr++;
|
||||
ptr++;
|
||||
}
|
||||
}
|
||||
cptr->lexeme = NULL;
|
||||
|
||||
PG_RETURN_POINTER(res);
|
||||
}
|
65
src/backend/tsearch/dict_simple.c
Normal file
65
src/backend/tsearch/dict_simple.c
Normal file
@@ -0,0 +1,65 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* dict_simple.c
|
||||
* Simple dictionary: just lowercase and check for stopword
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_simple.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
StopList stoplist;
|
||||
} DictExample;
|
||||
|
||||
|
||||
Datum
|
||||
dsimple_init(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictExample *d = (DictExample *) palloc0(sizeof(DictExample));
|
||||
|
||||
d->stoplist.wordop = recode_and_lowerstr;
|
||||
|
||||
if (!PG_ARGISNULL(0) && PG_GETARG_POINTER(0) != NULL)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(0);
|
||||
char *filename = TextPGetCString(in);
|
||||
|
||||
readstoplist(filename, &d->stoplist);
|
||||
sortstoplist(&d->stoplist);
|
||||
pfree(filename);
|
||||
}
|
||||
|
||||
PG_RETURN_POINTER(d);
|
||||
}
|
||||
|
||||
Datum
|
||||
dsimple_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictExample *d = (DictExample *) PG_GETARG_POINTER(0);
|
||||
char *in = (char *) PG_GETARG_POINTER(1);
|
||||
int32 len = PG_GETARG_INT32(2);
|
||||
char *txt = lowerstr_with_len(in, len);
|
||||
TSLexeme *res = palloc0(sizeof(TSLexeme) * 2);
|
||||
|
||||
if (*txt == '\0' || searchstoplist(&(d->stoplist), txt))
|
||||
{
|
||||
pfree(txt);
|
||||
}
|
||||
else
|
||||
res[0].lexeme = txt;
|
||||
|
||||
PG_RETURN_POINTER(res);
|
||||
}
|
176
src/backend/tsearch/dict_synonym.c
Normal file
176
src/backend/tsearch/dict_synonym.c
Normal file
@@ -0,0 +1,176 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* dict_synonym.c
|
||||
* Synonym dictionary: replace word by its synonym
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/fd.h"
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
|
||||
#define SYNBUFLEN 4096
|
||||
typedef struct
|
||||
{
|
||||
char *in;
|
||||
char *out;
|
||||
} Syn;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int len;
|
||||
Syn *syn;
|
||||
} DictSyn;
|
||||
|
||||
static char *
|
||||
findwrd(char *in, char **end)
|
||||
{
|
||||
char *start;
|
||||
|
||||
*end = NULL;
|
||||
while (*in && t_isspace(in))
|
||||
in += pg_mblen(in);
|
||||
|
||||
if (*in == '\0')
|
||||
return NULL;
|
||||
start = in;
|
||||
|
||||
while (*in && !t_isspace(in))
|
||||
in += pg_mblen(in);
|
||||
|
||||
*end = in;
|
||||
return start;
|
||||
}
|
||||
|
||||
static int
|
||||
compareSyn(const void *a, const void *b)
|
||||
{
|
||||
return strcmp(((Syn *) a)->in, ((Syn *) b)->in);
|
||||
}
|
||||
|
||||
|
||||
Datum
|
||||
dsynonym_init(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in;
|
||||
DictSyn *d;
|
||||
int cur = 0;
|
||||
FILE *fin;
|
||||
char *filename;
|
||||
char buf[SYNBUFLEN];
|
||||
char *starti,
|
||||
*starto,
|
||||
*end = NULL;
|
||||
int slen;
|
||||
|
||||
/* init functions must defend against NULLs for themselves */
|
||||
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("NULL config not allowed for Synonym")));
|
||||
in = PG_GETARG_TEXT_P(0);
|
||||
|
||||
filename = get_tsearch_config_filename(TextPGetCString(in), "syn");
|
||||
|
||||
PG_FREE_IF_COPY(in, 0);
|
||||
|
||||
if ((fin = AllocateFile(filename, "r")) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("could not open synonym file \"%s\": %m",
|
||||
filename)));
|
||||
|
||||
d = (DictSyn *) palloc0(sizeof(DictSyn));
|
||||
|
||||
while (fgets(buf, SYNBUFLEN, fin))
|
||||
{
|
||||
slen = strlen(buf);
|
||||
pg_verifymbstr(buf, slen, false);
|
||||
if (cur == d->len)
|
||||
{
|
||||
if (d->len == 0)
|
||||
{
|
||||
d->len = 16;
|
||||
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
|
||||
}
|
||||
else
|
||||
{
|
||||
d->len *= 2;
|
||||
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
|
||||
}
|
||||
}
|
||||
|
||||
starti = findwrd(buf, &end);
|
||||
if (!starti)
|
||||
continue;
|
||||
*end = '\0';
|
||||
if (end >= buf + slen)
|
||||
continue;
|
||||
|
||||
starto = findwrd(end + 1, &end);
|
||||
if (!starto)
|
||||
continue;
|
||||
*end = '\0';
|
||||
|
||||
d->syn[cur].in = recode_and_lowerstr(starti);
|
||||
d->syn[cur].out = recode_and_lowerstr(starto);
|
||||
if (!(d->syn[cur].in && d->syn[cur].out))
|
||||
{
|
||||
FreeFile(fin);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
}
|
||||
|
||||
cur++;
|
||||
}
|
||||
|
||||
FreeFile(fin);
|
||||
|
||||
d->len = cur;
|
||||
if (cur > 1)
|
||||
qsort(d->syn, d->len, sizeof(Syn), compareSyn);
|
||||
|
||||
pfree(filename);
|
||||
PG_RETURN_POINTER(d);
|
||||
}
|
||||
|
||||
Datum
|
||||
dsynonym_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
|
||||
char *in = (char *) PG_GETARG_POINTER(1);
|
||||
int32 len = PG_GETARG_INT32(2);
|
||||
Syn key,
|
||||
*found;
|
||||
TSLexeme *res;
|
||||
|
||||
if (len <= 0)
|
||||
PG_RETURN_POINTER(NULL);
|
||||
|
||||
key.in = lowerstr_with_len(in, len);
|
||||
key.out = NULL;
|
||||
|
||||
found = (Syn *) bsearch(&key, d->syn, d->len, sizeof(Syn), compareSyn);
|
||||
pfree(key.in);
|
||||
|
||||
if (!found)
|
||||
PG_RETURN_POINTER(NULL);
|
||||
|
||||
res = palloc(sizeof(TSLexeme) * 2);
|
||||
memset(res, 0, sizeof(TSLexeme) * 2);
|
||||
res[0].lexeme = pstrdup(found->out);
|
||||
|
||||
PG_RETURN_POINTER(res);
|
||||
}
|
887
src/backend/tsearch/dict_thesaurus.c
Normal file
887
src/backend/tsearch/dict_thesaurus.c
Normal file
@@ -0,0 +1,887 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* dict_thesaurus.c
|
||||
* Thesaurus dictionary: phrase to phrase substitution
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "catalog/namespace.h"
|
||||
#include "storage/fd.h"
|
||||
#include "tsearch/ts_cache.h"
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
|
||||
/*
|
||||
* Temporay we use TSLexeme.flags for inner use...
|
||||
*/
|
||||
#define DT_USEASIS 0x1000
|
||||
|
||||
typedef struct LexemeInfo
|
||||
{
|
||||
uint16 idsubst; /* entry's number in DictThesaurus->subst */
|
||||
uint16 posinsubst; /* pos info in entry */
|
||||
uint16 tnvariant; /* total num lexemes in one variant */
|
||||
struct LexemeInfo *nextentry;
|
||||
struct LexemeInfo *nextvariant;
|
||||
} LexemeInfo;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char *lexeme;
|
||||
LexemeInfo *entries;
|
||||
} TheLexeme;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint16 lastlexeme; /* number lexemes to substitute */
|
||||
uint16 reslen;
|
||||
TSLexeme *res; /* prepared substituted result */
|
||||
} TheSubstitute;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
/* subdictionary to normalize lexemes */
|
||||
Oid subdictOid;
|
||||
TSDictionaryCacheEntry *subdict;
|
||||
|
||||
/* Array to search lexeme by exact match */
|
||||
TheLexeme *wrds;
|
||||
int nwrds;
|
||||
int ntwrds;
|
||||
|
||||
/*
|
||||
* Storage of substituted result, n-th element is for n-th expression
|
||||
*/
|
||||
TheSubstitute *subst;
|
||||
int nsubst;
|
||||
} DictThesaurus;
|
||||
|
||||
|
||||
static void
|
||||
newLexeme(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 posinsubst)
|
||||
{
|
||||
TheLexeme *ptr;
|
||||
|
||||
if (d->nwrds >= d->ntwrds)
|
||||
{
|
||||
if (d->ntwrds == 0)
|
||||
{
|
||||
d->ntwrds = 16;
|
||||
d->wrds = (TheLexeme *) palloc(sizeof(TheLexeme) * d->ntwrds);
|
||||
}
|
||||
else
|
||||
{
|
||||
d->ntwrds *= 2;
|
||||
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->ntwrds);
|
||||
}
|
||||
}
|
||||
|
||||
ptr = d->wrds + d->nwrds;
|
||||
d->nwrds++;
|
||||
|
||||
ptr->lexeme = palloc(e - b + 1);
|
||||
|
||||
memcpy(ptr->lexeme, b, e - b);
|
||||
ptr->lexeme[e - b] = '\0';
|
||||
|
||||
ptr->entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
|
||||
|
||||
ptr->entries->nextentry = NULL;
|
||||
ptr->entries->idsubst = idsubst;
|
||||
ptr->entries->posinsubst = posinsubst;
|
||||
}
|
||||
|
||||
static void
|
||||
addWrd(DictThesaurus * d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16 posinsubst, bool useasis)
|
||||
{
|
||||
static int nres = 0;
|
||||
static int ntres = 0;
|
||||
TheSubstitute *ptr;
|
||||
|
||||
if (nwrd == 0)
|
||||
{
|
||||
nres = ntres = 0;
|
||||
|
||||
if (idsubst >= d->nsubst)
|
||||
{
|
||||
if (d->nsubst == 0)
|
||||
{
|
||||
d->nsubst = 16;
|
||||
d->subst = (TheSubstitute *) palloc(sizeof(TheSubstitute) * d->nsubst);
|
||||
}
|
||||
else
|
||||
{
|
||||
d->nsubst *= 2;
|
||||
d->subst = (TheSubstitute *) repalloc(d->subst, sizeof(TheSubstitute) * d->nsubst);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ptr = d->subst + idsubst;
|
||||
|
||||
ptr->lastlexeme = posinsubst - 1;
|
||||
|
||||
if (nres + 1 >= ntres)
|
||||
{
|
||||
if (ntres == 0)
|
||||
{
|
||||
ntres = 2;
|
||||
ptr->res = (TSLexeme *) palloc(sizeof(TSLexeme) * ntres);
|
||||
}
|
||||
else
|
||||
{
|
||||
ntres *= 2;
|
||||
ptr->res = (TSLexeme *) repalloc(ptr->res, sizeof(TSLexeme) * ntres);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ptr->res[nres].lexeme = palloc(e - b + 1);
|
||||
memcpy(ptr->res[nres].lexeme, b, e - b);
|
||||
ptr->res[nres].lexeme[e - b] = '\0';
|
||||
|
||||
ptr->res[nres].nvariant = nwrd;
|
||||
if (useasis)
|
||||
ptr->res[nres].flags = DT_USEASIS;
|
||||
else
|
||||
ptr->res[nres].flags = 0;
|
||||
|
||||
ptr->res[++nres].lexeme = NULL;
|
||||
}
|
||||
|
||||
#define TR_WAITLEX 1
|
||||
#define TR_INLEX 2
|
||||
#define TR_WAITSUBS 3
|
||||
#define TR_INSUBS 4
|
||||
|
||||
static void
|
||||
thesaurusRead(char *filename, DictThesaurus * d)
|
||||
{
|
||||
FILE *fh;
|
||||
char str[BUFSIZ];
|
||||
int lineno = 0;
|
||||
uint16 idsubst = 0;
|
||||
bool useasis = false;
|
||||
|
||||
filename = get_tsearch_config_filename(filename, "ths");
|
||||
fh = AllocateFile(filename, "r");
|
||||
if (!fh)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("could not open thesaurus file \"%s\": %m",
|
||||
filename)));
|
||||
|
||||
while (fgets(str, sizeof(str), fh))
|
||||
{
|
||||
char *ptr,
|
||||
*recoded;
|
||||
int state = TR_WAITLEX;
|
||||
char *beginwrd = NULL;
|
||||
uint16 posinsubst = 0;
|
||||
uint16 nwrd = 0;
|
||||
|
||||
ptr = recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
|
||||
GetDatabaseEncoding(), PG_UTF8);
|
||||
if (recoded == NULL)
|
||||
elog(ERROR, "encoding conversion failed");
|
||||
|
||||
lineno++;
|
||||
|
||||
/* is it comment ? */
|
||||
while (t_isspace(ptr))
|
||||
ptr += pg_mblen(ptr);
|
||||
if (t_iseq(recoded, '#') || *recoded == '\0' || t_iseq(recoded, '\n') || t_iseq(recoded, '\r'))
|
||||
continue;
|
||||
|
||||
while (*ptr)
|
||||
{
|
||||
if (state == TR_WAITLEX)
|
||||
{
|
||||
if (t_iseq(ptr, ':'))
|
||||
{
|
||||
if (posinsubst == 0)
|
||||
{
|
||||
FreeFile(fh);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("unexpected delimiter at line %d of thesaurus file \"%s\"",
|
||||
lineno, filename)));
|
||||
}
|
||||
state = TR_WAITSUBS;
|
||||
}
|
||||
else if (!t_isspace(ptr))
|
||||
{
|
||||
beginwrd = ptr;
|
||||
state = TR_INLEX;
|
||||
}
|
||||
}
|
||||
else if (state == TR_INLEX)
|
||||
{
|
||||
if (t_iseq(ptr, ':'))
|
||||
{
|
||||
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
|
||||
state = TR_WAITSUBS;
|
||||
}
|
||||
else if (t_isspace(ptr))
|
||||
{
|
||||
newLexeme(d, beginwrd, ptr, idsubst, posinsubst++);
|
||||
state = TR_WAITLEX;
|
||||
}
|
||||
}
|
||||
else if (state == TR_WAITSUBS)
|
||||
{
|
||||
if (t_iseq(ptr, '*'))
|
||||
{
|
||||
useasis = true;
|
||||
state = TR_INSUBS;
|
||||
beginwrd = ptr + pg_mblen(ptr);
|
||||
}
|
||||
else if (t_iseq(ptr, '\\'))
|
||||
{
|
||||
useasis = false;
|
||||
state = TR_INSUBS;
|
||||
beginwrd = ptr + pg_mblen(ptr);
|
||||
}
|
||||
else if (!t_isspace(ptr))
|
||||
{
|
||||
useasis = false;
|
||||
beginwrd = ptr;
|
||||
state = TR_INSUBS;
|
||||
}
|
||||
}
|
||||
else if (state == TR_INSUBS)
|
||||
{
|
||||
if (t_isspace(ptr))
|
||||
{
|
||||
if (ptr == beginwrd)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
|
||||
lineno, filename)));
|
||||
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
|
||||
state = TR_WAITSUBS;
|
||||
}
|
||||
}
|
||||
else
|
||||
elog(ERROR, "unrecognized thesaurus state: %d", state);
|
||||
|
||||
ptr += pg_mblen(ptr);
|
||||
}
|
||||
|
||||
if (state == TR_INSUBS)
|
||||
{
|
||||
if (ptr == beginwrd)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("unexpected end of line or lexeme at line %d of thesaurus file \"%s\"",
|
||||
lineno, filename)));
|
||||
addWrd(d, beginwrd, ptr, idsubst, nwrd++, posinsubst, useasis);
|
||||
}
|
||||
|
||||
idsubst++;
|
||||
|
||||
if (!(nwrd && posinsubst))
|
||||
{
|
||||
FreeFile(fh);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("unexpected end of line at line %d of thesaurus file \"%s\"",
|
||||
lineno, filename)));
|
||||
}
|
||||
|
||||
if (recoded != str)
|
||||
pfree(recoded);
|
||||
}
|
||||
|
||||
d->nsubst = idsubst;
|
||||
|
||||
FreeFile(fh);
|
||||
}
|
||||
|
||||
static TheLexeme *
|
||||
addCompiledLexeme(TheLexeme * newwrds, int *nnw, int *tnm, TSLexeme * lexeme, LexemeInfo * src, uint16 tnvariant)
|
||||
{
|
||||
|
||||
if (*nnw >= *tnm)
|
||||
{
|
||||
*tnm *= 2;
|
||||
newwrds = (TheLexeme *) repalloc(newwrds, sizeof(TheLexeme) * *tnm);
|
||||
}
|
||||
|
||||
newwrds[*nnw].entries = (LexemeInfo *) palloc(sizeof(LexemeInfo));
|
||||
|
||||
if (lexeme && lexeme->lexeme)
|
||||
{
|
||||
newwrds[*nnw].lexeme = pstrdup(lexeme->lexeme);
|
||||
newwrds[*nnw].entries->tnvariant = tnvariant;
|
||||
}
|
||||
else
|
||||
{
|
||||
newwrds[*nnw].lexeme = NULL;
|
||||
newwrds[*nnw].entries->tnvariant = 1;
|
||||
}
|
||||
|
||||
newwrds[*nnw].entries->idsubst = src->idsubst;
|
||||
newwrds[*nnw].entries->posinsubst = src->posinsubst;
|
||||
|
||||
newwrds[*nnw].entries->nextentry = NULL;
|
||||
|
||||
(*nnw)++;
|
||||
return newwrds;
|
||||
}
|
||||
|
||||
static int
|
||||
cmpLexemeInfo(LexemeInfo * a, LexemeInfo * b)
|
||||
{
|
||||
if (a == NULL || b == NULL)
|
||||
return 0;
|
||||
|
||||
if (a->idsubst == b->idsubst)
|
||||
{
|
||||
if (a->posinsubst == b->posinsubst)
|
||||
{
|
||||
if (a->tnvariant == b->tnvariant)
|
||||
return 0;
|
||||
|
||||
return (a->tnvariant > b->tnvariant) ? 1 : -1;
|
||||
}
|
||||
|
||||
return (a->posinsubst > b->posinsubst) ? 1 : -1;
|
||||
}
|
||||
|
||||
return (a->idsubst > b->idsubst) ? 1 : -1;
|
||||
}
|
||||
|
||||
static int
|
||||
cmpLexeme(TheLexeme * a, TheLexeme * b)
|
||||
{
|
||||
if (a->lexeme == NULL)
|
||||
{
|
||||
if (b->lexeme == NULL)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
else if (b->lexeme == NULL)
|
||||
return -1;
|
||||
|
||||
return strcmp(a->lexeme, b->lexeme);
|
||||
}
|
||||
|
||||
static int
|
||||
cmpLexemeQ(const void *a, const void *b)
|
||||
{
|
||||
return cmpLexeme((TheLexeme *) a, (TheLexeme *) b);
|
||||
}
|
||||
|
||||
static int
|
||||
cmpTheLexeme(const void *a, const void *b)
|
||||
{
|
||||
TheLexeme *la = (TheLexeme *) a;
|
||||
TheLexeme *lb = (TheLexeme *) b;
|
||||
int res;
|
||||
|
||||
if ((res = cmpLexeme(la, lb)) != 0)
|
||||
return res;
|
||||
|
||||
return -cmpLexemeInfo(la->entries, lb->entries);
|
||||
}
|
||||
|
||||
static void
|
||||
compileTheLexeme(DictThesaurus * d)
|
||||
{
|
||||
int i,
|
||||
nnw = 0,
|
||||
tnm = 16;
|
||||
TheLexeme *newwrds = (TheLexeme *) palloc(sizeof(TheLexeme) * tnm),
|
||||
*ptrwrds;
|
||||
|
||||
for (i = 0; i < d->nwrds; i++)
|
||||
{
|
||||
TSLexeme *ptr;
|
||||
|
||||
ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
||||
PointerGetDatum(d->subdict->dictData),
|
||||
PointerGetDatum(d->wrds[i].lexeme),
|
||||
Int32GetDatum(strlen(d->wrds[i].lexeme)),
|
||||
PointerGetDatum(NULL)));
|
||||
|
||||
if (!(ptr && ptr->lexeme))
|
||||
{
|
||||
if (!ptr)
|
||||
elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
|
||||
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
|
||||
else
|
||||
elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
|
||||
d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
|
||||
|
||||
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
|
||||
}
|
||||
else
|
||||
{
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
TSLexeme *remptr = ptr + 1;
|
||||
int tnvar = 1;
|
||||
int curvar = ptr->nvariant;
|
||||
|
||||
/* compute n words in one variant */
|
||||
while (remptr->lexeme)
|
||||
{
|
||||
if (remptr->nvariant != (remptr - 1)->nvariant)
|
||||
break;
|
||||
tnvar++;
|
||||
remptr++;
|
||||
}
|
||||
|
||||
remptr = ptr;
|
||||
while (remptr->lexeme && remptr->nvariant == curvar)
|
||||
{
|
||||
newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
|
||||
remptr++;
|
||||
}
|
||||
|
||||
ptr = remptr;
|
||||
}
|
||||
}
|
||||
|
||||
pfree(d->wrds[i].lexeme);
|
||||
pfree(d->wrds[i].entries);
|
||||
}
|
||||
|
||||
pfree(d->wrds);
|
||||
d->wrds = newwrds;
|
||||
d->nwrds = nnw;
|
||||
d->ntwrds = tnm;
|
||||
|
||||
if (d->nwrds > 1)
|
||||
{
|
||||
qsort(d->wrds, d->nwrds, sizeof(TheLexeme), cmpTheLexeme);
|
||||
|
||||
/* uniq */
|
||||
newwrds = d->wrds;
|
||||
ptrwrds = d->wrds + 1;
|
||||
while (ptrwrds - d->wrds < d->nwrds)
|
||||
{
|
||||
if (cmpLexeme(ptrwrds, newwrds) == 0)
|
||||
{
|
||||
if (cmpLexemeInfo(ptrwrds->entries, newwrds->entries))
|
||||
{
|
||||
ptrwrds->entries->nextentry = newwrds->entries;
|
||||
newwrds->entries = ptrwrds->entries;
|
||||
}
|
||||
else
|
||||
pfree(ptrwrds->entries);
|
||||
|
||||
if (ptrwrds->lexeme)
|
||||
pfree(ptrwrds->lexeme);
|
||||
}
|
||||
else
|
||||
{
|
||||
newwrds++;
|
||||
*newwrds = *ptrwrds;
|
||||
}
|
||||
|
||||
ptrwrds++;
|
||||
}
|
||||
|
||||
d->nwrds = newwrds - d->wrds + 1;
|
||||
d->wrds = (TheLexeme *) repalloc(d->wrds, sizeof(TheLexeme) * d->nwrds);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
compileTheSubstitute(DictThesaurus * d)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < d->nsubst; i++)
|
||||
{
|
||||
TSLexeme *rem = d->subst[i].res,
|
||||
*outptr,
|
||||
*inptr;
|
||||
int n = 2;
|
||||
|
||||
outptr = d->subst[i].res = (TSLexeme *) palloc(sizeof(TSLexeme) * n);
|
||||
outptr->lexeme = NULL;
|
||||
inptr = rem;
|
||||
|
||||
while (inptr && inptr->lexeme)
|
||||
{
|
||||
TSLexeme *lexized,
|
||||
tmplex[2];
|
||||
|
||||
if (inptr->flags & DT_USEASIS)
|
||||
{ /* do not lexize */
|
||||
tmplex[0] = *inptr;
|
||||
tmplex[0].flags = 0;
|
||||
tmplex[1].lexeme = NULL;
|
||||
lexized = tmplex;
|
||||
}
|
||||
else
|
||||
{
|
||||
lexized = (TSLexeme *) DatumGetPointer(
|
||||
FunctionCall4(
|
||||
&(d->subdict->lexize),
|
||||
PointerGetDatum(d->subdict->dictData),
|
||||
PointerGetDatum(inptr->lexeme),
|
||||
Int32GetDatum(strlen(inptr->lexeme)),
|
||||
PointerGetDatum(NULL)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
if (lexized && lexized->lexeme)
|
||||
{
|
||||
int toset = (lexized->lexeme && outptr != d->subst[i].res) ? (outptr - d->subst[i].res) : -1;
|
||||
|
||||
while (lexized->lexeme)
|
||||
{
|
||||
if (outptr - d->subst[i].res + 1 >= n)
|
||||
{
|
||||
int diff = outptr - d->subst[i].res;
|
||||
|
||||
n *= 2;
|
||||
d->subst[i].res = (TSLexeme *) repalloc(d->subst[i].res, sizeof(TSLexeme) * n);
|
||||
outptr = d->subst[i].res + diff;
|
||||
}
|
||||
|
||||
*outptr = *lexized;
|
||||
outptr->lexeme = pstrdup(lexized->lexeme);
|
||||
|
||||
outptr++;
|
||||
lexized++;
|
||||
}
|
||||
|
||||
if (toset > 0)
|
||||
d->subst[i].res[toset].flags |= TSL_ADDPOS;
|
||||
}
|
||||
else if (lexized)
|
||||
{
|
||||
elog(NOTICE, "thesaurus word \"%s\" in substitution is a stop-word, ignored (rule %d)", inptr->lexeme, i + 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
elog(ERROR, "thesaurus word \"%s\" in substitution isn't recognized (rule %d)", inptr->lexeme, i + 1);
|
||||
}
|
||||
|
||||
if (inptr->lexeme)
|
||||
pfree(inptr->lexeme);
|
||||
inptr++;
|
||||
}
|
||||
|
||||
if (outptr == d->subst[i].res)
|
||||
elog(ERROR, "all words in thesaurus substitution are stop words (rule %d)", i + 1);
|
||||
|
||||
d->subst[i].reslen = outptr - d->subst[i].res;
|
||||
|
||||
pfree(rem);
|
||||
}
|
||||
}
|
||||
|
||||
Datum
|
||||
thesaurus_init(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictThesaurus *d;
|
||||
Map *cfg,
|
||||
*pcfg;
|
||||
text *in;
|
||||
char *subdictname = NULL;
|
||||
bool fileloaded = false;
|
||||
|
||||
/* init functions must defend against NULLs for themselves */
|
||||
if (PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("NULL config not allowed for Thesaurus")));
|
||||
in = PG_GETARG_TEXT_P(0);
|
||||
|
||||
parse_keyvalpairs(in, &cfg);
|
||||
PG_FREE_IF_COPY(in, 0);
|
||||
|
||||
d = (DictThesaurus *) palloc0(sizeof(DictThesaurus));
|
||||
|
||||
pcfg = cfg;
|
||||
while (pcfg->key)
|
||||
{
|
||||
if (pg_strcasecmp("DictFile", pcfg->key) == 0)
|
||||
{
|
||||
if (fileloaded)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("multiple DictFile parameters")));
|
||||
thesaurusRead(pcfg->value, d);
|
||||
fileloaded = true;
|
||||
}
|
||||
else if (pg_strcasecmp("Dictionary", pcfg->key) == 0)
|
||||
{
|
||||
if (subdictname)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("multiple Dictionary parameters")));
|
||||
subdictname = pstrdup(pcfg->value);
|
||||
}
|
||||
else
|
||||
{
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("unrecognized Thesaurus parameter: \"%s\"",
|
||||
pcfg->key)));
|
||||
}
|
||||
pfree(pcfg->key);
|
||||
pfree(pcfg->value);
|
||||
pcfg++;
|
||||
}
|
||||
pfree(cfg);
|
||||
|
||||
if (!fileloaded)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("missing DictFile parameter")));
|
||||
if (!subdictname)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("missing Dictionary parameter")));
|
||||
|
||||
d->subdictOid = TSDictionaryGetDictid(stringToQualifiedNameList(subdictname), false);
|
||||
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
|
||||
|
||||
compileTheLexeme(d);
|
||||
compileTheSubstitute(d);
|
||||
|
||||
PG_RETURN_POINTER(d);
|
||||
}
|
||||
|
||||
static LexemeInfo *
|
||||
findTheLexeme(DictThesaurus * d, char *lexeme)
|
||||
{
|
||||
TheLexeme key = {lexeme, NULL}, *res;
|
||||
|
||||
if (d->nwrds == 0)
|
||||
return NULL;
|
||||
|
||||
res = bsearch(&key, d->wrds, d->nwrds, sizeof(TheLexeme), cmpLexemeQ);
|
||||
|
||||
if (res == NULL)
|
||||
return NULL;
|
||||
return res->entries;
|
||||
}
|
||||
|
||||
static bool
|
||||
matchIdSubst(LexemeInfo * stored, uint16 idsubst)
|
||||
{
|
||||
bool res = true;
|
||||
|
||||
if (stored)
|
||||
{
|
||||
res = false;
|
||||
|
||||
for (; stored; stored = stored->nextvariant)
|
||||
if (stored->idsubst == idsubst)
|
||||
{
|
||||
res = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static LexemeInfo *
|
||||
findVariant(LexemeInfo * in, LexemeInfo * stored, uint16 curpos, LexemeInfo ** newin, int newn)
|
||||
{
|
||||
for (;;)
|
||||
{
|
||||
int i;
|
||||
LexemeInfo *ptr = newin[0];
|
||||
|
||||
for (i = 0; i < newn; i++)
|
||||
{
|
||||
while (newin[i] && newin[i]->idsubst < ptr->idsubst)
|
||||
newin[i] = newin[i]->nextentry;
|
||||
|
||||
if (newin[i] == NULL)
|
||||
return in;
|
||||
|
||||
if (newin[i]->idsubst > ptr->idsubst)
|
||||
{
|
||||
ptr = newin[i];
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
|
||||
while (newin[i]->idsubst == ptr->idsubst)
|
||||
{
|
||||
if (newin[i]->posinsubst == curpos && newin[i]->tnvariant == newn)
|
||||
{
|
||||
ptr = newin[i];
|
||||
break;
|
||||
}
|
||||
|
||||
newin[i] = newin[i]->nextentry;
|
||||
if (newin[i] == NULL)
|
||||
return in;
|
||||
}
|
||||
|
||||
if (newin[i]->idsubst != ptr->idsubst)
|
||||
{
|
||||
ptr = newin[i];
|
||||
i = -1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (i == newn && matchIdSubst(stored, ptr->idsubst) && (in == NULL || !matchIdSubst(in, ptr->idsubst)))
|
||||
{ /* found */
|
||||
|
||||
ptr->nextvariant = in;
|
||||
in = ptr;
|
||||
}
|
||||
|
||||
/* step forward */
|
||||
for (i = 0; i < newn; i++)
|
||||
newin[i] = newin[i]->nextentry;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static TSLexeme *
|
||||
copyTSLexeme(TheSubstitute * ts)
|
||||
{
|
||||
TSLexeme *res;
|
||||
uint16 i;
|
||||
|
||||
res = (TSLexeme *) palloc(sizeof(TSLexeme) * (ts->reslen + 1));
|
||||
for (i = 0; i < ts->reslen; i++)
|
||||
{
|
||||
res[i] = ts->res[i];
|
||||
res[i].lexeme = pstrdup(ts->res[i].lexeme);
|
||||
}
|
||||
|
||||
res[ts->reslen].lexeme = NULL;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static TSLexeme *
|
||||
checkMatch(DictThesaurus * d, LexemeInfo * info, uint16 curpos, bool *moreres)
|
||||
{
|
||||
*moreres = false;
|
||||
while (info)
|
||||
{
|
||||
Assert(info->idsubst < d->nsubst);
|
||||
if (info->nextvariant)
|
||||
*moreres = true;
|
||||
if (d->subst[info->idsubst].lastlexeme == curpos)
|
||||
return copyTSLexeme(d->subst + info->idsubst);
|
||||
info = info->nextvariant;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Datum
|
||||
thesaurus_lexize(PG_FUNCTION_ARGS)
|
||||
{
|
||||
DictThesaurus *d = (DictThesaurus *) PG_GETARG_POINTER(0);
|
||||
DictSubState *dstate = (DictSubState *) PG_GETARG_POINTER(3);
|
||||
TSLexeme *res = NULL;
|
||||
LexemeInfo *stored,
|
||||
*info = NULL;
|
||||
uint16 curpos = 0;
|
||||
bool moreres = false;
|
||||
|
||||
if (PG_NARGS() < 4 || dstate == NULL)
|
||||
elog(ERROR, "forbidden call of thesaurus or nested call");
|
||||
|
||||
if (dstate->isend)
|
||||
PG_RETURN_POINTER(NULL);
|
||||
stored = (LexemeInfo *) dstate->private;
|
||||
|
||||
if (stored)
|
||||
curpos = stored->posinsubst + 1;
|
||||
|
||||
if (!d->subdict->isvalid)
|
||||
d->subdict = lookup_ts_dictionary_cache(d->subdictOid);
|
||||
|
||||
res = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
|
||||
PointerGetDatum(d->subdict->dictData),
|
||||
PG_GETARG_DATUM(1),
|
||||
PG_GETARG_DATUM(2),
|
||||
PointerGetDatum(NULL)));
|
||||
|
||||
if (res && res->lexeme)
|
||||
{
|
||||
TSLexeme *ptr = res,
|
||||
*basevar;
|
||||
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
uint16 nv = ptr->nvariant;
|
||||
uint16 i,
|
||||
nlex = 0;
|
||||
LexemeInfo **infos;
|
||||
|
||||
basevar = ptr;
|
||||
while (ptr->lexeme && nv == ptr->nvariant)
|
||||
{
|
||||
nlex++;
|
||||
ptr++;
|
||||
}
|
||||
|
||||
infos = (LexemeInfo **) palloc(sizeof(LexemeInfo *) * nlex);
|
||||
for (i = 0; i < nlex; i++)
|
||||
if ((infos[i] = findTheLexeme(d, basevar[i].lexeme)) == NULL)
|
||||
break;
|
||||
|
||||
if (i < nlex)
|
||||
{
|
||||
/* no chance to find */
|
||||
pfree(infos);
|
||||
continue;
|
||||
}
|
||||
|
||||
info = findVariant(info, stored, curpos, infos, nlex);
|
||||
}
|
||||
}
|
||||
else if (res)
|
||||
{ /* stop-word */
|
||||
LexemeInfo *infos = findTheLexeme(d, NULL);
|
||||
|
||||
info = findVariant(NULL, stored, curpos, &infos, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
info = NULL; /* word isn't recognized */
|
||||
}
|
||||
|
||||
dstate->private = (void *) info;
|
||||
|
||||
if (!info)
|
||||
{
|
||||
dstate->getnext = false;
|
||||
PG_RETURN_POINTER(NULL);
|
||||
}
|
||||
|
||||
if ((res = checkMatch(d, info, curpos, &moreres)) != NULL)
|
||||
{
|
||||
dstate->getnext = moreres;
|
||||
PG_RETURN_POINTER(res);
|
||||
}
|
||||
|
||||
dstate->getnext = true;
|
||||
|
||||
PG_RETURN_POINTER(NULL);
|
||||
}
|
236
src/backend/tsearch/regis.c
Normal file
236
src/backend/tsearch/regis.c
Normal file
@@ -0,0 +1,236 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* regis.c
|
||||
* Fast regex subset
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/regis.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "tsearch/dicts/regis.h"
|
||||
#include "tsearch/ts_locale.h"
|
||||
|
||||
bool
|
||||
RS_isRegis(const char *str)
|
||||
{
|
||||
while (str && *str)
|
||||
{
|
||||
if (t_isalpha(str) ||
|
||||
t_iseq(str, '[') ||
|
||||
t_iseq(str, ']') ||
|
||||
t_iseq(str, '^'))
|
||||
str += pg_mblen(str);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#define RS_IN_ONEOF 1
|
||||
#define RS_IN_ONEOF_IN 2
|
||||
#define RS_IN_NONEOF 3
|
||||
#define RS_IN_WAIT 4
|
||||
|
||||
static RegisNode *
|
||||
newRegisNode(RegisNode * prev, int len)
|
||||
{
|
||||
RegisNode *ptr;
|
||||
|
||||
ptr = (RegisNode *) palloc0(RNHDRSZ + len + 1);
|
||||
if (prev)
|
||||
prev->next = ptr;
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void
|
||||
RS_compile(Regis * r, bool issuffix, char *str)
|
||||
{
|
||||
int len = strlen(str);
|
||||
int state = RS_IN_WAIT;
|
||||
char *c = (char *) str;
|
||||
RegisNode *ptr = NULL;
|
||||
|
||||
memset(r, 0, sizeof(Regis));
|
||||
r->issuffix = (issuffix) ? 1 : 0;
|
||||
|
||||
while (*c)
|
||||
{
|
||||
if (state == RS_IN_WAIT)
|
||||
{
|
||||
if (t_isalpha(c))
|
||||
{
|
||||
if (ptr)
|
||||
ptr = newRegisNode(ptr, len);
|
||||
else
|
||||
ptr = r->node = newRegisNode(NULL, len);
|
||||
COPYCHAR(ptr->data, c);
|
||||
ptr->type = RSF_ONEOF;
|
||||
ptr->len = pg_mblen(c);
|
||||
}
|
||||
else if (t_iseq(c, '['))
|
||||
{
|
||||
if (ptr)
|
||||
ptr = newRegisNode(ptr, len);
|
||||
else
|
||||
ptr = r->node = newRegisNode(NULL, len);
|
||||
ptr->type = RSF_ONEOF;
|
||||
state = RS_IN_ONEOF;
|
||||
}
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
||||
errmsg("invalid regis pattern: \"%s\"",
|
||||
str)));
|
||||
}
|
||||
else if (state == RS_IN_ONEOF)
|
||||
{
|
||||
if (t_iseq(c, '^'))
|
||||
{
|
||||
ptr->type = RSF_NONEOF;
|
||||
state = RS_IN_NONEOF;
|
||||
}
|
||||
else if (t_isalpha(c))
|
||||
{
|
||||
COPYCHAR(ptr->data, c);
|
||||
ptr->len = pg_mblen(c);
|
||||
state = RS_IN_ONEOF_IN;
|
||||
}
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
||||
errmsg("invalid regis pattern: \"%s\"",
|
||||
str)));
|
||||
}
|
||||
else if (state == RS_IN_ONEOF_IN || state == RS_IN_NONEOF)
|
||||
{
|
||||
if (t_isalpha(c))
|
||||
{
|
||||
COPYCHAR(ptr->data + ptr->len, c);
|
||||
ptr->len += pg_mblen(c);
|
||||
}
|
||||
else if (t_iseq(c, ']'))
|
||||
state = RS_IN_WAIT;
|
||||
else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
||||
errmsg("invalid regis pattern: \"%s\"",
|
||||
str)));
|
||||
}
|
||||
else
|
||||
elog(ERROR, "internal error in RS_compile: state %d", state);
|
||||
c += pg_mblen(c);
|
||||
}
|
||||
|
||||
ptr = r->node;
|
||||
while (ptr)
|
||||
{
|
||||
r->nchar++;
|
||||
ptr = ptr->next;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
RS_free(Regis * r)
|
||||
{
|
||||
RegisNode *ptr = r->node,
|
||||
*tmp;
|
||||
|
||||
while (ptr)
|
||||
{
|
||||
tmp = ptr->next;
|
||||
pfree(ptr);
|
||||
ptr = tmp;
|
||||
}
|
||||
|
||||
r->node = NULL;
|
||||
}
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
static bool
|
||||
mb_strchr(char *str, char *c)
|
||||
{
|
||||
int clen = pg_mblen(c),
|
||||
plen,
|
||||
i;
|
||||
char *ptr = str;
|
||||
bool res = false;
|
||||
|
||||
clen = pg_mblen(c);
|
||||
while (*ptr && !res)
|
||||
{
|
||||
plen = pg_mblen(ptr);
|
||||
if (plen == clen)
|
||||
{
|
||||
i = plen;
|
||||
res = true;
|
||||
while (i--)
|
||||
if (*(ptr + i) != *(c + i))
|
||||
{
|
||||
res = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ptr += plen;
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
#else
|
||||
#define mb_strchr(s,c) ( (strchr((s),*(c)) == NULL) ? false : true )
|
||||
#endif
|
||||
|
||||
|
||||
bool
|
||||
RS_execute(Regis * r, char *str)
|
||||
{
|
||||
RegisNode *ptr = r->node;
|
||||
char *c = str;
|
||||
int len = 0;
|
||||
|
||||
while (*c)
|
||||
{
|
||||
len++;
|
||||
c += pg_mblen(c);
|
||||
}
|
||||
|
||||
if (len < r->nchar)
|
||||
return 0;
|
||||
|
||||
c = str;
|
||||
if (r->issuffix)
|
||||
{
|
||||
len -= r->nchar;
|
||||
while (len-- > 0)
|
||||
c += pg_mblen(c);
|
||||
}
|
||||
|
||||
|
||||
while (ptr)
|
||||
{
|
||||
switch (ptr->type)
|
||||
{
|
||||
case RSF_ONEOF:
|
||||
if (mb_strchr((char *) ptr->data, c) != true)
|
||||
return false;
|
||||
break;
|
||||
case RSF_NONEOF:
|
||||
if (mb_strchr((char *) ptr->data, c) == true)
|
||||
return false;
|
||||
break;
|
||||
default:
|
||||
elog(ERROR, "unrecognized regis node type: %d", ptr->type);
|
||||
}
|
||||
ptr = ptr->next;
|
||||
c += pg_mblen(c);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
1747
src/backend/tsearch/spell.c
Normal file
1747
src/backend/tsearch/spell.c
Normal file
File diff suppressed because it is too large
Load Diff
3
src/backend/tsearch/synonym.syn.sample
Normal file
3
src/backend/tsearch/synonym.syn.sample
Normal file
@@ -0,0 +1,3 @@
|
||||
skies sky
|
||||
booking book
|
||||
bookings book
|
20
src/backend/tsearch/thesaurus.ths.sample
Normal file
20
src/backend/tsearch/thesaurus.ths.sample
Normal file
@@ -0,0 +1,20 @@
|
||||
#
|
||||
# Theasurus config file. Character ':' separates string from replacement, eg
|
||||
# sample-words : substitute-words
|
||||
#
|
||||
# Any substitute-word can be marked by preceding '*' character,
|
||||
# which means do not lexize this word
|
||||
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
|
||||
|
||||
one two three : *123
|
||||
one two : *12
|
||||
one : *1
|
||||
two : *2
|
||||
|
||||
#foo bar : blah blah
|
||||
#f bar : fbar
|
||||
#e bar : ebar
|
||||
#g bar bar : gbarbar
|
||||
#asd:sdffff
|
||||
#qwerty:qwer wert erty
|
||||
|
363
src/backend/tsearch/to_tsany.c
Normal file
363
src/backend/tsearch/to_tsany.c
Normal file
@@ -0,0 +1,363 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* to_tsany.c
|
||||
* to_ts* function definitions
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/to_tsany.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "catalog/namespace.h"
|
||||
#include "tsearch/ts_cache.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/syscache.h"
|
||||
|
||||
|
||||
Datum
|
||||
get_current_ts_config(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_OID(getTSCurrentConfig(true));
|
||||
}
|
||||
|
||||
/*
|
||||
* to_tsvector
|
||||
*/
|
||||
static int
|
||||
compareWORD(const void *a, const void *b)
|
||||
{
|
||||
if (((ParsedWord *) a)->len == ((ParsedWord *) b)->len)
|
||||
{
|
||||
int res = strncmp(
|
||||
((ParsedWord *) a)->word,
|
||||
((ParsedWord *) b)->word,
|
||||
((ParsedWord *) b)->len);
|
||||
|
||||
if (res == 0)
|
||||
return (((ParsedWord *) a)->pos.pos > ((ParsedWord *) b)->pos.pos) ? 1 : -1;
|
||||
return res;
|
||||
}
|
||||
return (((ParsedWord *) a)->len > ((ParsedWord *) b)->len) ? 1 : -1;
|
||||
}
|
||||
|
||||
static int
|
||||
uniqueWORD(ParsedWord * a, int4 l)
|
||||
{
|
||||
ParsedWord *ptr,
|
||||
*res;
|
||||
int tmppos;
|
||||
|
||||
if (l == 1)
|
||||
{
|
||||
tmppos = LIMITPOS(a->pos.pos);
|
||||
a->alen = 2;
|
||||
a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
|
||||
a->pos.apos[0] = 1;
|
||||
a->pos.apos[1] = tmppos;
|
||||
return l;
|
||||
}
|
||||
|
||||
res = a;
|
||||
ptr = a + 1;
|
||||
|
||||
qsort((void *) a, l, sizeof(ParsedWord), compareWORD);
|
||||
tmppos = LIMITPOS(a->pos.pos);
|
||||
a->alen = 2;
|
||||
a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
|
||||
a->pos.apos[0] = 1;
|
||||
a->pos.apos[1] = tmppos;
|
||||
|
||||
while (ptr - a < l)
|
||||
{
|
||||
if (!(ptr->len == res->len &&
|
||||
strncmp(ptr->word, res->word, res->len) == 0))
|
||||
{
|
||||
res++;
|
||||
res->len = ptr->len;
|
||||
res->word = ptr->word;
|
||||
tmppos = LIMITPOS(ptr->pos.pos);
|
||||
res->alen = 2;
|
||||
res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
|
||||
res->pos.apos[0] = 1;
|
||||
res->pos.apos[1] = tmppos;
|
||||
}
|
||||
else
|
||||
{
|
||||
pfree(ptr->word);
|
||||
if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1)
|
||||
{
|
||||
if (res->pos.apos[0] + 1 >= res->alen)
|
||||
{
|
||||
res->alen *= 2;
|
||||
res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
|
||||
}
|
||||
if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
|
||||
{
|
||||
res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
|
||||
res->pos.apos[0]++;
|
||||
}
|
||||
}
|
||||
}
|
||||
ptr++;
|
||||
}
|
||||
|
||||
return res + 1 - a;
|
||||
}
|
||||
|
||||
/*
|
||||
* make value of tsvector, given parsed text
|
||||
*/
|
||||
TSVector
|
||||
make_tsvector(ParsedText *prs)
|
||||
{
|
||||
int4 i,
|
||||
j,
|
||||
lenstr = 0,
|
||||
totallen;
|
||||
TSVector in;
|
||||
WordEntry *ptr;
|
||||
char *str,
|
||||
*cur;
|
||||
|
||||
prs->curwords = uniqueWORD(prs->words, prs->curwords);
|
||||
for (i = 0; i < prs->curwords; i++)
|
||||
{
|
||||
lenstr += SHORTALIGN(prs->words[i].len);
|
||||
|
||||
if (prs->words[i].alen)
|
||||
lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
|
||||
}
|
||||
|
||||
totallen = CALCDATASIZE(prs->curwords, lenstr);
|
||||
in = (TSVector) palloc0(totallen);
|
||||
SET_VARSIZE(in, totallen);
|
||||
in->size = prs->curwords;
|
||||
|
||||
ptr = ARRPTR(in);
|
||||
cur = str = STRPTR(in);
|
||||
for (i = 0; i < prs->curwords; i++)
|
||||
{
|
||||
ptr->len = prs->words[i].len;
|
||||
if (cur - str > MAXSTRPOS)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("string is too long for tsvector")));
|
||||
ptr->pos = cur - str;
|
||||
memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len);
|
||||
pfree(prs->words[i].word);
|
||||
cur += SHORTALIGN(prs->words[i].len);
|
||||
if (prs->words[i].alen)
|
||||
{
|
||||
WordEntryPos *wptr;
|
||||
|
||||
ptr->haspos = 1;
|
||||
*(uint16 *) cur = prs->words[i].pos.apos[0];
|
||||
wptr = POSDATAPTR(in, ptr);
|
||||
for (j = 0; j < *(uint16 *) cur; j++)
|
||||
{
|
||||
WEP_SETWEIGHT(wptr[j], 0);
|
||||
WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
|
||||
}
|
||||
cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
|
||||
pfree(prs->words[i].pos.apos);
|
||||
}
|
||||
else
|
||||
ptr->haspos = 0;
|
||||
ptr++;
|
||||
}
|
||||
pfree(prs->words);
|
||||
return in;
|
||||
}
|
||||
|
||||
Datum
|
||||
to_tsvector_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid cfgId = PG_GETARG_OID(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
ParsedText prs;
|
||||
TSVector out;
|
||||
|
||||
prs.lenwords = (VARSIZE(in) - VARHDRSZ) / 6; /* just estimation of
|
||||
* word's number */
|
||||
if (prs.lenwords == 0)
|
||||
prs.lenwords = 2;
|
||||
prs.curwords = 0;
|
||||
prs.pos = 0;
|
||||
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
|
||||
|
||||
parsetext(cfgId, &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ);
|
||||
PG_FREE_IF_COPY(in, 1);
|
||||
|
||||
if (prs.curwords)
|
||||
out = make_tsvector(&prs);
|
||||
else
|
||||
{
|
||||
pfree(prs.words);
|
||||
out = palloc(CALCDATASIZE(0, 0));
|
||||
SET_VARSIZE(out, CALCDATASIZE(0, 0));
|
||||
out->size = 0;
|
||||
}
|
||||
|
||||
PG_RETURN_POINTER(out);
|
||||
}
|
||||
|
||||
Datum
|
||||
to_tsvector(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(0);
|
||||
Oid cfgId;
|
||||
|
||||
cfgId = getTSCurrentConfig(true);
|
||||
PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector_byid,
|
||||
ObjectIdGetDatum(cfgId),
|
||||
PointerGetDatum(in)));
|
||||
}
|
||||
|
||||
/*
|
||||
* to_tsquery
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* This function is used for morph parsing
|
||||
*/
|
||||
static void
|
||||
pushval_morph(TSQueryParserState * state, int typeval, char *strval, int lenval, int2 weight)
|
||||
{
|
||||
int4 count = 0;
|
||||
ParsedText prs;
|
||||
uint32 variant,
|
||||
pos,
|
||||
cntvar = 0,
|
||||
cntpos = 0,
|
||||
cnt = 0;
|
||||
|
||||
prs.lenwords = 4;
|
||||
prs.curwords = 0;
|
||||
prs.pos = 0;
|
||||
prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
|
||||
|
||||
parsetext(state->cfg_id, &prs, strval, lenval);
|
||||
|
||||
if (prs.curwords > 0)
|
||||
{
|
||||
|
||||
while (count < prs.curwords)
|
||||
{
|
||||
pos = prs.words[count].pos.pos;
|
||||
cntvar = 0;
|
||||
while (count < prs.curwords && pos == prs.words[count].pos.pos)
|
||||
{
|
||||
variant = prs.words[count].nvariant;
|
||||
|
||||
cnt = 0;
|
||||
while (count < prs.curwords && pos == prs.words[count].pos.pos && variant == prs.words[count].nvariant)
|
||||
{
|
||||
|
||||
pushval_asis(state, VAL, prs.words[count].word, prs.words[count].len, weight);
|
||||
pfree(prs.words[count].word);
|
||||
if (cnt)
|
||||
pushquery(state, OPR, (int4) '&', 0, 0, 0);
|
||||
cnt++;
|
||||
count++;
|
||||
}
|
||||
|
||||
if (cntvar)
|
||||
pushquery(state, OPR, (int4) '|', 0, 0, 0);
|
||||
cntvar++;
|
||||
}
|
||||
|
||||
if (cntpos)
|
||||
pushquery(state, OPR, (int4) '&', 0, 0, 0);
|
||||
|
||||
cntpos++;
|
||||
}
|
||||
|
||||
pfree(prs.words);
|
||||
|
||||
}
|
||||
else
|
||||
pushval_asis(state, VALSTOP, NULL, 0, 0);
|
||||
}
|
||||
|
||||
Datum
|
||||
to_tsquery_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid cfgid = PG_GETARG_OID(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
QueryItem *res;
|
||||
int4 len;
|
||||
|
||||
query = parse_tsquery(TextPGetCString(in), pushval_morph, cfgid, false);
|
||||
|
||||
if (query->size == 0)
|
||||
PG_RETURN_TSQUERY(query);
|
||||
|
||||
res = clean_fakeval(GETQUERY(query), &len);
|
||||
if (!res)
|
||||
{
|
||||
SET_VARSIZE(query, HDRSIZETQ);
|
||||
query->size = 0;
|
||||
PG_RETURN_POINTER(query);
|
||||
}
|
||||
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
|
||||
pfree(res);
|
||||
PG_RETURN_TSQUERY(query);
|
||||
}
|
||||
|
||||
Datum
|
||||
to_tsquery(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(0);
|
||||
Oid cfgId;
|
||||
|
||||
cfgId = getTSCurrentConfig(true);
|
||||
PG_RETURN_DATUM(DirectFunctionCall2(to_tsquery_byid,
|
||||
ObjectIdGetDatum(cfgId),
|
||||
PointerGetDatum(in)));
|
||||
}
|
||||
|
||||
Datum
|
||||
plainto_tsquery_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Oid cfgid = PG_GETARG_OID(0);
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query;
|
||||
QueryItem *res;
|
||||
int4 len;
|
||||
|
||||
query = parse_tsquery(TextPGetCString(in), pushval_morph, cfgid, true);
|
||||
|
||||
if (query->size == 0)
|
||||
PG_RETURN_TSQUERY(query);
|
||||
|
||||
res = clean_fakeval(GETQUERY(query), &len);
|
||||
if (!res)
|
||||
{
|
||||
SET_VARSIZE(query, HDRSIZETQ);
|
||||
query->size = 0;
|
||||
PG_RETURN_POINTER(query);
|
||||
}
|
||||
memcpy((void *) GETQUERY(query), (void *) res, len * sizeof(QueryItem));
|
||||
pfree(res);
|
||||
PG_RETURN_POINTER(query);
|
||||
}
|
||||
|
||||
Datum
|
||||
plainto_tsquery(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(0);
|
||||
Oid cfgId;
|
||||
|
||||
cfgId = getTSCurrentConfig(true);
|
||||
PG_RETURN_DATUM(DirectFunctionCall2(plainto_tsquery_byid,
|
||||
ObjectIdGetDatum(cfgId),
|
||||
PointerGetDatum(in)));
|
||||
}
|
241
src/backend/tsearch/ts_locale.c
Normal file
241
src/backend/tsearch/ts_locale.c
Normal file
@@ -0,0 +1,241 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* ts_locale.c
|
||||
* locale compatiblility layer for tsearch
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
size_t
|
||||
wchar2char(char *to, const wchar_t *from, size_t len)
|
||||
{
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = WideCharToMultiByte(CP_UTF8, 0, from, -1, to, len,
|
||||
NULL, NULL);
|
||||
|
||||
if (r == 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("UTF-16 to UTF-8 translation failed: %lu",
|
||||
GetLastError())));
|
||||
Assert(r <= len);
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
return wcstombs(to, from, len);
|
||||
}
|
||||
#endif /* WIN32 */
|
||||
|
||||
size_t
|
||||
char2wchar(wchar_t *to, const char *from, size_t len)
|
||||
{
|
||||
if (len == 0)
|
||||
return 0;
|
||||
|
||||
#ifdef WIN32
|
||||
if (GetDatabaseEncoding() == PG_UTF8)
|
||||
{
|
||||
int r;
|
||||
|
||||
r = MultiByteToWideChar(CP_UTF8, 0, from, len, to, len);
|
||||
|
||||
if (!r)
|
||||
{
|
||||
pg_verifymbstr(from, len, false);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid multibyte character for locale"),
|
||||
errhint("The server's LC_CTYPE locale is probably incompatible with the database encoding.")));
|
||||
}
|
||||
|
||||
Assert(r <= len);
|
||||
|
||||
return r;
|
||||
}
|
||||
else
|
||||
#endif /* WIN32 */
|
||||
if (lc_ctype_is_c())
|
||||
{
|
||||
/*
|
||||
* pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
|
||||
* allocated with sufficient space
|
||||
*/
|
||||
return pg_mb2wchar_with_len(from, (pg_wchar *) to, len);
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* mbstowcs require ending '\0'
|
||||
*/
|
||||
char *str = pnstrdup(from, len);
|
||||
size_t tolen;
|
||||
|
||||
tolen = mbstowcs(to, str, len);
|
||||
pfree(str);
|
||||
|
||||
return tolen;
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
_t_isalpha(const char *ptr)
|
||||
{
|
||||
wchar_t character[2];
|
||||
|
||||
if (lc_ctype_is_c())
|
||||
return isalpha(TOUCHAR(ptr));
|
||||
|
||||
char2wchar(character, ptr, 1);
|
||||
|
||||
return iswalpha((wint_t) *character);
|
||||
}
|
||||
|
||||
int
|
||||
_t_isprint(const char *ptr)
|
||||
{
|
||||
wchar_t character[2];
|
||||
|
||||
if (lc_ctype_is_c())
|
||||
return isprint(TOUCHAR(ptr));
|
||||
|
||||
char2wchar(character, ptr, 1);
|
||||
|
||||
return iswprint((wint_t) *character);
|
||||
}
|
||||
#endif /* TS_USE_WIDE */
|
||||
|
||||
/*
|
||||
* Convert C-string from UTF8 to server encoding and
|
||||
* lower it
|
||||
*/
|
||||
char *
|
||||
recode_and_lowerstr(char *str)
|
||||
{
|
||||
char *recoded;
|
||||
char *ret;
|
||||
|
||||
recoded = (char *) pg_do_encoding_conversion((unsigned char *) str, strlen(str),
|
||||
PG_UTF8, GetDatabaseEncoding());
|
||||
|
||||
if (recoded == NULL)
|
||||
elog(ERROR, "encoding conversion failed");
|
||||
|
||||
ret = lowerstr(recoded);
|
||||
|
||||
if (recoded != str)
|
||||
pfree(recoded);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
char *
|
||||
lowerstr(char *str)
|
||||
{
|
||||
return lowerstr_with_len(str, strlen(str));
|
||||
}
|
||||
|
||||
char *
|
||||
lowerstr_with_len(char *str, int len)
|
||||
{
|
||||
char *ptr = str;
|
||||
char *out;
|
||||
|
||||
if (len == 0)
|
||||
return pstrdup("");
|
||||
|
||||
#ifdef TS_USE_WIDE
|
||||
|
||||
/*
|
||||
* Use wide char code only when max encoding length > 1 and ctype != C.
|
||||
* Some operating systems fail with multi-byte encodings and a C locale.
|
||||
* Also, for a C locale there is no need to process as multibyte. From
|
||||
* backend/utils/adt/oracle_compat.c Teodor
|
||||
*/
|
||||
if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c())
|
||||
{
|
||||
wchar_t *wstr,
|
||||
*wptr;
|
||||
int wlen;
|
||||
|
||||
/*
|
||||
* alloc number of wchar_t for worst case, len contains number of
|
||||
* bytes <= number of characters and alloc 1 wchar_t for 0, because
|
||||
* wchar2char(wcstombs in really) wants zero-terminated string
|
||||
*/
|
||||
wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
|
||||
|
||||
/*
|
||||
* str SHOULD be cstring, so wlen contains number of converted
|
||||
* character
|
||||
*/
|
||||
wlen = char2wchar(wstr, str, len);
|
||||
if (wlen < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("translation failed from server encoding to wchar_t")));
|
||||
|
||||
Assert(wlen <= len);
|
||||
wstr[wlen] = 0;
|
||||
|
||||
while (*wptr)
|
||||
{
|
||||
*wptr = towlower((wint_t) *wptr);
|
||||
wptr++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Alloc result string for worst case + '\0'
|
||||
*/
|
||||
len = sizeof(char) * pg_database_encoding_max_length() *(wlen + 1);
|
||||
out = (char *) palloc(len);
|
||||
|
||||
/*
|
||||
* wlen now is number of bytes which is always >= number of characters
|
||||
*/
|
||||
wlen = wchar2char(out, wstr, len);
|
||||
pfree(wstr);
|
||||
|
||||
if (wlen < 0)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("translation failed from wchar_t to server encoding %d", errno)));
|
||||
Assert(wlen <= len);
|
||||
out[wlen] = '\0';
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
char *outptr;
|
||||
|
||||
outptr = out = (char *) palloc(sizeof(char) * (len + 1));
|
||||
while (*ptr && ptr - str < len)
|
||||
{
|
||||
*outptr++ = tolower(*(unsigned char *) ptr);
|
||||
ptr++;
|
||||
}
|
||||
*outptr = '\0';
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
626
src/backend/tsearch/ts_parse.c
Normal file
626
src/backend/tsearch/ts_parse.c
Normal file
@@ -0,0 +1,626 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* ts_parse.c
|
||||
* main parse functions for tsearch
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_parse.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include "tsearch/ts_cache.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
|
||||
#define IGNORE_LONGLEXEME 1
|
||||
|
||||
/*
|
||||
* Lexize subsystem
|
||||
*/
|
||||
|
||||
typedef struct ParsedLex
|
||||
{
|
||||
int type;
|
||||
char *lemm;
|
||||
int lenlemm;
|
||||
bool resfollow;
|
||||
struct ParsedLex *next;
|
||||
} ParsedLex;
|
||||
|
||||
typedef struct ListParsedLex
|
||||
{
|
||||
ParsedLex *head;
|
||||
ParsedLex *tail;
|
||||
} ListParsedLex;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
TSConfigCacheEntry *cfg;
|
||||
Oid curDictId;
|
||||
int posDict;
|
||||
DictSubState dictState;
|
||||
ParsedLex *curSub;
|
||||
ListParsedLex towork; /* current list to work */
|
||||
ListParsedLex waste; /* list of lexemes that already lexized */
|
||||
|
||||
/*
|
||||
* fields to store last variant to lexize (basically, thesaurus or similar
|
||||
* to, which wants several lexemes
|
||||
*/
|
||||
|
||||
ParsedLex *lastRes;
|
||||
TSLexeme *tmpRes;
|
||||
} LexizeData;
|
||||
|
||||
static void
|
||||
LexizeInit(LexizeData * ld, TSConfigCacheEntry * cfg)
|
||||
{
|
||||
ld->cfg = cfg;
|
||||
ld->curDictId = InvalidOid;
|
||||
ld->posDict = 0;
|
||||
ld->towork.head = ld->towork.tail = ld->curSub = NULL;
|
||||
ld->waste.head = ld->waste.tail = NULL;
|
||||
ld->lastRes = NULL;
|
||||
ld->tmpRes = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
LPLAddTail(ListParsedLex * list, ParsedLex * newpl)
|
||||
{
|
||||
if (list->tail)
|
||||
{
|
||||
list->tail->next = newpl;
|
||||
list->tail = newpl;
|
||||
}
|
||||
else
|
||||
list->head = list->tail = newpl;
|
||||
newpl->next = NULL;
|
||||
}
|
||||
|
||||
static ParsedLex *
|
||||
LPLRemoveHead(ListParsedLex * list)
|
||||
{
|
||||
ParsedLex *res = list->head;
|
||||
|
||||
if (list->head)
|
||||
list->head = list->head->next;
|
||||
|
||||
if (list->head == NULL)
|
||||
list->tail = NULL;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static void
|
||||
LexizeAddLemm(LexizeData * ld, int type, char *lemm, int lenlemm)
|
||||
{
|
||||
ParsedLex *newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
||||
|
||||
newpl = (ParsedLex *) palloc(sizeof(ParsedLex));
|
||||
newpl->type = type;
|
||||
newpl->lemm = lemm;
|
||||
newpl->lenlemm = lenlemm;
|
||||
LPLAddTail(&ld->towork, newpl);
|
||||
ld->curSub = ld->towork.tail;
|
||||
}
|
||||
|
||||
static void
|
||||
RemoveHead(LexizeData * ld)
|
||||
{
|
||||
LPLAddTail(&ld->waste, LPLRemoveHead(&ld->towork));
|
||||
|
||||
ld->posDict = 0;
|
||||
}
|
||||
|
||||
static void
|
||||
setCorrLex(LexizeData * ld, ParsedLex ** correspondLexem)
|
||||
{
|
||||
if (correspondLexem)
|
||||
{
|
||||
*correspondLexem = ld->waste.head;
|
||||
}
|
||||
else
|
||||
{
|
||||
ParsedLex *tmp,
|
||||
*ptr = ld->waste.head;
|
||||
|
||||
while (ptr)
|
||||
{
|
||||
tmp = ptr->next;
|
||||
pfree(ptr);
|
||||
ptr = tmp;
|
||||
}
|
||||
}
|
||||
ld->waste.head = ld->waste.tail = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
moveToWaste(LexizeData * ld, ParsedLex * stop)
|
||||
{
|
||||
bool go = true;
|
||||
|
||||
while (ld->towork.head && go)
|
||||
{
|
||||
if (ld->towork.head == stop)
|
||||
{
|
||||
ld->curSub = stop->next;
|
||||
go = false;
|
||||
}
|
||||
RemoveHead(ld);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
setNewTmpRes(LexizeData * ld, ParsedLex * lex, TSLexeme * res)
|
||||
{
|
||||
if (ld->tmpRes)
|
||||
{
|
||||
TSLexeme *ptr;
|
||||
|
||||
for (ptr = ld->tmpRes; ptr->lexeme; ptr++)
|
||||
pfree(ptr->lexeme);
|
||||
pfree(ld->tmpRes);
|
||||
}
|
||||
ld->tmpRes = res;
|
||||
ld->lastRes = lex;
|
||||
}
|
||||
|
||||
static TSLexeme *
|
||||
LexizeExec(LexizeData * ld, ParsedLex ** correspondLexem)
|
||||
{
|
||||
int i;
|
||||
ListDictionary *map;
|
||||
TSDictionaryCacheEntry *dict;
|
||||
TSLexeme *res;
|
||||
|
||||
if (ld->curDictId == InvalidOid)
|
||||
{
|
||||
/*
|
||||
* usial mode: dictionary wants only one word, but we should keep in
|
||||
* mind that we should go through all stack
|
||||
*/
|
||||
|
||||
while (ld->towork.head)
|
||||
{
|
||||
ParsedLex *curVal = ld->towork.head;
|
||||
|
||||
map = ld->cfg->map + curVal->type;
|
||||
|
||||
if (curVal->type == 0 || curVal->type >= ld->cfg->lenmap || map->len == 0)
|
||||
{
|
||||
/* skip this type of lexeme */
|
||||
RemoveHead(ld);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (i = ld->posDict; i < map->len; i++)
|
||||
{
|
||||
dict = lookup_ts_dictionary_cache(map->dictIds[i]);
|
||||
|
||||
ld->dictState.isend = ld->dictState.getnext = false;
|
||||
ld->dictState.private = NULL;
|
||||
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
||||
&(dict->lexize),
|
||||
PointerGetDatum(dict->dictData),
|
||||
PointerGetDatum(curVal->lemm),
|
||||
Int32GetDatum(curVal->lenlemm),
|
||||
PointerGetDatum(&ld->dictState)
|
||||
));
|
||||
|
||||
if (ld->dictState.getnext)
|
||||
{
|
||||
/*
|
||||
* dictionary wants next word, so setup and store current
|
||||
* position and go to multiword mode
|
||||
*/
|
||||
|
||||
ld->curDictId = DatumGetObjectId(map->dictIds[i]);
|
||||
ld->posDict = i + 1;
|
||||
ld->curSub = curVal->next;
|
||||
if (res)
|
||||
setNewTmpRes(ld, curVal, res);
|
||||
return LexizeExec(ld, correspondLexem);
|
||||
}
|
||||
|
||||
if (!res) /* dictionary doesn't know this lexeme */
|
||||
continue;
|
||||
|
||||
RemoveHead(ld);
|
||||
setCorrLex(ld, correspondLexem);
|
||||
return res;
|
||||
}
|
||||
|
||||
RemoveHead(ld);
|
||||
}
|
||||
}
|
||||
else
|
||||
{ /* curDictId is valid */
|
||||
dict = lookup_ts_dictionary_cache(ld->curDictId);
|
||||
|
||||
/*
|
||||
* Dictionary ld->curDictId asks us about following words
|
||||
*/
|
||||
|
||||
while (ld->curSub)
|
||||
{
|
||||
ParsedLex *curVal = ld->curSub;
|
||||
|
||||
map = ld->cfg->map + curVal->type;
|
||||
|
||||
if (curVal->type != 0)
|
||||
{
|
||||
bool dictExists = false;
|
||||
|
||||
if (curVal->type >= ld->cfg->lenmap || map->len == 0)
|
||||
{
|
||||
/* skip this type of lexeme */
|
||||
ld->curSub = curVal->next;
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* We should be sure that current type of lexeme is recognized
|
||||
* by our dictinonary: we just check is it exist in list of
|
||||
* dictionaries ?
|
||||
*/
|
||||
for (i = 0; i < map->len && !dictExists; i++)
|
||||
if (ld->curDictId == DatumGetObjectId(map->dictIds[i]))
|
||||
dictExists = true;
|
||||
|
||||
if (!dictExists)
|
||||
{
|
||||
/*
|
||||
* Dictionary can't work with current tpe of lexeme,
|
||||
* return to basic mode and redo all stored lexemes
|
||||
*/
|
||||
ld->curDictId = InvalidOid;
|
||||
return LexizeExec(ld, correspondLexem);
|
||||
}
|
||||
}
|
||||
|
||||
ld->dictState.isend = (curVal->type == 0) ? true : false;
|
||||
ld->dictState.getnext = false;
|
||||
|
||||
res = (TSLexeme *) DatumGetPointer(FunctionCall4(
|
||||
&(dict->lexize),
|
||||
PointerGetDatum(dict->dictData),
|
||||
PointerGetDatum(curVal->lemm),
|
||||
Int32GetDatum(curVal->lenlemm),
|
||||
PointerGetDatum(&ld->dictState)
|
||||
));
|
||||
|
||||
if (ld->dictState.getnext)
|
||||
{
|
||||
/* Dictionary wants one more */
|
||||
ld->curSub = curVal->next;
|
||||
if (res)
|
||||
setNewTmpRes(ld, curVal, res);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (res || ld->tmpRes)
|
||||
{
|
||||
/*
|
||||
* Dictionary normalizes lexemes, so we remove from stack all
|
||||
* used lexemes , return to basic mode and redo end of stack
|
||||
* (if it exists)
|
||||
*/
|
||||
if (res)
|
||||
{
|
||||
moveToWaste(ld, ld->curSub);
|
||||
}
|
||||
else
|
||||
{
|
||||
res = ld->tmpRes;
|
||||
moveToWaste(ld, ld->lastRes);
|
||||
}
|
||||
|
||||
/* reset to initial state */
|
||||
ld->curDictId = InvalidOid;
|
||||
ld->posDict = 0;
|
||||
ld->lastRes = NULL;
|
||||
ld->tmpRes = NULL;
|
||||
setCorrLex(ld, correspondLexem);
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dict don't want next lexem and didn't recognize anything, redo
|
||||
* from ld->towork.head
|
||||
*/
|
||||
ld->curDictId = InvalidOid;
|
||||
return LexizeExec(ld, correspondLexem);
|
||||
}
|
||||
}
|
||||
|
||||
setCorrLex(ld, correspondLexem);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse string and lexize words
|
||||
*/
|
||||
void
|
||||
parsetext(Oid cfgId, ParsedText * prs, char *buf, int4 buflen)
|
||||
{
|
||||
int type,
|
||||
lenlemm;
|
||||
char *lemm = NULL;
|
||||
LexizeData ldata;
|
||||
TSLexeme *norms;
|
||||
TSConfigCacheEntry *cfg;
|
||||
TSParserCacheEntry *prsobj;
|
||||
void *prsdata;
|
||||
|
||||
cfg = lookup_ts_config_cache(cfgId);
|
||||
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
||||
|
||||
prsdata = (void *) DatumGetPointer(FunctionCall2(&prsobj->prsstart,
|
||||
PointerGetDatum(buf),
|
||||
Int32GetDatum(buflen)));
|
||||
|
||||
LexizeInit(&ldata, cfg);
|
||||
|
||||
do
|
||||
{
|
||||
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
||||
PointerGetDatum(prsdata),
|
||||
PointerGetDatum(&lemm),
|
||||
PointerGetDatum(&lenlemm)));
|
||||
|
||||
if (type > 0 && lenlemm >= MAXSTRLEN)
|
||||
{
|
||||
#ifdef IGNORE_LONGLEXEME
|
||||
ereport(NOTICE,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("word is too long to be indexed"),
|
||||
errdetail("Words longer than %d characters are ignored.",
|
||||
MAXSTRLEN)));
|
||||
continue;
|
||||
#else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("word is too long to be indexed")));
|
||||
#endif
|
||||
}
|
||||
|
||||
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
||||
|
||||
while ((norms = LexizeExec(&ldata, NULL)) != NULL)
|
||||
{
|
||||
TSLexeme *ptr = norms;
|
||||
|
||||
prs->pos++; /* set pos */
|
||||
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
if (prs->curwords == prs->lenwords)
|
||||
{
|
||||
prs->lenwords *= 2;
|
||||
prs->words = (ParsedWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(ParsedWord));
|
||||
}
|
||||
|
||||
if (ptr->flags & TSL_ADDPOS)
|
||||
prs->pos++;
|
||||
prs->words[prs->curwords].len = strlen(ptr->lexeme);
|
||||
prs->words[prs->curwords].word = ptr->lexeme;
|
||||
prs->words[prs->curwords].nvariant = ptr->nvariant;
|
||||
prs->words[prs->curwords].alen = 0;
|
||||
prs->words[prs->curwords].pos.pos = LIMITPOS(prs->pos);
|
||||
ptr++;
|
||||
prs->curwords++;
|
||||
}
|
||||
pfree(norms);
|
||||
}
|
||||
} while (type > 0);
|
||||
|
||||
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
||||
}
|
||||
|
||||
/*
|
||||
* Headline framework
|
||||
*/
|
||||
static void
|
||||
hladdword(HeadlineText * prs, char *buf, int4 buflen, int type)
|
||||
{
|
||||
while (prs->curwords >= prs->lenwords)
|
||||
{
|
||||
prs->lenwords *= 2;
|
||||
prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
|
||||
}
|
||||
memset(&(prs->words[prs->curwords]), 0, sizeof(HeadlineWord));
|
||||
prs->words[prs->curwords].type = (uint8) type;
|
||||
prs->words[prs->curwords].len = buflen;
|
||||
prs->words[prs->curwords].word = palloc(buflen);
|
||||
memcpy(prs->words[prs->curwords].word, buf, buflen);
|
||||
prs->curwords++;
|
||||
}
|
||||
|
||||
static void
|
||||
hlfinditem(HeadlineText * prs, TSQuery query, char *buf, int buflen)
|
||||
{
|
||||
int i;
|
||||
QueryItem *item = GETQUERY(query);
|
||||
HeadlineWord *word;
|
||||
|
||||
while (prs->curwords + query->size >= prs->lenwords)
|
||||
{
|
||||
prs->lenwords *= 2;
|
||||
prs->words = (HeadlineWord *) repalloc((void *) prs->words, prs->lenwords * sizeof(HeadlineWord));
|
||||
}
|
||||
|
||||
word = &(prs->words[prs->curwords - 1]);
|
||||
for (i = 0; i < query->size; i++)
|
||||
{
|
||||
if (item->type == VAL && item->length == buflen && strncmp(GETOPERAND(query) + item->distance, buf, buflen) == 0)
|
||||
{
|
||||
if (word->item)
|
||||
{
|
||||
memcpy(&(prs->words[prs->curwords]), word, sizeof(HeadlineWord));
|
||||
prs->words[prs->curwords].item = item;
|
||||
prs->words[prs->curwords].repeated = 1;
|
||||
prs->curwords++;
|
||||
}
|
||||
else
|
||||
word->item = item;
|
||||
}
|
||||
item++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
addHLParsedLex(HeadlineText * prs, TSQuery query, ParsedLex * lexs, TSLexeme * norms)
|
||||
{
|
||||
ParsedLex *tmplexs;
|
||||
TSLexeme *ptr;
|
||||
|
||||
while (lexs)
|
||||
{
|
||||
|
||||
if (lexs->type > 0)
|
||||
hladdword(prs, lexs->lemm, lexs->lenlemm, lexs->type);
|
||||
|
||||
ptr = norms;
|
||||
while (ptr && ptr->lexeme)
|
||||
{
|
||||
hlfinditem(prs, query, ptr->lexeme, strlen(ptr->lexeme));
|
||||
ptr++;
|
||||
}
|
||||
|
||||
tmplexs = lexs->next;
|
||||
pfree(lexs);
|
||||
lexs = tmplexs;
|
||||
}
|
||||
|
||||
if (norms)
|
||||
{
|
||||
ptr = norms;
|
||||
while (ptr->lexeme)
|
||||
{
|
||||
pfree(ptr->lexeme);
|
||||
ptr++;
|
||||
}
|
||||
pfree(norms);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
hlparsetext(Oid cfgId, HeadlineText * prs, TSQuery query, char *buf, int4 buflen)
|
||||
{
|
||||
int type,
|
||||
lenlemm;
|
||||
char *lemm = NULL;
|
||||
LexizeData ldata;
|
||||
TSLexeme *norms;
|
||||
ParsedLex *lexs;
|
||||
TSConfigCacheEntry *cfg;
|
||||
TSParserCacheEntry *prsobj;
|
||||
void *prsdata;
|
||||
|
||||
cfg = lookup_ts_config_cache(cfgId);
|
||||
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
||||
|
||||
prsdata = (void *) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
|
||||
PointerGetDatum(buf),
|
||||
Int32GetDatum(buflen)));
|
||||
|
||||
LexizeInit(&ldata, cfg);
|
||||
|
||||
do
|
||||
{
|
||||
type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
|
||||
PointerGetDatum(prsdata),
|
||||
PointerGetDatum(&lemm),
|
||||
PointerGetDatum(&lenlemm)));
|
||||
|
||||
if (type > 0 && lenlemm >= MAXSTRLEN)
|
||||
{
|
||||
#ifdef IGNORE_LONGLEXEME
|
||||
ereport(NOTICE,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("word is too long to be indexed"),
|
||||
errdetail("Words longer than %d characters are ignored.",
|
||||
MAXSTRLEN)));
|
||||
continue;
|
||||
#else
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("word is too long to be indexed")));
|
||||
#endif
|
||||
}
|
||||
|
||||
LexizeAddLemm(&ldata, type, lemm, lenlemm);
|
||||
|
||||
do
|
||||
{
|
||||
if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
|
||||
addHLParsedLex(prs, query, lexs, norms);
|
||||
else
|
||||
addHLParsedLex(prs, query, lexs, NULL);
|
||||
} while (norms);
|
||||
|
||||
} while (type > 0);
|
||||
|
||||
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
|
||||
}
|
||||
|
||||
text *
|
||||
generatHeadline(HeadlineText * prs)
|
||||
{
|
||||
text *out;
|
||||
int len = 128;
|
||||
char *ptr;
|
||||
HeadlineWord *wrd = prs->words;
|
||||
|
||||
out = (text *) palloc(len);
|
||||
ptr = ((char *) out) + VARHDRSZ;
|
||||
|
||||
while (wrd - prs->words < prs->curwords)
|
||||
{
|
||||
while (wrd->len + prs->stopsellen + prs->startsellen + (ptr - ((char *) out)) >= len)
|
||||
{
|
||||
int dist = ptr - ((char *) out);
|
||||
|
||||
len *= 2;
|
||||
out = (text *) repalloc(out, len);
|
||||
ptr = ((char *) out) + dist;
|
||||
}
|
||||
|
||||
if (wrd->in && !wrd->repeated)
|
||||
{
|
||||
if (wrd->replace)
|
||||
{
|
||||
*ptr = ' ';
|
||||
ptr++;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (wrd->selected)
|
||||
{
|
||||
memcpy(ptr, prs->startsel, prs->startsellen);
|
||||
ptr += prs->startsellen;
|
||||
}
|
||||
memcpy(ptr, wrd->word, wrd->len);
|
||||
ptr += wrd->len;
|
||||
if (wrd->selected)
|
||||
{
|
||||
memcpy(ptr, prs->stopsel, prs->stopsellen);
|
||||
ptr += prs->stopsellen;
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!wrd->repeated)
|
||||
pfree(wrd->word);
|
||||
|
||||
wrd++;
|
||||
}
|
||||
|
||||
SET_VARSIZE(out, ptr - ((char *) out));
|
||||
return out;
|
||||
}
|
330
src/backend/tsearch/ts_utils.c
Normal file
330
src/backend/tsearch/ts_utils.c
Normal file
@@ -0,0 +1,330 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* ts_utils.c
|
||||
* various support functions
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/ts_utils.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#include <ctype.h>
|
||||
|
||||
#include "miscadmin.h"
|
||||
#include "storage/fd.h"
|
||||
#include "tsearch/ts_locale.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
|
||||
|
||||
#define CS_WAITKEY 0
|
||||
#define CS_INKEY 1
|
||||
#define CS_WAITEQ 2
|
||||
#define CS_WAITVALUE 3
|
||||
#define CS_INVALUE 4
|
||||
#define CS_IN2VALUE 5
|
||||
#define CS_WAITDELIM 6
|
||||
#define CS_INESC 7
|
||||
#define CS_IN2ESC 8
|
||||
|
||||
static char *
|
||||
nstrdup(char *ptr, int len)
|
||||
{
|
||||
char *res = palloc(len + 1),
|
||||
*cptr;
|
||||
|
||||
memcpy(res, ptr, len);
|
||||
res[len] = '\0';
|
||||
cptr = ptr = res;
|
||||
while (*ptr)
|
||||
{
|
||||
if (t_iseq(ptr, '\\'))
|
||||
ptr++;
|
||||
COPYCHAR(cptr, ptr);
|
||||
cptr += pg_mblen(ptr);
|
||||
ptr += pg_mblen(ptr);
|
||||
}
|
||||
*cptr = '\0';
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse a parameter string consisting of key = value clauses
|
||||
*/
|
||||
void
|
||||
parse_keyvalpairs(text *in, Map ** m)
|
||||
{
|
||||
Map *mptr;
|
||||
char *ptr = VARDATA(in),
|
||||
*begin = NULL;
|
||||
char num = 0;
|
||||
int state = CS_WAITKEY;
|
||||
|
||||
while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
|
||||
{
|
||||
if (t_iseq(ptr, ','))
|
||||
num++;
|
||||
ptr += pg_mblen(ptr);
|
||||
}
|
||||
|
||||
*m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
|
||||
memset(mptr, 0, sizeof(Map) * (num + 2));
|
||||
ptr = VARDATA(in);
|
||||
while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
|
||||
{
|
||||
if (state == CS_WAITKEY)
|
||||
{
|
||||
if (t_isalpha(ptr))
|
||||
{
|
||||
begin = ptr;
|
||||
state = CS_INKEY;
|
||||
}
|
||||
else if (!t_isspace(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid parameter list format: \"%s\"",
|
||||
TextPGetCString(in))));
|
||||
}
|
||||
else if (state == CS_INKEY)
|
||||
{
|
||||
if (t_isspace(ptr))
|
||||
{
|
||||
mptr->key = nstrdup(begin, ptr - begin);
|
||||
state = CS_WAITEQ;
|
||||
}
|
||||
else if (t_iseq(ptr, '='))
|
||||
{
|
||||
mptr->key = nstrdup(begin, ptr - begin);
|
||||
state = CS_WAITVALUE;
|
||||
}
|
||||
else if (!t_isalpha(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid parameter list format: \"%s\"",
|
||||
TextPGetCString(in))));
|
||||
}
|
||||
else if (state == CS_WAITEQ)
|
||||
{
|
||||
if (t_iseq(ptr, '='))
|
||||
state = CS_WAITVALUE;
|
||||
else if (!t_isspace(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid parameter list format: \"%s\"",
|
||||
TextPGetCString(in))));
|
||||
}
|
||||
else if (state == CS_WAITVALUE)
|
||||
{
|
||||
if (t_iseq(ptr, '"'))
|
||||
{
|
||||
begin = ptr + 1;
|
||||
state = CS_INVALUE;
|
||||
}
|
||||
else if (!t_isspace(ptr))
|
||||
{
|
||||
begin = ptr;
|
||||
state = CS_IN2VALUE;
|
||||
}
|
||||
}
|
||||
else if (state == CS_INVALUE)
|
||||
{
|
||||
if (t_iseq(ptr, '"'))
|
||||
{
|
||||
mptr->value = nstrdup(begin, ptr - begin);
|
||||
mptr++;
|
||||
state = CS_WAITDELIM;
|
||||
}
|
||||
else if (t_iseq(ptr, '\\'))
|
||||
state = CS_INESC;
|
||||
}
|
||||
else if (state == CS_IN2VALUE)
|
||||
{
|
||||
if (t_isspace(ptr) || t_iseq(ptr, ','))
|
||||
{
|
||||
mptr->value = nstrdup(begin, ptr - begin);
|
||||
mptr++;
|
||||
state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
|
||||
}
|
||||
else if (t_iseq(ptr, '\\'))
|
||||
state = CS_INESC;
|
||||
}
|
||||
else if (state == CS_WAITDELIM)
|
||||
{
|
||||
if (t_iseq(ptr, ','))
|
||||
state = CS_WAITKEY;
|
||||
else if (!t_isspace(ptr))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid parameter list format: \"%s\"",
|
||||
TextPGetCString(in))));
|
||||
}
|
||||
else if (state == CS_INESC)
|
||||
state = CS_INVALUE;
|
||||
else if (state == CS_IN2ESC)
|
||||
state = CS_IN2VALUE;
|
||||
else
|
||||
elog(ERROR, "unrecognized parse_keyvalpairs state: %d", state);
|
||||
ptr += pg_mblen(ptr);
|
||||
}
|
||||
|
||||
if (state == CS_IN2VALUE)
|
||||
{
|
||||
mptr->value = nstrdup(begin, ptr - begin);
|
||||
mptr++;
|
||||
}
|
||||
else if (!(state == CS_WAITDELIM || state == CS_WAITKEY))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_SYNTAX_ERROR),
|
||||
errmsg("invalid parameter list format: \"%s\"",
|
||||
TextPGetCString(in))));
|
||||
}
|
||||
|
||||
/*
|
||||
* Given the base name and extension of a tsearch config file, return
|
||||
* its full path name. The base name is assumed to be user-supplied,
|
||||
* and is checked to prevent pathname attacks. The extension is assumed
|
||||
* to be safe.
|
||||
*
|
||||
* The result is a palloc'd string.
|
||||
*/
|
||||
char *
|
||||
get_tsearch_config_filename(const char *basename,
|
||||
const char *extension)
|
||||
{
|
||||
char sharepath[MAXPGPATH];
|
||||
char *result;
|
||||
const char *p;
|
||||
|
||||
/*
|
||||
* We enforce that the basename is all alpha characters. This may be
|
||||
* overly restrictive, but we don't want to allow access to anything
|
||||
* outside the tsearch_data directory, so for instance '/' *must* be
|
||||
* rejected. This is the same test used for timezonesets names.
|
||||
*/
|
||||
for (p = basename; *p; p++)
|
||||
{
|
||||
if (!isalpha((unsigned char) *p))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("invalid text search configuration file name \"%s\"",
|
||||
basename)));
|
||||
}
|
||||
|
||||
get_share_path(my_exec_path, sharepath);
|
||||
result = palloc(MAXPGPATH);
|
||||
snprintf(result, MAXPGPATH, "%s/tsearch_data/%s.%s",
|
||||
sharepath, basename, extension);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
#define STOPBUFLEN 4096
|
||||
|
||||
void
|
||||
readstoplist(char *in, StopList * s)
|
||||
{
|
||||
char **stop = NULL;
|
||||
|
||||
s->len = 0;
|
||||
if (in && *in)
|
||||
{
|
||||
char *filename = get_tsearch_config_filename(in, "stop");
|
||||
FILE *hin;
|
||||
char buf[STOPBUFLEN];
|
||||
int reallen = 0;
|
||||
int line = 0;
|
||||
|
||||
if ((hin = AllocateFile(filename, "r")) == NULL)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CONFIG_FILE_ERROR),
|
||||
errmsg("could not open stopword file \"%s\": %m",
|
||||
filename)));
|
||||
|
||||
while (fgets(buf, STOPBUFLEN, hin))
|
||||
{
|
||||
char *pbuf = buf;
|
||||
|
||||
line++;
|
||||
while (*pbuf && !isspace(*pbuf))
|
||||
pbuf++;
|
||||
*pbuf = '\0';
|
||||
|
||||
if (*buf == '\0')
|
||||
continue;
|
||||
|
||||
if (!pg_verifymbstr(buf, strlen(buf), true))
|
||||
{
|
||||
FreeFile(hin);
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
|
||||
errmsg("invalid multibyte encoding at line %d in file \"%s\"",
|
||||
line, filename)));
|
||||
}
|
||||
|
||||
if (s->len >= reallen)
|
||||
{
|
||||
if (reallen == 0)
|
||||
{
|
||||
reallen = 16;
|
||||
stop = (char **) palloc(sizeof(char *) * reallen);
|
||||
}
|
||||
else
|
||||
{
|
||||
reallen *= 2;
|
||||
stop = (char **) repalloc((void *) stop, sizeof(char *) * reallen);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (s->wordop)
|
||||
stop[s->len] = s->wordop(buf);
|
||||
else
|
||||
stop[s->len] = pstrdup(buf);
|
||||
|
||||
(s->len)++;
|
||||
}
|
||||
FreeFile(hin);
|
||||
pfree(filename);
|
||||
}
|
||||
|
||||
s->stop = stop;
|
||||
}
|
||||
|
||||
static int
|
||||
comparestr(const void *a, const void *b)
|
||||
{
|
||||
return strcmp(*(char **) a, *(char **) b);
|
||||
}
|
||||
|
||||
void
|
||||
sortstoplist(StopList * s)
|
||||
{
|
||||
if (s->stop && s->len > 0)
|
||||
qsort(s->stop, s->len, sizeof(char *), comparestr);
|
||||
}
|
||||
|
||||
bool
|
||||
searchstoplist(StopList * s, char *key)
|
||||
{
|
||||
return (s->stop && s->len > 0 &&
|
||||
bsearch(&key, s->stop, s->len,
|
||||
sizeof(char *), comparestr)) ? true : false;
|
||||
}
|
||||
|
||||
char *
|
||||
pnstrdup(const char *in, int len)
|
||||
{
|
||||
char *out = palloc(len + 1);
|
||||
|
||||
memcpy(out, in, len);
|
||||
out[len] = '\0';
|
||||
return out;
|
||||
}
|
360
src/backend/tsearch/wparser.c
Normal file
360
src/backend/tsearch/wparser.c
Normal file
@@ -0,0 +1,360 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* wparser.c
|
||||
* Standard interface to word parser
|
||||
*
|
||||
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/tsearch/wparser.c,v 1.1 2007/08/21 01:11:18 tgl Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "funcapi.h"
|
||||
#include "access/genam.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/skey.h"
|
||||
#include "catalog/indexing.h"
|
||||
#include "catalog/namespace.h"
|
||||
#include "catalog/pg_ts_parser.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "tsearch/ts_cache.h"
|
||||
#include "tsearch/ts_public.h"
|
||||
#include "tsearch/ts_utils.h"
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/fmgroids.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/syscache.h"
|
||||
|
||||
|
||||
/******sql-level interface******/
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int cur;
|
||||
LexDescr *list;
|
||||
} TSTokenTypeStorage;
|
||||
|
||||
static void
|
||||
tt_setup_firstcall(FuncCallContext *funcctx, Oid prsid)
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
MemoryContext oldcontext;
|
||||
TSTokenTypeStorage *st;
|
||||
TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid);
|
||||
|
||||
if (!OidIsValid(prs->lextypeOid))
|
||||
elog(ERROR, "method lextype isn't defined for text search parser %u",
|
||||
prsid);
|
||||
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
st = (TSTokenTypeStorage *) palloc(sizeof(TSTokenTypeStorage));
|
||||
st->cur = 0;
|
||||
/* OidFunctionCall0 is absent */
|
||||
st->list = (LexDescr *) DatumGetPointer(OidFunctionCall1(prs->lextypeOid,
|
||||
(Datum) 0));
|
||||
funcctx->user_fctx = (void *) st;
|
||||
|
||||
tupdesc = CreateTemplateTupleDesc(3, false);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid",
|
||||
INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "alias",
|
||||
TEXTOID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 3, "description",
|
||||
TEXTOID, -1, 0);
|
||||
|
||||
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
static Datum
|
||||
tt_process_call(FuncCallContext *funcctx)
|
||||
{
|
||||
TSTokenTypeStorage *st;
|
||||
|
||||
st = (TSTokenTypeStorage *) funcctx->user_fctx;
|
||||
if (st->list && st->list[st->cur].lexid)
|
||||
{
|
||||
Datum result;
|
||||
char *values[3];
|
||||
char txtid[16];
|
||||
HeapTuple tuple;
|
||||
|
||||
sprintf(txtid, "%d", st->list[st->cur].lexid);
|
||||
values[0] = txtid;
|
||||
values[1] = st->list[st->cur].alias;
|
||||
values[2] = st->list[st->cur].descr;
|
||||
|
||||
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
pfree(values[1]);
|
||||
pfree(values[2]);
|
||||
st->cur++;
|
||||
return result;
|
||||
}
|
||||
if (st->list)
|
||||
pfree(st->list);
|
||||
pfree(st);
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_token_type_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
tt_setup_firstcall(funcctx, PG_GETARG_OID(0));
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
if ((result = tt_process_call(funcctx)) != (Datum) 0)
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_token_type_byname(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
text *prsname = PG_GETARG_TEXT_P(0);
|
||||
Oid prsId;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
prsId = TSParserGetPrsid(textToQualifiedNameList(prsname), false);
|
||||
tt_setup_firstcall(funcctx, prsId);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
if ((result = tt_process_call(funcctx)) != (Datum) 0)
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int type;
|
||||
char *lexeme;
|
||||
} LexemeEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
int cur;
|
||||
int len;
|
||||
LexemeEntry *list;
|
||||
} PrsStorage;
|
||||
|
||||
|
||||
static void
|
||||
prs_setup_firstcall(FuncCallContext *funcctx, Oid prsid, text *txt)
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
MemoryContext oldcontext;
|
||||
PrsStorage *st;
|
||||
TSParserCacheEntry *prs = lookup_ts_parser_cache(prsid);
|
||||
char *lex = NULL;
|
||||
int llen = 0,
|
||||
type = 0;
|
||||
void *prsdata;
|
||||
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
st = (PrsStorage *) palloc(sizeof(PrsStorage));
|
||||
st->cur = 0;
|
||||
st->len = 16;
|
||||
st->list = (LexemeEntry *) palloc(sizeof(LexemeEntry) * st->len);
|
||||
|
||||
prsdata = (void *) DatumGetPointer(FunctionCall2(&prs->prsstart,
|
||||
PointerGetDatum(VARDATA(txt)),
|
||||
Int32GetDatum(VARSIZE(txt) - VARHDRSZ)));
|
||||
|
||||
while ((type = DatumGetInt32(FunctionCall3(&prs->prstoken,
|
||||
PointerGetDatum(prsdata),
|
||||
PointerGetDatum(&lex),
|
||||
PointerGetDatum(&llen)))) != 0)
|
||||
{
|
||||
if (st->cur >= st->len)
|
||||
{
|
||||
st->len = 2 * st->len;
|
||||
st->list = (LexemeEntry *) repalloc(st->list, sizeof(LexemeEntry) * st->len);
|
||||
}
|
||||
st->list[st->cur].lexeme = palloc(llen + 1);
|
||||
memcpy(st->list[st->cur].lexeme, lex, llen);
|
||||
st->list[st->cur].lexeme[llen] = '\0';
|
||||
st->list[st->cur].type = type;
|
||||
st->cur++;
|
||||
}
|
||||
|
||||
FunctionCall1(&prs->prsend, PointerGetDatum(prsdata));
|
||||
|
||||
st->len = st->cur;
|
||||
st->cur = 0;
|
||||
|
||||
funcctx->user_fctx = (void *) st;
|
||||
tupdesc = CreateTemplateTupleDesc(2, false);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 1, "tokid",
|
||||
INT4OID, -1, 0);
|
||||
TupleDescInitEntry(tupdesc, (AttrNumber) 2, "token",
|
||||
TEXTOID, -1, 0);
|
||||
|
||||
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
|
||||
MemoryContextSwitchTo(oldcontext);
|
||||
}
|
||||
|
||||
static Datum
|
||||
prs_process_call(FuncCallContext *funcctx)
|
||||
{
|
||||
PrsStorage *st;
|
||||
|
||||
st = (PrsStorage *) funcctx->user_fctx;
|
||||
if (st->cur < st->len)
|
||||
{
|
||||
Datum result;
|
||||
char *values[2];
|
||||
char tid[16];
|
||||
HeapTuple tuple;
|
||||
|
||||
values[0] = tid;
|
||||
sprintf(tid, "%d", st->list[st->cur].type);
|
||||
values[1] = st->list[st->cur].lexeme;
|
||||
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
pfree(values[1]);
|
||||
st->cur++;
|
||||
return result;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (st->list)
|
||||
pfree(st->list);
|
||||
pfree(st);
|
||||
}
|
||||
return (Datum) 0;
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_parse_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
text *txt = PG_GETARG_TEXT_P(1);
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
prs_setup_firstcall(funcctx, PG_GETARG_OID(0), txt);
|
||||
PG_FREE_IF_COPY(txt, 1);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
if ((result = prs_process_call(funcctx)) != (Datum) 0)
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_parse_byname(PG_FUNCTION_ARGS)
|
||||
{
|
||||
FuncCallContext *funcctx;
|
||||
Datum result;
|
||||
|
||||
if (SRF_IS_FIRSTCALL())
|
||||
{
|
||||
text *prsname = PG_GETARG_TEXT_P(0);
|
||||
text *txt = PG_GETARG_TEXT_P(1);
|
||||
Oid prsId;
|
||||
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
prsId = TSParserGetPrsid(textToQualifiedNameList(prsname), false);
|
||||
prs_setup_firstcall(funcctx, prsId, txt);
|
||||
}
|
||||
|
||||
funcctx = SRF_PERCALL_SETUP();
|
||||
|
||||
if ((result = prs_process_call(funcctx)) != (Datum) 0)
|
||||
SRF_RETURN_NEXT(funcctx, result);
|
||||
SRF_RETURN_DONE(funcctx);
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_headline_byid_opt(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *in = PG_GETARG_TEXT_P(1);
|
||||
TSQuery query = PG_GETARG_TSQUERY(2);
|
||||
text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
|
||||
HeadlineText prs;
|
||||
text *out;
|
||||
TSConfigCacheEntry *cfg;
|
||||
TSParserCacheEntry *prsobj;
|
||||
|
||||
cfg = lookup_ts_config_cache(PG_GETARG_OID(0));
|
||||
prsobj = lookup_ts_parser_cache(cfg->prsId);
|
||||
|
||||
memset(&prs, 0, sizeof(HeadlineText));
|
||||
prs.lenwords = 32;
|
||||
prs.words = (HeadlineWord *) palloc(sizeof(HeadlineWord) * prs.lenwords);
|
||||
|
||||
hlparsetext(cfg->cfgId, &prs, query, VARDATA(in), VARSIZE(in) - VARHDRSZ);
|
||||
|
||||
FunctionCall3(&(prsobj->prsheadline),
|
||||
PointerGetDatum(&prs),
|
||||
PointerGetDatum(opt),
|
||||
PointerGetDatum(query));
|
||||
|
||||
out = generatHeadline(&prs);
|
||||
|
||||
PG_FREE_IF_COPY(in, 1);
|
||||
PG_FREE_IF_COPY(query, 2);
|
||||
if (opt)
|
||||
PG_FREE_IF_COPY(opt, 3);
|
||||
pfree(prs.words);
|
||||
pfree(prs.startsel);
|
||||
pfree(prs.stopsel);
|
||||
|
||||
PG_RETURN_POINTER(out);
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_headline_byid(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
|
||||
PG_GETARG_DATUM(0),
|
||||
PG_GETARG_DATUM(1),
|
||||
PG_GETARG_DATUM(2)));
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_headline(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
|
||||
ObjectIdGetDatum(getTSCurrentConfig(true)),
|
||||
PG_GETARG_DATUM(0),
|
||||
PG_GETARG_DATUM(1)));
|
||||
}
|
||||
|
||||
Datum
|
||||
ts_headline_opt(PG_FUNCTION_ARGS)
|
||||
{
|
||||
PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
|
||||
ObjectIdGetDatum(getTSCurrentConfig(true)),
|
||||
PG_GETARG_DATUM(0),
|
||||
PG_GETARG_DATUM(1),
|
||||
PG_GETARG_DATUM(2)));
|
||||
}
|
1873
src/backend/tsearch/wparser_def.c
Normal file
1873
src/backend/tsearch/wparser_def.c
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user