1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-28 23:42:10 +03:00

Add sample text search dictionary templates and parsers, to replace the

hard-to-maintain textual examples currently in the SGML docs.  From
Sergey Karpov.
This commit is contained in:
Tom Lane
2007-10-15 21:36:50 +00:00
parent fb631dba2a
commit 5fcb079858
24 changed files with 1324 additions and 9 deletions

View File

@ -0,0 +1,38 @@
# $PostgreSQL: pgsql/contrib/dict_xsyn/Makefile,v 1.1 2007/10/15 21:36:50 tgl Exp $
MODULE_big = dict_xsyn
OBJS = dict_xsyn.o
DATA_built = dict_xsyn.sql
DATA = uninstall_dict_xsyn.sql
DOCS = README.dict_xsyn
REGRESS = dict_xsyn
DICTDIR = tsearch_data
DICTFILES = xsyn_sample.rules
ifdef USE_PGXS
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
include $(PGXS)
else
subdir = contrib/dict_xsyn
top_builddir = ../..
include $(top_builddir)/src/Makefile.global
include $(top_srcdir)/contrib/contrib-global.mk
endif
install: install-data
.PHONY: install-data
install-data: $(DICTFILES)
for i in $(DICTFILES); \
do $(INSTALL_DATA) $(srcdir)/$$i '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i; \
done
uninstall: uninstall-data
.PHONY: uninstall-data
uninstall-data:
for i in $(DICTFILES); \
do rm -rf '$(DESTDIR)$(datadir)/$(DICTDIR)/'$$i ; \
done

View File

@ -0,0 +1,52 @@
Extended Synonym dictionary
===========================
This is a simple synonym dictionary. It replaces words with groups of their
synonyms, and so makes it possible to search for a word using any of its
synonyms.
* Configuration
It accepts the following options:
- KEEPORIG controls whether the original word is included, or only its
synonyms. Default is 'true'.
- RULES is the base name of the file containing the list of synonyms.
This file must be in $(prefix)/share/tsearch_data/, and its name must
end in ".rules" (which is not included in the RULES parameter).
The rules file has the following format:
- Each line represents a group of synonyms for a single word, which is
given first on the line. Synonyms are separated by whitespace:
word syn1 syn2 syn3
- Sharp ('#') sign is a comment delimiter. It may appear at any position
inside the line. The rest of the line will be skipped.
Look at xsyn_sample.rules, which is installed in $(prefix)/share/tsearch_data/,
for an example.
* Usage
1. Compile and install
2. Load dictionary
psql mydb < dict_xsyn.sql
3. Test it
mydb=# SELECT ts_lexize('xsyn','word');
ts_lexize
----------------
{word,syn1,syn2,syn3)
4. Change the dictionary options as you wish
mydb# ALTER TEXT SEARCH DICTIONARY xsyn (KEEPORIG=false);
ALTER TEXT SEARCH DICTIONARY
That's all.

View File

@ -0,0 +1,235 @@
/*-------------------------------------------------------------------------
*
* dict_xsyn.c
* Extended synonym dictionary
*
* Copyright (c) 2007, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.c,v 1.1 2007/10/15 21:36:50 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
#include "commands/defrem.h"
#include "fmgr.h"
#include "storage/fd.h"
#include "tsearch/ts_locale.h"
#include "tsearch/ts_utils.h"
PG_MODULE_MAGIC;
typedef struct
{
char *key; /* Word */
char *value; /* Unparsed list of synonyms, including the word itself */
} Syn;
typedef struct
{
int len;
Syn *syn;
bool keeporig;
} DictSyn;
PG_FUNCTION_INFO_V1(dxsyn_init);
Datum dxsyn_init(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(dxsyn_lexize);
Datum dxsyn_lexize(PG_FUNCTION_ARGS);
static char *
find_word(char *in, char **end)
{
char *start;
*end = NULL;
while (*in && t_isspace(in))
in += pg_mblen(in);
if (!*in || *in == '#')
return NULL;
start = in;
while (*in && !t_isspace(in))
in += pg_mblen(in);
*end = in;
return start;
}
static int
compare_syn(const void *a, const void *b)
{
return strcmp(((Syn *) a)->key, ((Syn *) b)->key);
}
static void
read_dictionary(DictSyn *d, char *filename)
{
char *real_filename = get_tsearch_config_filename(filename, "rules");
FILE *fin;
char *line;
int cur = 0;
if ((fin = AllocateFile(real_filename, "r")) == NULL)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("could not open synonym file \"%s\": %m",
real_filename)));
while ((line = t_readline(fin)) != NULL)
{
char *value;
char *key;
char *end = NULL;
if (*line == '\0')
continue;
value = lowerstr(line);
pfree(line);
key = find_word(value, &end);
if (!key)
{
pfree(value);
continue;
}
if (cur == d->len)
{
d->len = (d->len > 0) ? 2 * d->len : 16;
if (d->syn)
d->syn = (Syn *) repalloc(d->syn, sizeof(Syn) * d->len);
else
d->syn = (Syn *) palloc(sizeof(Syn) * d->len);
}
d->syn[cur].key = pnstrdup(key, end - key);
d->syn[cur].value = value;
cur++;
}
FreeFile(fin);
d->len = cur;
if (cur > 1)
qsort(d->syn, d->len, sizeof(Syn), compare_syn);
pfree(real_filename);
}
Datum
dxsyn_init(PG_FUNCTION_ARGS)
{
List *dictoptions = (List *) PG_GETARG_POINTER(0);
DictSyn *d;
ListCell *l;
d = (DictSyn *) palloc0(sizeof(DictSyn));
d->len = 0;
d->syn = NULL;
d->keeporig = true;
foreach(l, dictoptions)
{
DefElem *defel = (DefElem *) lfirst(l);
if (pg_strcasecmp(defel->defname, "KEEPORIG") == 0)
{
d->keeporig = defGetBoolean(defel);
}
else if (pg_strcasecmp(defel->defname, "RULES") == 0)
{
read_dictionary(d, defGetString(defel));
}
else
{
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("unrecognized xsyn parameter: \"%s\"",
defel->defname)));
}
}
PG_RETURN_POINTER(d);
}
Datum
dxsyn_lexize(PG_FUNCTION_ARGS)
{
DictSyn *d = (DictSyn *) PG_GETARG_POINTER(0);
char *in = (char *) PG_GETARG_POINTER(1);
int length = PG_GETARG_INT32(2);
Syn word;
Syn *found;
TSLexeme *res = NULL;
if (!length || d->len == 0)
PG_RETURN_POINTER(NULL);
/* Create search pattern */
{
char *temp = pnstrdup(in, length);
word.key = lowerstr(temp);
pfree(temp);
word.value = NULL;
}
/* Look for matching syn */
found = (Syn *)bsearch(&word, d->syn, d->len, sizeof(Syn), compare_syn);
pfree(word.key);
if (!found)
PG_RETURN_POINTER(NULL);
/* Parse string of synonyms and return array of words */
{
char *value = pstrdup(found->value);
int value_length = strlen(value);
char *pos = value;
int nsyns = 0;
bool is_first = true;
res = palloc(0);
while(pos < value + value_length)
{
char *end;
char *syn = find_word(pos, &end);
if (!syn)
break;
*end = '\0';
res = repalloc(res, sizeof(TSLexeme)*(nsyns + 2));
res[nsyns].lexeme = NULL;
/* first word is added to result only if KEEPORIG flag is set */
if(d->keeporig || !is_first)
{
res[nsyns].lexeme = pstrdup(syn);
res[nsyns + 1].lexeme = NULL;
nsyns++;
}
is_first = false;
pos = end + 1;
}
pfree(value);
}
PG_RETURN_POINTER(res);
}

View File

@ -0,0 +1,29 @@
-- $PostgreSQL: pgsql/contrib/dict_xsyn/dict_xsyn.sql.in,v 1.1 2007/10/15 21:36:50 tgl Exp $
-- Adjust this setting to control where the objects get created.
SET search_path = public;
BEGIN;
CREATE FUNCTION dxsyn_init(internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE FUNCTION dxsyn_lexize(internal, internal, internal, internal)
RETURNS internal
AS 'MODULE_PATHNAME'
LANGUAGE C STRICT;
CREATE TEXT SEARCH TEMPLATE xsyn_template (
LEXIZE = dxsyn_lexize,
INIT = dxsyn_init
);
CREATE TEXT SEARCH DICTIONARY xsyn (
TEMPLATE = xsyn_template
);
COMMENT ON TEXT SEARCH DICTIONARY xsyn IS 'eXtended synonym dictionary';
END;

View File

@ -0,0 +1,22 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
RESET client_min_messages;
--configuration
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
--lexize
SELECT ts_lexize('xsyn', 'supernova');
ts_lexize
----------------
{sn,sne,1987a}
(1 row)
SELECT ts_lexize('xsyn', 'grb');
ts_lexize
-----------
(1 row)

View File

@ -0,0 +1,16 @@
--
-- first, define the datatype. Turn off echoing so that expected file
-- does not depend on contents of this file.
--
SET client_min_messages = warning;
\set ECHO none
\i dict_xsyn.sql
\set ECHO all
RESET client_min_messages;
--configuration
ALTER TEXT SEARCH DICTIONARY xsyn (RULES='xsyn_sample', KEEPORIG=false);
--lexize
SELECT ts_lexize('xsyn', 'supernova');
SELECT ts_lexize('xsyn', 'grb');

View File

@ -0,0 +1,9 @@
SET search_path = public;
DROP TEXT SEARCH DICTIONARY xsyn;
DROP TEXT SEARCH TEMPLATE xsyn_template;
DROP FUNCTION dxsyn_init(internal);
DROP FUNCTION dxsyn_lexize(internal,internal,internal,internal);

View File

@ -0,0 +1,6 @@
# Sample rules file for eXtended Synonym (xsyn) dictionary
# format is as follows:
#
# word synonym1 synonym2 ...
#
supernova sn sne 1987a