mirror of
https://github.com/postgres/postgres.git
synced 2025-07-26 01:22:12 +03:00
Add prefix support for synonym dictionary
This commit is contained in:
@ -1,4 +1,4 @@
|
|||||||
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.52 2009/06/17 21:58:49 tgl Exp $ -->
|
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.53 2009/08/14 14:53:20 teodor Exp $ -->
|
||||||
|
|
||||||
<chapter id="textsearch">
|
<chapter id="textsearch">
|
||||||
<title id="textsearch-title">Full Text Search</title>
|
<title id="textsearch-title">Full Text Search</title>
|
||||||
@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris');
|
|||||||
asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
|
asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
|
||||||
</programlisting>
|
</programlisting>
|
||||||
</para>
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
An asterisk (<literal>*</literal>) at the end of definition word indicates
|
||||||
|
that definition word is a prefix, and <function>to_tsquery()</function>
|
||||||
|
function will transform that definition to the prefix search format (see
|
||||||
|
<xref linkend="textsearch-parsing-queries">).
|
||||||
|
Notice that it is ignored in <function>to_tsvector()</function>.
|
||||||
|
</para>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Contents of <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
|
||||||
|
</para>
|
||||||
|
<programlisting>
|
||||||
|
postgres pgsql
|
||||||
|
postgresql pgsql
|
||||||
|
postgre pgsql
|
||||||
|
gogle googl
|
||||||
|
indices index*
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
<para>
|
||||||
|
Results:
|
||||||
|
</para>
|
||||||
|
<programlisting>
|
||||||
|
=# create text search dictionary syn( template=synonym,synonyms='synonym_sample');
|
||||||
|
=# select ts_lexize('syn','indices');
|
||||||
|
ts_lexize
|
||||||
|
-----------
|
||||||
|
{index}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
=# create text search configuration tst ( copy=simple);
|
||||||
|
=# alter text search configuration tst alter mapping for asciiword with syn;
|
||||||
|
=# select to_tsquery('tst','indices');
|
||||||
|
to_tsquery
|
||||||
|
------------
|
||||||
|
'index':*
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
=# select 'indexes are very useful'::tsvector;
|
||||||
|
tsvector
|
||||||
|
---------------------------------
|
||||||
|
'are' 'indexes' 'useful' 'very'
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
=# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
|
||||||
|
?column?
|
||||||
|
----------
|
||||||
|
t
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
=# select to_tsvector('tst','indices');
|
||||||
|
to_tsvector
|
||||||
|
-------------
|
||||||
|
'index':1
|
||||||
|
(1 row)
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
<para>
|
<para>
|
||||||
The only parameter required by the <literal>synonym</> template is
|
The only parameter required by the <literal>synonym</> template is
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $
|
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -23,6 +23,8 @@ typedef struct
|
|||||||
{
|
{
|
||||||
char *in;
|
char *in;
|
||||||
char *out;
|
char *out;
|
||||||
|
int outlen;
|
||||||
|
uint16 flags;
|
||||||
} Syn;
|
} Syn;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
@ -36,11 +38,14 @@ typedef struct
|
|||||||
* Finds the next whitespace-delimited word within the 'in' string.
|
* Finds the next whitespace-delimited word within the 'in' string.
|
||||||
* Returns a pointer to the first character of the word, and a pointer
|
* Returns a pointer to the first character of the word, and a pointer
|
||||||
* to the next byte after the last character in the word (in *end).
|
* to the next byte after the last character in the word (in *end).
|
||||||
|
* Character '*' at the end of word will not be threated as word
|
||||||
|
* charater if flags is not null.
|
||||||
*/
|
*/
|
||||||
static char *
|
static char *
|
||||||
findwrd(char *in, char **end)
|
findwrd(char *in, char **end, uint16 *flags)
|
||||||
{
|
{
|
||||||
char *start;
|
char *start;
|
||||||
|
char *lastchar;
|
||||||
|
|
||||||
/* Skip leading spaces */
|
/* Skip leading spaces */
|
||||||
while (*in && t_isspace(in))
|
while (*in && t_isspace(in))
|
||||||
@ -53,13 +58,27 @@ findwrd(char *in, char **end)
|
|||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
start = in;
|
lastchar = start = in;
|
||||||
|
|
||||||
/* Find end of word */
|
/* Find end of word */
|
||||||
while (*in && !t_isspace(in))
|
while (*in && !t_isspace(in))
|
||||||
|
{
|
||||||
|
lastchar = in;
|
||||||
in += pg_mblen(in);
|
in += pg_mblen(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags )
|
||||||
|
{
|
||||||
|
*flags = TSL_PREFIX;
|
||||||
|
*end = lastchar;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (flags)
|
||||||
|
*flags = 0;
|
||||||
|
*end = in;
|
||||||
|
}
|
||||||
|
|
||||||
*end = in;
|
|
||||||
return start;
|
return start;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
|
|||||||
*end = NULL;
|
*end = NULL;
|
||||||
int cur = 0;
|
int cur = 0;
|
||||||
char *line = NULL;
|
char *line = NULL;
|
||||||
|
uint16 flags = 0;
|
||||||
|
|
||||||
foreach(l, dictoptions)
|
foreach(l, dictoptions)
|
||||||
{
|
{
|
||||||
@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
|
|||||||
|
|
||||||
while ((line = tsearch_readline(&trst)) != NULL)
|
while ((line = tsearch_readline(&trst)) != NULL)
|
||||||
{
|
{
|
||||||
starti = findwrd(line, &end);
|
starti = findwrd(line, &end, NULL);
|
||||||
if (!starti)
|
if (!starti)
|
||||||
{
|
{
|
||||||
/* Empty line */
|
/* Empty line */
|
||||||
@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
|
|||||||
}
|
}
|
||||||
*end = '\0';
|
*end = '\0';
|
||||||
|
|
||||||
starto = findwrd(end + 1, &end);
|
starto = findwrd(end + 1, &end, &flags);
|
||||||
if (!starto)
|
if (!starto)
|
||||||
{
|
{
|
||||||
/* A line with only one word (+whitespace). Ignore silently. */
|
/* A line with only one word (+whitespace). Ignore silently. */
|
||||||
@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS)
|
|||||||
d->syn[cur].out = lowerstr(starto);
|
d->syn[cur].out = lowerstr(starto);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
d->syn[cur].outlen = strlen(starto);
|
||||||
|
d->syn[cur].flags = flags;
|
||||||
|
|
||||||
cur++;
|
cur++;
|
||||||
|
|
||||||
skipline:
|
skipline:
|
||||||
@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
|
|||||||
PG_RETURN_POINTER(NULL);
|
PG_RETURN_POINTER(NULL);
|
||||||
|
|
||||||
res = palloc0(sizeof(TSLexeme) * 2);
|
res = palloc0(sizeof(TSLexeme) * 2);
|
||||||
res[0].lexeme = pstrdup(found->out);
|
res[0].lexeme = pnstrdup(found->out, found->outlen);
|
||||||
|
res[0].flags = found->flags;
|
||||||
|
|
||||||
PG_RETURN_POINTER(res);
|
PG_RETURN_POINTER(res);
|
||||||
}
|
}
|
||||||
|
@ -2,3 +2,4 @@ postgres pgsql
|
|||||||
postgresql pgsql
|
postgresql pgsql
|
||||||
postgre pgsql
|
postgre pgsql
|
||||||
gogle googl
|
gogle googl
|
||||||
|
indices index*
|
||||||
|
@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle');
|
|||||||
{googl}
|
{googl}
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT ts_lexize('synonym', 'indices');
|
||||||
|
ts_lexize
|
||||||
|
-----------
|
||||||
|
{index}
|
||||||
|
(1 row)
|
||||||
|
|
||||||
-- Create and simple test thesaurus dictionary
|
-- Create and simple test thesaurus dictionary
|
||||||
-- More tests in configuration checks because ts_lexize()
|
-- More tests in configuration checks because ts_lexize()
|
||||||
-- cannot pass more than one word to thesaurus.
|
-- cannot pass more than one word to thesaurus.
|
||||||
@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead
|
|||||||
'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
|
'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
|
||||||
(1 row)
|
(1 row)
|
||||||
|
|
||||||
|
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
|
||||||
|
to_tsvector
|
||||||
|
----------------------------------------------
|
||||||
|
'form':8 'index':1,3,10 'plural':7 'right':6
|
||||||
|
(1 row)
|
||||||
|
|
||||||
|
SELECT to_tsquery('synonym_tst', 'Index & indices');
|
||||||
|
to_tsquery
|
||||||
|
---------------------
|
||||||
|
'index' & 'index':*
|
||||||
|
(1 row)
|
||||||
|
|
||||||
-- test thesaurus in configuration
|
-- test thesaurus in configuration
|
||||||
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector
|
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector
|
||||||
CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
|
CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
|
||||||
|
@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym (
|
|||||||
|
|
||||||
SELECT ts_lexize('synonym', 'PoStGrEs');
|
SELECT ts_lexize('synonym', 'PoStGrEs');
|
||||||
SELECT ts_lexize('synonym', 'Gogle');
|
SELECT ts_lexize('synonym', 'Gogle');
|
||||||
|
SELECT ts_lexize('synonym', 'indices');
|
||||||
|
|
||||||
-- Create and simple test thesaurus dictionary
|
-- Create and simple test thesaurus dictionary
|
||||||
-- More tests in configuration checks because ts_lexize()
|
-- More tests in configuration checks because ts_lexize()
|
||||||
@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
|
|||||||
|
|
||||||
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
|
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
|
||||||
SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
|
SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
|
||||||
|
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
|
||||||
|
SELECT to_tsquery('synonym_tst', 'Index & indices');
|
||||||
|
|
||||||
-- test thesaurus in configuration
|
-- test thesaurus in configuration
|
||||||
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector
|
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector
|
||||||
|
Reference in New Issue
Block a user