Add prefix support for synonym dictionary

2025-07-26 01:22:12 +03:00 · 2009-08-14 14:53:20 +00:00
parent 0c738084fb
commit abd8c94ff9
5 changed files with 111 additions and 8 deletions
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.52 2009/06/17 21:58:49 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.53 2009/08/14 14:53:20 teodor Exp $ -->
 <chapter id="textsearch">
 <title id="textsearch-title">Full Text Search</title>
@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris');
 asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
 </programlisting>
   </para>
   <para>
    An asterisk (<literal>*</literal>) at the end of definition word indicates 
    that definition word is a prefix, and <function>to_tsquery()</function> 
    function will transform that definition to the prefix search format (see 
    <xref linkend="textsearch-parsing-queries">). 
    Notice that it is ignored in <function>to_tsvector()</function>.
   </para>
   <para>
    Contents of <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
   </para>
 <programlisting>
 postgres        pgsql
 postgresql      pgsql
 postgre pgsql
 gogle   googl
 indices index*
 </programlisting>
   <para>
    Results:
   </para>
 <programlisting>
 =# create text search dictionary syn( template=synonym,synonyms='synonym_sample');
 =# select ts_lexize('syn','indices');
 ts_lexize
 -----------
 {index}
 (1 row)
 =# create text search configuration tst ( copy=simple);
 =# alter text search configuration tst alter mapping for asciiword with syn;
 =# select to_tsquery('tst','indices');
 to_tsquery
 ------------
 'index':*
 (1 row)
 =# select 'indexes are very useful'::tsvector;
            tsvector             
 ---------------------------------
 'are' 'indexes' 'useful' 'very'
 (1 row)
 =# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
 ?column?
 ----------
 t
 (1 row)
 =# select to_tsvector('tst','indices');
 to_tsvector
 -------------
 'index':1
 (1 row)
 </programlisting>
   <para>
    The only parameter required by the <literal>synonym</> template is
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -23,6 +23,8 @@ typedef struct
 {
 	char	   *in;
 	char	   *out;
 	int			outlen;
 	uint16		flags;
 } Syn;
 typedef struct
@ -36,11 +38,14 @@ typedef struct
 * Finds the next whitespace-delimited word within the 'in' string.
 * Returns a pointer to the first character of the word, and a pointer
 * to the next byte after the last character in the word (in *end).
 * Character '*' at the end of word will not be threated as word
 * charater if flags is not null.
 */
 static char *
-findwrd(char *in, char **end)
+findwrd(char *in, char **end, uint16 *flags)
 {
 	char	   *start;
 	char	   *lastchar;
 	/* Skip leading spaces */
 	while (*in && t_isspace(in))
@ -53,13 +58,27 @@ findwrd(char *in, char **end)
 		return NULL;
 	}
-	start = in;
+	lastchar = start = in;
 	/* Find end of word */
 	while (*in && !t_isspace(in))
 	{
 		lastchar = in;
 		in += pg_mblen(in);
 	}
 	if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags )
 	{
 		*flags = TSL_PREFIX;
 		*end = lastchar;
 	}
 	else
 	{
 		if (flags)
 				*flags = 0;
 		*end = in;
 	}
 	*end = in;
 	return start;
 }
@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
 			   *end = NULL;
 	int			cur = 0;
 	char	   *line = NULL;
 	uint16		flags = 0;
 	foreach(l, dictoptions)
 	{
@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
 	while ((line = tsearch_readline(&trst)) != NULL)
 	{
-		starti = findwrd(line, &end);
+		starti = findwrd(line, &end, NULL);
 		if (!starti)
 		{
 			/* Empty line */
@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
 		}
 		*end = '\0';
-		starto = findwrd(end + 1, &end);
+		starto = findwrd(end + 1, &end, &flags);
 		if (!starto)
 		{
 			/* A line with only one word (+whitespace). Ignore silently. */
@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS)
 			d->syn[cur].out = lowerstr(starto);
 		}
 		d->syn[cur].outlen = strlen(starto);
 		d->syn[cur].flags = flags; 
 		cur++;
 skipline:
@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
 		PG_RETURN_POINTER(NULL);
 	res = palloc0(sizeof(TSLexeme) * 2);
-	res[0].lexeme = pstrdup(found->out);
+	res[0].lexeme = pnstrdup(found->out, found->outlen);
 	res[0].flags = found->flags;
 	PG_RETURN_POINTER(res);
 }
--- a/src/backend/tsearch/synonym_sample.syn
+++ b/src/backend/tsearch/synonym_sample.syn
@ -2,3 +2,4 @@ postgres	pgsql
 postgresql	pgsql
 postgre	pgsql
 gogle	googl
 indices	index*
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle');
 {googl}
 (1 row)
 SELECT ts_lexize('synonym', 'indices');
 ts_lexize 
 -----------
 {index}
 (1 row)
 -- Create and simple test thesaurus dictionary
 -- More tests in configuration checks because ts_lexize()
 -- cannot pass more than one word to thesaurus.
@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead
 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
 (1 row)
 SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
                 to_tsvector                  
 ----------------------------------------------
 'form':8 'index':1,3,10 'plural':7 'right':6
 (1 row)
 SELECT to_tsquery('synonym_tst', 'Index & indices');
     to_tsquery      
 ---------------------
 'index' & 'index':*
 (1 row)
 -- test thesaurus in configuration
 -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
 CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
--- a/src/test/regress/sql/tsdicts.sql
+++ b/src/test/regress/sql/tsdicts.sql
@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym (
 SELECT ts_lexize('synonym', 'PoStGrEs');
 SELECT ts_lexize('synonym', 'Gogle');
 SELECT ts_lexize('synonym', 'indices');
 -- Create and simple test thesaurus dictionary
 -- More tests in configuration checks because ts_lexize()
@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
 SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
 SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
 SELECT to_tsquery('synonym_tst', 'Index & indices');
 -- test thesaurus in configuration
 -- see thesaurus_sample.ths to understand 'odd' resulting tsvector