mirror of
https://github.com/postgres/postgres.git
synced 2025-07-30 11:03:19 +03:00
improve support of agglutinative languages (query with compound words).
regression=# select to_tsquery( '\'fotballklubber\''); to_tsquery ------------------------------------------------ 'fotball' & 'klubb' | 'fot' & 'ball' & 'klubb' (1 row) So, changed interface to dictionaries, lexize method of dictionary shoud return pointer to aray of TSLexeme structs instead of char**. Last element should have TSLexeme->lexeme == NULL. typedef struct { /* number of variant of split word , for example Word 'fotballklubber' (norwegian) has two varian to split: ( fotball, klubb ) and ( fot, ball, klubb ). So, dictionary should return: nvariant lexeme 1 fotball 1 klubb 2 fot 2 ball 2 klubb */ uint16 nvariant; /* currently unused */ uint16 flags; /* C-string */ char *lexeme; } TSLexeme;
This commit is contained in:
@ -1119,17 +1119,32 @@ SplitToVariants(IspellDict * Conf, SPNode * snode, SplitVar * orig, char *word,
|
||||
return var;
|
||||
}
|
||||
|
||||
char **
|
||||
TSLexeme *
|
||||
NINormalizeWord(IspellDict * Conf, char *word)
|
||||
{
|
||||
char **res = NormalizeSubWord(Conf, word, 0);
|
||||
TSLexeme *lcur=NULL, *lres=NULL;
|
||||
u_int16_t NVariant=1;
|
||||
|
||||
if (res) {
|
||||
char **ptr = res;
|
||||
lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
|
||||
while(*ptr) {
|
||||
lcur->lexeme=*ptr;
|
||||
lcur->flags=0;
|
||||
lcur->nvariant = NVariant++;
|
||||
lcur++;
|
||||
ptr++;
|
||||
}
|
||||
lcur->lexeme=NULL;
|
||||
pfree(res);
|
||||
}
|
||||
|
||||
if (Conf->compoundcontrol != '\t')
|
||||
{
|
||||
int wordlen = strlen(word);
|
||||
SplitVar *ptr,
|
||||
*var = SplitToVariants(Conf, NULL, NULL, word, wordlen, 0, -1);
|
||||
char **cur = res;
|
||||
int i;
|
||||
|
||||
while (var)
|
||||
@ -1140,30 +1155,31 @@ NINormalizeWord(IspellDict * Conf, char *word)
|
||||
|
||||
if (subres)
|
||||
{
|
||||
char **ptr = subres;
|
||||
char **subptr = subres;
|
||||
|
||||
if (cur)
|
||||
{
|
||||
while (*cur)
|
||||
cur++;
|
||||
}
|
||||
else
|
||||
res = cur = (char **) palloc(MAX_NORM * sizeof(char *));
|
||||
if ( !lcur )
|
||||
lcur = lres = (TSLexeme*)palloc( MAX_NORM * sizeof(TSLexeme) );
|
||||
|
||||
while(*subptr) {
|
||||
for(i=0;i<var->nstem-1;i++) {
|
||||
lcur->lexeme=(subptr==subres) ? var->stem[ i ] : pstrdup(var->stem[ i ]);
|
||||
lcur->flags=0;
|
||||
lcur->nvariant = NVariant;
|
||||
lcur++;
|
||||
}
|
||||
|
||||
for (i = 0; i < var->nstem - 1; i++)
|
||||
{
|
||||
*cur = var->stem[i];
|
||||
cur++;
|
||||
}
|
||||
while (*ptr)
|
||||
{
|
||||
*cur = *ptr;
|
||||
cur++;
|
||||
ptr++;
|
||||
}
|
||||
*cur = NULL;
|
||||
lcur->lexeme=*subptr;
|
||||
lcur->flags=0;
|
||||
lcur->nvariant = NVariant;
|
||||
lcur++;
|
||||
subptr++;
|
||||
NVariant++;
|
||||
}
|
||||
|
||||
lcur->lexeme=NULL;
|
||||
pfree(subres);
|
||||
var->stem[0] = NULL;
|
||||
pfree( var->stem[ var->nstem-1 ] );
|
||||
}
|
||||
}
|
||||
|
||||
@ -1175,7 +1191,7 @@ NINormalizeWord(IspellDict * Conf, char *word)
|
||||
var = ptr;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
return lres;
|
||||
}
|
||||
|
||||
|
||||
|
@ -3,10 +3,11 @@
|
||||
|
||||
#include <sys/types.h>
|
||||
#include "regex/regex.h"
|
||||
#include "regis.h"
|
||||
#include "c.h"
|
||||
|
||||
|
||||
#include "regis.h"
|
||||
#include "dict.h"
|
||||
|
||||
struct SPNode;
|
||||
|
||||
|
||||
@ -116,7 +117,7 @@ typedef struct
|
||||
|
||||
} IspellDict;
|
||||
|
||||
char **NINormalizeWord(IspellDict * Conf, char *word);
|
||||
TSLexeme *NINormalizeWord(IspellDict * Conf, char *word);
|
||||
int NIImportAffixes(IspellDict * Conf, const char *filename);
|
||||
int NIImportDictionary(IspellDict * Conf, const char *filename);
|
||||
|
||||
|
Reference in New Issue
Block a user