1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-05 07:21:24 +03:00

Improve support of Hunspell in ispell dictionary.

Now it's possible to load recent version of Hunspell for several languages.
To handle these dictionaries Hunspell patch adds support for:
* FLAG long - sets the double extended ASCII character flag type
* FLAG num - sets the decimal number flag type (from 1 to 65535)
* AF parameter - alias for flag's set

Also it moves test dictionaries into separate directory.

Author: Artur Zakirov with editorization by me
This commit is contained in:
Teodor Sigaev
2016-03-04 20:08:10 +03:00
parent 9445db925e
commit d78a7d9c7f
15 changed files with 1105 additions and 91 deletions

View File

@ -13,8 +13,11 @@ include $(top_builddir)/src/Makefile.global
DICTDIR=tsearch_data
DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
ispell_sample.affix ispell_sample.dict
DICTFILES=dicts/synonym_sample.syn dicts/thesaurus_sample.ths \
dicts/hunspell_sample.affix \
dicts/ispell_sample.affix dicts/ispell_sample.dict \
dicts/hunspell_sample_long.affix dicts/hunspell_sample_long.dict \
dicts/hunspell_sample_num.affix dicts/hunspell_sample_num.dict
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \
dict_simple.o dict_synonym.o dict_thesaurus.o \

View File

@ -0,0 +1,35 @@
FLAG long
AF 7
AF cZ #1
AF cL #2
AF sGsJpUsS #3
AF sSpB #4
AF cZsS #5
AF sScZs\ #6
AF sA #7
COMPOUNDFLAG cZ
ONLYINCOMPOUND cL
PFX pB Y 1
PFX pB 0 re .
PFX pU N 1
PFX pU 0 un .
SFX sJ Y 1
SFX sJ 0 INGS [^E]
SFX sG Y 1
SFX sG 0 ING [^E]
SFX sS Y 1
SFX sS 0 S [^SXZHY]
SFX sA Y 1
SFX sA Y IES [^AEIOU]Y
SFX s\ N 1
SFX s\ 0 Y/2 [^Y]

View File

@ -0,0 +1,8 @@
book/3
booking/4
footballklubber
foot/5
football/1
ball/6
klubber/1
sky/7

View File

@ -0,0 +1,26 @@
FLAG num
COMPOUNDFLAG 101
ONLYINCOMPOUND 102
PFX 201 Y 1
PFX 201 0 re .
PFX 202 N 1
PFX 202 0 un .
SFX 301 Y 1
SFX 301 0 INGS [^E]
SFX 302 Y 1
SFX 302 0 ING [^E]
SFX 303 Y 1
SFX 303 0 S [^SXZHY]
SFX 304 Y 1
SFX 304 Y IES [^AEIOU]Y
SFX 305 N 1
SFX 305 0 Y/102 [^Y]

View File

@ -0,0 +1,8 @@
book/302,301,202,303
booking/303,201
footballklubber
foot/101,303
football/101
ball/303,101,305
klubber/101
sky/304

View File

@ -5,6 +5,54 @@
*
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
*
* Ispell dictionary
* -----------------
*
* Rules of dictionaries are defined in two files with .affix and .dict
* extensions. They are used by spell checker programs Ispell and Hunspell.
*
* An .affix file declares morphological rules to get a basic form of words.
* The format of an .affix file has different structure for Ispell and Hunspell
* dictionaries. The Hunspell format is more complicated. But when an .affix
* file is imported and compiled, it is stored in the same structure AffixNode.
*
* A .dict file stores a list of basic forms of words with references to
* affix rules. The format of a .dict file has the same structure for Ispell
* and Hunspell dictionaries.
*
* Compilation of a dictionary
* ---------------------------
*
* A compiled dictionary is stored in the IspellDict structure. Compilation of
* a dictionary is divided into the several steps:
* - NIImportDictionary() - stores each word of a .dict file in the
* temporary Spell field.
* - NIImportAffixes() - stores affix rules of an .affix file in the
* Affix field (not temporary) if an .affix file has the Ispell format.
* -> NIImportOOAffixes() - stores affix rules if an .affix file has the
* Hunspell format. The AffixData field is initialized if AF parameter
* is defined.
* - NISortDictionary() - builds a prefix tree (Trie) from the words list
* and stores it in the Dictionary field. The words list is got from the
* Spell field. The AffixData field is initialized if AF parameter is not
* defined.
* - NISortAffixes():
* - builds a list of compond affixes from the affix list and stores it
* in the CompoundAffix.
* - builds prefix trees (Trie) from the affix list for prefixes and suffixes
* and stores them in Suffix and Prefix fields.
* The affix list is got from the Affix field.
*
* Memory management
* -----------------
*
* The IspellDict structure has the Spell field which is used only in compile
* time. The Spell field stores a words list. It can take a lot of memory.
* Therefore when a dictionary is compiled this field is cleared by
* NIFinishBuild().
*
* All resources which should cleared by NIFinishBuild() is initialized using
* tmpalloc() and tmpalloc0().
*
* IDENTIFICATION
* src/backend/tsearch/spell.c
@ -150,10 +198,12 @@ cmpspell(const void *s1, const void *s2)
{
return (strcmp((*(SPELL *const *) s1)->word, (*(SPELL *const *) s2)->word));
}
static int
cmpspellaffix(const void *s1, const void *s2)
{
return (strncmp((*(SPELL *const *) s1)->p.flag, (*(SPELL *const *) s2)->p.flag, MAXFLAGLEN));
return (strcmp((*(SPELL *const *) s1)->p.flag,
(*(SPELL *const *) s2)->p.flag));
}
static char *
@ -220,6 +270,11 @@ strbncmp(const unsigned char *s1, const unsigned char *s2, size_t count)
return 0;
}
/*
* Compares affixes.
* First compares the type of an affix. Prefixes should go before affixes.
* If types are equal then compares replaceable string.
*/
static int
cmpaffix(const void *s1, const void *s2)
{
@ -237,6 +292,176 @@ cmpaffix(const void *s1, const void *s2)
(const unsigned char *) a2->repl);
}
/*
* Gets an affix flag from string representation (a set of affixes).
*
* Several flags can be stored in a single string. Flags can be represented by:
* - 1 character (FM_CHAR).
* - 2 characters (FM_LONG).
* - numbers from 1 to 65000 (FM_NUM).
*
* Depending on the flagMode an affix string can have the following format:
* - FM_CHAR: ABCD
* Here we have 4 flags: A, B, C and D
* - FM_LONG: ABCDE*
* Here we have 3 flags: AB, CD and E*
* - FM_NUM: 200,205,50
* Here we have 3 flags: 200, 205 and 50
*
* Conf: current dictionary.
* sflag: string representation (a set of affixes) of an affix flag.
* sflagnext: returns reference to the start of a next affix flag in the sflag.
*
* Returns an integer representation of the affix flag.
*/
static uint16
DecodeFlag(IspellDict *Conf, char *sflag, char **sflagnext)
{
int32 s;
char *next;
unsigned char *usflag;
switch (Conf->flagMode)
{
case FM_LONG:
/*
* Hunspell docs says flag could contains only
* ASCII characters
*/
if (!(pg_mblen(sflag) == 1 && isascii(sflag[0]) &&
pg_mblen(sflag + 1) == 1 && isascii(sflag[1])))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("non-ASCII affix flag \"%s\"", sflag)));
usflag = (unsigned char *)sflag;
s = ((int)usflag[0]) << 8 | ((int)usflag[1]);
if (sflagnext)
/* Go to start of the next flag */
*sflagnext = sflag + 2;
break;
case FM_NUM:
s = strtol(sflag, &next, 10);
if (s <= 0 || s > FLAGNUM_MAXSIZE)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("invalid affix flag \"%s\"", sflag)));
if (sflagnext)
{
/* Go to start of the next flag */
if (next)
{
bool met_comma = false;
while (*next)
{
if (!(pg_mblen(next) == 1 && isascii(*next)))
{
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("non-ASCII affix flag \"%s\"",
sflag)));
}
else if (isdigit(*next))
{
if (!met_comma)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("invalid affix flag \"%s\"",
sflag)));
break;
}
else if (*next == ',')
{
if (met_comma)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("invalid affix flag \"%s\"",
sflag)));
met_comma = true;
}
else if (!isspace(*next))
{
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("invalid character in affix flag \"%s\"", sflag)));
}
next++;
}
if (*next == '\0')
next = NULL;
}
*sflagnext = next;
}
break;
default:
if (!(pg_mblen(sflag) == 1 && isascii(*sflag)))
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("non-ASCII affix flag \"%s\"", sflag)));
s = *sflag;
if (sflagnext)
/* Go to start of the next flag */
*sflagnext = sflag + pg_mblen(sflag);
}
return (uint16)s;
}
/*
* Checks if the affix set Conf->AffixData[affix] contains affixflag.
* Conf->AffixData[affix] is the string representation of an affix flags.
* Conf->AffixData[affix] does not contain affixflag if this flag is not used
* actually by the .dict file.
*
* Conf: current dictionary.
* affix: index of the Conf->AffixData array.
* affixflag: integer representation of the affix flag.
*
* Returns true if the string Conf->AffixData[affix] contains affixflag,
* otherwise returns false.
*/
static bool
IsAffixFlagInUse(IspellDict *Conf, int affix, uint16 affixflag)
{
char *flagcur;
char *flagnext = NULL;
if (affixflag == 0)
return true;
flagcur = Conf->AffixData[affix];
while (*flagcur)
{
/* Compare first affix flag in flagcur with affixflag */
if (DecodeFlag(Conf, flagcur, &flagnext) == affixflag)
return true;
/* Otherwise go to next flag */
if (flagnext)
flagcur = flagnext;
/* If we have not flags anymore then exit */
else
break;
}
/* Could not find affixflag */
return false;
}
/*
* Adds the new word into the temporary array Spell.
*
* Conf: current dictionary.
* word: new word.
* flag: set of affix flags. Integer representation of flag can be got by
* DecodeFlag().
*/
static void
NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
{
@ -255,14 +480,18 @@ NIAddSpell(IspellDict *Conf, const char *word, const char *flag)
}
Conf->Spell[Conf->nspell] = (SPELL *) tmpalloc(SPELLHDRSZ + strlen(word) + 1);
strcpy(Conf->Spell[Conf->nspell]->word, word);
strlcpy(Conf->Spell[Conf->nspell]->p.flag, flag, MAXFLAGLEN);
Conf->Spell[Conf->nspell]->p.flag = (*flag != '\0')
? cpstrdup(Conf, flag) : VoidString;
Conf->nspell++;
}
/*
* import dictionary
* Imports dictionary into the temporary array Spell.
*
* Note caller must already have applied get_tsearch_config_filename
* Note caller must already have applied get_tsearch_config_filename.
*
* Conf: current dictionary.
* filename: path to the .dict file.
*/
void
NIImportDictionary(IspellDict *Conf, const char *filename)
@ -280,6 +509,7 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
{
char *s,
*pstr;
/* Set of affix flags */
const char *flag;
/* Extract flag from the line */
@ -324,7 +554,30 @@ NIImportDictionary(IspellDict *Conf, const char *filename)
tsearch_readline_end(&trst);
}
/*
* Searches a basic form of word in the prefix tree. This word was generated
* using an affix rule. This rule may not be presented in an affix set of
* a basic form of word.
*
* For example, we have the entry in the .dict file:
* meter/GMD
*
* The affix rule with the flag S:
* SFX S y ies [^aeiou]y
* is not presented here.
*
* The affix rule with the flag M:
* SFX M 0 's .
* is presented here.
*
* Conf: current dictionary.
* word: basic form of word.
* affixflag: integer representation of the affix flag, by which a basic form of
* word was generated.
* flag: compound flag used to compare with StopMiddle->compoundflag.
*
* Returns 1 if the word was found in the prefix tree, else returns 0.
*/
static int
FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
{
@ -349,13 +602,22 @@ FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
{
if (flag == 0)
{
/*
* The word can be formed only with another word.
* And in the flag parameter there is not a sign
* that we search compound words.
*/
if (StopMiddle->compoundflag & FF_COMPOUNDONLY)
return 0;
}
else if ((flag & StopMiddle->compoundflag) == 0)
return 0;
if ((affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
/*
* Check if this affix rule is presented in the affix set
* with index StopMiddle->affix.
*/
if (IsAffixFlagInUse(Conf, StopMiddle->affix, affixflag))
return 1;
}
node = StopMiddle->node;
@ -373,6 +635,24 @@ FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
return 0;
}
/*
* Adds a new affix rule to the Affix field.
*
* Conf: current dictionary.
* flag: integer representation of the affix flag ('\' in the below example).
* flagflags: set of flags from the flagval field for this affix rule. This set
* is listed after '/' character in the added string (repl).
*
* For example L flag in the hunspell_sample.affix:
* SFX \ 0 Y/L [^Y]
*
* mask: condition for search ('[^Y]' in the above example).
* find: stripping characters from beginning (at prefix) or end (at suffix)
* of the word ('0' in the above example, 0 means that there is not
* stripping character).
* repl: adding string after stripping ('Y' in the above example).
* type: FF_SUFFIX or FF_PREFIX.
*/
static void
NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
{
@ -394,18 +674,21 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
Affix = Conf->Affix + Conf->naffixes;
if (strcmp(mask, ".") == 0)
/* This affix rule can be applied for words with any ending */
if (strcmp(mask, ".") == 0 || *mask == '\0')
{
Affix->issimple = 1;
Affix->isregis = 0;
}
/* This affix rule will use regis to search word ending */
else if (RS_isRegis(mask))
{
Affix->issimple = 0;
Affix->isregis = 1;
RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX) ? true : false,
RS_compile(&(Affix->reg.regis), (type == FF_SUFFIX),
*mask ? mask : VoidString);
}
/* This affix rule will use regex_t to search word ending */
else
{
int masklen;
@ -457,7 +740,6 @@ NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
Conf->naffixes++;
}
/* Parsing states for parse_affentry() and friends */
#define PAE_WAIT_MASK 0
#define PAE_INMASK 1
@ -712,9 +994,16 @@ parse_affentry(char *str, char *mask, char *find, char *repl)
*pmask = *pfind = *prepl = '\0';
return (*mask && (*find || *repl)) ? true : false;
return (*mask && (*find || *repl));
}
/*
* Sets up a correspondence for the affix parameter with the affix flag.
*
* Conf: current dictionary.
* s: affix flag in string.
* val: affix parameter.
*/
static void
addFlagValue(IspellDict *Conf, char *s, uint32 val)
{
@ -731,12 +1020,66 @@ addFlagValue(IspellDict *Conf, char *s, uint32 val)
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("multibyte flag character is not allowed")));
Conf->flagval[*(unsigned char *) s] = (unsigned char) val;
Conf->flagval[DecodeFlag(Conf, s, (char **)NULL)] = (unsigned char) val;
Conf->usecompound = true;
}
/*
* Import an affix file that follows MySpell or Hunspell format
* Returns a set of affix parameters which correspondence to the set of affix
* flags s.
*/
static int
getFlagValues(IspellDict *Conf, char *s)
{
uint32 flag = 0;
char *flagcur;
char *flagnext = NULL;
flagcur = s;
while (*flagcur)
{
flag |= Conf->flagval[DecodeFlag(Conf, flagcur, &flagnext)];
if (flagnext)
flagcur = flagnext;
else
break;
}
return flag;
}
/*
* Returns a flag set using the s parameter.
*
* If Conf->useFlagAliases is true then the s parameter is index of the
* Conf->AffixData array and function returns its entry.
* Else function returns the s parameter.
*/
static char *
getFlags(IspellDict *Conf, char *s)
{
if (Conf->useFlagAliases)
{
int curaffix = strtol(s, (char **)NULL, 10);
if (curaffix && curaffix <= Conf->nAffixData)
/*
* Do not substract 1 from curaffix
* because empty string was added in NIImportOOAffixes
*/
return Conf->AffixData[curaffix];
else
return VoidString;
}
else
return s;
}
/*
* Import an affix file that follows MySpell or Hunspell format.
*
* Conf: current dictionary.
* filename: path to the .affix file.
*/
static void
NIImportOOAffixes(IspellDict *Conf, const char *filename)
@ -751,7 +1094,10 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
char repl[BUFSIZ],
*prepl;
bool isSuffix = false;
int flag = 0;
int naffix = 0,
curaffix = 0;
int flag = 0,
sflaglen = 0;
char flagflags = 0;
tsearch_readline_state trst;
char *recoded;
@ -759,6 +1105,8 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
/* read file to find any flag */
memset(Conf->flagval, 0, sizeof(Conf->flagval));
Conf->usecompound = false;
Conf->useFlagAliases = false;
Conf->flagMode = FM_CHAR;
if (!tsearch_readline_begin(&trst, filename))
ereport(ERROR,
@ -806,10 +1154,18 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
while (*s && t_isspace(s))
s += pg_mblen(s);
if (*s && STRNCMP(s, "default") != 0)
ereport(ERROR,
if (*s)
{
if (STRNCMP(s, "long") == 0)
Conf->flagMode = FM_LONG;
else if (STRNCMP(s, "num") == 0)
Conf->flagMode = FM_NUM;
else if (STRNCMP(s, "default") != 0)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("Ispell dictionary supports only default flag value")));
errmsg("Ispell dictionary supports only default, "
"long and num flag value")));
}
}
pfree(recoded);
@ -834,27 +1190,77 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
if (ptype)
pfree(ptype);
ptype = lowerstr_ctx(Conf, type);
/* First try to parse AF parameter (alias compression) */
if (STRNCMP(ptype, "af") == 0)
{
/* First line is the number of aliases */
if (!Conf->useFlagAliases)
{
Conf->useFlagAliases = true;
naffix = atoi(sflag);
if (naffix == 0)
ereport(ERROR,
(errcode(ERRCODE_CONFIG_FILE_ERROR),
errmsg("invalid number of flag vector aliases")));
/* Also reserve place for empty flag set */
naffix++;
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
Conf->lenAffixData = Conf->nAffixData = naffix;
/* Add empty flag set into AffixData */
Conf->AffixData[curaffix] = VoidString;
curaffix++;
}
/* Other lines is aliases */
else
{
if (curaffix < naffix)
{
Conf->AffixData[curaffix] = cpstrdup(Conf, sflag);
curaffix++;
}
}
goto nextline;
}
/* Else try to parse prefixes and suffixes */
if (fields_read < 4 ||
(STRNCMP(ptype, "sfx") != 0 && STRNCMP(ptype, "pfx") != 0))
goto nextline;
sflaglen = strlen(sflag);
if (sflaglen == 0
|| (sflaglen > 1 && Conf->flagMode == FM_CHAR)
|| (sflaglen > 2 && Conf->flagMode == FM_LONG))
goto nextline;
/*
* Affix header. For example:
* SFX \ N 1
*/
if (fields_read == 4)
{
if (strlen(sflag) != 1)
goto nextline;
flag = *sflag;
isSuffix = (STRNCMP(ptype, "sfx") == 0) ? true : false;
/* Convert the affix flag to int */
flag = DecodeFlag(Conf, sflag, (char **)NULL);
isSuffix = (STRNCMP(ptype, "sfx") == 0);
if (t_iseq(find, 'y') || t_iseq(find, 'Y'))
flagflags = FF_CROSSPRODUCT;
else
flagflags = 0;
}
/*
* Affix fields. For example:
* SFX \ 0 Y/L [^Y]
*/
else
{
char *ptr;
int aflg = 0;
if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
if (flag == 0)
goto nextline;
prepl = lowerstr_ctx(Conf, repl);
/* Find position of '/' in lowercased string "prepl" */
@ -866,11 +1272,7 @@ NIImportOOAffixes(IspellDict *Conf, const char *filename)
*/
*ptr = '\0';
ptr = repl + (ptr - prepl) + 1;
while (*ptr)
{
aflg |= Conf->flagval[*(unsigned char *) ptr];
ptr++;
}
aflg |= getFlagValues(Conf, getFlags(Conf, ptr));
}
pfind = lowerstr_ctx(Conf, find);
pmask = lowerstr_ctx(Conf, mask);
@ -928,6 +1330,8 @@ NIImportAffixes(IspellDict *Conf, const char *filename)
memset(Conf->flagval, 0, sizeof(Conf->flagval));
Conf->usecompound = false;
Conf->useFlagAliases = false;
Conf->flagMode = FM_CHAR;
while ((recoded = tsearch_readline(&trst)) != NULL)
{
@ -1044,6 +1448,12 @@ isnewformat:
NIImportOOAffixes(Conf, filename);
}
/*
* Merges two affix flag sets and stores a new affix flag set into
* Conf->AffixData.
*
* Returns index of a new affix flag set.
*/
static int
MergeAffix(IspellDict *Conf, int a1, int a2)
{
@ -1068,21 +1478,25 @@ MergeAffix(IspellDict *Conf, int a1, int a2)
return Conf->nAffixData - 1;
}
/*
* Returns a set of affix parameters which correspondence to the set of affix
* flags with the given index.
*/
static uint32
makeCompoundFlags(IspellDict *Conf, int affix)
{
uint32 flag = 0;
char *str = Conf->AffixData[affix];
while (str && *str)
{
flag |= Conf->flagval[*(unsigned char *) str];
str++;
}
return (flag & FF_DICTFLAGMASK);
char *str = Conf->AffixData[affix];
return (getFlagValues(Conf, str) & FF_DICTFLAGMASK);
}
/*
* Makes a prefix tree for the given level.
*
* Conf: current dictionary.
* low: lower index of the Conf->Spell array.
* high: upper index of the Conf->Spell array.
* level: current prefix tree level.
*/
static SPNode *
mkSPNode(IspellDict *Conf, int low, int high, int level)
{
@ -1115,6 +1529,7 @@ mkSPNode(IspellDict *Conf, int low, int high, int level)
{
if (lastchar)
{
/* Next level of the prefix tree */
data->node = mkSPNode(Conf, lownew, i, level + 1);
lownew = i;
data++;
@ -1154,6 +1569,7 @@ mkSPNode(IspellDict *Conf, int low, int high, int level)
}
}
/* Next level of the prefix tree */
data->node = mkSPNode(Conf, lownew, high, level + 1);
return rs;
@ -1172,44 +1588,83 @@ NISortDictionary(IspellDict *Conf)
/* compress affixes */
/* Count the number of different flags used in the dictionary */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspellaffix);
naffix = 0;
for (i = 0; i < Conf->nspell; i++)
{
if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag, MAXFLAGLEN))
naffix++;
}
/*
* Fill in Conf->AffixData with the affixes that were used in the
* dictionary. Replace textual flag-field of Conf->Spell entries with
* indexes into Conf->AffixData array.
* If we use flag aliases then we need to use Conf->AffixData filled
* in the NIImportOOAffixes().
*/
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
curaffix = -1;
for (i = 0; i < Conf->nspell; i++)
if (Conf->useFlagAliases)
{
if (i == 0 || strncmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix], MAXFLAGLEN))
for (i = 0; i < Conf->nspell; i++)
{
curaffix++;
Assert(curaffix < naffix);
Conf->AffixData[curaffix] = cpstrdup(Conf, Conf->Spell[i]->p.flag);
curaffix = strtol(Conf->Spell[i]->p.flag, (char **)NULL, 10);
if (curaffix && curaffix <= Conf->nAffixData)
Conf->Spell[i]->p.d.affix = curaffix;
else
/*
* If Conf->Spell[i]->p.flag is empty, then get empty value of
* Conf->AffixData (0 index).
*/
Conf->Spell[i]->p.d.affix = 0;
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
}
}
/* Otherwise fill Conf->AffixData here */
else
{
/* Count the number of different flags used in the dictionary */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *),
cmpspellaffix);
naffix = 0;
for (i = 0; i < Conf->nspell; i++)
{
if (i == 0
|| strcmp(Conf->Spell[i]->p.flag, Conf->Spell[i - 1]->p.flag))
naffix++;
}
Conf->Spell[i]->p.d.affix = curaffix;
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
/*
* Fill in Conf->AffixData with the affixes that were used in the
* dictionary. Replace textual flag-field of Conf->Spell entries with
* indexes into Conf->AffixData array.
*/
Conf->AffixData = (char **) palloc0(naffix * sizeof(char *));
curaffix = -1;
for (i = 0; i < Conf->nspell; i++)
{
if (i == 0
|| strcmp(Conf->Spell[i]->p.flag, Conf->AffixData[curaffix]))
{
curaffix++;
Assert(curaffix < naffix);
Conf->AffixData[curaffix] = cpstrdup(Conf,
Conf->Spell[i]->p.flag);
}
Conf->Spell[i]->p.d.affix = curaffix;
Conf->Spell[i]->p.d.len = strlen(Conf->Spell[i]->word);
}
Conf->lenAffixData = Conf->nAffixData = naffix;
}
Conf->lenAffixData = Conf->nAffixData = naffix;
/* Start build a prefix tree */
qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL *), cmpspell);
Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
}
/*
* Makes a prefix tree for the given level using the repl string of an affix
* rule. Affixes with empty replace string do not include in the prefix tree.
* This affixes are included by mkVoidAffix().
*
* Conf: current dictionary.
* low: lower index of the Conf->Affix array.
* high: upper index of the Conf->Affix array.
* level: current prefix tree level.
* type: FF_SUFFIX or FF_PREFIX.
*/
static AffixNode *
mkANode(IspellDict *Conf, int low, int high, int level, int type)
{
@ -1247,6 +1702,7 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type)
{
if (lastchar)
{
/* Next level of the prefix tree */
data->node = mkANode(Conf, lownew, i, level + 1, type);
if (naff)
{
@ -1267,6 +1723,7 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type)
}
}
/* Next level of the prefix tree */
data->node = mkANode(Conf, lownew, high, level + 1, type);
if (naff)
{
@ -1281,6 +1738,10 @@ mkANode(IspellDict *Conf, int low, int high, int level, int type)
return rs;
}
/*
* Makes the root void node in the prefix tree. The root void node is created
* for affixes which have empty replace string ("repl" field).
*/
static void
mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
{
@ -1304,11 +1765,12 @@ mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
Conf->Prefix = Affix;
}
/* Count affixes with empty replace string */
for (i = start; i < end; i++)
if (Conf->Affix[i].replen == 0)
cnt++;
/* There is not affixes with empty replace string */
if (cnt == 0)
return;
@ -1324,18 +1786,31 @@ mkVoidAffix(IspellDict *Conf, bool issuffix, int startsuffix)
}
}
/*
* Checks if the affixflag is used by dictionary. Conf->AffixData does not
* contain affixflag if this flag is not used actually by the .dict file.
*
* Conf: current dictionary.
* affixflag: integer representation of the affix flag.
*
* Returns true if the Conf->AffixData array contains affixflag, otherwise
* returns false.
*/
static bool
isAffixInUse(IspellDict *Conf, char flag)
isAffixInUse(IspellDict *Conf, uint16 affixflag)
{
int i;
for (i = 0; i < Conf->nAffixData; i++)
if (strchr(Conf->AffixData[i], flag) != NULL)
if (IsAffixFlagInUse(Conf, i, affixflag))
return true;
return false;
}
/*
* Builds Conf->Prefix and Conf->Suffix trees from the imported affixes.
*/
void
NISortAffixes(IspellDict *Conf)
{
@ -1347,6 +1822,7 @@ NISortAffixes(IspellDict *Conf)
if (Conf->naffixes == 0)
return;
/* Store compound affixes in the Conf->CompoundAffix array */
if (Conf->naffixes > 1)
qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
@ -1359,7 +1835,7 @@ NISortAffixes(IspellDict *Conf)
firstsuffix = i;
if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
isAffixInUse(Conf, (char) Affix->flag))
isAffixInUse(Conf, Affix->flag))
{
if (ptr == Conf->CompoundAffix ||
ptr->issuffix != (ptr - 1)->issuffix ||
@ -1370,7 +1846,7 @@ NISortAffixes(IspellDict *Conf)
/* leave only unique and minimals suffixes */
ptr->affix = Affix->repl;
ptr->len = Affix->replen;
ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
ptr->issuffix = (Affix->type == FF_SUFFIX);
ptr++;
}
}
@ -1378,6 +1854,7 @@ NISortAffixes(IspellDict *Conf)
ptr->affix = NULL;
Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
/* Start build a prefix tree */
Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
mkVoidAffix(Conf, true, firstsuffix);
@ -1825,7 +2302,7 @@ SplitToVariants(IspellDict *Conf, SPNode *snode, SplitVar *orig, char *word, int
if (StopLow < StopHigh)
{
if (level == FF_COMPOUNDBEGIN)
if (startpos == 0)
compoundflag = FF_COMPOUNDBEGIN;
else if (level == wordlen - 1)
compoundflag = FF_COMPOUNDLAST;