1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-21 16:02:15 +03:00

Rename and slightly redefine the default text search parser's "word"

categories, as per discussion.  asciiword (formerly lword) is still
ASCII-letters-only, and numword (formerly word) is still the most general
mixed-alpha-and-digits case.  But word (formerly nlword) is now
any-group-of-letters-with-at-least-one-non-ASCII, rather than all-non-ASCII as
before.  This is no worse than before for parsing mixed Russian/English text,
which seems to have been the design center for the original coding; and it
should simplify matters for parsing most European languages.  In particular
it will not be necessary for any language to accept strings containing digits
as being regular "words".  The hyphenated-word categories are adjusted
similarly.
This commit is contained in:
Tom Lane
2007-10-23 20:46:12 +00:00
parent 344d0cae64
commit dbaec70c15
10 changed files with 464 additions and 447 deletions

View File

@ -2,7 +2,7 @@
#
# Makefile for src/backend/snowball
#
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.3 2007/08/27 10:29:49 mha Exp $
# $PostgreSQL: pgsql/src/backend/snowball/Makefile,v 1.4 2007/10/23 20:46:12 tgl Exp $
#
#-------------------------------------------------------------------------
@ -46,8 +46,9 @@ OBJS= dict_snowball.o api.o utilities.o \
stem_UTF_8_swedish.o \
stem_UTF_8_turkish.o
# second column is name of latin dictionary, if different
# Note order dependency: use of some other language as latin dictionary
# first column is language name and also name of dictionary for not-all-ASCII
# words, second is name of dictionary for all-ASCII words
# Note order dependency: use of some other language as ASCII dictionary
# must come after creation of that language
LANGUAGES= \
danish danish \
@ -95,8 +96,8 @@ ifeq ($(enable_shared), yes)
while [ "$$#" -gt 0 ] ; \
do \
lang=$$1; shift; \
nonlatdictname=$$lang; \
latdictname=$$1; shift; \
nonascdictname=$$lang; \
ascdictname=$$1; shift; \
if [ -s $(srcdir)/stopwords/$${lang}.stop ] ; then \
stop=", StopWords=$${lang}" ; \
else \
@ -106,8 +107,8 @@ ifeq ($(enable_shared), yes)
sed -e "s#_LANGNAME_#$$lang#g" | \
sed -e "s#_DICTNAME_#$${lang}_stem#g" | \
sed -e "s#_CFGNAME_#$$lang#g" | \
sed -e "s#_LATDICTNAME_#$${latdictname}_stem#g" | \
sed -e "s#_NONLATDICTNAME_#$${nonlatdictname}_stem#g" | \
sed -e "s#_ASCDICTNAME_#$${ascdictname}_stem#g" | \
sed -e "s#_NONASCDICTNAME_#$${nonascdictname}_stem#g" | \
sed -e "s#_STOPWORDS_#$$stop#g" ; \
done >> $@
else

View File

@ -1,4 +1,4 @@
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.4 2007/09/03 02:30:43 tgl Exp $$
-- $PostgreSQL: pgsql/src/backend/snowball/snowball.sql.in,v 1.5 2007/10/23 20:46:12 tgl Exp $$
-- text search configuration for _LANGNAME_ language
CREATE TEXT SEARCH DICTIONARY _DICTNAME_
@ -12,14 +12,15 @@ CREATE TEXT SEARCH CONFIGURATION _CFGNAME_
COMMENT ON TEXT SEARCH CONFIGURATION _CFGNAME_ IS 'configuration for _LANGNAME_ language';
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR email, url, host, sfloat, version, uri, file, float, int, uint
FOR email, url, host, sfloat, version, uri, file, float, int, uint,
numword, hword_numpart, numhword
WITH simple;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR lhword, lpart_hword, lword
WITH _LATDICTNAME_;
FOR asciiword, hword_asciipart, asciihword
WITH _ASCDICTNAME_;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
FOR hword, nlhword, nlpart_hword, nlword, word, part_hword
WITH _NONLATDICTNAME_;
FOR word, hword_part, hword
WITH _NONASCDICTNAME_;

View File

@ -1,13 +1,13 @@
/*-------------------------------------------------------------------------
*
* wparser_def.c
* Standard word parser
* Default text search parser
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $
* $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -22,79 +22,53 @@
#include "utils/builtins.h"
/* rememder !!!! */
#define LASTNUM 23
/* Output token categories */
#define LATWORD 1
#define CYRWORD 2
#define UWORD 3
#define EMAIL 4
#define FURL 5
#define HOST 6
#define SCIENTIFIC 7
#define ASCIIWORD 1
#define WORD_T 2
#define NUMWORD 3
#define EMAIL 4
#define URL_T 5
#define HOST 6
#define SCIENTIFIC 7
#define VERSIONNUMBER 8
#define PARTHYPHENWORD 9
#define CYRPARTHYPHENWORD 10
#define LATPARTHYPHENWORD 11
#define SPACE 12
#define TAG 13
#define NUMPARTHWORD 9
#define PARTHWORD 10
#define ASCIIPARTHWORD 11
#define SPACE 12
#define TAG_T 13
#define PROTOCOL 14
#define HYPHENWORD 15
#define LATHYPHENWORD 16
#define CYRHYPHENWORD 17
#define URI 18
#define FILEPATH 19
#define DECIMAL 20
#define SIGNEDINT 21
#define UNSIGNEDINT 22
#define HTMLENTITY 23
#define NUMHWORD 15
#define ASCIIHWORD 16
#define HWORD 17
#define URI 18
#define FILEPATH 19
#define DECIMAL 20
#define SIGNEDINT 21
#define UNSIGNEDINT 22
#define HTMLENTITY 23
static const char *lex_descr[] = {
"",
"Latin word",
"Non-latin word",
"Word",
"Email",
"URL",
"Host",
"Scientific notation",
"VERSION",
"Part of hyphenated word",
"Non-latin part of hyphenated word",
"Latin part of hyphenated word",
"Space symbols",
"HTML Tag",
"Protocol head",
"Hyphenated word",
"Latin hyphenated word",
"Non-latin hyphenated word",
"URI",
"File or path name",
"Decimal notation",
"Signed integer",
"Unsigned integer",
"HTML Entity"
};
#define LASTNUM 23
static const char *tok_alias[] = {
static const char * const tok_alias[] = {
"",
"lword",
"nlword",
"asciiword",
"word",
"numword",
"email",
"url",
"host",
"sfloat",
"version",
"part_hword",
"nlpart_hword",
"lpart_hword",
"hword_numpart",
"hword_part",
"hword_asciipart",
"blank",
"tag",
"protocol",
"numhword",
"asciihword",
"hword",
"lhword",
"nlhword",
"uri",
"file",
"float",
@ -103,12 +77,42 @@ static const char *tok_alias[] = {
"entity"
};
static const char * const lex_descr[] = {
"",
"Word, all ASCII",
"Word, all letters",
"Word, letters and digits",
"Email address",
"URL",
"Host",
"Scientific notation",
"Version number",
"Hyphenated word part, letters and digits",
"Hyphenated word part, all letters",
"Hyphenated word part, all ASCII",
"Space symbols",
"HTML tag",
"Protocol head",
"Hyphenated word, letters and digits",
"Hyphenated word, all ASCII",
"Hyphenated word, all letters",
"URI",
"File or path name",
"Decimal notation",
"Signed integer",
"Unsigned integer",
"HTML entity"
};
/* Parser states */
typedef enum
{
TPS_Base = 0,
TPS_InUWord,
TPS_InLatWord,
TPS_InCyrWord,
TPS_InNumWord,
TPS_InAsciiWord,
TPS_InWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
@ -167,20 +171,20 @@ typedef enum
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
TPS_InHyphenLatWordFirst,
TPS_InHyphenLatWord,
TPS_InHyphenCyrWordFirst,
TPS_InHyphenCyrWord,
TPS_InHyphenUWordFirst,
TPS_InHyphenUWord,
TPS_InHyphenAsciiWordFirst,
TPS_InHyphenAsciiWord,
TPS_InHyphenWordFirst,
TPS_InHyphenWord,
TPS_InHyphenNumWordFirst,
TPS_InHyphenNumWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenCyrWordPart,
TPS_InHyphenLatWordPart,
TPS_InHyphenUWordPart,
TPS_InHyphenWordPart,
TPS_InHyphenAsciiWordPart,
TPS_InHyphenNumWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
@ -192,7 +196,6 @@ typedef enum
/* forward declaration */
struct TParser;
typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
* except p_iseq */
typedef void (*TParserSpecial) (struct TParser *); /* special handler for
@ -208,6 +211,16 @@ typedef struct
TParserSpecial special;
} TParserStateActionItem;
/* Flag bits in TParserStateActionItem.flags */
#define A_NEXT 0x0000
#define A_BINGO 0x0001
#define A_POP 0x0002
#define A_PUSH 0x0004
#define A_RERUN 0x0008
#define A_CLEAR 0x0010
#define A_MERGE 0x0020
#define A_CLRALL 0x0040
typedef struct
{
TParserState state;
@ -255,6 +268,11 @@ typedef struct TParser
} TParser;
/* forward decls here */
static bool TParserGet(TParser * prs);
static TParserPosition *
newTParserPosition(TParserPosition * prev)
{
@ -303,8 +321,6 @@ TParserInit(char *str, int len)
return prs;
}
static bool TParserGet(TParser * prs);
static void
TParserClose(TParser * prs)
{
@ -325,10 +341,10 @@ TParserClose(TParser * prs)
}
/*
* defining support function, equvalent is* macroses, but
* Character-type support functions, equivalent to is* macros, but
* working with any possible encodings and locales. Note,
* that with multibyte encoding and C-locale isw* function may fail
* or give wrong result. Note 2: multibyte encoding and C-local
* or give wrong result. Note 2: multibyte encoding and C-locale
* often are used for Asian languages
*/
@ -487,17 +503,13 @@ p_isascii(TParser * prs)
}
static int
p_islatin(TParser * prs)
p_isasclet(TParser * prs)
{
return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0;
return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
}
static int
p_isnonlatin(TParser * prs)
{
return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0;
}
/* deliberately suppress unused-function complaints for the above */
void _make_compiler_happy(void);
void
_make_compiler_happy(void)
@ -638,21 +650,12 @@ p_isURI(TParser * prs)
* Table of state/action of parser
*/
#define A_NEXT 0x0000
#define A_BINGO 0x0001
#define A_POP 0x0002
#define A_PUSH 0x0004
#define A_RERUN 0x0008
#define A_CLEAR 0x0010
#define A_MERGE 0x0020
#define A_CLRALL 0x0040
static TParserStateActionItem actionTPS_Base[] = {
{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
@ -664,37 +667,38 @@ static TParserStateActionItem actionTPS_Base[] = {
};
static TParserStateActionItem actionTPS_InUWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
static TParserStateActionItem actionTPS_InNumWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
};
static TParserStateActionItem actionTPS_InLatWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
static TParserStateActionItem actionTPS_InAsciiWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL}
{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
};
static TParserStateActionItem actionTPS_InCyrWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL}
static TParserStateActionItem actionTPS_InWord[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
};
static TParserStateActionItem actionTPS_InUnsignedInt[] = {
@ -704,8 +708,8 @@ static TParserStateActionItem actionTPS_InUnsignedInt[] = {
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL},
{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
};
@ -816,13 +820,13 @@ static TParserStateActionItem actionTPS_InMantissa[] = {
static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -849,7 +853,7 @@ static TParserStateActionItem actionTPS_InTagFirst[] = {
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
{p_islatin, 0, A_PUSH, TPS_InTagName, 0, NULL},
{p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -863,7 +867,7 @@ static TParserStateActionItem actionTPS_InXMLBegin[] = {
static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InTagName, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -873,7 +877,7 @@ static TParserStateActionItem actionTPS_InTagName[] = {
{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -888,7 +892,7 @@ static TParserStateActionItem actionTPS_InTag[] = {
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
@ -924,7 +928,7 @@ static TParserStateActionItem actionTPS_InTagBackSleshed[] = {
};
static TParserStateActionItem actionTPS_InTagEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InCommentFirst[] = {
@ -962,19 +966,19 @@ static TParserStateActionItem actionTPS_InCloseCommentLast[] = {
};
static TParserStateActionItem actionTPS_InCommentEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
@ -984,7 +988,7 @@ static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
static TParserStateActionItem actionTPS_InHostDomain[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
{p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@ -1013,14 +1017,14 @@ static TParserStateActionItem actionTPS_InPort[] = {
static TParserStateActionItem actionTPS_InHostFirstAN[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHost[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@ -1034,7 +1038,7 @@ static TParserStateActionItem actionTPS_InEmail[] = {
static TParserStateActionItem actionTPS_InFileFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
@ -1045,7 +1049,7 @@ static TParserStateActionItem actionTPS_InFileFirst[] = {
static TParserStateActionItem actionTPS_InFileTwiddle[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
@ -1054,7 +1058,7 @@ static TParserStateActionItem actionTPS_InFileTwiddle[] = {
static TParserStateActionItem actionTPS_InPathFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
@ -1079,7 +1083,7 @@ static TParserStateActionItem actionTPS_InPathSecond[] = {
static TParserStateActionItem actionTPS_InFile[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
{p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
@ -1091,7 +1095,7 @@ static TParserStateActionItem actionTPS_InFile[] = {
static TParserStateActionItem actionTPS_InFileNext[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
@ -1119,7 +1123,7 @@ static TParserStateActionItem actionTPS_InURI[] = {
static TParserStateActionItem actionTPS_InFURL[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, FURL, SpecialFURL},
{p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@ -1139,54 +1143,52 @@ static TParserStateActionItem actionTPS_InProtocolEnd[] = {
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
};
static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}
static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
static TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}
static TParserStateActionItem actionTPS_InHyphenWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
static TParserStateActionItem actionTPS_InHyphenNumWord[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
@ -1196,26 +1198,26 @@ static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
};
static TParserStateActionItem actionTPS_InHyphenValue[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
{p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
{NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
@ -1227,32 +1229,31 @@ static TParserStateActionItem actionTPS_InParseHyphenHyphen[] = {
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD, NULL}
static TParserStateActionItem actionTPS_InHyphenWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD, NULL},
{p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
{p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD, NULL}
static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL}
static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
{p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
};
@ -1284,14 +1285,14 @@ static TParserStateActionItem actionTPS_InHVersionPart[] = {
};
/*
* order should be the same as in typedef enum {} TParserState!!
* order must be the same as in typedef enum {} TParserState!!
*/
static const TParserStateAction Actions[] = {
{TPS_Base, actionTPS_Base},
{TPS_InUWord, actionTPS_InUWord},
{TPS_InLatWord, actionTPS_InLatWord},
{TPS_InCyrWord, actionTPS_InCyrWord},
{TPS_InNumWord, actionTPS_InNumWord},
{TPS_InAsciiWord, actionTPS_InAsciiWord},
{TPS_InWord, actionTPS_InWord},
{TPS_InUnsignedInt, actionTPS_InUnsignedInt},
{TPS_InSignedIntFirst, actionTPS_InSignedIntFirst},
{TPS_InSignedInt, actionTPS_InSignedInt},
@ -1350,20 +1351,20 @@ static const TParserStateAction Actions[] = {
{TPS_InProtocolFirst, actionTPS_InProtocolFirst},
{TPS_InProtocolSecond, actionTPS_InProtocolSecond},
{TPS_InProtocolEnd, actionTPS_InProtocolEnd},
{TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst},
{TPS_InHyphenLatWord, actionTPS_InHyphenLatWord},
{TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst},
{TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord},
{TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst},
{TPS_InHyphenUWord, actionTPS_InHyphenUWord},
{TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst},
{TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord},
{TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst},
{TPS_InHyphenWord, actionTPS_InHyphenWord},
{TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst},
{TPS_InHyphenNumWord, actionTPS_InHyphenNumWord},
{TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst},
{TPS_InHyphenValue, actionTPS_InHyphenValue},
{TPS_InHyphenValueExact, actionTPS_InHyphenValueExact},
{TPS_InParseHyphen, actionTPS_InParseHyphen},
{TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen},
{TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart},
{TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart},
{TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart},
{TPS_InHyphenWordPart, actionTPS_InHyphenWordPart},
{TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart},
{TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart},
{TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt},
{TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst},
{TPS_InHDecimalPart, actionTPS_InHDecimalPart},
@ -1378,10 +1379,11 @@ TParserGet(TParser * prs)
{
TParserStateActionItem *item = NULL;
Assert(prs->state);
if (prs->state->posbyte >= prs->lenstr)
return false;
Assert(prs->state);
prs->lexeme = prs->str + prs->state->posbyte;
prs->state->pushedAtAction = NULL;
@ -1488,10 +1490,12 @@ TParserGet(TParser * prs)
prs->state->state = item->tostate;
/* check for go away */
if ((item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN) == 0))
if ((item->flags & A_BINGO) ||
(prs->state->posbyte >= prs->lenstr &&
(item->flags & A_RERUN) == 0))
break;
/* go to begining of loop if we should rerun or we just restore state */
/* go to beginning of loop if we should rerun or we just restore state */
if (item->flags & (A_RERUN | A_POP))
continue;
@ -1557,16 +1561,15 @@ prsd_end(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}
#define LEAVETOKEN(x) ( (x)==12 )
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define ENDPUNCTOKEN(x) ( (x)==12 )
#define LEAVETOKEN(x) ( (x)==SPACE )
#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define ENDPUNCTOKEN(x) ( (x)==SPACE )
#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )
#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
typedef struct
{

View File

@ -37,7 +37,7 @@
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.435 2007/10/22 20:13:37 tgl Exp $
* $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.436 2007/10/23 20:46:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@ -53,6 +53,6 @@
*/
/* yyyymmddN */
#define CATALOG_VERSION_NO 200710221
#define CATALOG_VERSION_NO 200710231
#endif

View File

@ -209,8 +209,8 @@ SELECT ts_lexize('synonym', 'Gogle');
(1 row)
-- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
-- can not give more tat one word as it may wish thesaurus.
-- More tests in configuration checks because ts_lexize()
-- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus,
DictFile=thesaurus_sample,
@ -227,7 +227,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
COPY=english
);
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word
word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
to_tsvector
@ -276,7 +276,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
COPY=english
);
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
to_tsvector
@ -296,7 +296,7 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
COPY=synonym_tst
);
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
to_tsvector

View File

@ -208,31 +208,31 @@ SELECT ts_lexize('english_stem', 'identity');
(1 row)
SELECT * FROM ts_token_type('default');
tokid | alias | description
-------+--------------+-----------------------------------
1 | lword | Latin word
2 | nlword | Non-latin word
3 | word | Word
4 | email | Email
5 | url | URL
6 | host | Host
7 | sfloat | Scientific notation
8 | version | VERSION
9 | part_hword | Part of hyphenated word
10 | nlpart_hword | Non-latin part of hyphenated word
11 | lpart_hword | Latin part of hyphenated word
12 | blank | Space symbols
13 | tag | HTML Tag
14 | protocol | Protocol head
15 | hword | Hyphenated word
16 | lhword | Latin hyphenated word
17 | nlhword | Non-latin hyphenated word
18 | uri | URI
19 | file | File or path name
20 | float | Decimal notation
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | HTML Entity
tokid | alias | description
-------+-----------------+------------------------------------------
1 | asciiword | Word, all ASCII
2 | word | Word, all letters
3 | numword | Word, letters and digits
4 | email | Email address
5 | url | URL
6 | host | Host
7 | sfloat | Scientific notation
8 | version | Version number
9 | hword_numpart | Hyphenated word part, letters and digits
10 | hword_part | Hyphenated word part, all letters
11 | hword_asciipart | Hyphenated word part, all ASCII
12 | blank | Space symbols
13 | tag | HTML tag
14 | protocol | Protocol head
15 | numhword | Hyphenated word, letters and digits
16 | asciihword | Hyphenated word, all ASCII
17 | hword | Hyphenated word, all letters
18 | uri | URI
19 | file | File or path name
20 | float | Decimal notation
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | HTML entity
(23 rows)
SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">

View File

@ -58,8 +58,8 @@ SELECT ts_lexize('synonym', 'PoStGrEs');
SELECT ts_lexize('synonym', 'Gogle');
-- Create and simple test thesaurus dictionary
-- More test in configuration checks because of ts_lexize
-- can not give more tat one word as it may wish thesaurus.
-- More tests in configuration checks because ts_lexize()
-- cannot pass more than one word to thesaurus.
CREATE TEXT SEARCH DICTIONARY thesaurus (
Template=thesaurus,
DictFile=thesaurus_sample,
@ -74,7 +74,7 @@ CREATE TEXT SEARCH CONFIGURATION ispell_tst (
);
ALTER TEXT SEARCH CONFIGURATION ispell_tst ALTER MAPPING FOR
hword, lhword, lpart_hword, lword, nlhword, nlpart_hword, nlword, part_hword, word
word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
@ -99,7 +99,7 @@ CREATE TEXT SEARCH CONFIGURATION synonym_tst (
);
ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
@ -112,10 +112,9 @@ CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
);
ALTER TEXT SEARCH CONFIGURATION thesaurus_tst ALTER MAPPING FOR
lword, lpart_hword, lhword
asciiword, hword_asciipart, asciihword
WITH synonym, thesaurus, english_stem;
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');

View File

@ -3,7 +3,7 @@ package Install;
#
# Package that provides 'make install' functionality for msvc builds
#
# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.24 2007/10/16 16:00:00 tgl Exp $
# $PostgreSQL: pgsql/src/tools/msvc/Install.pm,v 1.25 2007/10/23 20:46:12 tgl Exp $
#
use strict;
use warnings;
@ -258,7 +258,7 @@ sub GenerateTsearchFiles
while ($#pieces > 0)
{
my $lang = shift @pieces || last;
my $latlang = shift @pieces || last;
my $asclang = shift @pieces || last;
my $txt = $tmpl;
my $stop = '';
@ -269,8 +269,8 @@ sub GenerateTsearchFiles
$txt =~ s#_LANGNAME_#${lang}#gs;
$txt =~ s#_DICTNAME_#${lang}_stem#gs;
$txt =~ s#_CFGNAME_#${lang}#gs;
$txt =~ s#_LATDICTNAME_#${latlang}_stem#gs;
$txt =~ s#_NONLATDICTNAME_#${lang}_stem#gs;
$txt =~ s#_ASCDICTNAME_#${asclang}_stem#gs;
$txt =~ s#_NONASCDICTNAME_#${lang}_stem#gs;
$txt =~ s#_STOPWORDS_#$stop#gs;
print $F $txt;
print ".";