mirror of
https://github.com/postgres/postgres.git
synced 2025-08-08 06:02:22 +03:00
Text parser rewritten:
- supports multibyte encodings - more strict rules for lexemes - flex isn't used Add: - tsquery plainto_tsquery(text) Function makes tsquery from plain text. - &&, ||, !! operation for tsquery for combining tsquery from it's parts: 'foo & bar' || 'asd' => 'foo & bar | asd'
This commit is contained in:
@@ -1,8 +1,8 @@
|
||||
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
|
||||
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
|
||||
|
||||
SUBOBJS = parser.o deflex.o
|
||||
|
||||
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
|
||||
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
|
||||
|
||||
PG_CPPFLAGS = -I$(srcdir)/..
|
||||
|
||||
@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
|
||||
|
||||
all: SUBSYS.o
|
||||
|
||||
parser.c: parser.l
|
||||
ifdef FLEX
|
||||
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
|
||||
else
|
||||
@$(missing) flex $< $@
|
||||
endif
|
||||
|
||||
SUBSYS.o: $(SUBOBJS)
|
||||
$(LD) $(LDREL) $(LDOUT) $@ $^
|
||||
|
||||
|
@@ -15,7 +15,7 @@ const char *lex_descr[] = {
|
||||
"Latin part of hyphenated word",
|
||||
"Space symbols",
|
||||
"HTML Tag",
|
||||
"HTTP head",
|
||||
"Protocol head",
|
||||
"Hyphenated word",
|
||||
"Latin hyphenated word",
|
||||
"Non-latin hyphenated word",
|
||||
@@ -42,7 +42,7 @@ const char *tok_alias[] = {
|
||||
"lpart_hword",
|
||||
"blank",
|
||||
"tag",
|
||||
"http",
|
||||
"protocol",
|
||||
"hword",
|
||||
"lhword",
|
||||
"nlhword",
|
||||
|
@@ -17,7 +17,7 @@
|
||||
#define LATPARTHYPHENWORD 11
|
||||
#define SPACE 12
|
||||
#define TAG 13
|
||||
#define HTTP 14
|
||||
#define PROTOCOL 14
|
||||
#define HYPHENWORD 15
|
||||
#define LATHYPHENWORD 16
|
||||
#define CYRHYPHENWORD 17
|
||||
|
1028
contrib/tsearch2/wordparser/parser.c
Normal file
1028
contrib/tsearch2/wordparser/parser.c
Normal file
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,147 @@
|
||||
#ifndef __PARSER_H__
|
||||
#define __PARSER_H__
|
||||
|
||||
extern char *token;
|
||||
extern int tokenlen;
|
||||
int tsearch2_yylex(void);
|
||||
void tsearch2_start_parse_str(char *, int);
|
||||
void tsearch2_end_parse(void);
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include "ts_locale.h"
|
||||
|
||||
typedef enum {
|
||||
TPS_Base = 0,
|
||||
TPS_InUWord,
|
||||
TPS_InLatWord,
|
||||
TPS_InCyrWord,
|
||||
TPS_InUnsignedInt,
|
||||
TPS_InSignedIntFirst,
|
||||
TPS_InSignedInt,
|
||||
TPS_InSpace,
|
||||
TPS_InUDecimalFirst,
|
||||
TPS_InUDecimal,
|
||||
TPS_InDecimalFirst,
|
||||
TPS_InDecimal,
|
||||
TPS_InVersionFirst,
|
||||
TPS_InVersion,
|
||||
TPS_InMantissaFirst,
|
||||
TPS_InMantissaSign,
|
||||
TPS_InMantissa,
|
||||
TPS_InHTMLEntityFirst,
|
||||
TPS_InHTMLEntity,
|
||||
TPS_InHTMLEntityNumFirst,
|
||||
TPS_InHTMLEntityNum,
|
||||
TPS_InHTMLEntityEnd,
|
||||
TPS_InTagFirst,
|
||||
TPS_InTagCloseFirst,
|
||||
TPS_InTag,
|
||||
TPS_InTagEscapeK,
|
||||
TPS_InTagEscapeKK,
|
||||
TPS_InTagBackSleshed,
|
||||
TPS_InTagEnd,
|
||||
TPS_InCommentFirst,
|
||||
TPS_InCommentLast,
|
||||
TPS_InComment,
|
||||
TPS_InCloseCommentFirst,
|
||||
TPS_InCloseCommentLast,
|
||||
TPS_InCommentEnd,
|
||||
TPS_InHostFirstDomen,
|
||||
TPS_InHostDomenSecond,
|
||||
TPS_InHostDomen,
|
||||
TPS_InPortFirst,
|
||||
TPS_InPort,
|
||||
TPS_InHostFirstAN,
|
||||
TPS_InHost,
|
||||
TPS_InEmail,
|
||||
TPS_InFileFirst,
|
||||
TPS_InFile,
|
||||
TPS_InFileNext,
|
||||
TPS_InURIFirst,
|
||||
TPS_InURIStart,
|
||||
TPS_InURI,
|
||||
TPS_InFURL,
|
||||
TPS_InProtocolFirst,
|
||||
TPS_InProtocolSecond,
|
||||
TPS_InProtocolEnd,
|
||||
TPS_InHyphenLatWordFirst,
|
||||
TPS_InHyphenLatWord,
|
||||
TPS_InHyphenCyrWordFirst,
|
||||
TPS_InHyphenCyrWord,
|
||||
TPS_InHyphenUWordFirst,
|
||||
TPS_InHyphenUWord,
|
||||
TPS_InHyphenValueFirst,
|
||||
TPS_InHyphenValue,
|
||||
TPS_InHyphenValueExact,
|
||||
TPS_InParseHyphen,
|
||||
TPS_InParseHyphenHyphen,
|
||||
TPS_InHyphenCyrWordPart,
|
||||
TPS_InHyphenLatWordPart,
|
||||
TPS_InHyphenUWordPart,
|
||||
TPS_InHyphenUnsignedInt,
|
||||
TPS_InHDecimalPartFirst,
|
||||
TPS_InHDecimalPart,
|
||||
TPS_InHVersionPartFirst,
|
||||
TPS_InHVersionPart,
|
||||
TPS_Null /* last state (fake value) */
|
||||
} TParserState;
|
||||
|
||||
/* forward declaration */
|
||||
struct TParser;
|
||||
|
||||
|
||||
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */
|
||||
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */
|
||||
|
||||
typedef struct {
|
||||
TParserCharTest isclass;
|
||||
char c;
|
||||
uint16 flags;
|
||||
TParserState tostate;
|
||||
int type;
|
||||
TParserSpecial special;
|
||||
} TParserStateActionItem;
|
||||
|
||||
typedef struct {
|
||||
TParserState state;
|
||||
TParserStateActionItem *action;
|
||||
} TParserStateAction;
|
||||
|
||||
typedef struct TParserPosition {
|
||||
int posbyte; /* position of parser in bytes */
|
||||
int poschar; /* osition of parser in characters */
|
||||
int charlen; /* length of current char */
|
||||
int lenbytelexeme;
|
||||
int lencharlexeme;
|
||||
TParserState state;
|
||||
struct TParserPosition *prev;
|
||||
int flags;
|
||||
TParserStateActionItem *pushedAtAction;
|
||||
} TParserPosition;
|
||||
|
||||
typedef struct TParser {
|
||||
/* string and position information */
|
||||
char *str; /* multibyte string */
|
||||
int lenstr; /* length of mbstring */
|
||||
wchar_t *wstr; /* wide character string */
|
||||
int lenwstr; /* length of wsting */
|
||||
|
||||
/* State of parse */
|
||||
int charmaxlen;
|
||||
bool usewide;
|
||||
TParserPosition *state;
|
||||
bool ignore;
|
||||
bool wanthost;
|
||||
|
||||
/* silly char */
|
||||
char c;
|
||||
|
||||
/* out */
|
||||
char *lexeme;
|
||||
int lenbytelexeme;
|
||||
int lencharlexeme;
|
||||
int type;
|
||||
|
||||
} TParser;
|
||||
|
||||
|
||||
TParser* TParserInit( char *, int );
|
||||
bool TParserGet( TParser* );
|
||||
void TParserClose( TParser* );
|
||||
|
||||
#endif
|
||||
|
@@ -1,346 +0,0 @@
|
||||
%{
|
||||
#include "postgres.h"
|
||||
|
||||
#include "deflex.h"
|
||||
#include "parser.h"
|
||||
#include "common.h"
|
||||
|
||||
/* Avoid exit() on fatal scanner errors */
|
||||
#undef fprintf
|
||||
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
|
||||
|
||||
char *token = NULL; /* pointer to token */
|
||||
int tokenlen;
|
||||
static char *s = NULL; /* to return WHOLE hyphenated-word */
|
||||
|
||||
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
|
||||
|
||||
typedef struct {
|
||||
int tlen;
|
||||
int clen;
|
||||
char *str;
|
||||
} TagStorage;
|
||||
|
||||
static TagStorage ts={0,0,NULL};
|
||||
|
||||
static void
|
||||
addTag(void)
|
||||
{
|
||||
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
|
||||
ts.tlen*=2;
|
||||
ts.str=realloc(ts.str,ts.tlen);
|
||||
if (!ts.str)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
}
|
||||
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
|
||||
ts.clen+=tsearch2_yyleng;
|
||||
ts.str[ts.clen]='\0';
|
||||
}
|
||||
|
||||
static void
|
||||
startTag(void)
|
||||
{
|
||||
if ( ts.str==NULL ) {
|
||||
ts.tlen=tsearch2_yyleng+1;
|
||||
ts.str=malloc(ts.tlen);
|
||||
if (!ts.str)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory")));
|
||||
}
|
||||
ts.clen=0;
|
||||
ts.str[0]='\0';
|
||||
addTag();
|
||||
}
|
||||
|
||||
%}
|
||||
|
||||
%option 8bit
|
||||
%option never-interactive
|
||||
%option nodefault
|
||||
%option nounput
|
||||
%option noyywrap
|
||||
|
||||
/* parser's state for parsing hyphenated-word */
|
||||
%x DELIM
|
||||
/* parser's state for parsing URL*/
|
||||
%x URL
|
||||
%x SERVER
|
||||
|
||||
/* parser's state for parsing TAGS */
|
||||
%x INTAG
|
||||
%x QINTAG
|
||||
%x INCOMMENT
|
||||
%x INSCRIPT
|
||||
|
||||
/* cyrillic koi8 char */
|
||||
CYRALNUM [0-9\200-\377]
|
||||
CYRALPHA [\200-\377]
|
||||
ALPHA [a-zA-Z\200-\377]
|
||||
ALNUM [0-9a-zA-Z\200-\377]
|
||||
|
||||
|
||||
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
|
||||
URI [-_[:alnum:]/%,\.;=&?#]+
|
||||
|
||||
%%
|
||||
|
||||
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
|
||||
|
||||
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
|
||||
BEGIN INITIAL;
|
||||
addTag();
|
||||
token = ts.str;
|
||||
tokenlen = ts.clen;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
"<!--" { BEGIN INCOMMENT; startTag(); }
|
||||
|
||||
<INCOMMENT>"-->" {
|
||||
BEGIN INITIAL;
|
||||
addTag();
|
||||
token = ts.str;
|
||||
tokenlen = ts.clen;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
|
||||
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
|
||||
|
||||
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
|
||||
|
||||
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
|
||||
|
||||
<QINTAG>"\\\"" { addTag(); }
|
||||
|
||||
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
|
||||
|
||||
<INTAG>">" {
|
||||
BEGIN INITIAL;
|
||||
addTag();
|
||||
token = ts.str;
|
||||
tokenlen = ts.clen;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
|
||||
|
||||
\&(quot|amp|nbsp|lt|gt)\; {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return HTMLENTITY;
|
||||
}
|
||||
|
||||
\&\#[0-9][0-9]?[0-9]?\; {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return HTMLENTITY;
|
||||
}
|
||||
|
||||
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return EMAIL;
|
||||
}
|
||||
|
||||
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SCIENTIFIC;
|
||||
}
|
||||
|
||||
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return VERSIONNUMBER;
|
||||
}
|
||||
|
||||
[+-]?[0-9]+\.[0-9]+ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return DECIMAL;
|
||||
}
|
||||
|
||||
[+-][0-9]+ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SIGNEDINT;
|
||||
}
|
||||
|
||||
<DELIM,INITIAL>[0-9]+ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return UNSIGNEDINT;
|
||||
}
|
||||
|
||||
http"://" {
|
||||
BEGIN URL;
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return HTTP;
|
||||
}
|
||||
|
||||
ftp"://" {
|
||||
BEGIN URL;
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return HTTP;
|
||||
}
|
||||
|
||||
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
|
||||
BEGIN SERVER;
|
||||
if (s) { free(s); s=NULL; }
|
||||
s = strdup( tsearch2_yytext );
|
||||
tokenlen = tsearch2_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return FURL;
|
||||
}
|
||||
|
||||
<SERVER,URL,INITIAL>{HOSTNAME} {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return HOST;
|
||||
}
|
||||
|
||||
<SERVER>[/:]{URI} {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return URI;
|
||||
}
|
||||
|
||||
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return FILEPATH;
|
||||
}
|
||||
|
||||
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
|
||||
BEGIN DELIM;
|
||||
if (s) { free(s); s=NULL; }
|
||||
s = strdup( tsearch2_yytext );
|
||||
tokenlen = tsearch2_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return CYRHYPHENWORD;
|
||||
}
|
||||
|
||||
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
|
||||
BEGIN DELIM;
|
||||
if (s) { free(s); s=NULL; }
|
||||
s = strdup( tsearch2_yytext );
|
||||
tokenlen = tsearch2_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return LATHYPHENWORD;
|
||||
}
|
||||
|
||||
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
|
||||
BEGIN DELIM;
|
||||
if (s) { free(s); s=NULL; }
|
||||
s = strdup( tsearch2_yytext );
|
||||
tokenlen = tsearch2_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return HYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return VERSIONNUMBER;
|
||||
}
|
||||
|
||||
<DELIM>\+?[0-9]+\.[0-9]+ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return DECIMAL;
|
||||
}
|
||||
|
||||
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return CYRPARTHYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return LATPARTHYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>{ALNUM}+ /* one word in composite-word */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return PARTHYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>- {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
|
||||
BEGIN INITIAL;
|
||||
yyless( 0 );
|
||||
}
|
||||
|
||||
{CYRALPHA}+ /* normal word */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return CYRWORD;
|
||||
}
|
||||
|
||||
[[:alpha:]]+ /* normal word */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return LATWORD;
|
||||
}
|
||||
|
||||
{ALNUM}+ /* normal word */ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return UWORD;
|
||||
}
|
||||
|
||||
[ \r\n\t]+ {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
. {
|
||||
token = tsearch2_yytext;
|
||||
tokenlen = tsearch2_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
%%
|
||||
|
||||
/* clearing after parsing from string */
|
||||
void
|
||||
tsearch2_end_parse(void)
|
||||
{
|
||||
if (s)
|
||||
{
|
||||
free(s);
|
||||
s = NULL;
|
||||
}
|
||||
tsearch2_yy_delete_buffer( buf );
|
||||
buf = NULL;
|
||||
}
|
||||
|
||||
/* start parse from string */
|
||||
void
|
||||
tsearch2_start_parse_str(char* str, int limit)
|
||||
{
|
||||
if (buf)
|
||||
tsearch2_end_parse();
|
||||
buf = tsearch2_yy_scan_bytes( str, limit );
|
||||
tsearch2_yy_switch_to_buffer( buf );
|
||||
BEGIN INITIAL;
|
||||
}
|
Reference in New Issue
Block a user