1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-08 06:02:22 +03:00

Text parser rewritten:

- supports multibyte encodings
        - more strict rules for lexemes
        - flex isn't used
Add:
        - tsquery plainto_tsquery(text)
          Function makes tsquery from plain text.
        - &&, ||, !! operation for tsquery for combining
          tsquery from it's parts:  'foo & bar' || 'asd' => 'foo & bar | asd'
This commit is contained in:
Teodor Sigaev
2005-11-21 12:27:57 +00:00
parent b91e6ed93e
commit c52795d18a
15 changed files with 1613 additions and 424 deletions

View File

@@ -1,8 +1,8 @@
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $
SUBOBJS = parser.o deflex.o
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)
PG_CPPFLAGS = -I$(srcdir)/..
@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)
all: SUBSYS.o
parser.c: parser.l
ifdef FLEX
$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
else
@$(missing) flex $< $@
endif
SUBSYS.o: $(SUBOBJS)
$(LD) $(LDREL) $(LDOUT) $@ $^

View File

@@ -15,7 +15,7 @@ const char *lex_descr[] = {
"Latin part of hyphenated word",
"Space symbols",
"HTML Tag",
"HTTP head",
"Protocol head",
"Hyphenated word",
"Latin hyphenated word",
"Non-latin hyphenated word",
@@ -42,7 +42,7 @@ const char *tok_alias[] = {
"lpart_hword",
"blank",
"tag",
"http",
"protocol",
"hword",
"lhword",
"nlhword",

View File

@@ -17,7 +17,7 @@
#define LATPARTHYPHENWORD 11
#define SPACE 12
#define TAG 13
#define HTTP 14
#define PROTOCOL 14
#define HYPHENWORD 15
#define LATHYPHENWORD 16
#define CYRHYPHENWORD 17

File diff suppressed because it is too large Load Diff

View File

@@ -1,10 +1,147 @@
#ifndef __PARSER_H__
#define __PARSER_H__
extern char *token;
extern int tokenlen;
int tsearch2_yylex(void);
void tsearch2_start_parse_str(char *, int);
void tsearch2_end_parse(void);
#include <ctype.h>
#include <limits.h>
#include "ts_locale.h"
typedef enum {
TPS_Base = 0,
TPS_InUWord,
TPS_InLatWord,
TPS_InCyrWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InSpace,
TPS_InUDecimalFirst,
TPS_InUDecimal,
TPS_InDecimalFirst,
TPS_InDecimal,
TPS_InVersionFirst,
TPS_InVersion,
TPS_InMantissaFirst,
TPS_InMantissaSign,
TPS_InMantissa,
TPS_InHTMLEntityFirst,
TPS_InHTMLEntity,
TPS_InHTMLEntityNumFirst,
TPS_InHTMLEntityNum,
TPS_InHTMLEntityEnd,
TPS_InTagFirst,
TPS_InTagCloseFirst,
TPS_InTag,
TPS_InTagEscapeK,
TPS_InTagEscapeKK,
TPS_InTagBackSleshed,
TPS_InTagEnd,
TPS_InCommentFirst,
TPS_InCommentLast,
TPS_InComment,
TPS_InCloseCommentFirst,
TPS_InCloseCommentLast,
TPS_InCommentEnd,
TPS_InHostFirstDomen,
TPS_InHostDomenSecond,
TPS_InHostDomen,
TPS_InPortFirst,
TPS_InPort,
TPS_InHostFirstAN,
TPS_InHost,
TPS_InEmail,
TPS_InFileFirst,
TPS_InFile,
TPS_InFileNext,
TPS_InURIFirst,
TPS_InURIStart,
TPS_InURI,
TPS_InFURL,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
TPS_InHyphenLatWordFirst,
TPS_InHyphenLatWord,
TPS_InHyphenCyrWordFirst,
TPS_InHyphenCyrWord,
TPS_InHyphenUWordFirst,
TPS_InHyphenUWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
TPS_InHyphenCyrWordPart,
TPS_InHyphenLatWordPart,
TPS_InHyphenUWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
TPS_InHVersionPartFirst,
TPS_InHVersionPart,
TPS_Null /* last state (fake value) */
} TParserState;
/* forward declaration */
struct TParser;
typedef int (*TParserCharTest)(struct TParser*); /* any p_is* functions except p_iseq */
typedef void (*TParserSpecial)(struct TParser*); /* special handler for special cases... */
typedef struct {
TParserCharTest isclass;
char c;
uint16 flags;
TParserState tostate;
int type;
TParserSpecial special;
} TParserStateActionItem;
typedef struct {
TParserState state;
TParserStateActionItem *action;
} TParserStateAction;
typedef struct TParserPosition {
int posbyte; /* position of parser in bytes */
int poschar; /* osition of parser in characters */
int charlen; /* length of current char */
int lenbytelexeme;
int lencharlexeme;
TParserState state;
struct TParserPosition *prev;
int flags;
TParserStateActionItem *pushedAtAction;
} TParserPosition;
typedef struct TParser {
/* string and position information */
char *str; /* multibyte string */
int lenstr; /* length of mbstring */
wchar_t *wstr; /* wide character string */
int lenwstr; /* length of wsting */
/* State of parse */
int charmaxlen;
bool usewide;
TParserPosition *state;
bool ignore;
bool wanthost;
/* silly char */
char c;
/* out */
char *lexeme;
int lenbytelexeme;
int lencharlexeme;
int type;
} TParser;
TParser* TParserInit( char *, int );
bool TParserGet( TParser* );
void TParserClose( TParser* );
#endif

View File

@@ -1,346 +0,0 @@
%{
#include "postgres.h"
#include "deflex.h"
#include "parser.h"
#include "common.h"
/* Avoid exit() on fatal scanner errors */
#undef fprintf
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
char *token = NULL; /* pointer to token */
int tokenlen;
static char *s = NULL; /* to return WHOLE hyphenated-word */
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
typedef struct {
int tlen;
int clen;
char *str;
} TagStorage;
static TagStorage ts={0,0,NULL};
static void
addTag(void)
{
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
ts.tlen*=2;
ts.str=realloc(ts.str,ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
ts.clen+=tsearch2_yyleng;
ts.str[ts.clen]='\0';
}
static void
startTag(void)
{
if ( ts.str==NULL ) {
ts.tlen=tsearch2_yyleng+1;
ts.str=malloc(ts.tlen);
if (!ts.str)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
ts.clen=0;
ts.str[0]='\0';
addTag();
}
%}
%option 8bit
%option never-interactive
%option nodefault
%option nounput
%option noyywrap
/* parser's state for parsing hyphenated-word */
%x DELIM
/* parser's state for parsing URL*/
%x URL
%x SERVER
/* parser's state for parsing TAGS */
%x INTAG
%x QINTAG
%x INCOMMENT
%x INSCRIPT
/* cyrillic koi8 char */
CYRALNUM [0-9\200-\377]
CYRALPHA [\200-\377]
ALPHA [a-zA-Z\200-\377]
ALNUM [0-9a-zA-Z\200-\377]
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
URI [-_[:alnum:]/%,\.;=&?#]+
%%
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<!--" { BEGIN INCOMMENT; startTag(); }
<INCOMMENT>"-->" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
<QINTAG>"\\\"" { addTag(); }
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
<INTAG>">" {
BEGIN INITIAL;
addTag();
token = ts.str;
tokenlen = ts.clen;
return TAG;
}
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
\&(quot|amp|nbsp|lt|gt)\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
\&\#[0-9][0-9]?[0-9]?\; {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTMLENTITY;
}
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return EMAIL;
}
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SCIENTIFIC;
}
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
[+-]?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
[+-][0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SIGNEDINT;
}
<DELIM,INITIAL>[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UNSIGNEDINT;
}
http"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
ftp"://" {
BEGIN URL;
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HTTP;
}
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
BEGIN SERVER;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return FURL;
}
<SERVER,URL,INITIAL>{HOSTNAME} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return HOST;
}
<SERVER>[/:]{URI} {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return URI;
}
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return FILEPATH;
}
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return CYRHYPHENWORD;
}
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return LATHYPHENWORD;
}
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
BEGIN DELIM;
if (s) { free(s); s=NULL; }
s = strdup( tsearch2_yytext );
tokenlen = tsearch2_yyleng;
yyless( 0 );
token = s;
return HYPHENWORD;
}
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return VERSIONNUMBER;
}
<DELIM>\+?[0-9]+\.[0-9]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return DECIMAL;
}
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRPARTHYPHENWORD;
}
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATPARTHYPHENWORD;
}
<DELIM>{ALNUM}+ /* one word in composite-word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return PARTHYPHENWORD;
}
<DELIM>- {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
BEGIN INITIAL;
yyless( 0 );
}
{CYRALPHA}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return CYRWORD;
}
[[:alpha:]]+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return LATWORD;
}
{ALNUM}+ /* normal word */ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return UWORD;
}
[ \r\n\t]+ {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
. {
token = tsearch2_yytext;
tokenlen = tsearch2_yyleng;
return SPACE;
}
%%
/* clearing after parsing from string */
void
tsearch2_end_parse(void)
{
if (s)
{
free(s);
s = NULL;
}
tsearch2_yy_delete_buffer( buf );
buf = NULL;
}
/* start parse from string */
void
tsearch2_start_parse_str(char* str, int limit)
{
if (buf)
tsearch2_end_parse();
buf = tsearch2_yy_scan_bytes( str, limit );
tsearch2_yy_switch_to_buffer( buf );
BEGIN INITIAL;
}