Text parser rewritten:

- supports multibyte encodings - more strict rules for lexemes - flex isn't used Add: - tsquery plainto_tsquery(text) Function makes tsquery from plain text. - &&, ||, !! operation for tsquery for combining tsquery from it's parts: 'foo & bar' || 'asd' => 'foo & bar | asd'
2025-08-08 06:02:22 +03:00 · 2005-11-21 12:27:57 +00:00
parent b91e6ed93e
commit c52795d18a
15 changed files with 1613 additions and 424 deletions
--- a/contrib/tsearch2/wordparser/Makefile
+++ b/contrib/tsearch2/wordparser/Makefile
@@ -1,8 +1,8 @@
-# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.8 2005/10/18 01:30:49 tgl Exp $
+# $PostgreSQL: pgsql/contrib/tsearch2/wordparser/Makefile,v 1.9 2005/11/21 12:27:57 teodor Exp $

 SUBOBJS =  parser.o deflex.o

-EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) parser.c
+EXTRA_CLEAN = SUBSYS.o $(SUBOBJS)

 PG_CPPFLAGS = -I$(srcdir)/..

@@ -20,13 +20,6 @@ override CFLAGS += $(CFLAGS_SL)

 all: SUBSYS.o

-parser.c: parser.l
-ifdef FLEX
-	$(FLEX) $(FLEXFLAGS) -8 -Ptsearch2_yy -o'$@' $<
-else
-	@$(missing) flex $< $@
-endif
-
 SUBSYS.o: $(SUBOBJS)
 	$(LD) $(LDREL) $(LDOUT) $@ $^

--- a/contrib/tsearch2/wordparser/deflex.c
+++ b/contrib/tsearch2/wordparser/deflex.c
@@ -15,7 +15,7 @@ const char *lex_descr[] = {
 	"Latin part of hyphenated word",
 	"Space symbols",
 	"HTML Tag",
-	"HTTP head",
+	"Protocol head",
 	"Hyphenated word",
 	"Latin hyphenated word",
 	"Non-latin hyphenated word",
@@ -42,7 +42,7 @@ const char *tok_alias[] = {
 	"lpart_hword",
 	"blank",
 	"tag",
-	"http",
+	"protocol",
 	"hword",
 	"lhword",
 	"nlhword",
--- a/contrib/tsearch2/wordparser/deflex.h
+++ b/contrib/tsearch2/wordparser/deflex.h
@@ -17,7 +17,7 @@
 #define LATPARTHYPHENWORD	11
 #define SPACE		12
 #define TAG			13
-#define HTTP		14
+#define PROTOCOL		14
 #define HYPHENWORD	15
 #define LATHYPHENWORD	16
 #define CYRHYPHENWORD	17
--- a/contrib/tsearch2/wordparser/parser.c
+++ b/contrib/tsearch2/wordparser/parser.c
--- a/contrib/tsearch2/wordparser/parser.h
+++ b/contrib/tsearch2/wordparser/parser.h
@@ -1,10 +1,147 @@
 #ifndef __PARSER_H__
 #define __PARSER_H__

-extern char *token;
-extern int	tokenlen;
-int			tsearch2_yylex(void);
-void		tsearch2_start_parse_str(char *, int);
-void		tsearch2_end_parse(void);
+#include <ctype.h>
+#include <limits.h>
+#include "ts_locale.h"
+
+typedef enum {
+	TPS_Base = 0,
+	TPS_InUWord,
+	TPS_InLatWord,
+	TPS_InCyrWord,
+	TPS_InUnsignedInt,
+	TPS_InSignedIntFirst,
+	TPS_InSignedInt,
+	TPS_InSpace,
+	TPS_InUDecimalFirst,
+	TPS_InUDecimal,
+	TPS_InDecimalFirst,
+	TPS_InDecimal,
+	TPS_InVersionFirst,
+	TPS_InVersion,
+	TPS_InMantissaFirst,
+	TPS_InMantissaSign,
+	TPS_InMantissa,
+	TPS_InHTMLEntityFirst,
+	TPS_InHTMLEntity,
+	TPS_InHTMLEntityNumFirst,
+	TPS_InHTMLEntityNum,
+	TPS_InHTMLEntityEnd,
+	TPS_InTagFirst,
+	TPS_InTagCloseFirst,
+	TPS_InTag,
+	TPS_InTagEscapeK,
+	TPS_InTagEscapeKK,
+	TPS_InTagBackSleshed,
+	TPS_InTagEnd,
+	TPS_InCommentFirst,
+	TPS_InCommentLast,
+	TPS_InComment,
+	TPS_InCloseCommentFirst,
+	TPS_InCloseCommentLast,
+	TPS_InCommentEnd,
+	TPS_InHostFirstDomen,
+	TPS_InHostDomenSecond,
+	TPS_InHostDomen,
+	TPS_InPortFirst,
+	TPS_InPort,
+	TPS_InHostFirstAN,
+	TPS_InHost,
+	TPS_InEmail,
+	TPS_InFileFirst,
+	TPS_InFile,
+	TPS_InFileNext,
+	TPS_InURIFirst,
+	TPS_InURIStart,
+	TPS_InURI,
+	TPS_InFURL,
+	TPS_InProtocolFirst,
+	TPS_InProtocolSecond,
+	TPS_InProtocolEnd,
+	TPS_InHyphenLatWordFirst,
+	TPS_InHyphenLatWord,
+	TPS_InHyphenCyrWordFirst,
+	TPS_InHyphenCyrWord,
+	TPS_InHyphenUWordFirst,
+	TPS_InHyphenUWord,
+	TPS_InHyphenValueFirst,
+	TPS_InHyphenValue,
+	TPS_InHyphenValueExact,
+	TPS_InParseHyphen,
+	TPS_InParseHyphenHyphen,
+	TPS_InHyphenCyrWordPart,
+	TPS_InHyphenLatWordPart,
+	TPS_InHyphenUWordPart,
+	TPS_InHyphenUnsignedInt,
+	TPS_InHDecimalPartFirst,
+	TPS_InHDecimalPart,
+	TPS_InHVersionPartFirst,
+	TPS_InHVersionPart,
+	TPS_Null  /* last state (fake value) */
+} TParserState;
+
+/* forward declaration */
+struct TParser;
+
+
+typedef int (*TParserCharTest)(struct TParser*);  /* any p_is* functions except p_iseq */
+typedef void (*TParserSpecial)(struct TParser*);  /* special handler for special cases... */
+
+typedef struct {
+        TParserCharTest isclass;
+        char            c;
+        uint16          flags;
+        TParserState    tostate;
+        int             type;
+        TParserSpecial  special;
+} TParserStateActionItem;
+
+typedef struct {
+        TParserState            state;
+        TParserStateActionItem  *action;
+} TParserStateAction;
+
+typedef struct TParserPosition {
+	int		posbyte; /* position of parser in bytes */
+	int		poschar; /* osition of parser in characters */
+	int		charlen; /* length of current char */
+	int 		lenbytelexeme;
+	int 		lencharlexeme;
+	TParserState	state;
+	struct TParserPosition	*prev;
+	int		flags;
+	TParserStateActionItem	*pushedAtAction;
+} TParserPosition;
+
+typedef struct TParser {
+	/* string and position information */
+	char 		*str;  /* multibyte string */
+	int		lenstr; /* length of mbstring */
+	wchar_t		*wstr;  /* wide character string */ 
+	int		lenwstr; /* length of wsting */
+
+	/* State of parse */
+	int		charmaxlen;
+	bool		usewide;
+	TParserPosition	*state;
+	bool		ignore;
+	bool		wanthost;
+
+	/* silly char */
+	char c;
+
+	/* out */
+	char	 	*lexeme;
+	int 		lenbytelexeme;
+	int 		lencharlexeme;
+	int 		type;
+	
+} TParser;
+
+
+TParser* TParserInit( char *, int );
+bool	TParserGet( TParser* );
+void	TParserClose( TParser* );

 #endif
--- a/contrib/tsearch2/wordparser/parser.l
+++ b/contrib/tsearch2/wordparser/parser.l
@@ -1,346 +0,0 @@
-%{
-#include "postgres.h"
-
-#include "deflex.h"
-#include "parser.h"
-#include "common.h"
-
-/* Avoid exit() on fatal scanner errors */
-#undef fprintf
-#define fprintf(file, fmt, msg)  ts_error(ERROR, fmt, msg)
-
-char *token = NULL;  /* pointer to token */
-int tokenlen;
-static char *s     = NULL;  /* to return WHOLE hyphenated-word */
-
-YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
-
-typedef struct {
-	int tlen;
-	int clen;
-	char *str;
-} TagStorage;
-
-static TagStorage ts={0,0,NULL};
-
-static void
-addTag(void)
-{
-	while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
-		ts.tlen*=2;
-		ts.str=realloc(ts.str,ts.tlen);
-		if (!ts.str)
-                	ereport(ERROR,
-                               	(errcode(ERRCODE_OUT_OF_MEMORY),
-                               	 errmsg("out of memory")));
-        }
-        memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
-        ts.clen+=tsearch2_yyleng;
-	ts.str[ts.clen]='\0';
-}
-
-static void
-startTag(void)
-{
-	if ( ts.str==NULL ) {
-		ts.tlen=tsearch2_yyleng+1;
-		ts.str=malloc(ts.tlen);
-		if (!ts.str)
-                	ereport(ERROR,
-                                (errcode(ERRCODE_OUT_OF_MEMORY),
-                                 errmsg("out of memory")));
-	}
-	ts.clen=0;
-	ts.str[0]='\0';
-	addTag();
-}
-
-%}
-
-%option 8bit
-%option never-interactive
-%option nodefault
-%option nounput
-%option noyywrap
-
-/* parser's state for parsing hyphenated-word */
-%x DELIM  
-/* parser's state for parsing URL*/
-%x URL  
-%x SERVER  
-
-/* parser's state for parsing TAGS */
-%x INTAG
-%x QINTAG
-%x INCOMMENT
-%x INSCRIPT
-
-/* cyrillic koi8 char */
-CYRALNUM	[0-9\200-\377]
-CYRALPHA	[\200-\377]
-ALPHA		[a-zA-Z\200-\377]
-ALNUM		[0-9a-zA-Z\200-\377]
-
-
-HOSTNAME	([-_[:alnum:]]+\.)+[[:alpha:]]+
-URI		[-_[:alnum:]/%,\.;=&?#]+
-
-%%
-
-"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
-
-<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
-	BEGIN INITIAL; 
-	addTag();
-	token = ts.str;
-	tokenlen = ts.clen;
-	return TAG;
-}
-
-"<!--"	{ BEGIN INCOMMENT; startTag(); }
-
-<INCOMMENT>"-->"	{ 
-	BEGIN INITIAL;
-	addTag();
-	token = ts.str;
-	tokenlen = ts.clen;
-	return TAG;
-}
-
-
-"<"[\![:alpha:]]	{ BEGIN INTAG; startTag(); }
-
-"</"[[:alpha:]]	{ BEGIN INTAG; startTag(); }
-
-<INTAG>"\""	{ BEGIN QINTAG; addTag(); }
-
-<QINTAG>"\\\""	{ addTag(); }
-
-<QINTAG>"\""	{ BEGIN INTAG; addTag(); }
-
-<INTAG>">"	{ 
-	BEGIN INITIAL;
-	addTag();
-	token = ts.str;
-	tokenlen = ts.clen;
-	return TAG;
-}
-
-<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }	
-
-\&(quot|amp|nbsp|lt|gt)\;   {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return HTMLENTITY;
-}
-
-\&\#[0-9][0-9]?[0-9]?\; {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return HTMLENTITY;
-}
- 
-[-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ { 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return EMAIL; 
-}
-
-[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */ 	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return SCIENTIFIC; 
-}
-
-[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return VERSIONNUMBER;
-}
-
-[+-]?[0-9]+\.[0-9]+ {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return DECIMAL;
-}
-
-[+-][0-9]+ { 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return SIGNEDINT; 
-}
-
-<DELIM,INITIAL>[0-9]+ { 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return UNSIGNEDINT; 
-}
-
-http"://"        { 
-	BEGIN URL; 
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return HTTP;
-}
-
-ftp"://"        { 
-	BEGIN URL; 
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return HTTP;
-}
-
-<URL,INITIAL>{HOSTNAME}[/:]{URI} { 
-	BEGIN SERVER;
-	if (s) { free(s); s=NULL; } 
-	s = strdup( tsearch2_yytext ); 
-	tokenlen = tsearch2_yyleng;
-	yyless( 0 ); 
-	token = s;
-	return FURL;
-}
-
-<SERVER,URL,INITIAL>{HOSTNAME} {
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return HOST;
-}
-
-<SERVER>[/:]{URI} 	{
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return URI;
-}
-
-[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return FILEPATH;
-}
-
-({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */	{
-	BEGIN DELIM;
-	if (s) { free(s); s=NULL; } 
-	s = strdup( tsearch2_yytext );
-	tokenlen = tsearch2_yyleng;
-	yyless( 0 );
-	token = s;
-	return CYRHYPHENWORD;
-}
-
-([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */	{
-	 BEGIN DELIM;
-	if (s) { free(s); s=NULL; } 
-	s = strdup( tsearch2_yytext );
-	tokenlen = tsearch2_yyleng;
-	yyless( 0 );
-	token = s;
-	return LATHYPHENWORD;
-}
-
-({ALNUM}+-)+{ALNUM}+ /* composite-word */	{
-	BEGIN DELIM;
-	if (s) { free(s); s=NULL; } 
-	s = strdup( tsearch2_yytext );
-	tokenlen = tsearch2_yyleng;
-	yyless( 0 );
-	token = s;
-	return HYPHENWORD;
-}
-
-<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return VERSIONNUMBER;
-}
-
-<DELIM>\+?[0-9]+\.[0-9]+ {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return DECIMAL;
-}
-
-<DELIM>{CYRALPHA}+  /* one word in composite-word */	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return CYRPARTHYPHENWORD; 
-}
-
-<DELIM>[[:alpha:]]+  /* one word in composite-word */	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return LATPARTHYPHENWORD; 
-}
-
-<DELIM>{ALNUM}+  /* one word in composite-word */	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return PARTHYPHENWORD; 
-}
-
-<DELIM>-  { 
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return SPACE;
-}
-
-<DELIM,SERVER,URL>.|\n /* return in basic state */	{
-	BEGIN INITIAL;
-	yyless( 0 );
-}
-
-{CYRALPHA}+ /* normal word */	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return CYRWORD; 
-}
-
-[[:alpha:]]+ /* normal word */	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return LATWORD; 
-}
-
-{ALNUM}+ /* normal word */	{ 
-	token = tsearch2_yytext; 
-	tokenlen = tsearch2_yyleng;
-	return UWORD; 
-}
-
-[ \r\n\t]+ {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return SPACE;
-}
-
-. {
-	token = tsearch2_yytext;
-	tokenlen = tsearch2_yyleng;
-	return SPACE;
-} 
-
-%%
-
-/* clearing after parsing from string */
-void
-tsearch2_end_parse(void)
-{
-	if (s)
-	{
-		free(s);
-		s = NULL;
-	} 
-	tsearch2_yy_delete_buffer( buf );
-	buf = NULL;
-} 
-
-/* start parse from string */
-void
-tsearch2_start_parse_str(char* str, int limit)
-{
-	if (buf)
-		tsearch2_end_parse();
-	buf = tsearch2_yy_scan_bytes( str, limit );
-	tsearch2_yy_switch_to_buffer( buf );
-	BEGIN INITIAL;
-}