Re-refactor the core scanner's API, in order to get out from under the problem

of different parsers having different YYSTYPE unions that they want to use with it. I defined a new union core_YYSTYPE that is just the (very short) list of semantic values returned by the core scanner. I had originally worried that this would require an extra interface layer, but actually we can have parser.c's base_yylex (formerly filtered_base_yylex) take care of that at no extra cost. Names associated with the core scanner are now "core_yy_foo", with "base_yy_foo" being used in the core Bison parser and the parser.c interface layer. This solves the last serious stumbling block to eliminating plpgsql's separate lexer. One restriction that will still be present is that plpgsql and the core will have to agree on the token numbers assigned to tokens that can be returned by the core lexer. Since Bison doesn't seem willing to accept external assignments of those numbers, we'll have to live with decreeing that core and plpgsql grammars declare these tokens first and in the same order.
2025-10-25 13:17:41 +03:00 · 2009-11-09 18:38:48 +00:00
parent 2ace38d226
commit 10bcfa189b
5 changed files with 236 additions and 172 deletions
--- a/src/include/parser/gramparse.h
+++ b/src/include/parser/gramparse.h
@@ -11,7 +11,7 @@
 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.49 2009/11/05 23:24:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/parser/gramparse.h,v 1.50 2009/11/09 18:38:48 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -20,20 +20,11 @@
 #define GRAMPARSE_H

 #include "nodes/parsenodes.h"
-#include "parser/keywords.h"
+#include "parser/scanner.h"

 /*
- * We track token locations in terms of byte offsets from the start of the
- * source string, not the column number/line number representation that
- * bison uses by default.  Also, to minimize overhead we track only one
- * location (usually the first token location) for each construct, not
- * the beginning and ending locations as bison does by default.  It's
- * therefore sufficient to make YYLTYPE an int.
- */
-#define YYLTYPE  int
-
-/*
- * After defining YYLTYPE, it's safe to include gram.h.
+ * NB: include gram.h only AFTER including scanner.h, because scanner.h
+ * is what #defines YYLTYPE.
 */
 #include "parser/gram.h"

@@ -44,62 +35,24 @@
 typedef struct base_yy_extra_type
 {
 	/*
-	 * The string the lexer is physically scanning.  We keep this mainly so
-	 * that we can cheaply compute the offset of the current token (yytext).
+	 * Fields used by the core scanner.
 	 */
-	char	   *scanbuf;
-	Size		scanbuflen;
+	core_yy_extra_type core_yy_extra;

 	/*
-	 * The keyword list to use.
-	 */
-	const ScanKeyword *keywords;
-	int			num_keywords;
-
-	/*
-	 * literalbuf is used to accumulate literal values when multiple rules
-	 * are needed to parse a single literal.  Call startlit() to reset buffer
-	 * to empty, addlit() to add text.  NOTE: the string in literalbuf is
-	 * NOT necessarily null-terminated, but there always IS room to add a
-	 * trailing null at offset literallen.  We store a null only when we
-	 * need it.
-	 */
-	char	   *literalbuf;		/* palloc'd expandable buffer */
-	int			literallen;		/* actual current string length */
-	int			literalalloc;	/* current allocated buffer size */
-
-	int			xcdepth;		/* depth of nesting in slash-star comments */
-	char	   *dolqstart;		/* current $foo$ quote start string */
-
-	/* first part of UTF16 surrogate pair for Unicode escapes */
-	int32		utf16_first_part;
-
-	/* state variables for literal-lexing warnings */
-	bool		warn_on_first_escape;
-	bool		saw_non_ascii;
-
-	/*
-	 * State variables for filtered_base_yylex().
+	 * State variables for base_yylex().
 	 */
 	bool		have_lookahead;		/* is lookahead info valid? */
 	int			lookahead_token;	/* one-token lookahead */
-	YYSTYPE		lookahead_yylval;	/* yylval for lookahead token */
+	core_YYSTYPE lookahead_yylval;	/* yylval for lookahead token */
 	YYLTYPE		lookahead_yylloc;	/* yylloc for lookahead token */

 	/*
-	 * State variables that belong to the grammar, not the lexer.  It's
-	 * simpler to keep these here than to invent a separate structure.
-	 * These fields are unused/undefined if the lexer is invoked on its own.
+	 * State variables that belong to the grammar.
 	 */
-
 	List	   *parsetree;		/* final parse result is delivered here */
 } base_yy_extra_type;

-/*
- * The type of yyscanner is opaque outside scan.l.
- */
-typedef void *base_yyscan_t;
-
 /*
 * In principle we should use yyget_extra() to fetch the yyextra field
 * from a yyscanner struct.  However, flex always puts that field first,
@@ -110,22 +63,11 @@ typedef void *base_yyscan_t;


 /* from parser.c */
-extern int	filtered_base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp,
-								base_yyscan_t yyscanner);
-
-/* from scan.l */
-extern base_yyscan_t scanner_init(const char *str,
-								  base_yy_extra_type *yyext,
-								  const ScanKeyword *keywords,
-								  int num_keywords);
-extern void scanner_finish(base_yyscan_t yyscanner);
 extern int	base_yylex(YYSTYPE *lvalp, YYLTYPE *llocp,
-					   base_yyscan_t yyscanner);
-extern int	scanner_errposition(int location, base_yyscan_t yyscanner);
-extern void scanner_yyerror(const char *message, base_yyscan_t yyscanner);
+					   core_yyscan_t yyscanner);

 /* from gram.y */
 extern void parser_init(base_yy_extra_type *yyext);
-extern int	base_yyparse(base_yyscan_t yyscanner);
+extern int	base_yyparse(core_yyscan_t yyscanner);

 #endif   /* GRAMPARSE_H */