From 787514b30bb7dd0b3484d6cb717e3b9aafc06c4a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 26 Mar 2025 15:21:05 +0900
Subject: [PATCH] Use relation name instead of OID in query jumbling for
 RangeTblEntry

custom_query_jumble (introduced in 5ac462e2b7ac as a node field
attribute) is now assigned to the expanded reference name "eref" of
RangeTblEntry, adding in the query jumble computation the non-qualified
aliased relation name, without the list of column names.  The relation
OID is removed from the query jumbling.

The effects of this change can be seen in the tests added by
3430215fe35f, where pg_stat_statements (PGSS) entries are now grouped
using the relation name, ignoring the relation search_path may point at.
For example, these two relations are different, but are now grouped in a
single PGSS entry as they are assigned the same query ID:
CREATE TABLE foo1.tab (a int);
CREATE TABLE foo2.tab (b int);
SET search_path = 'foo1';
SELECT count(*) FROM tab;
SET search_path = 'foo2';
SELECT count(*) FROM tab;
SELECT count(*) FROM foo1.tab;
SELECT count(*) FROM foo2.tab;
SELECT query, calls FROM pg_stat_statements WHERE query ~ 'FROM tab';
          query           | calls
--------------------------+-------
 SELECT count(*) FROM tab |     4
(1 row)

It is still possible to use an alias in the FROM clause to split these.
This behavior is useful for relations re-created with the same name,
where queries based on such relations would be grouped in the same
PGSS entry.  For permanent schemas, it should not really matter in
practice.  The main benefit is for workloads that use a lot of temporary
relations, which are usually re-created with the same name continuously.
These can be a heavy source of bloat in PGSS depending on the workload.
Such entries can now be grouped together, improving the user experience.

The original idea from Christoph Berg used catalog lookups to find
temporary relations, something that the query jumble has never done, and
it could cause some performance regressions.  The idea to use
RangeTblEntry.eref and the relation name, applying the same rules for
all relations, temporary and not temporary, has been proposed by Tom
Lane.  The documentation additions have been suggested by Sami Imseih.

Author: Michael Paquier <michael@paquier.xyz>
Co-authored-by: Sami Imseih <samimseih@gmail.com>
Reviewed-by: Christoph Berg <myon@debian.org>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Sami Imseih <samimseih@gmail.com>
Discussion: https://postgr.es/m/Z9iWXKGwkm8RAC93@msg.df7cb.de
---
 .../pg_stat_statements/expected/select.out    | 20 ++++++++-----------
 doc/src/sgml/pgstatstatements.sgml            |  9 +++++++--
 src/backend/nodes/queryjumblefuncs.c          | 19 ++++++++++++++++++
 src/include/nodes/parsenodes.h                | 11 +++++++---
 4 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/select.out b/contrib/pg_stat_statements/expected/select.out
index 708c6b0e9c4..1eebc2898ab 100644
--- a/contrib/pg_stat_statements/expected/select.out
+++ b/contrib/pg_stat_statements/expected/select.out
@@ -433,11 +433,10 @@ COMMIT;
 SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C";
  calls |                                 query                                  
 -------+------------------------------------------------------------------------
-     1 | SELECT * FROM temp_t
-     1 | SELECT * FROM temp_t
+     2 | SELECT * FROM temp_t
      0 | SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"
      1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
-(4 rows)
+(3 rows)
 
 SELECT pg_stat_statements_reset() IS NOT NULL AS t;
  t 
@@ -623,18 +622,15 @@ SELECT a AS a1 FROM pgss_schema_2.tab_search_diff_2;
 SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C";
  calls |                                 query                                  
 -------+------------------------------------------------------------------------
-     3 | SELECT a FROM pgss_schema_2.tab_search_diff_2 AS t1
-     9 | SELECT a FROM tab_search_diff_2 AS t1
-     1 | SELECT a, b FROM pgss_schema_1.tab_search_same
-     3 | SELECT a, b FROM tab_search_same
+     8 | SELECT a FROM tab_search_diff_2
+     4 | SELECT a FROM tab_search_diff_2 AS t1
+     4 | SELECT a, b FROM tab_search_same
      0 | SELECT calls, query FROM pg_stat_statements ORDER BY query COLLATE "C"
-     1 | SELECT count(*) FROM pgss_schema_1.tab_search_same
-     1 | SELECT count(*) FROM pgss_schema_2.tab_search_diff_1
-     3 | SELECT count(*) FROM tab_search_diff_1
+     4 | SELECT count(*) FROM tab_search_diff_1
      4 | SELECT count(*) FROM tab_search_diff_2
-     3 | SELECT count(*) FROM tab_search_same
+     4 | SELECT count(*) FROM tab_search_same
      1 | SELECT pg_stat_statements_reset() IS NOT NULL AS t
-(11 rows)
+(8 rows)
 
 DROP SCHEMA pgss_schema_1 CASCADE;
 NOTICE:  drop cascades to 3 other objects
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index f4e384e95ae..625b84ebfef 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -675,8 +675,13 @@ calls | 2
    things, the internal object identifiers appearing in this representation.
    This has some counterintuitive implications.  For example,
    <filename>pg_stat_statements</filename> will consider two apparently-identical
-   queries to be distinct, if they reference a table that was dropped
-   and recreated between the executions of the two queries.
+   queries to be distinct, if they reference for example a function that was
+   dropped and recreated between the executions of the two queries.
+   Conversely, if a table is dropped and recreated between the
+   executions of queries, two apparently-identical queries may be
+   considered the same. However, if the alias for a table is different
+   for otherwise-similar queries, these queries will be considered
+   distinct.
    The hashing process is also sensitive to differences in
    machine architecture and other facets of the platform.
    Furthermore, it is not safe to assume that <structfield>queryid</structfield>
diff --git a/src/backend/nodes/queryjumblefuncs.c b/src/backend/nodes/queryjumblefuncs.c
index f8b0f91704b..62d6cfb7ac1 100644
--- a/src/backend/nodes/queryjumblefuncs.c
+++ b/src/backend/nodes/queryjumblefuncs.c
@@ -67,6 +67,9 @@ static void _jumbleElements(JumbleState *jstate, List *elements);
 static void _jumbleA_Const(JumbleState *jstate, Node *node);
 static void _jumbleList(JumbleState *jstate, Node *node);
 static void _jumbleVariableSetStmt(JumbleState *jstate, Node *node);
+static void _jumbleRangeTblEntry_eref(JumbleState *jstate,
+									  RangeTblEntry *rte,
+									  Alias *expr);
 
 /*
  * Given a possibly multi-statement source string, confine our attention to the
@@ -516,3 +519,19 @@ _jumbleVariableSetStmt(JumbleState *jstate, Node *node)
 	JUMBLE_FIELD(is_local);
 	JUMBLE_LOCATION(location);
 }
+
+/*
+ * Custom query jumble function for RangeTblEntry.eref.
+ */
+static void
+_jumbleRangeTblEntry_eref(JumbleState *jstate,
+						  RangeTblEntry *rte,
+						  Alias *expr)
+{
+	JUMBLE_FIELD(type);
+
+	/*
+	 * This includes only the table name, the list of column names is ignored.
+	 */
+	JUMBLE_STRING(aliasname);
+}
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 23c9e3c5abf..df331b1c0d9 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1050,8 +1050,13 @@ typedef struct RangeTblEntry
 	 */
 	/* user-written alias clause, if any */
 	Alias	   *alias pg_node_attr(query_jumble_ignore);
-	/* expanded reference names */
-	Alias	   *eref pg_node_attr(query_jumble_ignore);
+
+	/*
+	 * Expanded reference names.  This uses a custom query jumble function so
+	 * that the table name is included in the computation, but not its list of
+	 * columns.
+	 */
+	Alias	   *eref pg_node_attr(custom_query_jumble);
 
 	RTEKind		rtekind;		/* see above */
 
@@ -1094,7 +1099,7 @@ typedef struct RangeTblEntry
 	 * tables to be invalidated if the underlying table is altered.
 	 */
 	/* OID of the relation */
-	Oid			relid;
+	Oid			relid pg_node_attr(query_jumble_ignore);
 	/* inheritance requested? */
 	bool		inh;
 	/* relation kind (see pg_class.relkind) */