From b81ffa13e35693af8accf00f1694bbeca6809fb6 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Thu, 3 Apr 2025 10:45:38 -0700
Subject: [PATCH] pg_upgrade check for Unicode-dependent relations.

This check will not cause an upgrade failure, only a warning.

Discussion: https://postgr.es/m/ef03d678b39a64392f4b12e0f59d1495c740969e.camel%40j-davis.com
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
---
 src/bin/pg_upgrade/check.c | 180 +++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)

diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 02d9146e5ed..3189de4df96 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -13,6 +13,7 @@
 #include "catalog/pg_class_d.h"
 #include "fe_utils/string_utils.h"
 #include "pg_upgrade.h"
+#include "common/unicode_version.h"
 
 static void check_new_cluster_is_empty(void);
 static void check_is_install_user(ClusterInfo *cluster);
@@ -25,6 +26,7 @@ static void check_for_tables_with_oids(ClusterInfo *cluster);
 static void check_for_pg_role_prefix(ClusterInfo *cluster);
 static void check_for_new_tablespace_dir(void);
 static void check_for_user_defined_encoding_conversions(ClusterInfo *cluster);
+static void check_for_unicode_update(ClusterInfo *cluster);
 static void check_new_cluster_logical_replication_slots(void);
 static void check_new_cluster_subscription_configuration(void);
 static void check_old_cluster_for_valid_slots(void);
@@ -633,6 +635,12 @@ check_and_dump_old_cluster(void)
 
 	check_for_data_types_usage(&old_cluster);
 
+	/*
+	 * Unicode updates can affect some objects that use expressions with
+	 * functions dependent on Unicode.
+	 */
+	check_for_unicode_update(&old_cluster);
+
 	/*
 	 * PG 14 changed the function signature of encoding conversion functions.
 	 * Conversions from older versions cannot be upgraded automatically
@@ -1754,6 +1762,178 @@ check_for_user_defined_encoding_conversions(ClusterInfo *cluster)
 		check_ok();
 }
 
+/*
+ * Callback function for processing results of query for
+ * check_for_unicode_update()'s UpgradeTask.  If the query returned any rows
+ * (i.e., the check failed), write the details to the report file.
+ */
+static void
+process_unicode_update(DbInfo *dbinfo, PGresult *res, void *arg)
+{
+	UpgradeTaskReport *report = (UpgradeTaskReport *) arg;
+	int			ntups = PQntuples(res);
+	int			i_reloid = PQfnumber(res, "reloid");
+	int			i_nspname = PQfnumber(res, "nspname");
+	int			i_relname = PQfnumber(res, "relname");
+
+	if (ntups == 0)
+		return;
+
+	if (report->file == NULL &&
+		(report->file = fopen_priv(report->path, "w")) == NULL)
+		pg_fatal("could not open file \"%s\": %m", report->path);
+
+	fprintf(report->file, "In database: %s\n", dbinfo->db_name);
+
+	for (int rowno = 0; rowno < ntups; rowno++)
+		fprintf(report->file, "  (oid=%s) %s.%s\n",
+				PQgetvalue(res, rowno, i_reloid),
+				PQgetvalue(res, rowno, i_nspname),
+				PQgetvalue(res, rowno, i_relname));
+}
+
+/*
+ * Check if the Unicode version built into Postgres changed between the old
+ * cluster and the new cluster.
+ */
+static bool
+unicode_version_changed(ClusterInfo *cluster)
+{
+	PGconn	   *conn_template1 = connectToServer(cluster, "template1");
+	PGresult   *res;
+	char	   *old_unicode_version;
+	bool		unicode_updated;
+
+	res = executeQueryOrDie(conn_template1, "SELECT unicode_version()");
+	old_unicode_version = PQgetvalue(res, 0, 0);
+	unicode_updated = (strcmp(old_unicode_version, PG_UNICODE_VERSION) != 0);
+
+	PQclear(res);
+	PQfinish(conn_template1);
+
+	return unicode_updated;
+}
+
+/*
+ * check_for_unicode_update()
+ *
+ * Check if the version of Unicode in the old server and the new server
+ * differ. If so, check for indexes, partitioned tables, or constraints that
+ * use expressions with functions dependent on Unicode behavior.
+ */
+static void
+check_for_unicode_update(ClusterInfo *cluster)
+{
+	UpgradeTaskReport report;
+	UpgradeTask *task = upgrade_task_create();
+	const char *query;
+
+	/*
+	 * The builtin provider did not exist prior to version 17. While there are
+	 * still problems that could potentially be caught from earlier versions,
+	 * such as an index on NORMALIZE(), we don't check for that here.
+	 */
+	if (GET_MAJOR_VERSION(cluster->major_version) < 1700)
+		return;
+
+	prep_status("Checking for objects affected by Unicode update");
+
+	if (!unicode_version_changed(cluster))
+	{
+		check_ok();
+		return;
+	}
+
+	report.file = NULL;
+	snprintf(report.path, sizeof(report.path), "%s/%s",
+			 log_opts.basedir,
+			 "unicode_dependent_rels.txt");
+
+	query =
+	/* collations that use built-in Unicode for character semantics */
+		"WITH collations(collid) AS ( "
+		"  SELECT oid FROM pg_collation "
+		"  WHERE collprovider='b' AND colllocale IN ('C.UTF-8','PG_UNICODE_FAST') "
+	/* include default collation, if appropriate */
+		"  UNION "
+		"  SELECT 'pg_catalog.default'::regcollation FROM pg_database "
+		"  WHERE datname = current_database() AND "
+		"  datlocprovider='b' AND datlocale IN ('C.UTF-8','PG_UNICODE_FAST') "
+		"), "
+	/* functions that use built-in Unicode */
+		"functions(procid) AS ( "
+		"  SELECT proc.oid FROM pg_proc proc "
+		"  WHERE proname IN ('normalize','unicode_assigned','unicode_version','is_normalized') AND "
+		"        pronamespace='pg_catalog'::regnamespace "
+		"), "
+	/* operators that use the input collation for character semantics */
+		"coll_operators(operid, procid, collid) AS ( "
+		"  SELECT oper.oid, oper.oprcode, collid FROM pg_operator oper, collations "
+		"  WHERE oprname IN ('~', '~*', '!~', '!~*', '~~*', '!~~*') AND "
+		"        oprnamespace='pg_catalog'::regnamespace AND "
+		"        oprright='text'::regtype "
+		"), "
+	/* functions that use the input collation for character semantics */
+		"coll_functions(procid, collid) AS ( "
+		"  SELECT proc.oid, collid FROM pg_proc proc, collations "
+		"  WHERE proname IN ('lower','initcap','upper') AND "
+		"        pronamespace='pg_catalog'::regnamespace AND "
+		"        proargtypes[0] = 'text'::regtype "
+	/* include functions behind the operators listed above */
+		"  UNION "
+		"  SELECT procid, collid FROM coll_operators "
+		"), "
+
+	/*
+	 * Generate patterns to search a pg_node_tree for the above functions and
+	 * operators.
+	 */
+		"patterns(p) AS ( "
+		"  SELECT '{FUNCEXPR :funcid ' || procid::text || '[ }]' FROM functions "
+		"  UNION "
+		"  SELECT '{OPEXPR :opno ' || operid::text || ' (:\\w+ \\w+ )*' || "
+		"         ':inputcollid ' || collid::text || '[ }]' FROM coll_operators "
+		"  UNION "
+		"  SELECT '{FUNCEXPR :funcid ' || procid::text || ' (:\\w+ \\w+ )*' || "
+		"         ':inputcollid ' || collid::text || '[ }]' FROM coll_functions "
+		") "
+
+	/*
+	 * Match the patterns against expressions used for relation contents.
+	 */
+		"SELECT reloid, relkind, nspname, relname "
+		"  FROM ( "
+		"    SELECT conrelid "
+		"    FROM pg_constraint, patterns WHERE conbin::text ~ p "
+		"  UNION "
+		"    SELECT indexrelid "
+		"    FROM pg_index, patterns WHERE indexprs::text ~ p OR indpred::text ~ p "
+		"  UNION "
+		"    SELECT partrelid "
+		"    FROM pg_partitioned_table, patterns WHERE partexprs::text ~ p "
+		"  ) s(reloid), pg_class c, pg_namespace n, pg_database d "
+		"  WHERE s.reloid = c.oid AND c.relnamespace = n.oid AND "
+		"        d.datname = current_database() AND "
+		"        d.encoding = pg_char_to_encoding('UTF8');";
+
+	upgrade_task_add_step(task, query,
+						  process_unicode_update,
+						  true, &report);
+	upgrade_task_run(task, cluster);
+	upgrade_task_free(task);
+
+	if (report.file)
+	{
+		fclose(report.file);
+		report_status(PG_WARNING, "warning");
+		pg_log(PG_WARNING, "Your installation contains relations that may be affected by a new version of Unicode.\n"
+			   "A list of potentially-affected relations is in the file:\n"
+			   "    %s", report.path);
+	}
+	else
+		check_ok();
+}
+
 /*
  * check_new_cluster_logical_replication_slots()
  *