Attached is an update to contrib/tablefunc. It implements a new hashed

version of crosstab. This fixes a major deficiency in real-world use of the original version. Easiest to undestand with an illustration: Data: ------------------------------------------------------------------- select * from cth; id | rowid | rowdt | attribute | val ----+-------+---------------------+----------------+--------------- 1 | test1 | 2003-03-01 00:00:00 | temperature | 42 2 | test1 | 2003-03-01 00:00:00 | test_result | PASS 3 | test1 | 2003-03-01 00:00:00 | volts | 2.6987 4 | test2 | 2003-03-02 00:00:00 | temperature | 53 5 | test2 | 2003-03-02 00:00:00 | test_result | FAIL 6 | test2 | 2003-03-02 00:00:00 | test_startdate | 01 March 2003 7 | test2 | 2003-03-02 00:00:00 | volts | 3.1234 (7 rows) Original crosstab: ------------------------------------------------------------------- SELECT * FROM crosstab( 'SELECT rowid, attribute, val FROM cth ORDER BY 1,2',4) AS c(rowid text, temperature text, test_result text, test_startdate text, volts text); rowid | temperature | test_result | test_startdate | volts -------+-------------+-------------+----------------+-------- test1 | 42 | PASS | 2.6987 | test2 | 53 | FAIL | 01 March 2003 | 3.1234 (2 rows) Hashed crosstab: ------------------------------------------------------------------- SELECT * FROM crosstab( 'SELECT rowid, attribute, val FROM cth ORDER BY 1', 'SELECT DISTINCT attribute FROM cth ORDER BY 1') AS c(rowid text, temperature int4, test_result text, test_startdate timestamp, volts float8); rowid | temperature | test_result | test_startdate | volts -------+-------------+-------------+---------------------+-------- test1 | 42 | PASS | | 2.6987 test2 | 53 | FAIL | 2003-03-01 00:00:00 | 3.1234 (2 rows) Notice that the original crosstab slides data over to the left in the result tuple when it encounters missing data. In order to work around this you have to be make your source sql do all sorts of contortions (cartesian join of distinct rowid with distinct attribute; left join that back to the real source data). The new version avoids this by building a hash table using a second distinct attribute query. The new version also allows for "extra" columns (see the README) and allows the result columns to be coerced into differing datatypes if they are suitable (as shown above). In testing a "real-world" data set (69 distinct rowid's, 27 distinct categories/attributes, multiple missing data points) I saw about a 5-fold improvement in execution time (from about 2200 ms old, to 440 ms new). I left the original version intact because: 1) BC, 2) it is probably slightly faster if you know that you have no missing attributes. README and regression test adjustments included. If there are no objections, please apply. Joe Conway
2025-07-27 12:41:57 +03:00 · 2003-03-20 06:46:30 +00:00
parent add932ee91
commit 64d0b8b05f
6 changed files with 655 additions and 1 deletions
--- a/contrib/tablefunc/tablefunc.c
+++ b/contrib/tablefunc/tablefunc.c
@ -39,6 +39,11 @@

 #include "tablefunc.h"

+static int load_categories_hash(char *cats_sql, MemoryContext per_query_ctx);
+static Tuplestorestate *get_crosstab_tuplestore(char *sql,
+												int num_categories,
+												TupleDesc tupdesc,
+												MemoryContext per_query_ctx);
 static void validateConnectbyTupleDesc(TupleDesc tupdesc, bool show_branch);
 static bool compatCrosstabTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2);
 static bool compatConnectbyTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2);
@ -95,6 +100,67 @@ typedef struct
 /* sign, 10 digits, '\0' */
 #define INT32_STRLEN	12

+/* hash table support */
+static HTAB *crosstab_HashTable;
+
+/* The information we cache about loaded procedures */
+typedef struct crosstab_cat_desc
+{
+	char	   *catname;
+	int			attidx;	/* zero based */
+}	crosstab_cat_desc;
+
+#define MAX_CATNAME_LEN			NAMEDATALEN
+#define INIT_CATS				64
+
+#define crosstab_HashTableLookup(CATNAME, CATDESC) \
+do { \
+	crosstab_HashEnt *hentry; char key[MAX_CATNAME_LEN]; \
+	\
+	MemSet(key, 0, MAX_CATNAME_LEN); \
+	snprintf(key, MAX_CATNAME_LEN - 1, "%s", CATNAME); \
+	hentry = (crosstab_HashEnt*) hash_search(crosstab_HashTable, \
+										 key, HASH_FIND, NULL); \
+	if (hentry) \
+		CATDESC = hentry->catdesc; \
+	else \
+		CATDESC = NULL; \
+} while(0)
+
+#define crosstab_HashTableInsert(CATDESC) \
+do { \
+	crosstab_HashEnt *hentry; bool found; char key[MAX_CATNAME_LEN]; \
+	\
+	MemSet(key, 0, MAX_CATNAME_LEN); \
+	snprintf(key, MAX_CATNAME_LEN - 1, "%s", CATDESC->catname); \
+	hentry = (crosstab_HashEnt*) hash_search(crosstab_HashTable, \
+										 key, HASH_ENTER, &found); \
+	if (hentry == NULL) \
+		elog(ERROR, "out of memory in crosstab_HashTable"); \
+	if (found) \
+		elog(ERROR, "trying to use a category name more than once"); \
+	hentry->catdesc = CATDESC; \
+} while(0)
+
+#define crosstab_HashTableDelete(CATNAME) \
+do { \
+	crosstab_HashEnt *hentry; char key[MAX_CATNAME_LEN]; \
+	\
+	MemSet(key, 0, MAX_CATNAME_LEN); \
+	snprintf(key, MAX_CATNAME_LEN - 1, "%s", CATNAME); \
+	hentry = (crosstab_HashEnt*) hash_search(crosstab_HashTable, \
+										 key, HASH_REMOVE, NULL); \
+	if (hentry == NULL) \
+		elog(WARNING, "trying to delete function name that does not exist."); \
+} while(0)
+
+/* hash table */
+typedef struct crosstab_hashent
+{
+	char				internal_catname[MAX_CATNAME_LEN];
+	crosstab_cat_desc  *catdesc;
+} crosstab_HashEnt;
+
 /*
 * normal_rand - return requested number of random values
 * with a Gaussian (Normal) distribution.
@ -592,6 +658,341 @@ crosstab(PG_FUNCTION_ARGS)
 	}
 }

+/*
+ * crosstab_hash - reimplement crosstab as materialized function and 
+ * properly deal with missing values (i.e. don't pack remaining
+ * values to the left)
+ * 
+ * crosstab - create a crosstab of rowids and values columns from a
+ * SQL statement returning one rowid column, one category column,
+ * and one value column.
+ *
+ * e.g. given sql which produces:
+ *
+ *			rowid	cat		value
+ *			------+-------+-------
+ *			row1	cat1	val1
+ *			row1	cat2	val2
+ *			row1	cat4	val4
+ *			row2	cat1	val5
+ *			row2	cat2	val6
+ *			row2	cat3	val7
+ *			row2	cat4	val8
+ *
+ * crosstab returns:
+ *					<===== values columns =====>
+ *			rowid	cat1	cat2	cat3	cat4
+ *			------+-------+-------+-------+-------
+ *			row1	val1	val2	null	val4
+ *			row2	val5	val6	val7	val8
+ *
+ * NOTES:
+ * 1. SQL result must be ordered by 1.
+ * 2. The number of values columns depends on the tuple description
+ *	  of the function's declared return type.
+ * 2. Missing values (i.e. missing category) are filled in with nulls.
+ * 3. Extra values (i.e. not in category results) are skipped.
+ */
+PG_FUNCTION_INFO_V1(crosstab_hash);
+Datum
+crosstab_hash(PG_FUNCTION_ARGS)
+{
+	char			   *sql = GET_STR(PG_GETARG_TEXT_P(0));
+	char			   *cats_sql = GET_STR(PG_GETARG_TEXT_P(1));
+	ReturnSetInfo	   *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc			tupdesc;
+	MemoryContext		per_query_ctx;
+	MemoryContext		oldcontext;
+	int					num_categories;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (!rsinfo || !(rsinfo->allowedModes & SFRM_Materialize))
+		elog(ERROR, "crosstab: materialize mode required, but it is not "
+			 "allowed in this context");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	/* get the requested return tuple description */
+	tupdesc = CreateTupleDescCopy(rsinfo->expectedDesc);
+
+	/*
+	 * Check to make sure we have a reasonable tuple descriptor
+	 *
+	 * Note we will attempt to coerce the values into whatever
+	 * the return attribute type is and depend on the "in"
+	 * function to complain if needed.
+	 */
+	if (tupdesc->natts < 2)
+		elog(ERROR, "crosstab: query-specified return tuple and " \
+					"crosstab function are not compatible");
+
+	/* load up the categories hash table */
+	num_categories = load_categories_hash(cats_sql, per_query_ctx);
+
+	/* let the caller know we're sending back a tuplestore */
+	rsinfo->returnMode = SFRM_Materialize;
+
+	/* now go build it */
+	rsinfo->setResult = get_crosstab_tuplestore(sql,
+												num_categories,
+												tupdesc,
+												per_query_ctx);
+
+	/*
+	 * SFRM_Materialize mode expects us to return a NULL Datum. The actual
+	 * tuples are in our tuplestore and passed back through
+	 * rsinfo->setResult. rsinfo->setDesc is set to the tuple description
+	 * that we actually used to build our tuples with, so the caller can
+	 * verify we did what it was expecting.
+	 */
+	rsinfo->setDesc = tupdesc;
+	MemoryContextSwitchTo(oldcontext);
+
+	return (Datum) 0;
+}
+
+/*
+ * load up the categories hash table
+ */
+static int
+load_categories_hash(char *cats_sql, MemoryContext per_query_ctx)
+{
+	HASHCTL			ctl;
+	int				ret;
+	int				proc;
+	MemoryContext	SPIcontext;
+	int				num_categories = 0;
+
+	/* initialize the category hash table */
+	ctl.keysize = MAX_CATNAME_LEN;
+	ctl.entrysize = sizeof(crosstab_HashEnt);
+
+	/*
+	 * use INIT_CATS, defined above as a guess of how
+	 * many hash table entries to create, initially
+	 */
+	crosstab_HashTable = hash_create("crosstab hash", INIT_CATS, &ctl, HASH_ELEM);
+
+	/* Connect to SPI manager */
+	if ((ret = SPI_connect()) < 0)
+		elog(ERROR, "load_categories_hash: SPI_connect returned %d", ret);
+
+	/* Retrieve the category name rows */
+	ret = SPI_exec(cats_sql, 0);
+	num_categories = proc = SPI_processed;
+
+	/* Check for qualifying tuples */
+	if ((ret == SPI_OK_SELECT) && (proc > 0))
+	{
+		SPITupleTable  *spi_tuptable = SPI_tuptable;
+		TupleDesc		spi_tupdesc = spi_tuptable->tupdesc;
+		int				i;
+
+		/*
+		 * The provided categories SQL query must always return one column:
+		 * category - the label or identifier for each column
+		 */
+		if (spi_tupdesc->natts != 1)
+			elog(ERROR, "load_categories_hash: provided categories SQL must " \
+						"return 1 column of at least one row");
+
+		for (i = 0; i < proc; i++)
+		{
+			crosstab_cat_desc  *catdesc;
+			char			   *catname;
+			HeapTuple			spi_tuple;
+
+			/* get the next sql result tuple */
+			spi_tuple = spi_tuptable->vals[i];
+
+			/* get the category from the current sql result tuple */
+			catname = SPI_getvalue(spi_tuple, spi_tupdesc, 1);
+
+			SPIcontext = MemoryContextSwitchTo(per_query_ctx);
+
+			catdesc = (crosstab_cat_desc *) palloc(sizeof(crosstab_cat_desc));
+			catdesc->catname = catname;
+			catdesc->attidx = i;
+
+			/* Add the proc description block to the hashtable */
+			crosstab_HashTableInsert(catdesc);
+
+			MemoryContextSwitchTo(SPIcontext);
+		}
+	}
+	else
+	{
+		/* no qualifying tuples */
+		SPI_finish();
+		elog(ERROR, "load_categories_hash: provided categories SQL must " \
+					"return 1 column of at least one row");
+	}
+
+	if (SPI_finish() != SPI_OK_FINISH)
+		elog(ERROR, "load_categories_hash: SPI_finish() failed");
+
+	return num_categories;
+}
+
+/*
+ * create and populate the crosstab tuplestore using the provided source query
+ */
+static Tuplestorestate *
+get_crosstab_tuplestore(char *sql,
+						int num_categories,
+						TupleDesc tupdesc,
+						MemoryContext per_query_ctx)
+{
+	Tuplestorestate	   *tupstore;
+	AttInMetadata	   *attinmeta = TupleDescGetAttInMetadata(tupdesc);
+	char			  **values;
+	HeapTuple			tuple;
+	int					ret;
+	int					proc;
+	MemoryContext		SPIcontext;
+
+	/* initialize our tuplestore */
+	tupstore = tuplestore_begin_heap(true, SortMem);
+
+	/* Connect to SPI manager */
+	if ((ret = SPI_connect()) < 0)
+		elog(ERROR, "get_crosstab_tuplestore: SPI_connect returned %d", ret);
+
+	/* Now retrieve the crosstab source rows */
+	ret = SPI_exec(sql, 0);
+	proc = SPI_processed;
+
+	/* Check for qualifying tuples */
+	if ((ret == SPI_OK_SELECT) && (proc > 0))
+	{
+		SPITupleTable  *spi_tuptable = SPI_tuptable;
+		TupleDesc		spi_tupdesc = spi_tuptable->tupdesc;
+		int				ncols = spi_tupdesc->natts;
+		char		   *rowid;
+		char		   *lastrowid = NULL;
+		int				i, j;
+		int				result_ncols;
+
+		/*
+		 * The provided SQL query must always return at least three columns:
+		 *
+		 * 1. rowname	the label for each row - column 1 in the final result
+		 * 2. category  the label for each value-column in the final result
+		 * 3. value		the values used to populate the value-columns
+		 * 
+		 * If there are more than three columns, the last two are taken as
+		 * "category" and "values". The first column is taken as "rowname".
+		 * Additional columns (2 thru N-2) are assumed the same for the same
+		 * "rowname", and are copied into the result tuple from the first
+		 * time we encounter a particular rowname.
+		 */
+		if (ncols < 3)
+			elog(ERROR, "get_crosstab_tuplestore: provided source SQL must " \
+						"return at least 3 columns; a rowid, a category, " \
+						"and a values column");
+
+		result_ncols = (ncols - 2) + num_categories;
+
+		/* Recheck to make sure we tuple descriptor still looks reasonable */
+		if (tupdesc->natts != result_ncols)
+			elog(ERROR, "get_crosstab_tuplestore: query-specified return " \
+						"tuple has %d columns but crosstab returns %d",
+						 tupdesc->natts, result_ncols);
+
+		/* allocate space */
+		values = (char **) palloc(result_ncols * sizeof(char *));
+
+		/* and make sure it's clear */
+		memset(values, '\0', result_ncols * sizeof(char *));
+
+		for (i = 0; i < proc; i++)
+		{
+			HeapTuple			spi_tuple;
+			crosstab_cat_desc  *catdesc;
+			char			   *catname;
+
+			/* get the next sql result tuple */
+			spi_tuple = spi_tuptable->vals[i];
+
+			/* get the rowid from the current sql result tuple */
+			rowid = SPI_getvalue(spi_tuple, spi_tupdesc, 1);
+
+			/* if rowid is null, skip this tuple entirely */
+			if (rowid == NULL)
+				continue;
+
+			/*
+			 * if we're on a new output row, grab the column values up to
+			 * column N-2 now
+			 */
+			if ((lastrowid == NULL) || (strcmp(rowid, lastrowid) != 0))
+			{
+				/*
+				 * a new row means we need to flush the old one first,
+				 * unless we're on the very first row
+				 */
+				if (lastrowid != NULL)
+				{
+					/* switch to appropriate context while storing the tuple */
+					SPIcontext = MemoryContextSwitchTo(per_query_ctx);
+
+					/* rowid changed, flush the previous output row */
+					tuple = BuildTupleFromCStrings(attinmeta, values);
+					tuplestore_puttuple(tupstore, tuple);
+					for (j = 0; j < result_ncols; j++)
+						xpfree(values[j]);
+
+					/* now reset the context */
+					MemoryContextSwitchTo(SPIcontext);
+				}
+
+				values[0] = rowid;
+				for (j = 1; j < ncols - 2; j++)
+					values[j] = SPI_getvalue(spi_tuple, spi_tupdesc, j + 1);
+			}
+
+			/* look up the category and fill in the appropriate column */
+			catname = SPI_getvalue(spi_tuple, spi_tupdesc, ncols - 1);
+
+			if (catname != NULL)
+			{
+				crosstab_HashTableLookup(catname, catdesc);
+
+				if (catdesc)
+					values[catdesc->attidx + ncols - 2] = 
+						SPI_getvalue(spi_tuple, spi_tupdesc, ncols);
+			}
+
+			xpfree(lastrowid);
+			lastrowid = pstrdup(rowid);
+		}
+
+		/* switch to appropriate context while storing the tuple */
+		SPIcontext = MemoryContextSwitchTo(per_query_ctx);
+
+		/* flush the last output row */
+		tuple = BuildTupleFromCStrings(attinmeta, values);
+		tuplestore_puttuple(tupstore, tuple);
+
+		/* now reset the context */
+		MemoryContextSwitchTo(SPIcontext);
+
+	}
+	else
+	{
+		/* no qualifying tuples */
+		SPI_finish();
+	}
+
+	if (SPI_finish() != SPI_OK_FINISH)
+		elog(ERROR, "get_crosstab_tuplestore: SPI_finish() failed");
+
+	tuplestore_donestoring(tupstore);
+
+	return tupstore;
+}
+
 /*
 * connectby_text - produce a result set from a hierarchical (parent/child)
 * table.
@ -668,7 +1069,7 @@ connectby_text(PG_FUNCTION_ARGS)
 	attinmeta = TupleDescGetAttInMetadata(tupdesc);

 	/* check to see if caller supports us returning a tuplestore */
-	if (!rsinfo->allowedModes & SFRM_Materialize)
+	if (!rsinfo || !(rsinfo->allowedModes & SFRM_Materialize))
 		elog(ERROR, "connectby requires Materialize mode, but it is not "
 			 "allowed in this context");