Support XMLTABLE query expression

XMLTABLE is defined by the SQL/XML standard as a feature that allows turning XML-formatted data into relational form, so that it can be used as a <table primary> in the FROM clause of a query. This new construct provides significant simplicity and performance benefit for XML data processing; what in a client-side custom implementation was reported to take 20 minutes can be executed in 400ms using XMLTABLE. (The same functionality was said to take 10 seconds using nested PostgreSQL XPath function calls, and 5 seconds using XMLReader under PL/Python). The implemented syntax deviates slightly from what the standard requires. First, the standard indicates that the PASSING clause is optional and that multiple XML input documents may be given to it; we make it mandatory and accept a single document only. Second, we don't currently support a default namespace to be specified. This implementation relies on a new executor node based on a hardcoded method table. (Because the grammar is fixed, there is no extensibility in the current approach; further constructs can be implemented on top of this such as JSON_TABLE, but they require changes to core code.) Author: Pavel Stehule, Álvaro Herrera Extensively reviewed by: Craig Ringer Discussion: https://postgr.es/m/CAFj8pRAgfzMD-LoSmnMGybD0WsEznLHWap8DO79+-GTRAPR4qA@mail.gmail.com
2025-07-15 19:21:59 +03:00 · 2017-03-08 12:39:37 -03:00
parent 270d7dd8a5
commit fcec6caafa
52 changed files with 4606 additions and 50 deletions
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@ -106,6 +106,8 @@ static void set_function_pathlist(PlannerInfo *root, RelOptInfo *rel,
 					  RangeTblEntry *rte);
 static void set_values_pathlist(PlannerInfo *root, RelOptInfo *rel,
 					RangeTblEntry *rte);
+static void set_tablefunc_pathlist(PlannerInfo *root, RelOptInfo *rel,
+					   RangeTblEntry *rte);
 static void set_cte_pathlist(PlannerInfo *root, RelOptInfo *rel,
 				 RangeTblEntry *rte);
 static void set_worktable_pathlist(PlannerInfo *root, RelOptInfo *rel,
@ -365,6 +367,9 @@ set_rel_size(PlannerInfo *root, RelOptInfo *rel,
 			case RTE_FUNCTION:
 				set_function_size_estimates(root, rel);
 				break;
+			case RTE_TABLEFUNC:
+				set_tablefunc_size_estimates(root, rel);
+				break;
 			case RTE_VALUES:
 				set_values_size_estimates(root, rel);
 				break;
@ -437,6 +442,10 @@ set_rel_pathlist(PlannerInfo *root, RelOptInfo *rel,
 				/* RangeFunction */
 				set_function_pathlist(root, rel, rte);
 				break;
+			case RTE_TABLEFUNC:
+				/* Table Function */
+				set_tablefunc_pathlist(root, rel, rte);
+				break;
 			case RTE_VALUES:
 				/* Values list */
 				set_values_pathlist(root, rel, rte);
@ -599,6 +608,10 @@ set_rel_consider_parallel(PlannerInfo *root, RelOptInfo *rel,
 				return;
 			break;

+		case RTE_TABLEFUNC:
+			/* not parallel safe */
+			return;
+
 		case RTE_VALUES:
 			/* Check for parallel-restricted functions. */
 			if (!is_parallel_safe(root, (Node *) rte->values_lists))
@ -1932,6 +1945,27 @@ set_values_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 	add_path(rel, create_valuesscan_path(root, rel, required_outer));
 }

+/*
+ * set_tablefunc_pathlist
+ *		Build the (single) access path for a table func RTE
+ */
+static void
+set_tablefunc_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
+{
+	Relids		required_outer;
+
+	/*
+	 * We don't support pushing join clauses into the quals of a tablefunc
+	 * scan, but it could still have required parameterization due to LATERAL
+	 * refs in the function expression.
+	 */
+	required_outer = rel->lateral_relids;
+
+	/* Generate appropriate path */
+	add_path(rel, create_tablefuncscan_path(root, rel,
+											required_outer));
+}
+
 /*
 * set_cte_pathlist
 *		Build the (single) access path for a non-self-reference CTE RTE
@ -3032,6 +3066,9 @@ print_path(PlannerInfo *root, Path *path, int indent)
 				case T_FunctionScan:
 					ptype = "FunctionScan";
 					break;
+				case T_TableFuncScan:
+					ptype = "TableFuncScan";
+					break;
 				case T_ValuesScan:
 					ptype = "ValuesScan";
 					break;
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@ -1277,6 +1277,62 @@ cost_functionscan(Path *path, PlannerInfo *root,
 	path->total_cost = startup_cost + run_cost;
 }

+/*
+ * cost_tablefuncscan
+ *	  Determines and returns the cost of scanning a table function.
+ *
+ * 'baserel' is the relation to be scanned
+ * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL
+ */
+void
+cost_tablefuncscan(Path *path, PlannerInfo *root,
+				   RelOptInfo *baserel, ParamPathInfo *param_info)
+{
+	Cost		startup_cost = 0;
+	Cost		run_cost = 0;
+	QualCost	qpqual_cost;
+	Cost		cpu_per_tuple;
+	RangeTblEntry *rte;
+	QualCost	exprcost;
+
+	/* Should only be applied to base relations that are functions */
+	Assert(baserel->relid > 0);
+	rte = planner_rt_fetch(baserel->relid, root);
+	Assert(rte->rtekind == RTE_TABLEFUNC);
+
+	/* Mark the path with the correct row estimate */
+	if (param_info)
+		path->rows = param_info->ppi_rows;
+	else
+		path->rows = baserel->rows;
+
+	/*
+	 * Estimate costs of executing the table func expression(s).
+	 *
+	 * XXX in principle we ought to charge tuplestore spill costs if the
+	 * number of rows is large.  However, given how phony our rowcount
+	 * estimates for tablefuncs tend to be, there's not a lot of point in that
+	 * refinement right now.
+	 */
+	cost_qual_eval_node(&exprcost, (Node *) rte->tablefunc, root);
+
+	startup_cost += exprcost.startup + exprcost.per_tuple;
+
+	/* Add scanning CPU costs */
+	get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost);
+
+	startup_cost += qpqual_cost.startup;
+	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
+	run_cost += cpu_per_tuple * baserel->tuples;
+
+	/* tlist eval costs are paid per output row, not per tuple scanned */
+	startup_cost += path->pathtarget->cost.startup;
+	run_cost += path->pathtarget->cost.per_tuple * path->rows;
+
+	path->startup_cost = startup_cost;
+	path->total_cost = startup_cost + run_cost;
+}
+
 /*
 * cost_valuesscan
 *	  Determines and returns the cost of scanning a VALUES RTE.
@ -4421,6 +4477,31 @@ set_function_size_estimates(PlannerInfo *root, RelOptInfo *rel)
 	set_baserel_size_estimates(root, rel);
 }

+/*
+ * set_function_size_estimates
+ *		Set the size estimates for a base relation that is a function call.
+ *
+ * The rel's targetlist and restrictinfo list must have been constructed
+ * already.
+ *
+ * We set the same fields as set_tablefunc_size_estimates.
+ */
+void
+set_tablefunc_size_estimates(PlannerInfo *root, RelOptInfo *rel)
+{
+	RangeTblEntry *rte;
+
+	/* Should only be applied to base relations that are functions */
+	Assert(rel->relid > 0);
+	rte = planner_rt_fetch(rel->relid, root);
+	Assert(rte->rtekind == RTE_TABLEFUNC);
+
+	rel->tuples = 100;
+
+	/* Now estimate number of output rows, etc */
+	set_baserel_size_estimates(root, rel);
+}
+
 /*
 * set_values_size_estimates
 *		Set the size estimates for a base relation that is a values list.