mirror of
https://github.com/postgres/postgres.git
synced 2025-07-27 12:41:57 +03:00
Sample postgres_fdw tables remotely during ANALYZE
When collecting ANALYZE sample on foreign tables, postgres_fdw fetched all rows and performed the sampling locally. For large tables this means transferring and immediately discarding large amounts of data. This commit allows the sampling to be performed on the remote server, transferring only the much smaller sample. The sampling is performed using the built-in TABLESAMPLE methods (system, bernoulli) or random() function, depending on the remote server version. Remote sampling can be enabled by analyze_sampling on the foreign server and/or foreign table, with supported values 'off', 'auto', 'system', 'bernoulli' and 'random'. The default value is 'auto' which uses either 'bernoulli' (TABLESAMPLE method) or 'random' (for remote servers without TABLESAMPLE support).
This commit is contained in:
@ -2367,14 +2367,57 @@ deparseAnalyzeSizeSql(StringInfo buf, Relation rel)
|
||||
appendStringInfo(buf, "::pg_catalog.regclass) / %d", BLCKSZ);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct SELECT statement to acquire the number of rows of a relation.
|
||||
*
|
||||
* Note: we just return the remote server's reltuples value, which might
|
||||
* be off a good deal, but it doesn't seem worth working harder. See
|
||||
* comments in postgresAcquireSampleRowsFunc.
|
||||
*/
|
||||
void
|
||||
deparseAnalyzeTuplesSql(StringInfo buf, Relation rel)
|
||||
{
|
||||
StringInfoData relname;
|
||||
|
||||
/* We'll need the remote relation name as a literal. */
|
||||
initStringInfo(&relname);
|
||||
deparseRelation(&relname, rel);
|
||||
|
||||
appendStringInfoString(buf, "SELECT reltuples FROM pg_catalog.pg_class WHERE oid = ");
|
||||
deparseStringLiteral(buf, relname.data);
|
||||
appendStringInfoString(buf, "::pg_catalog.regclass");
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct SELECT statement to acquire sample rows of given relation.
|
||||
*
|
||||
* SELECT command is appended to buf, and list of columns retrieved
|
||||
* is returned to *retrieved_attrs.
|
||||
*
|
||||
* We only support sampling methods we can decide based on server version.
|
||||
* Allowing custom TSM modules (like tsm_system_rows) might be useful, but it
|
||||
* would require detecting which extensions are installed, to allow automatic
|
||||
* fall-back. Moreover, the methods may use different parameters like number
|
||||
* of rows (and not sampling rate). So we leave this for future improvements.
|
||||
*
|
||||
* Using random() to sample rows on the remote server has the advantage that
|
||||
* this works on all PostgreSQL versions (unlike TABLESAMPLE), and that it
|
||||
* does the sampling on the remote side (without transferring everything and
|
||||
* then discarding most rows).
|
||||
*
|
||||
* The disadvantage is that we still have to read all rows and evaluate the
|
||||
* random(), while TABLESAMPLE (at least with the "system" method) may skip.
|
||||
* It's not that different from the "bernoulli" method, though.
|
||||
*
|
||||
* We could also do "ORDER BY random() LIMIT x", which would always pick
|
||||
* the expected number of rows, but it requires sorting so it may be much
|
||||
* more expensive (particularly on large tables, which is what what the
|
||||
* remote sampling is meant to improve).
|
||||
*/
|
||||
void
|
||||
deparseAnalyzeSql(StringInfo buf, Relation rel, List **retrieved_attrs)
|
||||
deparseAnalyzeSql(StringInfo buf, Relation rel,
|
||||
PgFdwSamplingMethod sample_method, double sample_frac,
|
||||
List **retrieved_attrs)
|
||||
{
|
||||
Oid relid = RelationGetRelid(rel);
|
||||
TupleDesc tupdesc = RelationGetDescr(rel);
|
||||
@ -2422,10 +2465,35 @@ deparseAnalyzeSql(StringInfo buf, Relation rel, List **retrieved_attrs)
|
||||
appendStringInfoString(buf, "NULL");
|
||||
|
||||
/*
|
||||
* Construct FROM clause
|
||||
* Construct FROM clause, and perhaps WHERE clause too, depending on the
|
||||
* selected sampling method.
|
||||
*/
|
||||
appendStringInfoString(buf, " FROM ");
|
||||
deparseRelation(buf, rel);
|
||||
|
||||
switch (sample_method)
|
||||
{
|
||||
case ANALYZE_SAMPLE_OFF:
|
||||
/* nothing to do here */
|
||||
break;
|
||||
|
||||
case ANALYZE_SAMPLE_RANDOM:
|
||||
appendStringInfo(buf, " WHERE pg_catalog.random() < %f", sample_frac);
|
||||
break;
|
||||
|
||||
case ANALYZE_SAMPLE_SYSTEM:
|
||||
appendStringInfo(buf, " TABLESAMPLE SYSTEM(%f)", (100.0 * sample_frac));
|
||||
break;
|
||||
|
||||
case ANALYZE_SAMPLE_BERNOULLI:
|
||||
appendStringInfo(buf, " TABLESAMPLE BERNOULLI(%f)", (100.0 * sample_frac));
|
||||
break;
|
||||
|
||||
case ANALYZE_SAMPLE_AUTO:
|
||||
/* should have been resolved into actual method */
|
||||
elog(ERROR, "unexpected sampling method");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
Reference in New Issue
Block a user