diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 5d3bea09b1b..137b242ea97 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -3,6 +3,29 @@ * pg_stat_statements.c * Track statement execution times across a whole database cluster. * + * Execution costs are totalled for each distinct source query, and kept in + * a shared hashtable. (We track only as many distinct queries as will fit + * in the designated amount of shared memory.) + * + * As of Postgres 9.2, this module normalizes query entries. Normalization + * is a process whereby similar queries, typically differing only in their + * constants (though the exact rules are somewhat more subtle than that) are + * recognized as equivalent, and are tracked as a single entry. This is + * particularly useful for non-prepared queries. + * + * Normalization is implemented by fingerprinting queries, selectively + * serializing those fields of each query tree's nodes that are judged to be + * essential to the query. This is referred to as a query jumble. This is + * distinct from a regular serialization in that various extraneous + * information is ignored as irrelevant or not essential to the query, such + * as the collations of Vars and, most notably, the values of constants. + * + * This jumble is acquired at the end of parse analysis of each query, and + * a 32-bit hash of it is stored into the query's Query.queryId field. + * The server then copies this value around, making it available in plan + * tree(s) generated from the query. The executor can then use this value + * to blame query costs on the proper queryId. + * * Note about locking issues: to create or delete an entry in the shared * hashtable, one must hold pgss->lock exclusively. Modifying any field * in an entry except the counters requires the same. To look up an entry, @@ -27,6 +50,9 @@ #include "funcapi.h" #include "mb/pg_wchar.h" #include "miscadmin.h" +#include "parser/analyze.h" +#include "parser/parsetree.h" +#include "parser/scanner.h" #include "pgstat.h" #include "storage/fd.h" #include "storage/ipc.h" @@ -41,18 +67,20 @@ PG_MODULE_MAGIC; #define PGSS_DUMP_FILE "global/pg_stat_statements.stat" /* This constant defines the magic number in the stats file header */ -static const uint32 PGSS_FILE_HEADER = 0x20100108; +static const uint32 PGSS_FILE_HEADER = 0x20120328; /* XXX: Should USAGE_EXEC reflect execution time and/or buffer usage? */ #define USAGE_EXEC(duration) (1.0) #define USAGE_INIT (1.0) /* including initial planning */ +#define USAGE_NON_EXEC_STICK (3.0) /* to make new entries sticky */ #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */ #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */ +#define JUMBLE_SIZE 1024 /* query serialization buffer size */ + /* - * Hashtable key that defines the identity of a hashtable entry. The - * hash comparators do not assume that the query string is null-terminated; - * this lets us search for an mbcliplen'd string without copying it first. + * Hashtable key that defines the identity of a hashtable entry. We separate + * queries by user and by database even if they are otherwise identical. * * Presently, the query encoding is fully determined by the source database * and so we don't really need it to be in the key. But that might not always @@ -63,8 +91,7 @@ typedef struct pgssHashKey Oid userid; /* user OID */ Oid dbid; /* database OID */ int encoding; /* query encoding */ - int query_len; /* # of valid bytes in query string */ - const char *query_ptr; /* query string proper */ + uint32 queryid; /* query identifier */ } pgssHashKey; /* @@ -99,6 +126,7 @@ typedef struct pgssEntry { pgssHashKey key; /* hash key of entry - MUST BE FIRST */ Counters counters; /* the statistics for this query */ + int query_len; /* # of valid bytes in query string */ slock_t mutex; /* protects the counters only */ char query[1]; /* VARIABLE LENGTH ARRAY - MUST BE LAST */ /* Note: the allocated length of query[] is actually pgss->query_size */ @@ -113,6 +141,37 @@ typedef struct pgssSharedState int query_size; /* max query length in bytes */ } pgssSharedState; +/* + * Struct for tracking locations/lengths of constants during normalization + */ +typedef struct pgssLocationLen +{ + int location; /* start offset in query text */ + int length; /* length in bytes, or -1 to ignore */ +} pgssLocationLen; + +/* + * Working state for computing a query jumble and producing a normalized + * query string + */ +typedef struct pgssJumbleState +{ + /* Jumble of current query tree */ + unsigned char *jumble; + + /* Number of bytes used in jumble[] */ + Size jumble_len; + + /* Array of locations of constants that should be removed */ + pgssLocationLen *clocations; + + /* Allocated length of clocations array */ + int clocations_buf_size; + + /* Current number of valid entries in clocations array */ + int clocations_count; +} pgssJumbleState; + /*---- Local variables ----*/ /* Current nesting depth of ExecutorRun calls */ @@ -120,6 +179,7 @@ static int nested_level = 0; /* Saved hook values in case of unload */ static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static post_parse_analyze_hook_type prev_post_parse_analyze_hook = NULL; static ExecutorStart_hook_type prev_ExecutorStart = NULL; static ExecutorRun_hook_type prev_ExecutorRun = NULL; static ExecutorFinish_hook_type prev_ExecutorFinish = NULL; @@ -170,6 +230,7 @@ PG_FUNCTION_INFO_V1(pg_stat_statements); static void pgss_shmem_startup(void); static void pgss_shmem_shutdown(int code, Datum arg); +static void pgss_post_parse_analyze(ParseState *pstate, Query *query); static void pgss_ExecutorStart(QueryDesc *queryDesc, int eflags); static void pgss_ExecutorRun(QueryDesc *queryDesc, ScanDirection direction, @@ -181,12 +242,25 @@ static void pgss_ProcessUtility(Node *parsetree, DestReceiver *dest, char *completionTag); static uint32 pgss_hash_fn(const void *key, Size keysize); static int pgss_match_fn(const void *key1, const void *key2, Size keysize); -static void pgss_store(const char *query, double total_time, uint64 rows, - const BufferUsage *bufusage); +static uint32 pgss_hash_string(const char *str); +static void pgss_store(const char *query, uint32 queryId, + double total_time, uint64 rows, + const BufferUsage *bufusage, + pgssJumbleState * jstate); static Size pgss_memsize(void); -static pgssEntry *entry_alloc(pgssHashKey *key); +static pgssEntry *entry_alloc(pgssHashKey *key, const char *query, int query_len); static void entry_dealloc(void); static void entry_reset(void); +static void AppendJumble(pgssJumbleState * jstate, + const unsigned char *item, Size size); +static void JumbleQuery(pgssJumbleState * jstate, Query *query); +static void JumbleRangeTable(pgssJumbleState * jstate, List *rtable); +static void JumbleExpr(pgssJumbleState * jstate, Node *node); +static void RecordConstLocation(pgssJumbleState * jstate, int location); +static char *generate_normalized_query(pgssJumbleState * jstate, const char *query, + int *query_len_p, int encoding); +static void fill_in_constant_lengths(pgssJumbleState * jstate, const char *query); +static int comp_location(const void *a, const void *b); /* @@ -271,6 +345,8 @@ _PG_init(void) */ prev_shmem_startup_hook = shmem_startup_hook; shmem_startup_hook = pgss_shmem_startup; + prev_post_parse_analyze_hook = post_parse_analyze_hook; + post_parse_analyze_hook = pgss_post_parse_analyze; prev_ExecutorStart = ExecutorStart_hook; ExecutorStart_hook = pgss_ExecutorStart; prev_ExecutorRun = ExecutorRun_hook; @@ -291,6 +367,7 @@ _PG_fini(void) { /* Uninstall hooks. */ shmem_startup_hook = prev_shmem_startup_hook; + post_parse_analyze_hook = prev_post_parse_analyze_hook; ExecutorStart_hook = prev_ExecutorStart; ExecutorRun_hook = prev_ExecutorRun; ExecutorFinish_hook = prev_ExecutorFinish; @@ -400,26 +477,29 @@ pgss_shmem_startup(void) goto error; /* Previous incarnation might have had a larger query_size */ - if (temp.key.query_len >= buffer_size) + if (temp.query_len >= buffer_size) { - buffer = (char *) repalloc(buffer, temp.key.query_len + 1); - buffer_size = temp.key.query_len + 1; + buffer = (char *) repalloc(buffer, temp.query_len + 1); + buffer_size = temp.query_len + 1; } - if (fread(buffer, 1, temp.key.query_len, file) != temp.key.query_len) + if (fread(buffer, 1, temp.query_len, file) != temp.query_len) goto error; - buffer[temp.key.query_len] = '\0'; + buffer[temp.query_len] = '\0'; + + /* Skip loading "sticky" entries */ + if (temp.counters.calls == 0) + continue; /* Clip to available length if needed */ - if (temp.key.query_len >= query_size) - temp.key.query_len = pg_encoding_mbcliplen(temp.key.encoding, - buffer, - temp.key.query_len, - query_size - 1); - temp.key.query_ptr = buffer; + if (temp.query_len >= query_size) + temp.query_len = pg_encoding_mbcliplen(temp.key.encoding, + buffer, + temp.query_len, + query_size - 1); /* make the hashtable entry (discards old entries if too many) */ - entry = entry_alloc(&temp.key); + entry = entry_alloc(&temp.key, buffer, temp.query_len); /* copy in the actual stats */ entry->counters = temp.counters; @@ -481,7 +561,7 @@ pgss_shmem_shutdown(int code, Datum arg) hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { - int len = entry->key.query_len; + int len = entry->query_len; if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 || fwrite(entry->query, 1, len, file) != len) @@ -506,6 +586,58 @@ error: unlink(PGSS_DUMP_FILE); } +/* + * Post-parse-analysis hook: mark query with a queryId + */ +static void +pgss_post_parse_analyze(ParseState *pstate, Query *query) +{ + pgssJumbleState jstate; + BufferUsage bufusage; + + /* Assert we didn't do this already */ + Assert(query->queryId == 0); + + /* Safety check... */ + if (!pgss || !pgss_hash) + return; + + /* We do nothing with utility statements at this stage */ + if (query->utilityStmt) + return; + + /* Set up workspace for query jumbling */ + jstate.jumble = (unsigned char *) palloc(JUMBLE_SIZE); + jstate.jumble_len = 0; + jstate.clocations_buf_size = 32; + jstate.clocations = (pgssLocationLen *) + palloc(jstate.clocations_buf_size * sizeof(pgssLocationLen)); + jstate.clocations_count = 0; + + /* Compute query ID and mark the Query node with it */ + JumbleQuery(&jstate, query); + query->queryId = hash_any(jstate.jumble, jstate.jumble_len); + + /* + * If we were able to identify any ignorable constants, we immediately + * create a hash table entry for the query, so that we can record the + * normalized form of the query string. If there were no such constants, + * the normalized string would be the same as the query text anyway, so + * there's no need for an early entry. + */ + if (jstate.clocations_count > 0) + { + memset(&bufusage, 0, sizeof(bufusage)); + + pgss_store(pstate->p_sourcetext, + query->queryId, + 0, + 0, + &bufusage, + &jstate); + } +} + /* * ExecutorStart hook: start up tracking if needed */ @@ -589,6 +721,11 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) { if (queryDesc->totaltime && pgss_enabled()) { + uint32 queryId; + + /* Query's ID should have been filled in by post-analyze hook */ + queryId = queryDesc->plannedstmt->queryId; + /* * Make sure stats accumulation is done. (Note: it's okay if several * levels of hook all do this.) @@ -596,9 +733,11 @@ pgss_ExecutorEnd(QueryDesc *queryDesc) InstrEndLoop(queryDesc->totaltime); pgss_store(queryDesc->sourceText, + queryId, queryDesc->totaltime->total, queryDesc->estate->es_processed, - &queryDesc->totaltime->bufusage); + &queryDesc->totaltime->bufusage, + NULL); } if (prev_ExecutorEnd) @@ -620,7 +759,9 @@ pgss_ProcessUtility(Node *parsetree, const char *queryString, instr_time start; instr_time duration; uint64 rows = 0; - BufferUsage bufusage_start, bufusage; + BufferUsage bufusage_start, + bufusage; + uint32 queryId; bufusage_start = pgBufferUsage; INSTR_TIME_SET_CURRENT(start); @@ -677,8 +818,15 @@ pgss_ProcessUtility(Node *parsetree, const char *queryString, bufusage.time_write = pgBufferUsage.time_write; INSTR_TIME_SUBTRACT(bufusage.time_write, bufusage_start.time_write); - pgss_store(queryString, INSTR_TIME_GET_DOUBLE(duration), rows, - &bufusage); + /* For utility statements, we just hash the query string directly */ + queryId = pgss_hash_string(queryString); + + pgss_store(queryString, + queryId, + INSTR_TIME_GET_DOUBLE(duration), + rows, + &bufusage, + NULL); } else { @@ -702,8 +850,7 @@ pgss_hash_fn(const void *key, Size keysize) /* we don't bother to include encoding in the hash */ return hash_uint32((uint32) k->userid) ^ hash_uint32((uint32) k->dbid) ^ - DatumGetUInt32(hash_any((const unsigned char *) k->query_ptr, - k->query_len)); + hash_uint32((uint32) k->queryid); } /* @@ -718,23 +865,40 @@ pgss_match_fn(const void *key1, const void *key2, Size keysize) if (k1->userid == k2->userid && k1->dbid == k2->dbid && k1->encoding == k2->encoding && - k1->query_len == k2->query_len && - memcmp(k1->query_ptr, k2->query_ptr, k1->query_len) == 0) + k1->queryid == k2->queryid) return 0; else return 1; } +/* + * Given an arbitrarily long query string, produce a hash for the purposes of + * identifying the query, without normalizing constants. Used when hashing + * utility statements, or for legacy compatibility mode. + */ +static uint32 +pgss_hash_string(const char *str) +{ + return hash_any((const unsigned char *) str, strlen(str)); +} + /* * Store some statistics for a statement. + * + * If jstate is not NULL then we're trying to create an entry for which + * we have no statistics as yet; we just want to record the normalized + * query string while we can. */ static void -pgss_store(const char *query, double total_time, uint64 rows, - const BufferUsage *bufusage) +pgss_store(const char *query, uint32 queryId, + double total_time, uint64 rows, + const BufferUsage *bufusage, + pgssJumbleState * jstate) { pgssHashKey key; double usage; pgssEntry *entry; + char *norm_query = NULL; Assert(query != NULL); @@ -746,34 +910,89 @@ pgss_store(const char *query, double total_time, uint64 rows, key.userid = GetUserId(); key.dbid = MyDatabaseId; key.encoding = GetDatabaseEncoding(); - key.query_len = strlen(query); - if (key.query_len >= pgss->query_size) - key.query_len = pg_encoding_mbcliplen(key.encoding, - query, - key.query_len, - pgss->query_size - 1); - key.query_ptr = query; - - usage = USAGE_EXEC(duration); + key.queryid = queryId; /* Lookup the hash table entry with shared lock. */ LWLockAcquire(pgss->lock, LW_SHARED); entry = (pgssEntry *) hash_search(pgss_hash, &key, HASH_FIND, NULL); + + /* + * When creating an entry just to store the normalized string, make it + * artificially sticky so that it will probably still be there when + * executed. Strictly speaking, query strings are normalized on a best + * effort basis, though it would be difficult to demonstrate this even + * under artificial conditions. + */ + if (jstate && !entry) + usage = USAGE_NON_EXEC_STICK; + else + usage = USAGE_EXEC(duration); + if (!entry) { - /* Must acquire exclusive lock to add a new entry. */ + int query_len; + + /* + * We'll need exclusive lock to make a new entry. There is no point + * in holding shared lock while we normalize the string, though. + */ LWLockRelease(pgss->lock); - LWLockAcquire(pgss->lock, LW_EXCLUSIVE); - entry = entry_alloc(&key); + + query_len = strlen(query); + + if (jstate) + { + /* Normalize the string if enabled */ + norm_query = generate_normalized_query(jstate, query, + &query_len, + key.encoding); + + /* Acquire exclusive lock as required by entry_alloc() */ + LWLockAcquire(pgss->lock, LW_EXCLUSIVE); + + entry = entry_alloc(&key, norm_query, query_len); + } + else + { + /* + * We're just going to store the query string as-is; but we have + * to truncate it if over-length. + */ + if (query_len >= pgss->query_size) + query_len = pg_encoding_mbcliplen(key.encoding, + query, + query_len, + pgss->query_size - 1); + + /* Acquire exclusive lock as required by entry_alloc() */ + LWLockAcquire(pgss->lock, LW_EXCLUSIVE); + + entry = entry_alloc(&key, query, query_len); + } } - /* Grab the spinlock while updating the counters. */ + /* + * Grab the spinlock while updating the counters (see comment about + * locking rules at the head of the file) + */ { volatile pgssEntry *e = (volatile pgssEntry *) entry; SpinLockAcquire(&e->mutex); - e->counters.calls += 1; + + /* + * If we're entering real data, "unstick" entry if it was previously + * sticky, and then increment calls. + */ + if (!jstate) + { + if (e->counters.calls == 0) + e->counters.usage = USAGE_INIT; + + e->counters.calls += 1; + } + e->counters.total_time += total_time; e->counters.rows += rows; e->counters.shared_blks_hit += bufusage->shared_blks_hit; @@ -786,13 +1005,18 @@ pgss_store(const char *query, double total_time, uint64 rows, e->counters.local_blks_written += bufusage->local_blks_written; e->counters.temp_blks_read += bufusage->temp_blks_read; e->counters.temp_blks_written += bufusage->temp_blks_written; - e->counters.time_read += INSTR_TIME_GET_DOUBLE(bufusage->time_read); + e->counters.time_read += INSTR_TIME_GET_DOUBLE(bufusage->time_read); e->counters.time_write += INSTR_TIME_GET_DOUBLE(bufusage->time_write); e->counters.usage += usage; + SpinLockRelease(&e->mutex); } LWLockRelease(pgss->lock); + + /* We postpone this pfree until we're out of the lock */ + if (norm_query) + pfree(norm_query); } /* @@ -883,7 +1107,7 @@ pg_stat_statements(PG_FUNCTION_ARGS) qstr = (char *) pg_do_encoding_conversion((unsigned char *) entry->query, - entry->key.query_len, + entry->query_len, entry->key.encoding, GetDatabaseEncoding()); values[i++] = CStringGetTextDatum(qstr); @@ -902,6 +1126,10 @@ pg_stat_statements(PG_FUNCTION_ARGS) SpinLockRelease(&e->mutex); } + /* Skip entry if unexecuted (ie, it's a pending "sticky" entry) */ + if (tmp.calls == 0) + continue; + values[i++] = Int64GetDatumFast(tmp.calls); values[i++] = Float8GetDatumFast(tmp.total_time); values[i++] = Int64GetDatumFast(tmp.rows); @@ -923,8 +1151,8 @@ pg_stat_statements(PG_FUNCTION_ARGS) values[i++] = Float8GetDatumFast(tmp.time_write); } - Assert(i == sql_supports_v1_1_counters ? \ - PG_STAT_STATEMENTS_COLS : PG_STAT_STATEMENTS_COLS_V1_0); + Assert(i == sql_supports_v1_1_counters ? + PG_STAT_STATEMENTS_COLS : PG_STAT_STATEMENTS_COLS_V1_0); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } @@ -957,20 +1185,19 @@ pgss_memsize(void) * Allocate a new hashtable entry. * caller must hold an exclusive lock on pgss->lock * + * "query" need not be null-terminated; we rely on query_len instead + * * Note: despite needing exclusive lock, it's not an error for the target * entry to already exist. This is because pgss_store releases and * reacquires lock after failing to find a match; so someone else could * have made the entry while we waited to get exclusive lock. */ static pgssEntry * -entry_alloc(pgssHashKey *key) +entry_alloc(pgssHashKey *key, const char *query, int query_len) { pgssEntry *entry; bool found; - /* Caller must have clipped query properly */ - Assert(key->query_len < pgss->query_size); - /* Make space if needed */ while (hash_get_num_entries(pgss_hash) >= pgss_max) entry_dealloc(); @@ -982,16 +1209,16 @@ entry_alloc(pgssHashKey *key) { /* New entry, initialize it */ - /* dynahash tried to copy the key for us, but must fix query_ptr */ - entry->key.query_ptr = entry->query; /* reset the statistics */ memset(&entry->counters, 0, sizeof(Counters)); entry->counters.usage = USAGE_INIT; /* re-initialize the mutex each time ... we assume no one using it */ SpinLockInit(&entry->mutex); /* ... and don't forget the query text */ - memcpy(entry->query, key->query_ptr, key->query_len); - entry->query[key->query_len] = '\0'; + Assert(query_len >= 0 && query_len < pgss->query_size); + entry->query_len = query_len; + memcpy(entry->query, query, query_len); + entry->query[query_len] = '\0'; } return entry; @@ -1003,8 +1230,8 @@ entry_alloc(pgssHashKey *key) static int entry_cmp(const void *lhs, const void *rhs) { - double l_usage = (*(pgssEntry * const *) lhs)->counters.usage; - double r_usage = (*(pgssEntry * const *) rhs)->counters.usage; + double l_usage = (*(pgssEntry *const *) lhs)->counters.usage; + double r_usage = (*(pgssEntry *const *) rhs)->counters.usage; if (l_usage < r_usage) return -1; @@ -1070,3 +1297,772 @@ entry_reset(void) LWLockRelease(pgss->lock); } + +/* + * AppendJumble: Append a value that is substantive in a given query to + * the current jumble. + */ +static void +AppendJumble(pgssJumbleState * jstate, const unsigned char *item, Size size) +{ + unsigned char *jumble = jstate->jumble; + Size jumble_len = jstate->jumble_len; + + /* + * Whenever the jumble buffer is full, we hash the current contents and + * reset the buffer to contain just that hash value, thus relying on the + * hash to summarize everything so far. + */ + while (size > 0) + { + Size part_size; + + if (jumble_len >= JUMBLE_SIZE) + { + uint32 start_hash = hash_any(jumble, JUMBLE_SIZE); + + memcpy(jumble, &start_hash, sizeof(start_hash)); + jumble_len = sizeof(start_hash); + } + part_size = Min(size, JUMBLE_SIZE - jumble_len); + memcpy(jumble + jumble_len, item, part_size); + jumble_len += part_size; + item += part_size; + size -= part_size; + } + jstate->jumble_len = jumble_len; +} + +/* + * Wrappers around AppendJumble to encapsulate details of serialization + * of individual local variable elements. + */ +#define APP_JUMB(item) \ + AppendJumble(jstate, (const unsigned char *) &(item), sizeof(item)) +#define APP_JUMB_STRING(str) \ + AppendJumble(jstate, (const unsigned char *) (str), strlen(str) + 1) + +/* + * JumbleQuery: Selectively serialize the query tree, appending significant + * data to the "query jumble" while ignoring nonsignificant data. + * + * Rule of thumb for what to include is that we should ignore anything not + * semantically significant (such as alias names) as well as anything that can + * be deduced from child nodes (else we'd just be double-hashing that piece + * of information). + */ +static void +JumbleQuery(pgssJumbleState * jstate, Query *query) +{ + Assert(IsA(query, Query)); + Assert(query->utilityStmt == NULL); + + APP_JUMB(query->commandType); + /* resultRelation is usually predictable from commandType */ + JumbleExpr(jstate, (Node *) query->cteList); + JumbleRangeTable(jstate, query->rtable); + JumbleExpr(jstate, (Node *) query->jointree); + JumbleExpr(jstate, (Node *) query->targetList); + JumbleExpr(jstate, (Node *) query->returningList); + JumbleExpr(jstate, (Node *) query->groupClause); + JumbleExpr(jstate, query->havingQual); + JumbleExpr(jstate, (Node *) query->windowClause); + JumbleExpr(jstate, (Node *) query->distinctClause); + JumbleExpr(jstate, (Node *) query->sortClause); + JumbleExpr(jstate, query->limitOffset); + JumbleExpr(jstate, query->limitCount); + /* we ignore rowMarks */ + JumbleExpr(jstate, query->setOperations); +} + +/* + * Jumble a range table + */ +static void +JumbleRangeTable(pgssJumbleState * jstate, List *rtable) +{ + ListCell *lc; + + foreach(lc, rtable) + { + RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc); + + Assert(IsA(rte, RangeTblEntry)); + APP_JUMB(rte->rtekind); + switch (rte->rtekind) + { + case RTE_RELATION: + APP_JUMB(rte->relid); + break; + case RTE_SUBQUERY: + JumbleQuery(jstate, rte->subquery); + break; + case RTE_JOIN: + APP_JUMB(rte->jointype); + break; + case RTE_FUNCTION: + JumbleExpr(jstate, rte->funcexpr); + break; + case RTE_VALUES: + JumbleExpr(jstate, (Node *) rte->values_lists); + break; + case RTE_CTE: + + /* + * Depending on the CTE name here isn't ideal, but it's the + * only info we have to identify the referenced WITH item. + */ + APP_JUMB_STRING(rte->ctename); + APP_JUMB(rte->ctelevelsup); + break; + default: + elog(ERROR, "unrecognized RTE kind: %d", (int) rte->rtekind); + break; + } + } +} + +/* + * Jumble an expression tree + * + * In general this function should handle all the same node types that + * expression_tree_walker() does, and therefore it's coded to be as parallel + * to that function as possible. However, since we are only invoked on + * queries immediately post-parse-analysis, we need not handle node types + * that only appear in planning. + * + * Note: the reason we don't simply use expression_tree_walker() is that the + * point of that function is to support tree walkers that don't care about + * most tree node types, but here we care about all types. We should complain + * about any unrecognized node type. + */ +static void +JumbleExpr(pgssJumbleState * jstate, Node *node) +{ + ListCell *temp; + + if (node == NULL) + return; + + /* Guard against stack overflow due to overly complex expressions */ + check_stack_depth(); + + /* + * We always emit the node's NodeTag, then any additional fields that are + * considered significant, and then we recurse to any child nodes. + */ + APP_JUMB(node->type); + + switch (nodeTag(node)) + { + case T_Var: + { + Var *var = (Var *) node; + + APP_JUMB(var->varno); + APP_JUMB(var->varattno); + APP_JUMB(var->varlevelsup); + } + break; + case T_Const: + { + Const *c = (Const *) node; + + /* We jumble only the constant's type, not its value */ + APP_JUMB(c->consttype); + /* Also, record its parse location for query normalization */ + RecordConstLocation(jstate, c->location); + } + break; + case T_Param: + { + Param *p = (Param *) node; + + APP_JUMB(p->paramkind); + APP_JUMB(p->paramid); + APP_JUMB(p->paramtype); + } + break; + case T_Aggref: + { + Aggref *expr = (Aggref *) node; + + APP_JUMB(expr->aggfnoid); + JumbleExpr(jstate, (Node *) expr->args); + JumbleExpr(jstate, (Node *) expr->aggorder); + JumbleExpr(jstate, (Node *) expr->aggdistinct); + } + break; + case T_WindowFunc: + { + WindowFunc *expr = (WindowFunc *) node; + + APP_JUMB(expr->winfnoid); + APP_JUMB(expr->winref); + JumbleExpr(jstate, (Node *) expr->args); + } + break; + case T_ArrayRef: + { + ArrayRef *aref = (ArrayRef *) node; + + JumbleExpr(jstate, (Node *) aref->refupperindexpr); + JumbleExpr(jstate, (Node *) aref->reflowerindexpr); + JumbleExpr(jstate, (Node *) aref->refexpr); + JumbleExpr(jstate, (Node *) aref->refassgnexpr); + } + break; + case T_FuncExpr: + { + FuncExpr *expr = (FuncExpr *) node; + + APP_JUMB(expr->funcid); + JumbleExpr(jstate, (Node *) expr->args); + } + break; + case T_NamedArgExpr: + { + NamedArgExpr *nae = (NamedArgExpr *) node; + + APP_JUMB(nae->argnumber); + JumbleExpr(jstate, (Node *) nae->arg); + } + break; + case T_OpExpr: + case T_DistinctExpr: /* struct-equivalent to OpExpr */ + case T_NullIfExpr: /* struct-equivalent to OpExpr */ + { + OpExpr *expr = (OpExpr *) node; + + APP_JUMB(expr->opno); + JumbleExpr(jstate, (Node *) expr->args); + } + break; + case T_ScalarArrayOpExpr: + { + ScalarArrayOpExpr *expr = (ScalarArrayOpExpr *) node; + + APP_JUMB(expr->opno); + APP_JUMB(expr->useOr); + JumbleExpr(jstate, (Node *) expr->args); + } + break; + case T_BoolExpr: + { + BoolExpr *expr = (BoolExpr *) node; + + APP_JUMB(expr->boolop); + JumbleExpr(jstate, (Node *) expr->args); + } + break; + case T_SubLink: + { + SubLink *sublink = (SubLink *) node; + + APP_JUMB(sublink->subLinkType); + JumbleExpr(jstate, (Node *) sublink->testexpr); + JumbleQuery(jstate, (Query *) sublink->subselect); + } + break; + case T_FieldSelect: + { + FieldSelect *fs = (FieldSelect *) node; + + APP_JUMB(fs->fieldnum); + JumbleExpr(jstate, (Node *) fs->arg); + } + break; + case T_FieldStore: + { + FieldStore *fstore = (FieldStore *) node; + + JumbleExpr(jstate, (Node *) fstore->arg); + JumbleExpr(jstate, (Node *) fstore->newvals); + } + break; + case T_RelabelType: + { + RelabelType *rt = (RelabelType *) node; + + APP_JUMB(rt->resulttype); + JumbleExpr(jstate, (Node *) rt->arg); + } + break; + case T_CoerceViaIO: + { + CoerceViaIO *cio = (CoerceViaIO *) node; + + APP_JUMB(cio->resulttype); + JumbleExpr(jstate, (Node *) cio->arg); + } + break; + case T_ArrayCoerceExpr: + { + ArrayCoerceExpr *acexpr = (ArrayCoerceExpr *) node; + + APP_JUMB(acexpr->resulttype); + JumbleExpr(jstate, (Node *) acexpr->arg); + } + break; + case T_ConvertRowtypeExpr: + { + ConvertRowtypeExpr *crexpr = (ConvertRowtypeExpr *) node; + + APP_JUMB(crexpr->resulttype); + JumbleExpr(jstate, (Node *) crexpr->arg); + } + break; + case T_CollateExpr: + { + CollateExpr *ce = (CollateExpr *) node; + + APP_JUMB(ce->collOid); + JumbleExpr(jstate, (Node *) ce->arg); + } + break; + case T_CaseExpr: + { + CaseExpr *caseexpr = (CaseExpr *) node; + + JumbleExpr(jstate, (Node *) caseexpr->arg); + foreach(temp, caseexpr->args) + { + CaseWhen *when = (CaseWhen *) lfirst(temp); + + Assert(IsA(when, CaseWhen)); + JumbleExpr(jstate, (Node *) when->expr); + JumbleExpr(jstate, (Node *) when->result); + } + JumbleExpr(jstate, (Node *) caseexpr->defresult); + } + break; + case T_CaseTestExpr: + { + CaseTestExpr *ct = (CaseTestExpr *) node; + + APP_JUMB(ct->typeId); + } + break; + case T_ArrayExpr: + JumbleExpr(jstate, (Node *) ((ArrayExpr *) node)->elements); + break; + case T_RowExpr: + JumbleExpr(jstate, (Node *) ((RowExpr *) node)->args); + break; + case T_RowCompareExpr: + { + RowCompareExpr *rcexpr = (RowCompareExpr *) node; + + APP_JUMB(rcexpr->rctype); + JumbleExpr(jstate, (Node *) rcexpr->largs); + JumbleExpr(jstate, (Node *) rcexpr->rargs); + } + break; + case T_CoalesceExpr: + JumbleExpr(jstate, (Node *) ((CoalesceExpr *) node)->args); + break; + case T_MinMaxExpr: + { + MinMaxExpr *mmexpr = (MinMaxExpr *) node; + + APP_JUMB(mmexpr->op); + JumbleExpr(jstate, (Node *) mmexpr->args); + } + break; + case T_XmlExpr: + { + XmlExpr *xexpr = (XmlExpr *) node; + + APP_JUMB(xexpr->op); + JumbleExpr(jstate, (Node *) xexpr->named_args); + JumbleExpr(jstate, (Node *) xexpr->args); + } + break; + case T_NullTest: + { + NullTest *nt = (NullTest *) node; + + APP_JUMB(nt->nulltesttype); + JumbleExpr(jstate, (Node *) nt->arg); + } + break; + case T_BooleanTest: + { + BooleanTest *bt = (BooleanTest *) node; + + APP_JUMB(bt->booltesttype); + JumbleExpr(jstate, (Node *) bt->arg); + } + break; + case T_CoerceToDomain: + { + CoerceToDomain *cd = (CoerceToDomain *) node; + + APP_JUMB(cd->resulttype); + JumbleExpr(jstate, (Node *) cd->arg); + } + break; + case T_CoerceToDomainValue: + { + CoerceToDomainValue *cdv = (CoerceToDomainValue *) node; + + APP_JUMB(cdv->typeId); + } + break; + case T_SetToDefault: + { + SetToDefault *sd = (SetToDefault *) node; + + APP_JUMB(sd->typeId); + } + break; + case T_CurrentOfExpr: + { + CurrentOfExpr *ce = (CurrentOfExpr *) node; + + APP_JUMB(ce->cvarno); + if (ce->cursor_name) + APP_JUMB_STRING(ce->cursor_name); + APP_JUMB(ce->cursor_param); + } + break; + case T_TargetEntry: + { + TargetEntry *tle = (TargetEntry *) node; + + APP_JUMB(tle->resno); + APP_JUMB(tle->ressortgroupref); + JumbleExpr(jstate, (Node *) tle->expr); + } + break; + case T_RangeTblRef: + { + RangeTblRef *rtr = (RangeTblRef *) node; + + APP_JUMB(rtr->rtindex); + } + break; + case T_JoinExpr: + { + JoinExpr *join = (JoinExpr *) node; + + APP_JUMB(join->jointype); + APP_JUMB(join->isNatural); + APP_JUMB(join->rtindex); + JumbleExpr(jstate, join->larg); + JumbleExpr(jstate, join->rarg); + JumbleExpr(jstate, join->quals); + } + break; + case T_FromExpr: + { + FromExpr *from = (FromExpr *) node; + + JumbleExpr(jstate, (Node *) from->fromlist); + JumbleExpr(jstate, from->quals); + } + break; + case T_List: + foreach(temp, (List *) node) + { + JumbleExpr(jstate, (Node *) lfirst(temp)); + } + break; + case T_SortGroupClause: + { + SortGroupClause *sgc = (SortGroupClause *) node; + + APP_JUMB(sgc->tleSortGroupRef); + APP_JUMB(sgc->eqop); + APP_JUMB(sgc->sortop); + APP_JUMB(sgc->nulls_first); + } + break; + case T_WindowClause: + { + WindowClause *wc = (WindowClause *) node; + + APP_JUMB(wc->winref); + APP_JUMB(wc->frameOptions); + JumbleExpr(jstate, (Node *) wc->partitionClause); + JumbleExpr(jstate, (Node *) wc->orderClause); + JumbleExpr(jstate, wc->startOffset); + JumbleExpr(jstate, wc->endOffset); + } + break; + case T_CommonTableExpr: + { + CommonTableExpr *cte = (CommonTableExpr *) node; + + JumbleQuery(jstate, (Query *) cte->ctequery); + } + break; + case T_SetOperationStmt: + { + SetOperationStmt *setop = (SetOperationStmt *) node; + + APP_JUMB(setop->op); + APP_JUMB(setop->all); + JumbleExpr(jstate, setop->larg); + JumbleExpr(jstate, setop->rarg); + } + break; + default: + /* Only a warning, since we can stumble along anyway */ + elog(WARNING, "unrecognized node type: %d", + (int) nodeTag(node)); + break; + } +} + +/* + * Record location of constant within query string of query tree + * that is currently being walked. + */ +static void +RecordConstLocation(pgssJumbleState * jstate, int location) +{ + /* -1 indicates unknown or undefined location */ + if (location >= 0) + { + /* enlarge array if needed */ + if (jstate->clocations_count >= jstate->clocations_buf_size) + { + jstate->clocations_buf_size *= 2; + jstate->clocations = (pgssLocationLen *) + repalloc(jstate->clocations, + jstate->clocations_buf_size * + sizeof(pgssLocationLen)); + } + jstate->clocations[jstate->clocations_count].location = location; + /* initialize lengths to -1 to simplify fill_in_constant_lengths */ + jstate->clocations[jstate->clocations_count].length = -1; + jstate->clocations_count++; + } +} + +/* + * Generate a normalized version of the query string that will be used to + * represent all similar queries. + * + * Note that the normalized representation may well vary depending on + * just which "equivalent" query is used to create the hashtable entry. + * We assume this is OK. + * + * *query_len_p contains the input string length, and is updated with + * the result string length (which cannot be longer) on exit. + * + * Returns a palloc'd string, which is not necessarily null-terminated. + */ +static char * +generate_normalized_query(pgssJumbleState * jstate, const char *query, + int *query_len_p, int encoding) +{ + char *norm_query; + int query_len = *query_len_p; + int max_output_len; + int i, + len_to_wrt, /* Length (in bytes) to write */ + quer_loc = 0, /* Source query byte location */ + n_quer_loc = 0, /* Normalized query byte location */ + last_off = 0, /* Offset from start for previous tok */ + last_tok_len = 0; /* Length (in bytes) of that tok */ + + /* + * Get constants' lengths (core system only gives us locations). Note + * this also ensures the items are sorted by location. + */ + fill_in_constant_lengths(jstate, query); + + /* Allocate result buffer, ensuring we limit result to allowed size */ + max_output_len = Min(query_len, pgss->query_size - 1); + norm_query = palloc(max_output_len); + + for (i = 0; i < jstate->clocations_count; i++) + { + int off, /* Offset from start for cur tok */ + tok_len; /* Length (in bytes) of that tok */ + + off = jstate->clocations[i].location; + tok_len = jstate->clocations[i].length; + + if (tok_len < 0) + continue; /* ignore any duplicates */ + + /* Copy next chunk, or as much as will fit */ + len_to_wrt = off - last_off; + len_to_wrt -= last_tok_len; + len_to_wrt = Min(len_to_wrt, max_output_len - n_quer_loc); + + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + + if (n_quer_loc < max_output_len) + norm_query[n_quer_loc++] = '?'; + + quer_loc = off + tok_len; + last_off = off; + last_tok_len = tok_len; + + /* If we run out of space, might as well stop iterating */ + if (n_quer_loc >= max_output_len) + break; + } + + /* + * We've copied up until the last ignorable constant. Copy over the + * remaining bytes of the original query string, or at least as much as + * will fit. + */ + len_to_wrt = query_len - quer_loc; + len_to_wrt = Min(len_to_wrt, max_output_len - n_quer_loc); + + Assert(len_to_wrt >= 0); + memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); + n_quer_loc += len_to_wrt; + + /* + * If we ran out of space, we need to do an encoding-aware truncation, + * just to make sure we don't have an incomplete character at the end. + */ + if (n_quer_loc >= max_output_len) + query_len = pg_encoding_mbcliplen(encoding, + norm_query, + n_quer_loc, + pgss->query_size - 1); + else + query_len = n_quer_loc; + + *query_len_p = query_len; + return norm_query; +} + +/* + * Given a valid SQL string and an array of constant-location records, + * fill in the textual lengths of those constants. + * + * The constants may use any allowed constant syntax, such as float literals, + * bit-strings, single-quoted strings and dollar-quoted strings. This is + * accomplished by using the public API for the core scanner. + * + * It is the caller's job to ensure that the string is a valid SQL statement + * with constants at the indicated locations. Since in practice the string + * has already been parsed, and the locations that the caller provides will + * have originated from within the authoritative parser, this should not be + * a problem. + * + * Duplicate constant pointers are possible, and will have their lengths + * marked as '-1', so that they are later ignored. (Actually, we assume the + * lengths were initialized as -1 to start with, and don't change them here.) + * + * N.B. There is an assumption that a '-' character at a Const location begins + * a negative numeric constant. This precludes there ever being another + * reason for a constant to start with a '-'. + */ +static void +fill_in_constant_lengths(pgssJumbleState * jstate, const char *query) +{ + pgssLocationLen *locs; + core_yyscan_t yyscanner; + core_yy_extra_type yyextra; + core_YYSTYPE yylval; + YYLTYPE yylloc; + int last_loc = -1; + int i; + + /* + * Sort the records by location so that we can process them in order while + * scanning the query text. + */ + if (jstate->clocations_count > 1) + qsort(jstate->clocations, jstate->clocations_count, + sizeof(pgssLocationLen), comp_location); + locs = jstate->clocations; + + /* initialize the flex scanner --- should match raw_parser() */ + yyscanner = scanner_init(query, + &yyextra, + ScanKeywords, + NumScanKeywords); + + /* Search for each constant, in sequence */ + for (i = 0; i < jstate->clocations_count; i++) + { + int loc = locs[i].location; + int tok; + + Assert(loc >= 0); + + if (loc <= last_loc) + continue; /* Duplicate constant, ignore */ + + /* Lex tokens until we find the desired constant */ + for (;;) + { + tok = core_yylex(&yylval, &yylloc, yyscanner); + + /* We should not hit end-of-string, but if we do, behave sanely */ + if (tok == 0) + break; /* out of inner for-loop */ + + /* + * We should find the token position exactly, but if we somehow + * run past it, work with that. + */ + if (yylloc >= loc) + { + if (query[loc] == '-') + { + /* + * It's a negative value - this is the one and only case + * where we replace more than a single token. + * + * Do not compensate for the core system's special-case + * adjustment of location to that of the leading '-' + * operator in the event of a negative constant. It is + * also useful for our purposes to start from the minus + * symbol. In this way, queries like "select * from foo + * where bar = 1" and "select * from foo where bar = -2" + * will have identical normalized query strings. + */ + tok = core_yylex(&yylval, &yylloc, yyscanner); + if (tok == 0) + break; /* out of inner for-loop */ + } + + /* + * We now rely on the assumption that flex has placed a zero + * byte after the text of the current token in scanbuf. + */ + locs[i].length = strlen(yyextra.scanbuf + loc); + break; /* out of inner for-loop */ + } + } + + /* If we hit end-of-string, give up, leaving remaining lengths -1 */ + if (tok == 0) + break; + + last_loc = loc; + } + + scanner_finish(yyscanner); +} + +/* + * comp_location: comparator for qsorting pgssLocationLen structs by location + */ +static int +comp_location(const void *a, const void *b) +{ + int l = ((const pgssLocationLen *) a)->location; + int r = ((const pgssLocationLen *) b)->location; + + if (l < r) + return -1; + else if (l > r) + return +1; + else + return 0; +} diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml index ca7bd442741..00a0e5e1308 100644 --- a/doc/src/sgml/pgstatstatements.sgml +++ b/doc/src/sgml/pgstatstatements.sgml @@ -25,7 +25,7 @@ The statistics gathered by the module are made available via a system view named pg_stat_statements. This view contains one row for - each distinct query text, database ID, and user ID (up to the maximum + each distinct query, database ID, and user ID (up to the maximum number of distinct statements that the module can track). The columns of the view are shown in . @@ -61,7 +61,7 @@ query text - Text of the statement (up to bytes) + Text of a representative statement (up to bytes) @@ -195,10 +195,38 @@ - Note that statements are considered the same if they have the same text, - regardless of the values of any out-of-line parameters used in the - statement. Using out-of-line parameters will help to group statements - together and may make the statistics more useful. + Plannable queries (that is, SELECT, INSERT, + UPDATE, and DELETE) are combined into a single + pg_stat_statements entry whenever they have identical query + structures according to an internal hash calculation. Typically, two + queries will be considered the same for this purpose if they are + semantically equivalent except for the values of literal constants + appearing in the query. Utility commands (that is, all other commands) + are compared strictly on the basis of their textual query strings, however. + + + + When a constant's value has been ignored for purposes of matching the + query to other queries, the constant is replaced by ? + in the pg_stat_statements display. The rest of the query + text is that of the first query that had the particular hash value + associated with the pg_stat_statements entry. + + + + In some cases, queries with visibly different texts might get merged into a + single pg_stat_statements entry. Normally this will happen + only for semantically equivalent queries, but there is a small chance of + hash collisions causing unrelated queries to be merged into one entry. + (This cannot happen for queries belonging to different users or databases, + however.) + + + + Since the hash value is computed on the post-parse-analysis representation + of the queries, the opposite is also possible: queries with identical texts + might appear as separate entries, if they have different meanings as a + result of factors such as different search_path settings. @@ -329,20 +357,20 @@ pg_stat_statements.track = all bench=# SELECT pg_stat_statements_reset(); $ pgbench -i bench -$ pgbench -c10 -t300 -M prepared bench +$ pgbench -c10 -t300 bench bench=# \x bench=# SELECT query, calls, total_time, rows, 100.0 * shared_blks_hit / nullif(shared_blks_hit + shared_blks_read, 0) AS hit_percent FROM pg_stat_statements ORDER BY total_time DESC LIMIT 5; -[ RECORD 1 ]--------------------------------------------------------------------- -query | UPDATE pgbench_branches SET bbalance = bbalance + $1 WHERE bid = $2; +query | UPDATE pgbench_branches SET bbalance = bbalance + ? WHERE bid = ?; calls | 3000 total_time | 9.60900100000002 rows | 2836 hit_percent | 99.9778970000200936 -[ RECORD 2 ]--------------------------------------------------------------------- -query | UPDATE pgbench_tellers SET tbalance = tbalance + $1 WHERE tid = $2; +query | UPDATE pgbench_tellers SET tbalance = tbalance + ? WHERE tid = ?; calls | 3000 total_time | 8.015156 rows | 2990 @@ -354,7 +382,7 @@ total_time | 0.310624 rows | 100000 hit_percent | 0.30395136778115501520 -[ RECORD 4 ]--------------------------------------------------------------------- -query | UPDATE pgbench_accounts SET abalance = abalance + $1 WHERE aid = $2; +query | UPDATE pgbench_accounts SET abalance = abalance + ? WHERE aid = ?; calls | 3000 total_time | 0.271741999999997 rows | 3000