1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-26 01:22:12 +03:00

Reduce memory usage of tsvector type analyze function.

compute_tsvector_stats() detoasted and kept in memory every tsvector value
in the sample, but that can be a lot of memory. The original bug report
described a case using over 10 gigabytes, with statistics target of 10000
(the maximum).

To fix, allocate a separate copy of just the lexemes that we keep around,
and free the detoasted tsvector values as we go. This adds some palloc/pfree
overhead, when you have a lot of distinct lexemes in the sample, but it's
better than running out of memory.

Fixes bug  reported by James C. Reviewed by Tom Lane. Backport to
all supported versions.

Discussion: https://www.postgresql.org/message-id/20170514200602.1451.46797@wrigleys.postgresql.org
This commit is contained in:
Heikki Linnakangas
2017-07-12 22:03:38 +03:00
parent 832d3dce5a
commit bbeec3c749

@ -232,9 +232,7 @@ compute_tsvector_stats(VacAttrStats *stats,
/*
* We loop through the lexemes in the tsvector and add them to our
* tracking hashtable. Note: the hashtable entries will point into
* the (detoasted) tsvector value, therefore we cannot free that
* storage until we're done.
* tracking hashtable.
*/
lexemesptr = STRPTR(vector);
curentryptr = ARRPTR(vector);
@ -242,7 +240,12 @@ compute_tsvector_stats(VacAttrStats *stats,
{
bool found;
/* Construct a hash key */
/*
* Construct a hash key. The key points into the (detoasted)
* tsvector value at this point, but if a new entry is created, we
* make a copy of it. This way we can free the tsvector value
* once we've processed all its lexemes.
*/
hash_key.lexeme = lexemesptr + curentryptr->pos;
hash_key.length = curentryptr->len;
@ -261,6 +264,9 @@ compute_tsvector_stats(VacAttrStats *stats,
/* Initialize new tracking list element */
item->frequency = 1;
item->delta = b_current - 1;
item->key.lexeme = palloc(hash_key.length);
memcpy(item->key.lexeme, hash_key.lexeme, hash_key.length);
}
/* lexeme_no is the number of elements processed (ie N) */
@ -276,6 +282,10 @@ compute_tsvector_stats(VacAttrStats *stats,
/* Advance to the next WordEntry in the tsvector */
curentryptr++;
}
/* If the vector was toasted, free the detoasted copy. */
if (TSVectorGetDatum(vector) != value)
pfree(vector);
}
/* We can only compute real stats if we found some non-null values. */
@ -447,9 +457,12 @@ prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current)
{
if (item->frequency + item->delta <= b_current)
{
char *lexeme = item->key.lexeme;
if (hash_search(lexemes_tab, (const void *) &item->key,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "hash table corrupted");
pfree(lexeme);
}
}
}