1
0
mirror of https://github.com/postgres/postgres.git synced 2025-04-18 13:44:19 +03:00

Revert "Improve accounting for memory used by shared hash tables"

This reverts commit f5930f9a98ea65d659d41600a138e608988ad122.

This broke the expansion of private hash tables, which reallocates the
directory. But that's impossible when it's allocated together with the
other fields, and dir_realloc() failed with BogusFree. Clearly, this
needs rethinking.

Discussion: https://postgr.es/m/CAApHDvriCiNkm=v521AP6PKPfyWkJ++jqZ9eqX4cXnhxLv8w-A@mail.gmail.com
This commit is contained in:
Tomas Vondra 2025-04-04 04:28:59 +02:00
parent 88f55bc976
commit 1aff1dc8df
3 changed files with 69 additions and 225 deletions

View File

@ -73,7 +73,6 @@
#include "storage/shmem.h" #include "storage/shmem.h"
#include "storage/spin.h" #include "storage/spin.h"
#include "utils/builtins.h" #include "utils/builtins.h"
#include "utils/dynahash.h"
static void *ShmemAllocRaw(Size size, Size *allocated_size); static void *ShmemAllocRaw(Size size, Size *allocated_size);
@ -347,8 +346,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
/* look it up in the shmem index */ /* look it up in the shmem index */
location = ShmemInitStruct(name, location = ShmemInitStruct(name,
hash_get_size(infoP, hash_flags, hash_get_shared_size(infoP, hash_flags),
init_size, true),
&found); &found);
/* /*

View File

@ -260,39 +260,12 @@ static long hash_accesses,
hash_expansions; hash_expansions;
#endif #endif
/* access to parts of the hash table, allocated as a single chunk */
#define HASH_DIRECTORY_PTR(hashp) \
(((char *) (hashp)->hctl) + sizeof(HASHHDR))
#define HASH_SEGMENT_OFFSET(hctl, idx) \
(sizeof(HASHHDR) + \
((hctl)->dsize * sizeof(HASHSEGMENT)) + \
((hctl)->ssize * (idx) * sizeof(HASHBUCKET)))
#define HASH_SEGMENT_PTR(hashp, idx) \
((char *) (hashp)->hctl + HASH_SEGMENT_OFFSET((hashp)->hctl, (idx)))
#define HASH_SEGMENT_SIZE(hashp) \
((hashp)->ssize * sizeof(HASHBUCKET))
#define HASH_ELEMENTS_PTR(hashp, nsegs) \
((char *) (hashp)->hctl + HASH_SEGMENT_OFFSET((hashp)->hctl, nsegs))
/* Each element has a HASHELEMENT header plus user data. */
#define HASH_ELEMENT_SIZE(hctl) \
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN((hctl)->entrysize))
#define HASH_ELEMENT_NEXT(hctl, num, ptr) \
((char *) (ptr) + ((num) * HASH_ELEMENT_SIZE(hctl)))
/* /*
* Private function prototypes * Private function prototypes
*/ */
static void *DynaHashAlloc(Size size); static void *DynaHashAlloc(Size size);
static HASHSEGMENT seg_alloc(HTAB *hashp); static HASHSEGMENT seg_alloc(HTAB *hashp);
static HASHELEMENT *element_alloc(HTAB *hashp, int nelem); static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
static void element_add(HTAB *hashp, HASHELEMENT *firstElement,
int nelem, int freelist_idx);
static bool dir_realloc(HTAB *hashp); static bool dir_realloc(HTAB *hashp);
static bool expand_table(HTAB *hashp); static bool expand_table(HTAB *hashp);
static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx); static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
@ -307,9 +280,6 @@ static int next_pow2_int(long num);
static void register_seq_scan(HTAB *hashp); static void register_seq_scan(HTAB *hashp);
static void deregister_seq_scan(HTAB *hashp); static void deregister_seq_scan(HTAB *hashp);
static bool has_seq_scans(HTAB *hashp); static bool has_seq_scans(HTAB *hashp);
static void compute_buckets_and_segs(long nelem, long num_partitions,
long ssize,
int *nbuckets, int *nsegments);
/* /*
@ -383,8 +353,6 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
{ {
HTAB *hashp; HTAB *hashp;
HASHHDR *hctl; HASHHDR *hctl;
int nelem_batch;
bool prealloc;
/* /*
* Hash tables now allocate space for key and data, but you have to say * Hash tables now allocate space for key and data, but you have to say
@ -539,31 +507,9 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
hashp->isshared = false; hashp->isshared = false;
} }
/* Choose the number of entries to allocate at a time. */
nelem_batch = choose_nelem_alloc(info->entrysize);
/*
* Decide whether to pre-allocate elements.
*
* For a private hash table, preallocate the requested number of elements
* if it's less than our chosen nelem_alloc. This avoids wasting space if
* the caller correctly estimates a small table size.
*
* For a shared hash table, preallocate the requested number of elements.
* This reduces problems with run-time out-of-shared-memory conditions.
*/
prealloc = (flags & HASH_SHARED_MEM) || (nelem < nelem_batch);
/*
* Allocate the memory needed for hash header, directory, segments and
* elements together. Use pointer arithmetic to arrive at the start of
* each of these structures later.
*/
if (!hashp->hctl) if (!hashp->hctl)
{ {
Size size = hash_get_size(info, flags, nelem, prealloc); hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR));
hashp->hctl = (HASHHDR *) hashp->alloc(size);
if (!hashp->hctl) if (!hashp->hctl)
ereport(ERROR, ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY), (errcode(ERRCODE_OUT_OF_MEMORY),
@ -612,9 +558,6 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
hctl->keysize = info->keysize; hctl->keysize = info->keysize;
hctl->entrysize = info->entrysize; hctl->entrysize = info->entrysize;
/* remember how many elements to allocate at once */
hctl->nelem_alloc = nelem_batch;
/* make local copies of heavily-used constant fields */ /* make local copies of heavily-used constant fields */
hashp->keysize = hctl->keysize; hashp->keysize = hctl->keysize;
hashp->ssize = hctl->ssize; hashp->ssize = hctl->ssize;
@ -624,6 +567,14 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
if (!init_htab(hashp, nelem)) if (!init_htab(hashp, nelem))
elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname); elog(ERROR, "failed to initialize hash table \"%s\"", hashp->tabname);
/*
* For a shared hash table, preallocate the requested number of elements.
* This reduces problems with run-time out-of-shared-memory conditions.
*
* For a non-shared hash table, preallocate the requested number of
* elements if it's less than our chosen nelem_alloc. This avoids wasting
* space if the caller correctly estimates a small table size.
*/
if ((flags & HASH_SHARED_MEM) || if ((flags & HASH_SHARED_MEM) ||
nelem < hctl->nelem_alloc) nelem < hctl->nelem_alloc)
{ {
@ -631,7 +582,6 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
freelist_partitions, freelist_partitions,
nelem_alloc, nelem_alloc,
nelem_alloc_first; nelem_alloc_first;
void *ptr = NULL;
/* /*
* If hash table is partitioned, give each freelist an equal share of * If hash table is partitioned, give each freelist an equal share of
@ -656,28 +606,14 @@ hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
else else
nelem_alloc_first = nelem_alloc; nelem_alloc_first = nelem_alloc;
/*
* Calculate the offset at which to find the first partition of
* elements. We have to skip space for the header, segments and
* buckets.
*/
ptr = HASH_ELEMENTS_PTR(hashp, hctl->nsegs);
/*
* Assign the correct location of each parition within a pre-allocated
* buffer.
*
* Actual memory allocation happens in ShmemInitHash for shared hash
* tables or earlier in this function for non-shared hash tables.
*
* We just need to split that allocation into per-batch freelists.
*/
for (i = 0; i < freelist_partitions; i++) for (i = 0; i < freelist_partitions; i++)
{ {
int temp = (i == 0) ? nelem_alloc_first : nelem_alloc; int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
element_add(hashp, (HASHELEMENT *) ptr, temp, i); if (!element_alloc(hashp, temp, i))
ptr = HASH_ELEMENT_NEXT(hctl, temp, ptr); ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
} }
} }
@ -766,16 +702,29 @@ init_htab(HTAB *hashp, long nelem)
SpinLockInit(&(hctl->freeList[i].mutex)); SpinLockInit(&(hctl->freeList[i].mutex));
/* /*
* We've already calculated these parameters when we calculated how much * Allocate space for the next greater power of two number of buckets,
* space to allocate in hash_get_size(). Be careful to keep these two * assuming a desired maximum load factor of 1.
* places in sync, so that we get the same parameters.
*/ */
compute_buckets_and_segs(nelem, hctl->num_partitions, hctl->ssize, nbuckets = next_pow2_int(nelem);
&nbuckets, &nsegs);
/*
* In a partitioned table, nbuckets must be at least equal to
* num_partitions; were it less, keys with apparently different partition
* numbers would map to the same bucket, breaking partition independence.
* (Normally nbuckets will be much bigger; this is just a safety check.)
*/
while (nbuckets < hctl->num_partitions)
nbuckets <<= 1;
hctl->max_bucket = hctl->low_mask = nbuckets - 1; hctl->max_bucket = hctl->low_mask = nbuckets - 1;
hctl->high_mask = (nbuckets << 1) - 1; hctl->high_mask = (nbuckets << 1) - 1;
/*
* Figure number of directory segments needed, round up to a power of 2
*/
nsegs = (nbuckets - 1) / hctl->ssize + 1;
nsegs = next_pow2_int(nsegs);
/* /*
* Make sure directory is big enough. If pre-allocated directory is too * Make sure directory is big enough. If pre-allocated directory is too
* small, choke (caller screwed up). * small, choke (caller screwed up).
@ -788,25 +737,26 @@ init_htab(HTAB *hashp, long nelem)
return false; return false;
} }
/* /* Allocate a directory */
* Assign a directory by making it point to the correct location in the
* pre-allocated buffer.
*/
if (!(hashp->dir)) if (!(hashp->dir))
{ {
CurrentDynaHashCxt = hashp->hcxt; CurrentDynaHashCxt = hashp->hcxt;
hashp->dir = (HASHSEGMENT *) HASH_DIRECTORY_PTR(hashp); hashp->dir = (HASHSEGMENT *)
hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT));
if (!hashp->dir)
return false;
} }
/* Assign initial segments, which are also pre-allocated */ /* Allocate initial segments */
i = 0;
for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++) for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
{ {
*segp = (HASHSEGMENT) HASH_SEGMENT_PTR(hashp, i++); *segp = seg_alloc(hashp);
MemSet(*segp, 0, HASH_SEGMENT_SIZE(hashp)); if (*segp == NULL)
return false;
} }
Assert(i == nsegs); /* Choose number of entries to allocate at a time */
hctl->nelem_alloc = choose_nelem_alloc(hctl->entrysize);
#ifdef HASH_DEBUG #ifdef HASH_DEBUG
fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n", fprintf(stderr, "init_htab:\n%s%p\n%s%ld\n%s%ld\n%s%d\n%s%ld\n%s%u\n%s%x\n%s%x\n%s%ld\n",
@ -896,75 +846,16 @@ hash_select_dirsize(long num_entries)
} }
/* /*
* hash_get_size -- determine memory needed for a new dynamic hash table * Compute the required initial memory allocation for a shared-memory
* * hashtable with the given parameters. We need space for the HASHHDR
* info: hash table parameters * and for the (non expansible) directory.
* flags: bitmask indicating which parameters to take from *info
* nelem: maximum number of elements expected
* prealloc: request preallocation of elements
*
* Compute the required initial memory allocation for a hashtable with the given
* parameters. We need space for the HASHHDR, for the directory, segments and
* preallocated elements.
*
* The hash table may be private or shared. For shared hash tables the directory
* size is non-expansive, and we preallocate all elements (nelem). For private
* hash tables, we preallocate elements only if the expected number of elements
* is small (less than nelem_alloc).
*/ */
Size Size
hash_get_size(const HASHCTL *info, int flags, long nelem, bool prealloc) hash_get_shared_size(HASHCTL *info, int flags)
{ {
int nbuckets; Assert(flags & HASH_DIRSIZE);
int nsegs; Assert(info->dsize == info->max_dsize);
int num_partitions; return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
long ssize;
long dsize;
Size elementSize = HASH_ELEMENT_SIZE(info);
long nelem_prealloc = 0;
#ifdef USE_ASSERT_CHECKING
/* shared hash tables have non-expansive directory */
if (flags & HASH_SHARED_MEM)
{
Assert(flags & HASH_DIRSIZE);
Assert(info->dsize == info->max_dsize);
Assert(prealloc);
}
#endif
/* Non-shared hash tables may not specify dir size */
if (flags & HASH_DIRSIZE)
dsize = info->dsize;
else
dsize = DEF_DIRSIZE;
if (flags & HASH_SEGMENT)
ssize = info->ssize;
else
ssize = DEF_SEGSIZE;
if (flags & HASH_PARTITION)
{
num_partitions = info->num_partitions;
/* Number of entries should be atleast equal to the freelists */
if (nelem < NUM_FREELISTS)
nelem = NUM_FREELISTS;
}
else
num_partitions = 0;
compute_buckets_and_segs(nelem, num_partitions, ssize,
&nbuckets, &nsegs);
/* initial_elems as false indicates no elements are to be pre-allocated */
if (prealloc)
nelem_prealloc = nelem;
return sizeof(HASHHDR) + dsize * sizeof(HASHSEGMENT)
+ sizeof(HASHBUCKET) * ssize * nsegs
+ nelem_prealloc * elementSize;
} }
@ -1394,7 +1285,7 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
* Failing because the needed element is in a different freelist is * Failing because the needed element is in a different freelist is
* not acceptable. * not acceptable.
*/ */
if ((newElement = element_alloc(hashp, hctl->nelem_alloc)) == NULL) if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
{ {
int borrow_from_idx; int borrow_from_idx;
@ -1431,7 +1322,6 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
/* no elements available to borrow either, so out of memory */ /* no elements available to borrow either, so out of memory */
return NULL; return NULL;
} }
element_add(hashp, newElement, hctl->nelem_alloc, freelist_idx);
} }
/* remove entry from freelist, bump nentries */ /* remove entry from freelist, bump nentries */
@ -1810,43 +1700,29 @@ seg_alloc(HTAB *hashp)
} }
/* /*
* allocate some new elements * allocate some new elements and link them into the indicated free list
*/ */
static HASHELEMENT * static bool
element_alloc(HTAB *hashp, int nelem) element_alloc(HTAB *hashp, int nelem, int freelist_idx)
{
HASHHDR *hctl = hashp->hctl;
Size elementSize;
HASHELEMENT *firstElement = NULL;
if (hashp->isfixed)
return NULL;
/* Each element has a HASHELEMENT header plus user data. */
elementSize = HASH_ELEMENT_SIZE(hctl);
CurrentDynaHashCxt = hashp->hcxt;
firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
if (!firstElement)
return NULL;
return firstElement;
}
/*
* link the elements allocated by element_alloc into the indicated free list
*/
static void
element_add(HTAB *hashp, HASHELEMENT *firstElement, int nelem, int freelist_idx)
{ {
HASHHDR *hctl = hashp->hctl; HASHHDR *hctl = hashp->hctl;
Size elementSize; Size elementSize;
HASHELEMENT *firstElement;
HASHELEMENT *tmpElement; HASHELEMENT *tmpElement;
HASHELEMENT *prevElement; HASHELEMENT *prevElement;
int i; int i;
if (hashp->isfixed)
return false;
/* Each element has a HASHELEMENT header plus user data. */ /* Each element has a HASHELEMENT header plus user data. */
elementSize = HASH_ELEMENT_SIZE(hctl); elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
CurrentDynaHashCxt = hashp->hcxt;
firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);
if (!firstElement)
return false;
/* prepare to link all the new entries into the freelist */ /* prepare to link all the new entries into the freelist */
prevElement = NULL; prevElement = NULL;
@ -1868,6 +1744,8 @@ element_add(HTAB *hashp, HASHELEMENT *firstElement, int nelem, int freelist_idx)
if (IS_PARTITIONED(hctl)) if (IS_PARTITIONED(hctl))
SpinLockRelease(&hctl->freeList[freelist_idx].mutex); SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
return true;
} }
/* /*
@ -2079,34 +1957,3 @@ AtEOSubXact_HashTables(bool isCommit, int nestDepth)
} }
} }
} }
/*
* Calculate the number of buckets and segments to store the given
* number of elements in a hash table. Segments contain buckets which
* in turn contain elements.
*/
static void
compute_buckets_and_segs(long nelem, long num_partitions, long ssize,
int *nbuckets, int *nsegments)
{
/*
* Allocate space for the next greater power of two number of buckets,
* assuming a desired maximum load factor of 1.
*/
*nbuckets = next_pow2_int(nelem);
/*
* In a partitioned table, nbuckets must be at least equal to
* num_partitions; were it less, keys with apparently different partition
* numbers would map to the same bucket, breaking partition independence.
* (Normally nbuckets will be much bigger; this is just a safety check.)
*/
while ((*nbuckets) < num_partitions)
(*nbuckets) <<= 1;
/*
* Figure number of directory segments needed, round up to a power of 2
*/
*nsegments = ((*nbuckets) - 1) / ssize + 1;
*nsegments = next_pow2_int(*nsegments);
}

View File

@ -151,8 +151,7 @@ extern void hash_seq_term(HASH_SEQ_STATUS *status);
extern void hash_freeze(HTAB *hashp); extern void hash_freeze(HTAB *hashp);
extern Size hash_estimate_size(long num_entries, Size entrysize); extern Size hash_estimate_size(long num_entries, Size entrysize);
extern long hash_select_dirsize(long num_entries); extern long hash_select_dirsize(long num_entries);
extern Size hash_get_size(const HASHCTL *info, int flags, extern Size hash_get_shared_size(HASHCTL *info, int flags);
long nelem, bool prealloc);
extern void AtEOXact_HashTables(bool isCommit); extern void AtEOXact_HashTables(bool isCommit);
extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth); extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);