mirror of
https://github.com/postgres/postgres.git
synced 2025-05-09 18:21:05 +03:00
Improve comments about partitioned hash table freelists.
While I couldn't find any live bugs in commit 44ca4022f, the comments seemed pretty far from adequate; in particular it was not made plain that "borrowing" entries from other freelists is critical for correctness. Try to improve the commentary. A couple of very minor code style tweaks, as well. Discussion: https://postgr.es/m/10593.1500670709@sss.pgh.pa.us
This commit is contained in:
parent
991c8b04fc
commit
ab2324fd46
@ -15,7 +15,8 @@
|
|||||||
* to hash_create. This prevents any attempt to split buckets on-the-fly.
|
* to hash_create. This prevents any attempt to split buckets on-the-fly.
|
||||||
* Therefore, each hash bucket chain operates independently, and no fields
|
* Therefore, each hash bucket chain operates independently, and no fields
|
||||||
* of the hash header change after init except nentries and freeList.
|
* of the hash header change after init except nentries and freeList.
|
||||||
* A partitioned table uses spinlocks to guard changes of those fields.
|
* (A partitioned table uses multiple copies of those fields, guarded by
|
||||||
|
* spinlocks, for additional concurrency.)
|
||||||
* This lets any subset of the hash buckets be treated as a separately
|
* This lets any subset of the hash buckets be treated as a separately
|
||||||
* lockable partition. We expect callers to use the low-order bits of a
|
* lockable partition. We expect callers to use the low-order bits of a
|
||||||
* lookup key's hash value as a partition number --- this will work because
|
* lookup key's hash value as a partition number --- this will work because
|
||||||
@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET;
|
|||||||
typedef HASHBUCKET *HASHSEGMENT;
|
typedef HASHBUCKET *HASHSEGMENT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Using array of FreeListData instead of separate arrays of mutexes, nentries
|
* Per-freelist data.
|
||||||
* and freeLists prevents, at least partially, sharing one cache line between
|
*
|
||||||
* different mutexes (see below).
|
* In a partitioned hash table, each freelist is associated with a specific
|
||||||
|
* set of hashcodes, as determined by the FREELIST_IDX() macro below.
|
||||||
|
* nentries tracks the number of live hashtable entries having those hashcodes
|
||||||
|
* (NOT the number of entries in the freelist, as you might expect).
|
||||||
|
*
|
||||||
|
* The coverage of a freelist might be more or less than one partition, so it
|
||||||
|
* needs its own lock rather than relying on caller locking. Relying on that
|
||||||
|
* wouldn't work even if the coverage was the same, because of the occasional
|
||||||
|
* need to "borrow" entries from another freelist; see get_hash_entry().
|
||||||
|
*
|
||||||
|
* Using an array of FreeListData instead of separate arrays of mutexes,
|
||||||
|
* nentries and freeLists helps to reduce sharing of cache lines between
|
||||||
|
* different mutexes.
|
||||||
*/
|
*/
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
slock_t mutex; /* spinlock */
|
slock_t mutex; /* spinlock for this freelist */
|
||||||
long nentries; /* number of entries */
|
long nentries; /* number of entries in associated buckets */
|
||||||
HASHELEMENT *freeList; /* list of free elements */
|
HASHELEMENT *freeList; /* chain of free elements */
|
||||||
} FreeListData;
|
} FreeListData;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -143,12 +156,14 @@ typedef struct
|
|||||||
struct HASHHDR
|
struct HASHHDR
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* The freelist can become a point of contention on high-concurrency hash
|
* The freelist can become a point of contention in high-concurrency hash
|
||||||
* tables, so we use an array of freelist, each with its own mutex and
|
* tables, so we use an array of freelists, each with its own mutex and
|
||||||
* nentries count, instead of just a single one.
|
* nentries count, instead of just a single one. Although the freelists
|
||||||
|
* normally operate independently, we will scavenge entries from freelists
|
||||||
|
* other than a hashcode's default freelist when necessary.
|
||||||
*
|
*
|
||||||
* If hash table is not partitioned only freeList[0] is used and spinlocks
|
* If the hash table is not partitioned, only freeList[0] is used and its
|
||||||
* are not used at all.
|
* spinlock is not used at all; callers' locking is assumed sufficient.
|
||||||
*/
|
*/
|
||||||
FreeListData freeList[NUM_FREELISTS];
|
FreeListData freeList[NUM_FREELISTS];
|
||||||
|
|
||||||
@ -184,7 +199,7 @@ struct HASHHDR
|
|||||||
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
|
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
|
||||||
|
|
||||||
#define FREELIST_IDX(hctl, hashcode) \
|
#define FREELIST_IDX(hctl, hashcode) \
|
||||||
(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
|
(IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Top control structure for a hashtable --- in a shared table, each backend
|
* Top control structure for a hashtable --- in a shared table, each backend
|
||||||
@ -506,8 +521,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
|
|||||||
nelem_alloc_first;
|
nelem_alloc_first;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If hash table is partitioned all freeLists have equal number of
|
* If hash table is partitioned, give each freelist an equal share of
|
||||||
* elements. Otherwise only freeList[0] is used.
|
* the initial allocation. Otherwise only freeList[0] is used.
|
||||||
*/
|
*/
|
||||||
if (IS_PARTITIONED(hashp->hctl))
|
if (IS_PARTITIONED(hashp->hctl))
|
||||||
freelist_partitions = NUM_FREELISTS;
|
freelist_partitions = NUM_FREELISTS;
|
||||||
@ -515,10 +530,13 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
|
|||||||
freelist_partitions = 1;
|
freelist_partitions = 1;
|
||||||
|
|
||||||
nelem_alloc = nelem / freelist_partitions;
|
nelem_alloc = nelem / freelist_partitions;
|
||||||
if (nelem_alloc == 0)
|
if (nelem_alloc <= 0)
|
||||||
nelem_alloc = 1;
|
nelem_alloc = 1;
|
||||||
|
|
||||||
/* Make sure all memory will be used */
|
/*
|
||||||
|
* Make sure we'll allocate all the requested elements; freeList[0]
|
||||||
|
* gets the excess if the request isn't divisible by NUM_FREELISTS.
|
||||||
|
*/
|
||||||
if (nelem_alloc * freelist_partitions < nelem)
|
if (nelem_alloc * freelist_partitions < nelem)
|
||||||
nelem_alloc_first =
|
nelem_alloc_first =
|
||||||
nelem - nelem_alloc * (freelist_partitions - 1);
|
nelem - nelem_alloc * (freelist_partitions - 1);
|
||||||
@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem)
|
|||||||
int i;
|
int i;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* initialize mutex if it's a partitioned table
|
* initialize mutexes if it's a partitioned table
|
||||||
*/
|
*/
|
||||||
if (IS_PARTITIONED(hctl))
|
if (IS_PARTITIONED(hctl))
|
||||||
for (i = 0; i < NUM_FREELISTS; i++)
|
for (i = 0; i < NUM_FREELISTS; i++)
|
||||||
@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp,
|
|||||||
bool *foundPtr)
|
bool *foundPtr)
|
||||||
{
|
{
|
||||||
HASHHDR *hctl = hashp->hctl;
|
HASHHDR *hctl = hashp->hctl;
|
||||||
|
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
|
||||||
Size keysize;
|
Size keysize;
|
||||||
uint32 bucket;
|
uint32 bucket;
|
||||||
long segment_num;
|
long segment_num;
|
||||||
@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp,
|
|||||||
HASHBUCKET currBucket;
|
HASHBUCKET currBucket;
|
||||||
HASHBUCKET *prevBucketPtr;
|
HASHBUCKET *prevBucketPtr;
|
||||||
HashCompareFunc match;
|
HashCompareFunc match;
|
||||||
int freelist_idx = FREELIST_IDX(hctl, hashvalue);
|
|
||||||
|
|
||||||
#if HASH_STATISTICS
|
#if HASH_STATISTICS
|
||||||
hash_accesses++;
|
hash_accesses++;
|
||||||
@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp,
|
|||||||
if (IS_PARTITIONED(hctl))
|
if (IS_PARTITIONED(hctl))
|
||||||
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
|
SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex));
|
||||||
|
|
||||||
|
/* delete the record from the appropriate nentries counter. */
|
||||||
Assert(hctl->freeList[freelist_idx].nentries > 0);
|
Assert(hctl->freeList[freelist_idx].nentries > 0);
|
||||||
hctl->freeList[freelist_idx].nentries--;
|
hctl->freeList[freelist_idx].nentries--;
|
||||||
|
|
||||||
/* remove record from hash bucket's chain. */
|
/* remove record from hash bucket's chain. */
|
||||||
*prevBucketPtr = currBucket->link;
|
*prevBucketPtr = currBucket->link;
|
||||||
|
|
||||||
/* add the record to the freelist for this table. */
|
/* add the record to the appropriate freelist. */
|
||||||
currBucket->link = hctl->freeList[freelist_idx].freeList;
|
currBucket->link = hctl->freeList[freelist_idx].freeList;
|
||||||
hctl->freeList[freelist_idx].freeList = currBucket;
|
hctl->freeList[freelist_idx].freeList = currBucket;
|
||||||
|
|
||||||
@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* create a new entry if possible
|
* Allocate a new hashtable entry if possible; return NULL if out of memory.
|
||||||
|
* (Or, if the underlying space allocator throws error for out-of-memory,
|
||||||
|
* we won't return at all.)
|
||||||
*/
|
*/
|
||||||
static HASHBUCKET
|
static HASHBUCKET
|
||||||
get_hash_entry(HTAB *hashp, int freelist_idx)
|
get_hash_entry(HTAB *hashp, int freelist_idx)
|
||||||
{
|
{
|
||||||
HASHHDR *hctl = hashp->hctl;
|
HASHHDR *hctl = hashp->hctl;
|
||||||
HASHBUCKET newElement;
|
HASHBUCKET newElement;
|
||||||
int borrow_from_idx;
|
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
|
|||||||
if (IS_PARTITIONED(hctl))
|
if (IS_PARTITIONED(hctl))
|
||||||
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
|
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
|
||||||
|
|
||||||
/* no free elements. allocate another chunk of buckets */
|
/*
|
||||||
|
* No free elements in this freelist. In a partitioned table, there
|
||||||
|
* might be entries in other freelists, but to reduce contention we
|
||||||
|
* prefer to first try to get another chunk of buckets from the main
|
||||||
|
* shmem allocator. If that fails, though, we *MUST* root through all
|
||||||
|
* the other freelists before giving up. There are multiple callers
|
||||||
|
* that assume that they can allocate every element in the initially
|
||||||
|
* requested table size, or that deleting an element guarantees they
|
||||||
|
* can insert a new element, even if shared memory is entirely full.
|
||||||
|
* Failing because the needed element is in a different freelist is
|
||||||
|
* not acceptable.
|
||||||
|
*/
|
||||||
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
|
if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
|
||||||
{
|
{
|
||||||
|
int borrow_from_idx;
|
||||||
|
|
||||||
if (!IS_PARTITIONED(hctl))
|
if (!IS_PARTITIONED(hctl))
|
||||||
return NULL; /* out of memory */
|
return NULL; /* out of memory */
|
||||||
|
|
||||||
/* try to borrow element from another partition */
|
/* try to borrow element from another freelist */
|
||||||
borrow_from_idx = freelist_idx;
|
borrow_from_idx = freelist_idx;
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
|
borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
|
||||||
if (borrow_from_idx == freelist_idx)
|
if (borrow_from_idx == freelist_idx)
|
||||||
break;
|
break; /* examined all freelists, fail */
|
||||||
|
|
||||||
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
|
SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex));
|
||||||
newElement = hctl->freeList[borrow_from_idx].freeList;
|
newElement = hctl->freeList[borrow_from_idx].freeList;
|
||||||
@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx)
|
|||||||
hctl->freeList[borrow_from_idx].freeList = newElement->link;
|
hctl->freeList[borrow_from_idx].freeList = newElement->link;
|
||||||
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
|
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
|
||||||
|
|
||||||
|
/* careful: count the new element in its proper freelist */
|
||||||
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
|
SpinLockAcquire(&hctl->freeList[freelist_idx].mutex);
|
||||||
hctl->freeList[freelist_idx].nentries++;
|
hctl->freeList[freelist_idx].nentries++;
|
||||||
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
|
SpinLockRelease(&hctl->freeList[freelist_idx].mutex);
|
||||||
|
|
||||||
break;
|
return newElement;
|
||||||
}
|
}
|
||||||
|
|
||||||
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
|
SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex));
|
||||||
}
|
}
|
||||||
|
|
||||||
return newElement;
|
/* no elements available to borrow either, so out of memory */
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp)
|
|||||||
long sum = hashp->hctl->freeList[0].nentries;
|
long sum = hashp->hctl->freeList[0].nentries;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We currently don't bother with the mutex; it's only sensible to call
|
* We currently don't bother with acquiring the mutexes; it's only
|
||||||
* this function if you've got lock on all partitions of the table.
|
* sensible to call this function if you've got lock on all partitions of
|
||||||
|
* the table.
|
||||||
*/
|
*/
|
||||||
|
if (IS_PARTITIONED(hashp->hctl))
|
||||||
if (!IS_PARTITIONED(hashp->hctl))
|
{
|
||||||
return sum;
|
|
||||||
|
|
||||||
for (i = 1; i < NUM_FREELISTS; i++)
|
for (i = 1; i < NUM_FREELISTS; i++)
|
||||||
sum += hashp->hctl->freeList[i].nentries;
|
sum += hashp->hctl->freeList[i].nentries;
|
||||||
|
}
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user