diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index ebd55da997e..a58a479ae56 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -15,7 +15,8 @@ * to hash_create. This prevents any attempt to split buckets on-the-fly. * Therefore, each hash bucket chain operates independently, and no fields * of the hash header change after init except nentries and freeList. - * A partitioned table uses spinlocks to guard changes of those fields. + * (A partitioned table uses multiple copies of those fields, guarded by + * spinlocks, for additional concurrency.) * This lets any subset of the hash buckets be treated as a separately * lockable partition. We expect callers to use the low-order bits of a * lookup key's hash value as a partition number --- this will work because @@ -121,15 +122,27 @@ typedef HASHELEMENT *HASHBUCKET; typedef HASHBUCKET *HASHSEGMENT; /* - * Using array of FreeListData instead of separate arrays of mutexes, nentries - * and freeLists prevents, at least partially, sharing one cache line between - * different mutexes (see below). + * Per-freelist data. + * + * In a partitioned hash table, each freelist is associated with a specific + * set of hashcodes, as determined by the FREELIST_IDX() macro below. + * nentries tracks the number of live hashtable entries having those hashcodes + * (NOT the number of entries in the freelist, as you might expect). + * + * The coverage of a freelist might be more or less than one partition, so it + * needs its own lock rather than relying on caller locking. Relying on that + * wouldn't work even if the coverage was the same, because of the occasional + * need to "borrow" entries from another freelist; see get_hash_entry(). + * + * Using an array of FreeListData instead of separate arrays of mutexes, + * nentries and freeLists helps to reduce sharing of cache lines between + * different mutexes. */ typedef struct { - slock_t mutex; /* spinlock */ - long nentries; /* number of entries */ - HASHELEMENT *freeList; /* list of free elements */ + slock_t mutex; /* spinlock for this freelist */ + long nentries; /* number of entries in associated buckets */ + HASHELEMENT *freeList; /* chain of free elements */ } FreeListData; /* @@ -143,12 +156,14 @@ typedef struct struct HASHHDR { /* - * The freelist can become a point of contention on high-concurrency hash - * tables, so we use an array of freelist, each with its own mutex and - * nentries count, instead of just a single one. + * The freelist can become a point of contention in high-concurrency hash + * tables, so we use an array of freelists, each with its own mutex and + * nentries count, instead of just a single one. Although the freelists + * normally operate independently, we will scavenge entries from freelists + * other than a hashcode's default freelist when necessary. * - * If hash table is not partitioned only freeList[0] is used and spinlocks - * are not used at all. + * If the hash table is not partitioned, only freeList[0] is used and its + * spinlock is not used at all; callers' locking is assumed sufficient. */ FreeListData freeList[NUM_FREELISTS]; @@ -184,7 +199,7 @@ struct HASHHDR #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) #define FREELIST_IDX(hctl, hashcode) \ - (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0) + (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0) /* * Top control structure for a hashtable --- in a shared table, each backend @@ -506,8 +521,8 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) nelem_alloc_first; /* - * If hash table is partitioned all freeLists have equal number of - * elements. Otherwise only freeList[0] is used. + * If hash table is partitioned, give each freelist an equal share of + * the initial allocation. Otherwise only freeList[0] is used. */ if (IS_PARTITIONED(hashp->hctl)) freelist_partitions = NUM_FREELISTS; @@ -515,10 +530,13 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) freelist_partitions = 1; nelem_alloc = nelem / freelist_partitions; - if (nelem_alloc == 0) + if (nelem_alloc <= 0) nelem_alloc = 1; - /* Make sure all memory will be used */ + /* + * Make sure we'll allocate all the requested elements; freeList[0] + * gets the excess if the request isn't divisible by NUM_FREELISTS. + */ if (nelem_alloc * freelist_partitions < nelem) nelem_alloc_first = nelem - nelem_alloc * (freelist_partitions - 1); @@ -620,7 +638,7 @@ init_htab(HTAB *hashp, long nelem) int i; /* - * initialize mutex if it's a partitioned table + * initialize mutexes if it's a partitioned table */ if (IS_PARTITIONED(hctl)) for (i = 0; i < NUM_FREELISTS; i++) @@ -902,6 +920,7 @@ hash_search_with_hash_value(HTAB *hashp, bool *foundPtr) { HASHHDR *hctl = hashp->hctl; + int freelist_idx = FREELIST_IDX(hctl, hashvalue); Size keysize; uint32 bucket; long segment_num; @@ -910,7 +929,6 @@ hash_search_with_hash_value(HTAB *hashp, HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HashCompareFunc match; - int freelist_idx = FREELIST_IDX(hctl, hashvalue); #if HASH_STATISTICS hash_accesses++; @@ -993,13 +1011,14 @@ hash_search_with_hash_value(HTAB *hashp, if (IS_PARTITIONED(hctl)) SpinLockAcquire(&(hctl->freeList[freelist_idx].mutex)); + /* delete the record from the appropriate nentries counter. */ Assert(hctl->freeList[freelist_idx].nentries > 0); hctl->freeList[freelist_idx].nentries--; /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; - /* add the record to the freelist for this table. */ + /* add the record to the appropriate freelist. */ currBucket->link = hctl->freeList[freelist_idx].freeList; hctl->freeList[freelist_idx].freeList = currBucket; @@ -1220,14 +1239,15 @@ hash_update_hash_key(HTAB *hashp, } /* - * create a new entry if possible + * Allocate a new hashtable entry if possible; return NULL if out of memory. + * (Or, if the underlying space allocator throws error for out-of-memory, + * we won't return at all.) */ static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx) { HASHHDR *hctl = hashp->hctl; HASHBUCKET newElement; - int borrow_from_idx; for (;;) { @@ -1244,19 +1264,32 @@ get_hash_entry(HTAB *hashp, int freelist_idx) if (IS_PARTITIONED(hctl)) SpinLockRelease(&hctl->freeList[freelist_idx].mutex); - /* no free elements. allocate another chunk of buckets */ + /* + * No free elements in this freelist. In a partitioned table, there + * might be entries in other freelists, but to reduce contention we + * prefer to first try to get another chunk of buckets from the main + * shmem allocator. If that fails, though, we *MUST* root through all + * the other freelists before giving up. There are multiple callers + * that assume that they can allocate every element in the initially + * requested table size, or that deleting an element guarantees they + * can insert a new element, even if shared memory is entirely full. + * Failing because the needed element is in a different freelist is + * not acceptable. + */ if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx)) { + int borrow_from_idx; + if (!IS_PARTITIONED(hctl)) return NULL; /* out of memory */ - /* try to borrow element from another partition */ + /* try to borrow element from another freelist */ borrow_from_idx = freelist_idx; for (;;) { borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS; if (borrow_from_idx == freelist_idx) - break; + break; /* examined all freelists, fail */ SpinLockAcquire(&(hctl->freeList[borrow_from_idx].mutex)); newElement = hctl->freeList[borrow_from_idx].freeList; @@ -1266,17 +1299,19 @@ get_hash_entry(HTAB *hashp, int freelist_idx) hctl->freeList[borrow_from_idx].freeList = newElement->link; SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); + /* careful: count the new element in its proper freelist */ SpinLockAcquire(&hctl->freeList[freelist_idx].mutex); hctl->freeList[freelist_idx].nentries++; SpinLockRelease(&hctl->freeList[freelist_idx].mutex); - break; + return newElement; } SpinLockRelease(&(hctl->freeList[borrow_from_idx].mutex)); } - return newElement; + /* no elements available to borrow either, so out of memory */ + return NULL; } } @@ -1300,15 +1335,15 @@ hash_get_num_entries(HTAB *hashp) long sum = hashp->hctl->freeList[0].nentries; /* - * We currently don't bother with the mutex; it's only sensible to call - * this function if you've got lock on all partitions of the table. + * We currently don't bother with acquiring the mutexes; it's only + * sensible to call this function if you've got lock on all partitions of + * the table. */ - - if (!IS_PARTITIONED(hashp->hctl)) - return sum; - - for (i = 1; i < NUM_FREELISTS; i++) - sum += hashp->hctl->freeList[i].nentries; + if (IS_PARTITIONED(hashp->hctl)) + { + for (i = 1; i < NUM_FREELISTS; i++) + sum += hashp->hctl->freeList[i].nentries; + } return sum; }