1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-01 14:21:49 +03:00
Fujii Masao 60369db86f Fix calculation of how much shared memory is required to store a TOC.
Commit ac883ac453 refactored shm_toc_estimate() but changed its calculation
of shared memory size for TOC incorrectly. Previously this could cause too
large memory to be allocated.

Back-patch to v11 where the bug was introduced.

Author: Takayuki Tsunakawa
Discussion: https://postgr.es/m/TYAPR01MB2990BFB73170E2C4921E2C4DFEA80@TYAPR01MB2990.jpnprd01.prod.outlook.com
2021-01-15 12:45:25 +09:00

273 lines
7.6 KiB
C

/*-------------------------------------------------------------------------
*
* shm_toc.c
* shared memory segment table of contents
*
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/backend/storage/ipc/shm_toc.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "port/atomics.h"
#include "storage/shm_toc.h"
#include "storage/spin.h"
typedef struct shm_toc_entry
{
uint64 key; /* Arbitrary identifier */
Size offset; /* Offset, in bytes, from TOC start */
} shm_toc_entry;
struct shm_toc
{
uint64 toc_magic; /* Magic number identifying this TOC */
slock_t toc_mutex; /* Spinlock for mutual exclusion */
Size toc_total_bytes; /* Bytes managed by this TOC */
Size toc_allocated_bytes; /* Bytes allocated of those managed */
uint32 toc_nentry; /* Number of entries in TOC */
shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
};
/*
* Initialize a region of shared memory with a table of contents.
*/
shm_toc *
shm_toc_create(uint64 magic, void *address, Size nbytes)
{
shm_toc *toc = (shm_toc *) address;
Assert(nbytes > offsetof(shm_toc, toc_entry));
toc->toc_magic = magic;
SpinLockInit(&toc->toc_mutex);
/*
* The alignment code in shm_toc_allocate() assumes that the starting
* value is buffer-aligned.
*/
toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
toc->toc_allocated_bytes = 0;
toc->toc_nentry = 0;
return toc;
}
/*
* Attach to an existing table of contents. If the magic number found at
* the target address doesn't match our expectations, return NULL.
*/
shm_toc *
shm_toc_attach(uint64 magic, void *address)
{
shm_toc *toc = (shm_toc *) address;
if (toc->toc_magic != magic)
return NULL;
Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
return toc;
}
/*
* Allocate shared memory from a segment managed by a table of contents.
*
* This is not a full-blown allocator; there's no way to free memory. It's
* just a way of dividing a single physical shared memory segment into logical
* chunks that may be used for different purposes.
*
* We allocate backwards from the end of the segment, so that the TOC entries
* can grow forward from the start of the segment.
*/
void *
shm_toc_allocate(shm_toc *toc, Size nbytes)
{
volatile shm_toc *vtoc = toc;
Size total_bytes;
Size allocated_bytes;
Size nentry;
Size toc_bytes;
/*
* Make sure request is well-aligned. XXX: MAXALIGN is not enough,
* because atomic ops might need a wider alignment. We don't have a
* proper definition for the minimum to make atomic ops safe, but
* BUFFERALIGN ought to be enough.
*/
nbytes = BUFFERALIGN(nbytes);
SpinLockAcquire(&toc->toc_mutex);
total_bytes = vtoc->toc_total_bytes;
allocated_bytes = vtoc->toc_allocated_bytes;
nentry = vtoc->toc_nentry;
toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+ allocated_bytes;
/* Check for memory exhaustion and overflow. */
if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
{
SpinLockRelease(&toc->toc_mutex);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of shared memory")));
}
vtoc->toc_allocated_bytes += nbytes;
SpinLockRelease(&toc->toc_mutex);
return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
}
/*
* Return the number of bytes that can still be allocated.
*/
Size
shm_toc_freespace(shm_toc *toc)
{
volatile shm_toc *vtoc = toc;
Size total_bytes;
Size allocated_bytes;
Size nentry;
Size toc_bytes;
SpinLockAcquire(&toc->toc_mutex);
total_bytes = vtoc->toc_total_bytes;
allocated_bytes = vtoc->toc_allocated_bytes;
nentry = vtoc->toc_nentry;
SpinLockRelease(&toc->toc_mutex);
toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
}
/*
* Insert a TOC entry.
*
* The idea here is that the process setting up the shared memory segment will
* register the addresses of data structures within the segment using this
* function. Each data structure will be identified using a 64-bit key, which
* is assumed to be a well-known or discoverable integer. Other processes
* accessing the shared memory segment can pass the same key to
* shm_toc_lookup() to discover the addresses of those data structures.
*
* Since the shared memory segment may be mapped at different addresses within
* different backends, we store relative rather than absolute pointers.
*
* This won't scale well to a large number of keys. Hopefully, that isn't
* necessary; if it proves to be, we might need to provide a more sophisticated
* data structure here. But the real idea here is just to give someone mapping
* a dynamic shared memory the ability to find the bare minimum number of
* pointers that they need to bootstrap. If you're storing a lot of stuff in
* the TOC, you're doing it wrong.
*/
void
shm_toc_insert(shm_toc *toc, uint64 key, void *address)
{
volatile shm_toc *vtoc = toc;
Size total_bytes;
Size allocated_bytes;
Size nentry;
Size toc_bytes;
Size offset;
/* Relativize pointer. */
Assert(address > (void *) toc);
offset = ((char *) address) - (char *) toc;
SpinLockAcquire(&toc->toc_mutex);
total_bytes = vtoc->toc_total_bytes;
allocated_bytes = vtoc->toc_allocated_bytes;
nentry = vtoc->toc_nentry;
toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+ allocated_bytes;
/* Check for memory exhaustion and overflow. */
if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
nentry >= PG_UINT32_MAX)
{
SpinLockRelease(&toc->toc_mutex);
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of shared memory")));
}
Assert(offset < total_bytes);
vtoc->toc_entry[nentry].key = key;
vtoc->toc_entry[nentry].offset = offset;
/*
* By placing a write barrier after filling in the entry and before
* updating the number of entries, we make it safe to read the TOC
* unlocked.
*/
pg_write_barrier();
vtoc->toc_nentry++;
SpinLockRelease(&toc->toc_mutex);
}
/*
* Look up a TOC entry.
*
* If the key is not found, returns NULL if noError is true, otherwise
* throws elog(ERROR).
*
* Unlike the other functions in this file, this operation acquires no lock;
* it uses only barriers. It probably wouldn't hurt concurrency very much even
* if it did get a lock, but since it's reasonably likely that a group of
* worker processes could each read a series of entries from the same TOC
* right around the same time, there seems to be some value in avoiding it.
*/
void *
shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
{
uint32 nentry;
uint32 i;
/*
* Read the number of entries before we examine any entry. We assume that
* reading a uint32 is atomic.
*/
nentry = toc->toc_nentry;
pg_read_barrier();
/* Now search for a matching entry. */
for (i = 0; i < nentry; ++i)
{
if (toc->toc_entry[i].key == key)
return ((char *) toc) + toc->toc_entry[i].offset;
}
/* No matching entry was found. */
if (!noError)
elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
key, toc);
return NULL;
}
/*
* Estimate how much shared memory will be required to store a TOC and its
* dependent data structures.
*/
Size
shm_toc_estimate(shm_toc_estimator *e)
{
Size sz;
sz = offsetof(shm_toc, toc_entry);
sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
sz = add_size(sz, e->space_for_chunks);
return BUFFERALIGN(sz);
}