mirror of
https://github.com/postgres/postgres.git
synced 2025-06-16 06:01:02 +03:00
In order to have the option to use O_DIRECT/FILE_FLAG_NO_BUFFERING in a
later commit, we need the addresses of user space buffers to be well
aligned. The exact requirements vary by OS and file system (typically
sectors and/or memory pages). The address alignment size is set to
4096, which is enough for currently known systems: it matches modern
sectors and common memory page size. There is no standard governing
O_DIRECT's requirements so we might eventually have to reconsider this
with more information from the field or future systems.
Aligning I/O buffers on memory pages is also known to improve regular
buffered I/O performance.
Three classes of I/O buffers for regular data pages are adjusted:
(1) Heap buffers are now allocated with the new palloc_aligned() or
MemoryContextAllocAligned() functions introduced by commit 439f6175
.
(2) Stack buffers now use a new struct PGIOAlignedBlock to respect
PG_IO_ALIGN_SIZE, if possible with this compiler. (3) The buffer
pool is also aligned in shared memory.
WAL buffers were already aligned on XLOG_BLCKSZ. It's possible for
XLOG_BLCKSZ to be configured smaller than PG_IO_ALIGNED_SIZE and thus
for O_DIRECT WAL writes to fail to be well aligned, but that's a
pre-existing condition and will be addressed by a later commit.
BufFiles are not yet addressed (there's no current plan to use O_DIRECT
for those, but they could potentially get some incidental speedup even
in plain buffered I/O operations through better alignment).
If we can't align stack objects suitably using the compiler extensions
we know about, we disable the use of O_DIRECT by setting PG_O_DIRECT to
0. This avoids the need to consider systems that have O_DIRECT but
can't align stack objects the way we want; such systems could in theory
be supported with more work but we don't currently know of any such
machines, so it's easier to pretend there is no O_DIRECT support
instead. That's an existing and tested class of system.
Add assertions that all buffers passed into smgrread(), smgrwrite() and
smgrextend() are correctly aligned, unless PG_O_DIRECT is 0 (= stack
alignment tricks may be unavailable) or the block size has been set too
small to allow arrays of buffers to be all aligned.
Author: Thomas Munro <thomas.munro@gmail.com>
Author: Andres Freund <andres@anarazel.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Discussion: https://postgr.es/m/CA+hUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg@mail.gmail.com
244 lines
6.8 KiB
C
244 lines
6.8 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* spginsert.c
|
|
* Externally visible index creation/insertion routines
|
|
*
|
|
* All the actual insertion logic is in spgdoinsert.c.
|
|
*
|
|
* Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/access/spgist/spginsert.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
|
|
#include "access/genam.h"
|
|
#include "access/spgist_private.h"
|
|
#include "access/spgxlog.h"
|
|
#include "access/tableam.h"
|
|
#include "access/xlog.h"
|
|
#include "access/xloginsert.h"
|
|
#include "catalog/index.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/smgr.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/rel.h"
|
|
|
|
|
|
typedef struct
|
|
{
|
|
SpGistState spgstate; /* SPGiST's working state */
|
|
int64 indtuples; /* total number of tuples indexed */
|
|
MemoryContext tmpCtx; /* per-tuple temporary context */
|
|
} SpGistBuildState;
|
|
|
|
|
|
/* Callback to process one heap tuple during table_index_build_scan */
|
|
static void
|
|
spgistBuildCallback(Relation index, ItemPointer tid, Datum *values,
|
|
bool *isnull, bool tupleIsAlive, void *state)
|
|
{
|
|
SpGistBuildState *buildstate = (SpGistBuildState *) state;
|
|
MemoryContext oldCtx;
|
|
|
|
/* Work in temp context, and reset it after each tuple */
|
|
oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx);
|
|
|
|
/*
|
|
* Even though no concurrent insertions can be happening, we still might
|
|
* get a buffer-locking failure due to bgwriter or checkpointer taking a
|
|
* lock on some buffer. So we need to be willing to retry. We can flush
|
|
* any temp data when retrying.
|
|
*/
|
|
while (!spgdoinsert(index, &buildstate->spgstate, tid,
|
|
values, isnull))
|
|
{
|
|
MemoryContextReset(buildstate->tmpCtx);
|
|
}
|
|
|
|
/* Update total tuple count */
|
|
buildstate->indtuples += 1;
|
|
|
|
MemoryContextSwitchTo(oldCtx);
|
|
MemoryContextReset(buildstate->tmpCtx);
|
|
}
|
|
|
|
/*
|
|
* Build an SP-GiST index.
|
|
*/
|
|
IndexBuildResult *
|
|
spgbuild(Relation heap, Relation index, IndexInfo *indexInfo)
|
|
{
|
|
IndexBuildResult *result;
|
|
double reltuples;
|
|
SpGistBuildState buildstate;
|
|
Buffer metabuffer,
|
|
rootbuffer,
|
|
nullbuffer;
|
|
|
|
if (RelationGetNumberOfBlocks(index) != 0)
|
|
elog(ERROR, "index \"%s\" already contains data",
|
|
RelationGetRelationName(index));
|
|
|
|
/*
|
|
* Initialize the meta page and root pages
|
|
*/
|
|
metabuffer = SpGistNewBuffer(index);
|
|
rootbuffer = SpGistNewBuffer(index);
|
|
nullbuffer = SpGistNewBuffer(index);
|
|
|
|
Assert(BufferGetBlockNumber(metabuffer) == SPGIST_METAPAGE_BLKNO);
|
|
Assert(BufferGetBlockNumber(rootbuffer) == SPGIST_ROOT_BLKNO);
|
|
Assert(BufferGetBlockNumber(nullbuffer) == SPGIST_NULL_BLKNO);
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
SpGistInitMetapage(BufferGetPage(metabuffer));
|
|
MarkBufferDirty(metabuffer);
|
|
SpGistInitBuffer(rootbuffer, SPGIST_LEAF);
|
|
MarkBufferDirty(rootbuffer);
|
|
SpGistInitBuffer(nullbuffer, SPGIST_LEAF | SPGIST_NULLS);
|
|
MarkBufferDirty(nullbuffer);
|
|
|
|
|
|
END_CRIT_SECTION();
|
|
|
|
UnlockReleaseBuffer(metabuffer);
|
|
UnlockReleaseBuffer(rootbuffer);
|
|
UnlockReleaseBuffer(nullbuffer);
|
|
|
|
/*
|
|
* Now insert all the heap data into the index
|
|
*/
|
|
initSpGistState(&buildstate.spgstate, index);
|
|
buildstate.spgstate.isBuild = true;
|
|
buildstate.indtuples = 0;
|
|
|
|
buildstate.tmpCtx = AllocSetContextCreate(CurrentMemoryContext,
|
|
"SP-GiST build temporary context",
|
|
ALLOCSET_DEFAULT_SIZES);
|
|
|
|
reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
|
|
spgistBuildCallback, (void *) &buildstate,
|
|
NULL);
|
|
|
|
MemoryContextDelete(buildstate.tmpCtx);
|
|
|
|
SpGistUpdateMetaPage(index);
|
|
|
|
/*
|
|
* We didn't write WAL records as we built the index, so if WAL-logging is
|
|
* required, write all pages to the WAL now.
|
|
*/
|
|
if (RelationNeedsWAL(index))
|
|
{
|
|
log_newpage_range(index, MAIN_FORKNUM,
|
|
0, RelationGetNumberOfBlocks(index),
|
|
true);
|
|
}
|
|
|
|
result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult));
|
|
result->heap_tuples = reltuples;
|
|
result->index_tuples = buildstate.indtuples;
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Build an empty SPGiST index in the initialization fork
|
|
*/
|
|
void
|
|
spgbuildempty(Relation index)
|
|
{
|
|
Page page;
|
|
|
|
/* Construct metapage. */
|
|
page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
|
|
SpGistInitMetapage(page);
|
|
|
|
/*
|
|
* Write the page and log it unconditionally. This is important
|
|
* particularly for indexes created on tablespaces and databases whose
|
|
* creation happened after the last redo pointer as recovery removes any
|
|
* of their existing content when the corresponding create records are
|
|
* replayed.
|
|
*/
|
|
PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
|
|
smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
|
|
page, true);
|
|
log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM,
|
|
SPGIST_METAPAGE_BLKNO, page, true);
|
|
|
|
/* Likewise for the root page. */
|
|
SpGistInitPage(page, SPGIST_LEAF);
|
|
|
|
PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
|
|
smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_ROOT_BLKNO,
|
|
page, true);
|
|
log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM,
|
|
SPGIST_ROOT_BLKNO, page, true);
|
|
|
|
/* Likewise for the null-tuples root page. */
|
|
SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
|
|
|
|
PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
|
|
smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_NULL_BLKNO,
|
|
page, true);
|
|
log_newpage(&(RelationGetSmgr(index))->smgr_rlocator.locator, INIT_FORKNUM,
|
|
SPGIST_NULL_BLKNO, page, true);
|
|
|
|
/*
|
|
* An immediate sync is required even if we xlog'd the pages, because the
|
|
* writes did not go through shared buffers and therefore a concurrent
|
|
* checkpoint may have moved the redo pointer past our xlog record.
|
|
*/
|
|
smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
|
|
}
|
|
|
|
/*
|
|
* Insert one new tuple into an SPGiST index.
|
|
*/
|
|
bool
|
|
spginsert(Relation index, Datum *values, bool *isnull,
|
|
ItemPointer ht_ctid, Relation heapRel,
|
|
IndexUniqueCheck checkUnique,
|
|
bool indexUnchanged,
|
|
IndexInfo *indexInfo)
|
|
{
|
|
SpGistState spgstate;
|
|
MemoryContext oldCtx;
|
|
MemoryContext insertCtx;
|
|
|
|
insertCtx = AllocSetContextCreate(CurrentMemoryContext,
|
|
"SP-GiST insert temporary context",
|
|
ALLOCSET_DEFAULT_SIZES);
|
|
oldCtx = MemoryContextSwitchTo(insertCtx);
|
|
|
|
initSpGistState(&spgstate, index);
|
|
|
|
/*
|
|
* We might have to repeat spgdoinsert() multiple times, if conflicts
|
|
* occur with concurrent insertions. If so, reset the insertCtx each time
|
|
* to avoid cumulative memory consumption. That means we also have to
|
|
* redo initSpGistState(), but it's cheap enough not to matter.
|
|
*/
|
|
while (!spgdoinsert(index, &spgstate, ht_ctid, values, isnull))
|
|
{
|
|
MemoryContextReset(insertCtx);
|
|
initSpGistState(&spgstate, index);
|
|
}
|
|
|
|
SpGistUpdateMetaPage(index);
|
|
|
|
MemoryContextSwitchTo(oldCtx);
|
|
MemoryContextDelete(insertCtx);
|
|
|
|
/* return false since we've not done any unique check */
|
|
return false;
|
|
}
|