1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-16 15:02:33 +03:00

Adjust btree index build to not use shared buffers, thereby avoiding the

locking conflict against concurrent CHECKPOINT that was discussed a few
weeks ago.  Also, if not using WAL archiving (which is always true ATM
but won't be if PITR makes it into this release), there's no need to
WAL-log the index build process; it's sufficient to force-fsync the
completed index before commit.  This seems to gain about a factor of 2
in my tests, which is consistent with writing half as much data.  I did
not try it with WAL on a separate drive though --- probably the gain would
be a lot less in that scenario.
This commit is contained in:
Tom Lane
2004-06-02 17:28:18 +00:00
parent 4d0e47d5a9
commit 2095206de1
8 changed files with 304 additions and 214 deletions

View File

@@ -9,7 +9,7 @@
*
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.75 2004/04/21 18:24:25 tgl Exp $
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.76 2004/06/02 17:28:17 tgl Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
@@ -31,8 +31,9 @@
/*
* _bt_metapinit() -- Initialize the metadata page of a new btree.
*
* If markvalid is true, the index is immediately marked valid, else it
* will be invalid until _bt_metaproot() is called.
* Note: this is actually not used for standard btree index building;
* nbtsort.c prefers not to make the metadata page valid until completion
* of build.
*
* Note: there's no real need for any locking here. Since the transaction
* creating the index hasn't committed yet, no one else can even see the index
@@ -40,12 +41,11 @@
* not true, but we assume the caller holds sufficient locks on the index.)
*/
void
_bt_metapinit(Relation rel, bool markvalid)
_bt_metapinit(Relation rel)
{
Buffer buf;
Page pg;
BTMetaPageData *metad;
BTPageOpaque op;
if (RelationGetNumberOfBlocks(rel) != 0)
elog(ERROR, "cannot initialize non-empty btree index \"%s\"",
@@ -55,22 +55,12 @@ _bt_metapinit(Relation rel, bool markvalid)
Assert(BufferGetBlockNumber(buf) == BTREE_METAPAGE);
pg = BufferGetPage(buf);
_bt_initmetapage(pg, P_NONE, 0);
metad = BTPageGetMeta(pg);
/* NO ELOG(ERROR) from here till newmeta op is logged */
START_CRIT_SECTION();
_bt_pageinit(pg, BufferGetPageSize(buf));
metad = BTPageGetMeta(pg);
metad->btm_magic = markvalid ? BTREE_MAGIC : 0;
metad->btm_version = BTREE_VERSION;
metad->btm_root = P_NONE;
metad->btm_level = 0;
metad->btm_fastroot = P_NONE;
metad->btm_fastlevel = 0;
op = (BTPageOpaque) PageGetSpecialPointer(pg);
op->btpo_flags = BTP_META;
/* XLOG stuff */
if (!rel->rd_istemp)
{
@@ -90,7 +80,7 @@ _bt_metapinit(Relation rel, bool markvalid)
rdata[0].next = NULL;
recptr = XLogInsert(RM_BTREE_ID,
markvalid ? XLOG_BTREE_NEWMETA : XLOG_BTREE_INVALIDMETA,
XLOG_BTREE_NEWMETA,
rdata);
PageSetLSN(pg, recptr);
@@ -102,6 +92,29 @@ _bt_metapinit(Relation rel, bool markvalid)
WriteBuffer(buf);
}
/*
* _bt_initmetapage() -- Fill a page buffer with a correct metapage image
*/
void
_bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
{
BTMetaPageData *metad;
BTPageOpaque metaopaque;
_bt_pageinit(page, BLCKSZ);
metad = BTPageGetMeta(page);
metad->btm_magic = BTREE_MAGIC;
metad->btm_version = BTREE_VERSION;
metad->btm_root = rootbknum;
metad->btm_level = level;
metad->btm_fastroot = rootbknum;
metad->btm_fastlevel = level;
metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
metaopaque->btpo_flags = BTP_META;
}
/*
* _bt_getroot() -- Get the root page of the btree.
*
@@ -609,76 +622,6 @@ _bt_page_recyclable(Page page)
return false;
}
/*
* _bt_metaproot() -- Change the root page of the btree.
*
* Lehman and Yao require that the root page move around in order to
* guarantee deadlock-free short-term, fine-granularity locking. When
* we split the root page, we record the new parent in the metadata page
* for the relation. This routine does the work.
*
* No direct preconditions, but if you don't have the write lock on
* at least the old root page when you call this, you're making a big
* mistake. On exit, metapage data is correct and we no longer have
* a pin or lock on the metapage.
*
* Actually this is not used for splitting on-the-fly anymore. It's only used
* in nbtsort.c at the completion of btree building, where we know we have
* sole access to the index anyway.
*/
void
_bt_metaproot(Relation rel, BlockNumber rootbknum, uint32 level)
{
Buffer metabuf;
Page metap;
BTPageOpaque metaopaque;
BTMetaPageData *metad;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metap = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
Assert(metaopaque->btpo_flags & BTP_META);
/* NO ELOG(ERROR) from here till newmeta op is logged */
START_CRIT_SECTION();
metad = BTPageGetMeta(metap);
Assert(metad->btm_magic == BTREE_MAGIC || metad->btm_magic == 0);
metad->btm_magic = BTREE_MAGIC; /* it's valid now for sure */
metad->btm_root = rootbknum;
metad->btm_level = level;
metad->btm_fastroot = rootbknum;
metad->btm_fastlevel = level;
/* XLOG stuff */
if (!rel->rd_istemp)
{
xl_btree_newmeta xlrec;
XLogRecPtr recptr;
XLogRecData rdata[1];
xlrec.node = rel->rd_node;
xlrec.meta.root = metad->btm_root;
xlrec.meta.level = metad->btm_level;
xlrec.meta.fastroot = metad->btm_fastroot;
xlrec.meta.fastlevel = metad->btm_fastlevel;
rdata[0].buffer = InvalidBuffer;
rdata[0].data = (char *) &xlrec;
rdata[0].len = SizeOfBtreeNewmeta;
rdata[0].next = NULL;
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWMETA, rdata);
PageSetLSN(metap, recptr);
PageSetSUI(metap, ThisStartUpID);
}
END_CRIT_SECTION();
_bt_wrtbuf(rel, metabuf);
}
/*
* Delete item(s) from a btree page.
*