mirror of
https://github.com/postgres/postgres.git
synced 2025-08-31 17:02:12 +03:00
Postgres95 1.01 Distribution - Virgin Sources
This commit is contained in:
15
src/backend/access/nbtree/Makefile.inc
Normal file
15
src/backend/access/nbtree/Makefile.inc
Normal file
@@ -0,0 +1,15 @@
|
||||
#-------------------------------------------------------------------------
|
||||
#
|
||||
# Makefile.inc--
|
||||
# Makefile for access/nbtree (btree acess methods)
|
||||
#
|
||||
# Copyright (c) 1994, Regents of the University of California
|
||||
#
|
||||
#
|
||||
# IDENTIFICATION
|
||||
# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
|
||||
#
|
||||
#-------------------------------------------------------------------------
|
||||
|
||||
SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \
|
||||
nbtstrat.c nbtutils.c nbtsort.c
|
68
src/backend/access/nbtree/README
Normal file
68
src/backend/access/nbtree/README
Normal file
@@ -0,0 +1,68 @@
|
||||
$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
|
||||
This directory contains a correct implementation of Lehman and Yao's
|
||||
btree management algorithm that supports concurrent access for Postgres.
|
||||
We have made the following changes in order to incorporate their algorithm
|
||||
into Postgres:
|
||||
|
||||
+ The requirement that all btree keys be unique is too onerous,
|
||||
but the algorithm won't work correctly without it. As a result,
|
||||
this implementation adds an OID (guaranteed to be unique) to
|
||||
every key in the index. This guarantees uniqueness within a set
|
||||
of duplicates. Space overhead is four bytes.
|
||||
|
||||
For this reason, when we're passed an index tuple to store by the
|
||||
common access method code, we allocate a larger one and copy the
|
||||
supplied tuple into it. No Postgres code outside of the btree
|
||||
access method knows about this xid or sequence number.
|
||||
|
||||
+ Lehman and Yao don't require read locks, but assume that in-
|
||||
memory copies of tree nodes are unshared. Postgres shares
|
||||
in-memory buffers among backends. As a result, we do page-
|
||||
level read locking on btree nodes in order to guarantee that
|
||||
no record is modified while we are examining it. This reduces
|
||||
concurrency but guaranteees correct behavior.
|
||||
|
||||
+ Read locks on a page are held for as long as a scan has a pointer
|
||||
to the page. However, locks are always surrendered before the
|
||||
sibling page lock is acquired (for readers), so we remain deadlock-
|
||||
free. I will do a formal proof if I get bored anytime soon.
|
||||
|
||||
In addition, the following things are handy to know:
|
||||
|
||||
+ Page zero of every btree is a meta-data page. This page stores
|
||||
the location of the root page, a pointer to a list of free
|
||||
pages, and other stuff that's handy to know.
|
||||
|
||||
+ This algorithm doesn't really work, since it requires ordered
|
||||
writes, and UNIX doesn't support ordered writes.
|
||||
|
||||
+ There's one other case where we may screw up in this
|
||||
implementation. When we start a scan, we descend the tree
|
||||
to the key nearest the one in the qual, and once we get there,
|
||||
position ourselves correctly for the qual type (eg, <, >=, etc).
|
||||
If we happen to step off a page, decide we want to get back to
|
||||
it, and fetch the page again, and if some bad person has split
|
||||
the page and moved the last tuple we saw off of it, then the
|
||||
code complains about botched concurrency in an elog(WARN, ...)
|
||||
and gives up the ghost. This is the ONLY violation of Lehman
|
||||
and Yao's guarantee of correct behavior that I am aware of in
|
||||
this code.
|
||||
|
||||
Notes to operator class implementors:
|
||||
|
||||
With this implementation, we require the user to supply us with
|
||||
a procedure for pg_amproc. This procedure should take two keys
|
||||
A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
|
||||
respectively. See the contents of that relation for the btree
|
||||
access method for some samples.
|
||||
|
||||
Notes to mao for implementation document:
|
||||
|
||||
On deletions, we need to adjust the position of active scans on
|
||||
the index. The code in nbtscan.c handles this. We don't need to
|
||||
do this for splits because of the way splits are handled; if they
|
||||
happen behind us, we'll automatically go to the next page, and if
|
||||
they happen in front of us, we're not affected by them. For
|
||||
insertions, if we inserted a tuple behind the current scan location
|
||||
on the current scan page, we move one space ahead.
|
173
src/backend/access/nbtree/nbtcompare.c
Normal file
173
src/backend/access/nbtree/nbtcompare.c
Normal file
@@ -0,0 +1,173 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btcompare.c--
|
||||
* Comparison functions for btree access method.
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
* NOTES
|
||||
* These functions are stored in pg_amproc. For each operator class
|
||||
* defined on btrees, they compute
|
||||
*
|
||||
* compare(a, b):
|
||||
* < 0 if a < b,
|
||||
* = 0 if a == b,
|
||||
* > 0 if a > b.
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include <string.h>
|
||||
#include "postgres.h"
|
||||
#include "utils/nabstime.h"
|
||||
|
||||
int32
|
||||
btint2cmp(int16 a, int16 b)
|
||||
{
|
||||
return ((int32) (a - b));
|
||||
}
|
||||
|
||||
int32
|
||||
btint4cmp(int32 a, int32 b)
|
||||
{
|
||||
return (a - b);
|
||||
}
|
||||
|
||||
int32
|
||||
btint24cmp(int16 a, int32 b)
|
||||
{
|
||||
return (((int32) a) - b);
|
||||
}
|
||||
|
||||
int32
|
||||
btint42cmp(int32 a, int16 b)
|
||||
{
|
||||
return (a - ((int32) b));
|
||||
}
|
||||
|
||||
int32
|
||||
btfloat4cmp(float32 a, float32 b)
|
||||
{
|
||||
if (*a > *b)
|
||||
return (1);
|
||||
else if (*a == *b)
|
||||
return (0);
|
||||
else
|
||||
return (-1);
|
||||
}
|
||||
|
||||
int32
|
||||
btfloat8cmp(float64 a, float64 b)
|
||||
{
|
||||
if (*a > *b)
|
||||
return (1);
|
||||
else if (*a == *b)
|
||||
return (0);
|
||||
else
|
||||
return (-1);
|
||||
}
|
||||
|
||||
int32
|
||||
btoidcmp(Oid a, Oid b)
|
||||
{
|
||||
if (a > b)
|
||||
return (1);
|
||||
else if (a == b)
|
||||
return (0);
|
||||
else
|
||||
return (-1);
|
||||
}
|
||||
|
||||
int32
|
||||
btabstimecmp(AbsoluteTime a, AbsoluteTime b)
|
||||
{
|
||||
if (AbsoluteTimeIsBefore(a, b))
|
||||
return (1);
|
||||
else if (AbsoluteTimeIsBefore(b, a))
|
||||
return (-1);
|
||||
else
|
||||
return (0);
|
||||
}
|
||||
|
||||
int32
|
||||
btcharcmp(char a, char b)
|
||||
{
|
||||
return ((int32) (a - b));
|
||||
}
|
||||
|
||||
int32
|
||||
btchar2cmp(uint16 a, uint16 b)
|
||||
{
|
||||
return (strncmp((char *) &a, (char *) &b, 2));
|
||||
}
|
||||
|
||||
int32
|
||||
btchar4cmp(uint32 a, uint32 b)
|
||||
{
|
||||
return (strncmp((char *) &a, (char *) &b, 4));
|
||||
}
|
||||
|
||||
int32
|
||||
btchar8cmp(char *a, char *b)
|
||||
{
|
||||
return (strncmp(a, b, 8));
|
||||
}
|
||||
|
||||
int32
|
||||
btchar16cmp(char *a, char *b)
|
||||
{
|
||||
return (strncmp(a, b, 16));
|
||||
}
|
||||
|
||||
int32
|
||||
btnamecmp(NameData *a, NameData *b)
|
||||
{
|
||||
return (strncmp(a->data, b->data, NAMEDATALEN));
|
||||
}
|
||||
|
||||
int32
|
||||
bttextcmp(struct varlena *a, struct varlena *b)
|
||||
{
|
||||
char *ap, *bp;
|
||||
int len;
|
||||
int res;
|
||||
|
||||
ap = VARDATA(a);
|
||||
bp = VARDATA(b);
|
||||
|
||||
/* len is the length of the shorter of the two strings */
|
||||
if ((len = VARSIZE(a)) > VARSIZE(b))
|
||||
len = VARSIZE(b);
|
||||
|
||||
/* len includes the four bytes in which string length is stored */
|
||||
len -= sizeof(VARSIZE(a));
|
||||
|
||||
/*
|
||||
* If the two strings differ in the first len bytes, or if they're
|
||||
* the same in the first len bytes and they're both len bytes long,
|
||||
* we're done.
|
||||
*/
|
||||
|
||||
res = 0;
|
||||
if (len > 0) {
|
||||
do {
|
||||
res = (int) (*ap++ - *bp++);
|
||||
len--;
|
||||
} while (res == 0 && len != 0);
|
||||
}
|
||||
|
||||
if (res != 0 || VARSIZE(a) == VARSIZE(b))
|
||||
return (res);
|
||||
|
||||
/*
|
||||
* The two strings are the same in the first len bytes, and they
|
||||
* are of different lengths.
|
||||
*/
|
||||
|
||||
if (VARSIZE(a) < VARSIZE(b))
|
||||
return (-1);
|
||||
else
|
||||
return (1);
|
||||
}
|
831
src/backend/access/nbtree/nbtinsert.c
Normal file
831
src/backend/access/nbtree/nbtinsert.c
Normal file
@@ -0,0 +1,831 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btinsert.c--
|
||||
* Item insertion in Lehman and Yao btrees for Postgres.
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
#include "utils/elog.h"
|
||||
#include "utils/palloc.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/excid.h"
|
||||
|
||||
#include "access/heapam.h"
|
||||
#include "access/genam.h"
|
||||
#include "access/nbtree.h"
|
||||
|
||||
static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
|
||||
static Buffer _bt_split(Relation rel, Buffer buf);
|
||||
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
|
||||
static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
|
||||
static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
|
||||
static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
|
||||
static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem);
|
||||
|
||||
/*
|
||||
* _bt_doinsert() -- Handle insertion of a single btitem in the tree.
|
||||
*
|
||||
* This routine is called by the public interface routines, btbuild
|
||||
* and btinsert. By here, btitem is filled in, and has a unique
|
||||
* (xid, seqno) pair.
|
||||
*/
|
||||
InsertIndexResult
|
||||
_bt_doinsert(Relation rel, BTItem btitem)
|
||||
{
|
||||
ScanKey itup_scankey;
|
||||
IndexTuple itup;
|
||||
BTStack stack;
|
||||
Buffer buf;
|
||||
BlockNumber blkno;
|
||||
int natts;
|
||||
InsertIndexResult res;
|
||||
|
||||
itup = &(btitem->bti_itup);
|
||||
|
||||
/* we need a scan key to do our search, so build one */
|
||||
itup_scankey = _bt_mkscankey(rel, itup);
|
||||
natts = rel->rd_rel->relnatts;
|
||||
|
||||
/* find the page containing this key */
|
||||
stack = _bt_search(rel, natts, itup_scankey, &buf);
|
||||
blkno = BufferGetBlockNumber(buf);
|
||||
|
||||
/* trade in our read lock for a write lock */
|
||||
_bt_relbuf(rel, buf, BT_READ);
|
||||
buf = _bt_getbuf(rel, blkno, BT_WRITE);
|
||||
|
||||
/*
|
||||
* If the page was split between the time that we surrendered our
|
||||
* read lock and acquired our write lock, then this page may no
|
||||
* longer be the right place for the key we want to insert. In this
|
||||
* case, we need to move right in the tree. See Lehman and Yao for
|
||||
* an excruciatingly precise description.
|
||||
*/
|
||||
|
||||
buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
|
||||
|
||||
/* do the insertion */
|
||||
res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
|
||||
btitem, (BTItem) NULL);
|
||||
|
||||
/* be tidy */
|
||||
_bt_freestack(stack);
|
||||
_bt_freeskey(itup_scankey);
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
|
||||
*
|
||||
* This recursive procedure does the following things:
|
||||
*
|
||||
* + if necessary, splits the target page.
|
||||
* + finds the right place to insert the tuple (taking into
|
||||
* account any changes induced by a split).
|
||||
* + inserts the tuple.
|
||||
* + if the page was split, pops the parent stack, and finds the
|
||||
* right place to insert the new child pointer (by walking
|
||||
* right using information stored in the parent stack).
|
||||
* + invoking itself with the appropriate tuple for the right
|
||||
* child page on the parent.
|
||||
*
|
||||
* On entry, we must have the right buffer on which to do the
|
||||
* insertion, and the buffer must be pinned and locked. On return,
|
||||
* we will have dropped both the pin and the write lock on the buffer.
|
||||
*
|
||||
* The locking interactions in this code are critical. You should
|
||||
* grok Lehman and Yao's paper before making any changes. In addition,
|
||||
* you need to understand how we disambiguate duplicate keys in this
|
||||
* implementation, in order to be able to find our location using
|
||||
* L&Y "move right" operations. Since we may insert duplicate user
|
||||
* keys, and since these dups may propogate up the tree, we use the
|
||||
* 'afteritem' parameter to position ourselves correctly for the
|
||||
* insertion on internal pages.
|
||||
*/
|
||||
static InsertIndexResult
|
||||
_bt_insertonpg(Relation rel,
|
||||
Buffer buf,
|
||||
BTStack stack,
|
||||
int keysz,
|
||||
ScanKey scankey,
|
||||
BTItem btitem,
|
||||
BTItem afteritem)
|
||||
{
|
||||
InsertIndexResult res;
|
||||
Page page;
|
||||
Buffer rbuf;
|
||||
Buffer pbuf;
|
||||
Page rpage;
|
||||
ScanKey newskey;
|
||||
BTItem ritem;
|
||||
BTPageOpaque rpageop;
|
||||
BlockNumber rbknum, itup_blkno;
|
||||
OffsetNumber itup_off;
|
||||
int itemsz;
|
||||
InsertIndexResult newres;
|
||||
BTItem new_item = (BTItem) NULL;
|
||||
BTItem lowLeftItem;
|
||||
|
||||
page = BufferGetPage(buf);
|
||||
itemsz = IndexTupleDSize(btitem->bti_itup)
|
||||
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||
|
||||
itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this
|
||||
but we need to be consistent */
|
||||
|
||||
if (PageGetFreeSpace(page) < itemsz) {
|
||||
|
||||
/* split the buffer into left and right halves */
|
||||
rbuf = _bt_split(rel, buf);
|
||||
|
||||
/* which new page (left half or right half) gets the tuple? */
|
||||
if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
|
||||
/* left page */
|
||||
itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
|
||||
itemsz, btitem, afteritem);
|
||||
itup_blkno = BufferGetBlockNumber(buf);
|
||||
} else {
|
||||
/* right page */
|
||||
itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
|
||||
itemsz, btitem, afteritem);
|
||||
itup_blkno = BufferGetBlockNumber(rbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* By here,
|
||||
*
|
||||
* + our target page has been split;
|
||||
* + the original tuple has been inserted;
|
||||
* + we have write locks on both the old (left half) and new
|
||||
* (right half) buffers, after the split; and
|
||||
* + we have the key we want to insert into the parent.
|
||||
*
|
||||
* Do the parent insertion. We need to hold onto the locks for
|
||||
* the child pages until we locate the parent, but we can release
|
||||
* them before doing the actual insertion (see Lehman and Yao for
|
||||
* the reasoning).
|
||||
*/
|
||||
|
||||
if (stack == (BTStack) NULL) {
|
||||
|
||||
/* create a new root node and release the split buffers */
|
||||
_bt_newroot(rel, buf, rbuf);
|
||||
_bt_relbuf(rel, buf, BT_WRITE);
|
||||
_bt_relbuf(rel, rbuf, BT_WRITE);
|
||||
|
||||
} else {
|
||||
|
||||
/* form a index tuple that points at the new right page */
|
||||
rbknum = BufferGetBlockNumber(rbuf);
|
||||
rpage = BufferGetPage(rbuf);
|
||||
rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
|
||||
|
||||
/*
|
||||
* By convention, the first entry (0) on every
|
||||
* non-rightmost page is the high key for that page. In
|
||||
* order to get the lowest key on the new right page, we
|
||||
* actually look at its second (1) entry.
|
||||
*/
|
||||
|
||||
if (! P_RIGHTMOST(rpageop)) {
|
||||
ritem = (BTItem) PageGetItem(rpage,
|
||||
PageGetItemId(rpage, P_FIRSTKEY));
|
||||
} else {
|
||||
ritem = (BTItem) PageGetItem(rpage,
|
||||
PageGetItemId(rpage, P_HIKEY));
|
||||
}
|
||||
|
||||
/* get a unique btitem for this key */
|
||||
new_item = _bt_formitem(&(ritem->bti_itup));
|
||||
|
||||
ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
|
||||
|
||||
/* find the parent buffer */
|
||||
pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
|
||||
|
||||
/*
|
||||
* If the key of new_item is < than the key of the item
|
||||
* in the parent page pointing to the left page
|
||||
* (stack->bts_btitem), we have to update the latter key;
|
||||
* otherwise the keys on the parent page wouldn't be
|
||||
* monotonically increasing after we inserted the new
|
||||
* pointer to the right page (new_item). This only
|
||||
* happens if our left page is the leftmost page and a
|
||||
* new minimum key had been inserted before, which is not
|
||||
* reflected in the parent page but didn't matter so
|
||||
* far. If there are duplicate keys and this new minimum
|
||||
* key spills over to our new right page, we get an
|
||||
* inconsistency if we don't update the left key in the
|
||||
* parent page.
|
||||
*/
|
||||
|
||||
if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
|
||||
BTGreaterStrategyNumber)) {
|
||||
lowLeftItem =
|
||||
(BTItem) PageGetItem(page,
|
||||
PageGetItemId(page, P_FIRSTKEY));
|
||||
/* page must have right pointer after split */
|
||||
_bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid,
|
||||
lowLeftItem);
|
||||
}
|
||||
|
||||
/* don't need the children anymore */
|
||||
_bt_relbuf(rel, buf, BT_WRITE);
|
||||
_bt_relbuf(rel, rbuf, BT_WRITE);
|
||||
|
||||
newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
|
||||
newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
|
||||
keysz, newskey, new_item,
|
||||
stack->bts_btitem);
|
||||
|
||||
/* be tidy */
|
||||
pfree(newres);
|
||||
pfree(newskey);
|
||||
pfree(new_item);
|
||||
}
|
||||
} else {
|
||||
itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
|
||||
itemsz, btitem, afteritem);
|
||||
itup_blkno = BufferGetBlockNumber(buf);
|
||||
|
||||
_bt_relbuf(rel, buf, BT_WRITE);
|
||||
}
|
||||
|
||||
/* by here, the new tuple is inserted */
|
||||
res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
|
||||
ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_split() -- split a page in the btree.
|
||||
*
|
||||
* On entry, buf is the page to split, and is write-locked and pinned.
|
||||
* Returns the new right sibling of buf, pinned and write-locked. The
|
||||
* pin and lock on buf are maintained.
|
||||
*/
|
||||
static Buffer
|
||||
_bt_split(Relation rel, Buffer buf)
|
||||
{
|
||||
Buffer rbuf;
|
||||
Page origpage;
|
||||
Page leftpage, rightpage;
|
||||
BTPageOpaque ropaque, lopaque, oopaque;
|
||||
Buffer sbuf;
|
||||
Page spage;
|
||||
BTPageOpaque sopaque;
|
||||
Size itemsz;
|
||||
ItemId itemid;
|
||||
BTItem item;
|
||||
OffsetNumber leftoff, rightoff;
|
||||
OffsetNumber start;
|
||||
OffsetNumber maxoff;
|
||||
OffsetNumber firstright;
|
||||
OffsetNumber i;
|
||||
Size llimit;
|
||||
|
||||
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
||||
origpage = BufferGetPage(buf);
|
||||
leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
|
||||
rightpage = BufferGetPage(rbuf);
|
||||
|
||||
_bt_pageinit(rightpage, BufferGetPageSize(rbuf));
|
||||
_bt_pageinit(leftpage, BufferGetPageSize(buf));
|
||||
|
||||
/* init btree private data */
|
||||
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
|
||||
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
|
||||
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
|
||||
|
||||
/* if we're splitting this page, it won't be the root when we're done */
|
||||
oopaque->btpo_flags &= ~BTP_ROOT;
|
||||
lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
|
||||
lopaque->btpo_prev = oopaque->btpo_prev;
|
||||
ropaque->btpo_prev = BufferGetBlockNumber(buf);
|
||||
lopaque->btpo_next = BufferGetBlockNumber(rbuf);
|
||||
ropaque->btpo_next = oopaque->btpo_next;
|
||||
|
||||
/*
|
||||
* If the page we're splitting is not the rightmost page at its
|
||||
* level in the tree, then the first (0) entry on the page is the
|
||||
* high key for the page. We need to copy that to the right
|
||||
* half. Otherwise (meaning the rightmost page case), we should
|
||||
* treat the line pointers beginning at zero as user data.
|
||||
*
|
||||
* We leave a blank space at the start of the line table for the
|
||||
* left page. We'll come back later and fill it in with the high
|
||||
* key item we get from the right key.
|
||||
*/
|
||||
|
||||
leftoff = P_FIRSTKEY;
|
||||
ropaque->btpo_next = oopaque->btpo_next;
|
||||
if (! P_RIGHTMOST(oopaque)) {
|
||||
/* splitting a non-rightmost page, start at the first data item */
|
||||
start = P_FIRSTKEY;
|
||||
|
||||
/* copy the original high key to the new page */
|
||||
itemid = PageGetItemId(origpage, P_HIKEY);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (BTItem) PageGetItem(origpage, itemid);
|
||||
(void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED);
|
||||
rightoff = P_FIRSTKEY;
|
||||
} else {
|
||||
/* splitting a rightmost page, "high key" is the first data item */
|
||||
start = P_HIKEY;
|
||||
|
||||
/* the new rightmost page will not have a high key */
|
||||
rightoff = P_HIKEY;
|
||||
}
|
||||
maxoff = PageGetMaxOffsetNumber(origpage);
|
||||
llimit = PageGetFreeSpace(leftpage) / 2;
|
||||
firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
|
||||
|
||||
for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
|
||||
itemid = PageGetItemId(origpage, i);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (BTItem) PageGetItem(origpage, itemid);
|
||||
|
||||
/* decide which page to put it on */
|
||||
if (i < firstright) {
|
||||
(void) PageAddItem(leftpage, (Item) item, itemsz, leftoff,
|
||||
LP_USED);
|
||||
leftoff = OffsetNumberNext(leftoff);
|
||||
} else {
|
||||
(void) PageAddItem(rightpage, (Item) item, itemsz, rightoff,
|
||||
LP_USED);
|
||||
rightoff = OffsetNumberNext(rightoff);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Okay, page has been split, high key on right page is correct. Now
|
||||
* set the high key on the left page to be the min key on the right
|
||||
* page.
|
||||
*/
|
||||
|
||||
if (P_RIGHTMOST(ropaque)) {
|
||||
itemid = PageGetItemId(rightpage, P_HIKEY);
|
||||
} else {
|
||||
itemid = PageGetItemId(rightpage, P_FIRSTKEY);
|
||||
}
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (BTItem) PageGetItem(rightpage, itemid);
|
||||
|
||||
/*
|
||||
* We left a hole for the high key on the left page; fill it. The
|
||||
* modal crap is to tell the page manager to put the new item on the
|
||||
* page and not screw around with anything else. Whoever designed
|
||||
* this interface has presumably crawled back into the dung heap they
|
||||
* came from. No one here will admit to it.
|
||||
*/
|
||||
|
||||
PageManagerModeSet(OverwritePageManagerMode);
|
||||
(void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED);
|
||||
PageManagerModeSet(ShufflePageManagerMode);
|
||||
|
||||
/*
|
||||
* By here, the original data page has been split into two new halves,
|
||||
* and these are correct. The algorithm requires that the left page
|
||||
* never move during a split, so we copy the new left page back on top
|
||||
* of the original. Note that this is not a waste of time, since we
|
||||
* also require (in the page management code) that the center of a
|
||||
* page always be clean, and the most efficient way to guarantee this
|
||||
* is just to compact the data by reinserting it into a new left page.
|
||||
*/
|
||||
|
||||
PageRestoreTempPage(leftpage, origpage);
|
||||
|
||||
/* write these guys out */
|
||||
_bt_wrtnorelbuf(rel, rbuf);
|
||||
_bt_wrtnorelbuf(rel, buf);
|
||||
|
||||
/*
|
||||
* Finally, we need to grab the right sibling (if any) and fix the
|
||||
* prev pointer there. We are guaranteed that this is deadlock-free
|
||||
* since no other writer will be moving holding a lock on that page
|
||||
* and trying to move left, and all readers release locks on a page
|
||||
* before trying to fetch its neighbors.
|
||||
*/
|
||||
|
||||
if (! P_RIGHTMOST(ropaque)) {
|
||||
sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
|
||||
spage = BufferGetPage(sbuf);
|
||||
sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
|
||||
sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
|
||||
|
||||
/* write and release the old right sibling */
|
||||
_bt_wrtbuf(rel, sbuf);
|
||||
}
|
||||
|
||||
/* split's done */
|
||||
return (rbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_findsplitloc() -- find a safe place to split a page.
|
||||
*
|
||||
* In order to guarantee the proper handling of searches for duplicate
|
||||
* keys, the first duplicate in the chain must either be the first
|
||||
* item on the page after the split, or the entire chain must be on
|
||||
* one of the two pages. That is,
|
||||
* [1 2 2 2 3 4 5]
|
||||
* must become
|
||||
* [1] [2 2 2 3 4 5]
|
||||
* or
|
||||
* [1 2 2 2] [3 4 5]
|
||||
* but not
|
||||
* [1 2 2] [2 3 4 5].
|
||||
* However,
|
||||
* [2 2 2 2 2 3 4]
|
||||
* may be split as
|
||||
* [2 2 2 2] [2 3 4].
|
||||
*/
|
||||
static OffsetNumber
|
||||
_bt_findsplitloc(Relation rel,
|
||||
Page page,
|
||||
OffsetNumber start,
|
||||
OffsetNumber maxoff,
|
||||
Size llimit)
|
||||
{
|
||||
OffsetNumber i;
|
||||
OffsetNumber saferight;
|
||||
ItemId nxtitemid, safeitemid;
|
||||
BTItem safeitem, nxtitem;
|
||||
IndexTuple safetup, nxttup;
|
||||
Size nbytes;
|
||||
TupleDesc itupdesc;
|
||||
int natts;
|
||||
int attno;
|
||||
Datum attsafe;
|
||||
Datum attnext;
|
||||
bool null;
|
||||
|
||||
itupdesc = RelationGetTupleDescriptor(rel);
|
||||
natts = rel->rd_rel->relnatts;
|
||||
|
||||
saferight = start;
|
||||
safeitemid = PageGetItemId(page, saferight);
|
||||
nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
|
||||
safeitem = (BTItem) PageGetItem(page, safeitemid);
|
||||
safetup = &(safeitem->bti_itup);
|
||||
|
||||
i = OffsetNumberNext(start);
|
||||
|
||||
while (nbytes < llimit) {
|
||||
|
||||
/* check the next item on the page */
|
||||
nxtitemid = PageGetItemId(page, i);
|
||||
nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
|
||||
nxtitem = (BTItem) PageGetItem(page, nxtitemid);
|
||||
nxttup = &(nxtitem->bti_itup);
|
||||
|
||||
/* test against last known safe item */
|
||||
for (attno = 1; attno <= natts; attno++) {
|
||||
attsafe = index_getattr(safetup, attno, itupdesc, &null);
|
||||
attnext = index_getattr(nxttup, attno, itupdesc, &null);
|
||||
|
||||
/*
|
||||
* If the tuple we're looking at isn't equal to the last safe one
|
||||
* we saw, then it's our new safe tuple.
|
||||
*/
|
||||
|
||||
if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber,
|
||||
attsafe, attnext)) {
|
||||
safetup = nxttup;
|
||||
saferight = i;
|
||||
|
||||
/* break is for the attno for loop */
|
||||
break;
|
||||
}
|
||||
}
|
||||
i = OffsetNumberNext(i);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the chain of dups starts at the beginning of the page and extends
|
||||
* past the halfway mark, we can split it in the middle.
|
||||
*/
|
||||
|
||||
if (saferight == start)
|
||||
saferight = i;
|
||||
|
||||
return (saferight);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_newroot() -- Create a new root page for the index.
|
||||
*
|
||||
* We've just split the old root page and need to create a new one.
|
||||
* In order to do this, we add a new root page to the file, then lock
|
||||
* the metadata page and update it. This is guaranteed to be deadlock-
|
||||
* free, because all readers release their locks on the metadata page
|
||||
* before trying to lock the root, and all writers lock the root before
|
||||
* trying to lock the metadata page. We have a write lock on the old
|
||||
* root page, so we have not introduced any cycles into the waits-for
|
||||
* graph.
|
||||
*
|
||||
* On entry, lbuf (the old root) and rbuf (its new peer) are write-
|
||||
* locked. We don't drop the locks in this routine; that's done by
|
||||
* the caller. On exit, a new root page exists with entries for the
|
||||
* two new children. The new root page is neither pinned nor locked.
|
||||
*/
|
||||
static void
|
||||
_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
{
|
||||
Buffer rootbuf;
|
||||
Page lpage, rpage, rootpage;
|
||||
BlockNumber lbkno, rbkno;
|
||||
BlockNumber rootbknum;
|
||||
BTPageOpaque rootopaque;
|
||||
ItemId itemid;
|
||||
BTItem item;
|
||||
Size itemsz;
|
||||
BTItem new_item;
|
||||
|
||||
/* get a new root page */
|
||||
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
||||
rootpage = BufferGetPage(rootbuf);
|
||||
_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
|
||||
|
||||
/* set btree special data */
|
||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
|
||||
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
|
||||
rootopaque->btpo_flags |= BTP_ROOT;
|
||||
|
||||
/*
|
||||
* Insert the internal tuple pointers.
|
||||
*/
|
||||
|
||||
lbkno = BufferGetBlockNumber(lbuf);
|
||||
rbkno = BufferGetBlockNumber(rbuf);
|
||||
lpage = BufferGetPage(lbuf);
|
||||
rpage = BufferGetPage(rbuf);
|
||||
|
||||
/*
|
||||
* step over the high key on the left page while building the
|
||||
* left page pointer.
|
||||
*/
|
||||
itemid = PageGetItemId(lpage, P_FIRSTKEY);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (BTItem) PageGetItem(lpage, itemid);
|
||||
new_item = _bt_formitem(&(item->bti_itup));
|
||||
ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY);
|
||||
|
||||
/*
|
||||
* insert the left page pointer into the new root page. the root
|
||||
* page is the rightmost page on its level so the "high key" item
|
||||
* is the first data item.
|
||||
*/
|
||||
(void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED);
|
||||
pfree(new_item);
|
||||
|
||||
/*
|
||||
* the right page is the rightmost page on the second level, so
|
||||
* the "high key" item is the first data item on that page as well.
|
||||
*/
|
||||
itemid = PageGetItemId(rpage, P_HIKEY);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
item = (BTItem) PageGetItem(rpage, itemid);
|
||||
new_item = _bt_formitem(&(item->bti_itup));
|
||||
ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
|
||||
|
||||
/*
|
||||
* insert the right page pointer into the new root page.
|
||||
*/
|
||||
(void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED);
|
||||
pfree(new_item);
|
||||
|
||||
/* write and let go of the root buffer */
|
||||
rootbknum = BufferGetBlockNumber(rootbuf);
|
||||
_bt_wrtbuf(rel, rootbuf);
|
||||
|
||||
/* update metadata page with new root block number */
|
||||
_bt_metaproot(rel, rootbknum);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_pgaddtup() -- add a tuple to a particular page in the index.
|
||||
*
|
||||
* This routine adds the tuple to the page as requested, and keeps the
|
||||
* write lock and reference associated with the page's buffer. It is
|
||||
* an error to call pgaddtup() without a write lock and reference. If
|
||||
* afteritem is non-null, it's the item that we expect our new item
|
||||
* to follow. Otherwise, we do a binary search for the correct place
|
||||
* and insert the new item there.
|
||||
*/
|
||||
static OffsetNumber
|
||||
_bt_pgaddtup(Relation rel,
|
||||
Buffer buf,
|
||||
int keysz,
|
||||
ScanKey itup_scankey,
|
||||
Size itemsize,
|
||||
BTItem btitem,
|
||||
BTItem afteritem)
|
||||
{
|
||||
OffsetNumber itup_off;
|
||||
OffsetNumber first;
|
||||
Page page;
|
||||
BTPageOpaque opaque;
|
||||
BTItem chkitem;
|
||||
Oid afteroid;
|
||||
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
|
||||
|
||||
if (afteritem == (BTItem) NULL) {
|
||||
itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
|
||||
} else {
|
||||
afteroid = afteritem->bti_oid;
|
||||
itup_off = first;
|
||||
|
||||
do {
|
||||
chkitem =
|
||||
(BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
|
||||
itup_off = OffsetNumberNext(itup_off);
|
||||
} while (chkitem->bti_oid != afteroid);
|
||||
}
|
||||
|
||||
(void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED);
|
||||
|
||||
/* write the buffer, but hold our lock */
|
||||
_bt_wrtnorelbuf(rel, buf);
|
||||
|
||||
return (itup_off);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_goesonpg() -- Does a new tuple belong on this page?
|
||||
*
|
||||
* This is part of the complexity introduced by allowing duplicate
|
||||
* keys into the index. The tuple belongs on this page if:
|
||||
*
|
||||
* + there is no page to the right of this one; or
|
||||
* + it is less than the high key on the page; or
|
||||
* + the item it is to follow ("afteritem") appears on this
|
||||
* page.
|
||||
*/
|
||||
static bool
|
||||
_bt_goesonpg(Relation rel,
|
||||
Buffer buf,
|
||||
Size keysz,
|
||||
ScanKey scankey,
|
||||
BTItem afteritem)
|
||||
{
|
||||
Page page;
|
||||
ItemId hikey;
|
||||
BTPageOpaque opaque;
|
||||
BTItem chkitem;
|
||||
OffsetNumber offnum, maxoff;
|
||||
Oid afteroid;
|
||||
bool found;
|
||||
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
/* no right neighbor? */
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
if (P_RIGHTMOST(opaque))
|
||||
return (true);
|
||||
|
||||
/*
|
||||
* this is a non-rightmost page, so it must have a high key item.
|
||||
*
|
||||
* If the scan key is < the high key (the min key on the next page),
|
||||
* then it for sure belongs here.
|
||||
*/
|
||||
hikey = PageGetItemId(page, P_HIKEY);
|
||||
if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
|
||||
return (true);
|
||||
|
||||
/*
|
||||
* If the scan key is > the high key, then it for sure doesn't belong
|
||||
* here.
|
||||
*/
|
||||
|
||||
if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
|
||||
return (false);
|
||||
|
||||
/*
|
||||
* If we have no adjacency information, and the item is equal to the
|
||||
* high key on the page (by here it is), then the item does not belong
|
||||
* on this page.
|
||||
*/
|
||||
|
||||
if (afteritem == (BTItem) NULL)
|
||||
return (false);
|
||||
|
||||
/* damn, have to work for it. i hate that. */
|
||||
afteroid = afteritem->bti_oid;
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* Search the entire page for the afteroid. We need to do this, rather
|
||||
* than doing a binary search and starting from there, because if the
|
||||
* key we're searching for is the leftmost key in the tree at this
|
||||
* level, then a binary search will do the wrong thing. Splits are
|
||||
* pretty infrequent, so the cost isn't as bad as it could be.
|
||||
*/
|
||||
|
||||
found = false;
|
||||
for (offnum = P_FIRSTKEY;
|
||||
offnum <= maxoff;
|
||||
offnum = OffsetNumberNext(offnum)) {
|
||||
chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
|
||||
if (chkitem->bti_oid == afteroid) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (found);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_itemcmp() -- compare item1 to item2 using a requested
|
||||
* strategy (<, <=, =, >=, >)
|
||||
*
|
||||
*/
|
||||
bool
|
||||
_bt_itemcmp(Relation rel,
|
||||
Size keysz,
|
||||
BTItem item1,
|
||||
BTItem item2,
|
||||
StrategyNumber strat)
|
||||
{
|
||||
TupleDesc tupDes;
|
||||
IndexTuple indexTuple1, indexTuple2;
|
||||
Datum attrDatum1, attrDatum2;
|
||||
int i;
|
||||
bool isNull;
|
||||
bool compare;
|
||||
|
||||
tupDes = RelationGetTupleDescriptor(rel);
|
||||
indexTuple1 = &(item1->bti_itup);
|
||||
indexTuple2 = &(item2->bti_itup);
|
||||
|
||||
for (i = 1; i <= keysz; i++) {
|
||||
attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull);
|
||||
attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull);
|
||||
compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
|
||||
if (!compare) {
|
||||
return (false);
|
||||
}
|
||||
}
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_updateitem() -- updates the key of the item identified by the
|
||||
* oid with the key of newItem (done in place)
|
||||
*
|
||||
*/
|
||||
static void
|
||||
_bt_updateitem(Relation rel,
|
||||
Size keysz,
|
||||
Buffer buf,
|
||||
Oid bti_oid,
|
||||
BTItem newItem)
|
||||
{
|
||||
Page page;
|
||||
OffsetNumber maxoff;
|
||||
OffsetNumber i;
|
||||
ItemPointerData itemPtrData;
|
||||
BTItem item;
|
||||
IndexTuple oldIndexTuple, newIndexTuple;
|
||||
|
||||
page = BufferGetPage(buf);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/* locate item on the page */
|
||||
i = P_HIKEY;
|
||||
do {
|
||||
item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
|
||||
i = OffsetNumberNext(i);
|
||||
} while (i <= maxoff && item->bti_oid != bti_oid);
|
||||
|
||||
/* this should never happen (in theory) */
|
||||
if (item->bti_oid != bti_oid) {
|
||||
elog(FATAL, "_bt_getstackbuf was lying!!");
|
||||
}
|
||||
|
||||
oldIndexTuple = &(item->bti_itup);
|
||||
newIndexTuple = &(newItem->bti_itup);
|
||||
|
||||
/* keep the original item pointer */
|
||||
ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
|
||||
CopyIndexTuple(newIndexTuple, &oldIndexTuple);
|
||||
ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
|
||||
}
|
523
src/backend/access/nbtree/nbtpage.c
Normal file
523
src/backend/access/nbtree/nbtpage.c
Normal file
@@ -0,0 +1,523 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btpage.c--
|
||||
* BTree-specific page management code for the Postgres btree access
|
||||
* method.
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
* data at high addresses includes pointers to left and right siblings
|
||||
* and flag data describing page state. The first page in a btree, page
|
||||
* zero, is special -- it stores meta-information describing the tree.
|
||||
* Pages one and higher store the actual tree data.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
#include "utils/elog.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/excid.h"
|
||||
|
||||
#include "access/genam.h"
|
||||
#include "access/nbtree.h"
|
||||
|
||||
#define BTREE_METAPAGE 0
|
||||
#define BTREE_MAGIC 0x053162
|
||||
#define BTREE_VERSION 0
|
||||
|
||||
typedef struct BTMetaPageData {
|
||||
uint32 btm_magic;
|
||||
uint32 btm_version;
|
||||
BlockNumber btm_root;
|
||||
} BTMetaPageData;
|
||||
|
||||
#define BTPageGetMeta(p) \
|
||||
((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
|
||||
|
||||
extern bool BuildingBtree;
|
||||
|
||||
/*
|
||||
* We use high-concurrency locking on btrees. There are two cases in
|
||||
* which we don't do locking. One is when we're building the btree.
|
||||
* Since the creating transaction has not committed, no one can see
|
||||
* the index, and there's no reason to share locks. The second case
|
||||
* is when we're just starting up the database system. We use some
|
||||
* special-purpose initialization code in the relation cache manager
|
||||
* (see utils/cache/relcache.c) to allow us to do indexed scans on
|
||||
* the system catalogs before we'd normally be able to. This happens
|
||||
* before the lock table is fully initialized, so we can't use it.
|
||||
* Strictly speaking, this violates 2pl, but we don't do 2pl on the
|
||||
* system catalogs anyway, so I declare this to be okay.
|
||||
*/
|
||||
|
||||
#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
|
||||
|
||||
/*
|
||||
* _bt_metapinit() -- Initialize the metadata page of a btree.
|
||||
*/
|
||||
void
|
||||
_bt_metapinit(Relation rel)
|
||||
{
|
||||
Buffer buf;
|
||||
Page pg;
|
||||
int nblocks;
|
||||
BTMetaPageData metad;
|
||||
BTPageOpaque op;
|
||||
|
||||
/* can't be sharing this with anyone, now... */
|
||||
if (USELOCKING)
|
||||
RelationSetLockForWrite(rel);
|
||||
|
||||
if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
|
||||
elog(WARN, "Cannot initialize non-empty btree %s",
|
||||
RelationGetRelationName(rel));
|
||||
}
|
||||
|
||||
buf = ReadBuffer(rel, P_NEW);
|
||||
pg = BufferGetPage(buf);
|
||||
_bt_pageinit(pg, BufferGetPageSize(buf));
|
||||
|
||||
metad.btm_magic = BTREE_MAGIC;
|
||||
metad.btm_version = BTREE_VERSION;
|
||||
metad.btm_root = P_NONE;
|
||||
memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
|
||||
|
||||
op = (BTPageOpaque) PageGetSpecialPointer(pg);
|
||||
op->btpo_flags = BTP_META;
|
||||
|
||||
WriteBuffer(buf);
|
||||
|
||||
/* all done */
|
||||
if (USELOCKING)
|
||||
RelationUnsetLockForWrite(rel);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_checkmeta() -- Verify that the metadata stored in a btree are
|
||||
* reasonable.
|
||||
*/
|
||||
void
|
||||
_bt_checkmeta(Relation rel)
|
||||
{
|
||||
Buffer metabuf;
|
||||
Page metap;
|
||||
BTMetaPageData *metad;
|
||||
BTPageOpaque op;
|
||||
int nblocks;
|
||||
|
||||
/* if the relation is empty, this is init time; don't complain */
|
||||
if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
|
||||
return;
|
||||
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||
metap = BufferGetPage(metabuf);
|
||||
op = (BTPageOpaque) PageGetSpecialPointer(metap);
|
||||
if (!(op->btpo_flags & BTP_META)) {
|
||||
elog(WARN, "Invalid metapage for index %s",
|
||||
RelationGetRelationName(rel));
|
||||
}
|
||||
metad = BTPageGetMeta(metap);
|
||||
|
||||
if (metad->btm_magic != BTREE_MAGIC) {
|
||||
elog(WARN, "Index %s is not a btree",
|
||||
RelationGetRelationName(rel));
|
||||
}
|
||||
|
||||
if (metad->btm_version != BTREE_VERSION) {
|
||||
elog(WARN, "Version mismatch on %s: version %d file, version %d code",
|
||||
RelationGetRelationName(rel),
|
||||
metad->btm_version, BTREE_VERSION);
|
||||
}
|
||||
|
||||
_bt_relbuf(rel, metabuf, BT_READ);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_getroot() -- Get the root page of the btree.
|
||||
*
|
||||
* Since the root page can move around the btree file, we have to read
|
||||
* its location from the metadata page, and then read the root page
|
||||
* itself. If no root page exists yet, we have to create one. The
|
||||
* standard class of race conditions exists here; I think I covered
|
||||
* them all in the Hopi Indian rain dance of lock requests below.
|
||||
*
|
||||
* We pass in the access type (BT_READ or BT_WRITE), and return the
|
||||
* root page's buffer with the appropriate lock type set. Reference
|
||||
* count on the root page gets bumped by ReadBuffer. The metadata
|
||||
* page is unlocked and unreferenced by this process when this routine
|
||||
* returns.
|
||||
*/
|
||||
Buffer
|
||||
_bt_getroot(Relation rel, int access)
|
||||
{
|
||||
Buffer metabuf;
|
||||
Page metapg;
|
||||
BTPageOpaque metaopaque;
|
||||
Buffer rootbuf;
|
||||
Page rootpg;
|
||||
BTPageOpaque rootopaque;
|
||||
BlockNumber rootblkno;
|
||||
BTMetaPageData *metad;
|
||||
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
|
||||
metapg = BufferGetPage(metabuf);
|
||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||
Assert(metaopaque->btpo_flags & BTP_META);
|
||||
metad = BTPageGetMeta(metapg);
|
||||
|
||||
/* if no root page initialized yet, do it */
|
||||
if (metad->btm_root == P_NONE) {
|
||||
|
||||
/* turn our read lock in for a write lock */
|
||||
_bt_relbuf(rel, metabuf, BT_READ);
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
||||
metapg = BufferGetPage(metabuf);
|
||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
|
||||
Assert(metaopaque->btpo_flags & BTP_META);
|
||||
metad = BTPageGetMeta(metapg);
|
||||
|
||||
/*
|
||||
* Race condition: if someone else initialized the metadata between
|
||||
* the time we released the read lock and acquired the write lock,
|
||||
* above, we want to avoid doing it again.
|
||||
*/
|
||||
|
||||
if (metad->btm_root == P_NONE) {
|
||||
|
||||
/*
|
||||
* Get, initialize, write, and leave a lock of the appropriate
|
||||
* type on the new root page. Since this is the first page in
|
||||
* the tree, it's a leaf.
|
||||
*/
|
||||
|
||||
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
||||
rootblkno = BufferGetBlockNumber(rootbuf);
|
||||
rootpg = BufferGetPage(rootbuf);
|
||||
metad->btm_root = rootblkno;
|
||||
_bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
|
||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
|
||||
rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
|
||||
_bt_wrtnorelbuf(rel, rootbuf);
|
||||
|
||||
/* swap write lock for read lock, if appropriate */
|
||||
if (access != BT_WRITE) {
|
||||
_bt_setpagelock(rel, rootblkno, BT_READ);
|
||||
_bt_unsetpagelock(rel, rootblkno, BT_WRITE);
|
||||
}
|
||||
|
||||
/* okay, metadata is correct */
|
||||
_bt_wrtbuf(rel, metabuf);
|
||||
} else {
|
||||
|
||||
/*
|
||||
* Metadata initialized by someone else. In order to guarantee
|
||||
* no deadlocks, we have to release the metadata page and start
|
||||
* all over again.
|
||||
*/
|
||||
|
||||
_bt_relbuf(rel, metabuf, BT_WRITE);
|
||||
return (_bt_getroot(rel, access));
|
||||
}
|
||||
} else {
|
||||
rootbuf = _bt_getbuf(rel, metad->btm_root, access);
|
||||
|
||||
/* done with the meta page */
|
||||
_bt_relbuf(rel, metabuf, BT_READ);
|
||||
}
|
||||
|
||||
/*
|
||||
* Race condition: If the root page split between the time we looked
|
||||
* at the metadata page and got the root buffer, then we got the wrong
|
||||
* buffer.
|
||||
*/
|
||||
|
||||
rootpg = BufferGetPage(rootbuf);
|
||||
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
|
||||
if (!(rootopaque->btpo_flags & BTP_ROOT)) {
|
||||
|
||||
/* it happened, try again */
|
||||
_bt_relbuf(rel, rootbuf, access);
|
||||
return (_bt_getroot(rel, access));
|
||||
}
|
||||
|
||||
/*
|
||||
* By here, we have a correct lock on the root block, its reference
|
||||
* count is correct, and we have no lock set on the metadata page.
|
||||
* Return the root block.
|
||||
*/
|
||||
|
||||
return (rootbuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_getbuf() -- Get a buffer by block number for read or write.
|
||||
*
|
||||
* When this routine returns, the appropriate lock is set on the
|
||||
* requested buffer its reference count is correct.
|
||||
*/
|
||||
Buffer
|
||||
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
{
|
||||
Buffer buf;
|
||||
Page page;
|
||||
|
||||
/*
|
||||
* If we want a new block, we can't set a lock of the appropriate type
|
||||
* until we've instantiated the buffer.
|
||||
*/
|
||||
|
||||
if (blkno != P_NEW) {
|
||||
if (access == BT_WRITE)
|
||||
_bt_setpagelock(rel, blkno, BT_WRITE);
|
||||
else
|
||||
_bt_setpagelock(rel, blkno, BT_READ);
|
||||
|
||||
buf = ReadBuffer(rel, blkno);
|
||||
} else {
|
||||
buf = ReadBuffer(rel, blkno);
|
||||
blkno = BufferGetBlockNumber(buf);
|
||||
page = BufferGetPage(buf);
|
||||
_bt_pageinit(page, BufferGetPageSize(buf));
|
||||
|
||||
if (access == BT_WRITE)
|
||||
_bt_setpagelock(rel, blkno, BT_WRITE);
|
||||
else
|
||||
_bt_setpagelock(rel, blkno, BT_READ);
|
||||
}
|
||||
|
||||
/* ref count and lock type are correct */
|
||||
return (buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_relbuf() -- release a locked buffer.
|
||||
*/
|
||||
void
|
||||
_bt_relbuf(Relation rel, Buffer buf, int access)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
|
||||
blkno = BufferGetBlockNumber(buf);
|
||||
|
||||
/* access had better be one of read or write */
|
||||
if (access == BT_WRITE)
|
||||
_bt_unsetpagelock(rel, blkno, BT_WRITE);
|
||||
else
|
||||
_bt_unsetpagelock(rel, blkno, BT_READ);
|
||||
|
||||
ReleaseBuffer(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_wrtbuf() -- write a btree page to disk.
|
||||
*
|
||||
* This routine releases the lock held on the buffer and our reference
|
||||
* to it. It is an error to call _bt_wrtbuf() without a write lock
|
||||
* or a reference to the buffer.
|
||||
*/
|
||||
void
|
||||
_bt_wrtbuf(Relation rel, Buffer buf)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
|
||||
blkno = BufferGetBlockNumber(buf);
|
||||
WriteBuffer(buf);
|
||||
_bt_unsetpagelock(rel, blkno, BT_WRITE);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
|
||||
* our reference or lock.
|
||||
*
|
||||
* It is an error to call _bt_wrtnorelbuf() without a write lock
|
||||
* or a reference to the buffer.
|
||||
*/
|
||||
void
|
||||
_bt_wrtnorelbuf(Relation rel, Buffer buf)
|
||||
{
|
||||
BlockNumber blkno;
|
||||
|
||||
blkno = BufferGetBlockNumber(buf);
|
||||
WriteNoReleaseBuffer(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_pageinit() -- Initialize a new page.
|
||||
*/
|
||||
void
|
||||
_bt_pageinit(Page page, Size size)
|
||||
{
|
||||
/*
|
||||
* Cargo-cult programming -- don't really need this to be zero, but
|
||||
* creating new pages is an infrequent occurrence and it makes me feel
|
||||
* good when I know they're empty.
|
||||
*/
|
||||
|
||||
memset(page, 0, size);
|
||||
|
||||
PageInit(page, size, sizeof(BTPageOpaqueData));
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_metaproot() -- Change the root page of the btree.
|
||||
*
|
||||
* Lehman and Yao require that the root page move around in order to
|
||||
* guarantee deadlock-free short-term, fine-granularity locking. When
|
||||
* we split the root page, we record the new parent in the metadata page
|
||||
* for the relation. This routine does the work.
|
||||
*
|
||||
* No direct preconditions, but if you don't have the a write lock on
|
||||
* at least the old root page when you call this, you're making a big
|
||||
* mistake. On exit, metapage data is correct and we no longer have
|
||||
* a reference to or lock on the metapage.
|
||||
*/
|
||||
void
|
||||
_bt_metaproot(Relation rel, BlockNumber rootbknum)
|
||||
{
|
||||
Buffer metabuf;
|
||||
Page metap;
|
||||
BTPageOpaque metaopaque;
|
||||
BTMetaPageData *metad;
|
||||
|
||||
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
|
||||
metap = BufferGetPage(metabuf);
|
||||
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
|
||||
Assert(metaopaque->btpo_flags & BTP_META);
|
||||
metad = BTPageGetMeta(metap);
|
||||
metad->btm_root = rootbknum;
|
||||
_bt_wrtbuf(rel, metabuf);
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
|
||||
* we last looked at in the parent.
|
||||
*
|
||||
* This is possible because we save a bit image of the last item
|
||||
* we looked at in the parent, and the update algorithm guarantees
|
||||
* that if items above us in the tree move, they only move right.
|
||||
*/
|
||||
Buffer
|
||||
_bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
{
|
||||
Buffer buf;
|
||||
BlockNumber blkno;
|
||||
OffsetNumber start, offnum, maxoff;
|
||||
OffsetNumber i;
|
||||
Page page;
|
||||
ItemId itemid;
|
||||
BTItem item;
|
||||
BTPageOpaque opaque;
|
||||
|
||||
blkno = stack->bts_blkno;
|
||||
buf = _bt_getbuf(rel, blkno, access);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
if (maxoff >= stack->bts_offset) {
|
||||
itemid = PageGetItemId(page, stack->bts_offset);
|
||||
item = (BTItem) PageGetItem(page, itemid);
|
||||
|
||||
/* if the item is where we left it, we're done */
|
||||
if (item->bti_oid == stack->bts_btitem->bti_oid)
|
||||
return (buf);
|
||||
|
||||
/* if the item has just moved right on this page, we're done */
|
||||
for (i = OffsetNumberNext(stack->bts_offset);
|
||||
i <= maxoff;
|
||||
i = OffsetNumberNext(i)) {
|
||||
itemid = PageGetItemId(page, i);
|
||||
item = (BTItem) PageGetItem(page, itemid);
|
||||
|
||||
/* if the item is where we left it, we're done */
|
||||
if (item->bti_oid == stack->bts_btitem->bti_oid)
|
||||
return (buf);
|
||||
}
|
||||
}
|
||||
|
||||
/* by here, the item we're looking for moved right at least one page */
|
||||
for (;;) {
|
||||
blkno = opaque->btpo_next;
|
||||
if (P_RIGHTMOST(opaque))
|
||||
elog(FATAL, "my bits moved right off the end of the world!");
|
||||
|
||||
_bt_relbuf(rel, buf, access);
|
||||
buf = _bt_getbuf(rel, blkno, access);
|
||||
page = BufferGetPage(buf);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/* if we have a right sibling, step over the high key */
|
||||
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
|
||||
|
||||
/* see if it's on this page */
|
||||
for (offnum = start;
|
||||
offnum <= maxoff;
|
||||
offnum = OffsetNumberNext(offnum)) {
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
item = (BTItem) PageGetItem(page, itemid);
|
||||
if (item->bti_oid == stack->bts_btitem->bti_oid)
|
||||
return (buf);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
|
||||
{
|
||||
ItemPointerData iptr;
|
||||
|
||||
if (USELOCKING) {
|
||||
ItemPointerSet(&iptr, blkno, P_HIKEY);
|
||||
|
||||
if (access == BT_WRITE)
|
||||
RelationSetSingleWLockPage(rel, &iptr);
|
||||
else
|
||||
RelationSetSingleRLockPage(rel, &iptr);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
|
||||
{
|
||||
ItemPointerData iptr;
|
||||
|
||||
if (USELOCKING) {
|
||||
ItemPointerSet(&iptr, blkno, P_HIKEY);
|
||||
|
||||
if (access == BT_WRITE)
|
||||
RelationUnsetSingleWLockPage(rel, &iptr);
|
||||
else
|
||||
RelationUnsetSingleRLockPage(rel, &iptr);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_bt_pagedel(Relation rel, ItemPointer tid)
|
||||
{
|
||||
Buffer buf;
|
||||
Page page;
|
||||
BlockNumber blkno;
|
||||
OffsetNumber offno;
|
||||
|
||||
blkno = ItemPointerGetBlockNumber(tid);
|
||||
offno = ItemPointerGetOffsetNumber(tid);
|
||||
|
||||
buf = _bt_getbuf(rel, blkno, BT_WRITE);
|
||||
page = BufferGetPage(buf);
|
||||
|
||||
PageIndexTupleDelete(page, offno);
|
||||
|
||||
/* write the buffer and release the lock */
|
||||
_bt_wrtbuf(rel, buf);
|
||||
}
|
516
src/backend/access/nbtree/nbtree.c
Normal file
516
src/backend/access/nbtree/nbtree.c
Normal file
@@ -0,0 +1,516 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btree.c--
|
||||
* Implementation of Lehman and Yao's btree management algorithm for
|
||||
* Postgres.
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
* NOTES
|
||||
* This file contains only the public interface routines.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
#include "utils/elog.h"
|
||||
#include "utils/palloc.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/excid.h"
|
||||
|
||||
#include "access/heapam.h"
|
||||
#include "access/genam.h"
|
||||
#include "access/sdir.h"
|
||||
#include "access/nbtree.h"
|
||||
#include "access/funcindex.h"
|
||||
|
||||
#include "nodes/execnodes.h"
|
||||
#include "nodes/plannodes.h"
|
||||
|
||||
#include "executor/executor.h"
|
||||
#include "executor/tuptable.h"
|
||||
|
||||
#include "catalog/index.h"
|
||||
|
||||
bool BuildingBtree = false;
|
||||
bool FastBuild = false; /* turn this on to make bulk builds work*/
|
||||
|
||||
/*
|
||||
* btbuild() -- build a new btree index.
|
||||
*
|
||||
* We use a global variable to record the fact that we're creating
|
||||
* a new index. This is used to avoid high-concurrency locking,
|
||||
* since the index won't be visible until this transaction commits
|
||||
* and since building is guaranteed to be single-threaded.
|
||||
*/
|
||||
void
|
||||
btbuild(Relation heap,
|
||||
Relation index,
|
||||
int natts,
|
||||
AttrNumber *attnum,
|
||||
IndexStrategy istrat,
|
||||
uint16 pcount,
|
||||
Datum *params,
|
||||
FuncIndexInfo *finfo,
|
||||
PredInfo *predInfo)
|
||||
{
|
||||
HeapScanDesc hscan;
|
||||
Buffer buffer;
|
||||
HeapTuple htup;
|
||||
IndexTuple itup;
|
||||
TupleDesc htupdesc, itupdesc;
|
||||
Datum *attdata;
|
||||
bool *nulls;
|
||||
InsertIndexResult res;
|
||||
int nhtups, nitups;
|
||||
int i;
|
||||
BTItem btitem;
|
||||
ExprContext *econtext;
|
||||
TupleTable tupleTable;
|
||||
TupleTableSlot *slot;
|
||||
Oid hrelid, irelid;
|
||||
Node *pred, *oldPred;
|
||||
void *spool;
|
||||
|
||||
/* note that this is a new btree */
|
||||
BuildingBtree = true;
|
||||
|
||||
pred = predInfo->pred;
|
||||
oldPred = predInfo->oldPred;
|
||||
|
||||
/* initialize the btree index metadata page (if this is a new index) */
|
||||
if (oldPred == NULL)
|
||||
_bt_metapinit(index);
|
||||
|
||||
/* get tuple descriptors for heap and index relations */
|
||||
htupdesc = RelationGetTupleDescriptor(heap);
|
||||
itupdesc = RelationGetTupleDescriptor(index);
|
||||
|
||||
/* get space for data items that'll appear in the index tuple */
|
||||
attdata = (Datum *) palloc(natts * sizeof(Datum));
|
||||
nulls = (bool *) palloc(natts * sizeof(bool));
|
||||
|
||||
/*
|
||||
* If this is a predicate (partial) index, we will need to evaluate the
|
||||
* predicate using ExecQual, which requires the current tuple to be in a
|
||||
* slot of a TupleTable. In addition, ExecQual must have an ExprContext
|
||||
* referring to that slot. Here, we initialize dummy TupleTable and
|
||||
* ExprContext objects for this purpose. --Nels, Feb '92
|
||||
*/
|
||||
#ifndef OMIT_PARTIAL_INDEX
|
||||
if (pred != NULL || oldPred != NULL) {
|
||||
tupleTable = ExecCreateTupleTable(1);
|
||||
slot = ExecAllocTableSlot(tupleTable);
|
||||
econtext = makeNode(ExprContext);
|
||||
FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
|
||||
}
|
||||
#endif /* OMIT_PARTIAL_INDEX */
|
||||
|
||||
/* start a heap scan */
|
||||
hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
|
||||
htup = heap_getnext(hscan, 0, &buffer);
|
||||
|
||||
/* build the index */
|
||||
nhtups = nitups = 0;
|
||||
|
||||
if (FastBuild) {
|
||||
spool = _bt_spoolinit(index, 7);
|
||||
res = (InsertIndexResult) NULL;
|
||||
}
|
||||
|
||||
for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
|
||||
|
||||
nhtups++;
|
||||
|
||||
/*
|
||||
* If oldPred != NULL, this is an EXTEND INDEX command, so skip
|
||||
* this tuple if it was already in the existing partial index
|
||||
*/
|
||||
if (oldPred != NULL) {
|
||||
#ifndef OMIT_PARTIAL_INDEX
|
||||
|
||||
/*SetSlotContents(slot, htup);*/
|
||||
slot->val = htup;
|
||||
if (ExecQual((List*)oldPred, econtext) == true) {
|
||||
nitups++;
|
||||
continue;
|
||||
}
|
||||
#endif /* OMIT_PARTIAL_INDEX */
|
||||
}
|
||||
|
||||
/* Skip this tuple if it doesn't satisfy the partial-index predicate */
|
||||
if (pred != NULL) {
|
||||
#ifndef OMIT_PARTIAL_INDEX
|
||||
/* SetSlotContents(slot, htup); */
|
||||
slot->val = htup;
|
||||
if (ExecQual((List*)pred, econtext) == false)
|
||||
continue;
|
||||
#endif /* OMIT_PARTIAL_INDEX */
|
||||
}
|
||||
|
||||
nitups++;
|
||||
|
||||
/*
|
||||
* For the current heap tuple, extract all the attributes
|
||||
* we use in this index, and note which are null.
|
||||
*/
|
||||
|
||||
for (i = 1; i <= natts; i++) {
|
||||
int attoff;
|
||||
bool attnull;
|
||||
|
||||
/*
|
||||
* Offsets are from the start of the tuple, and are
|
||||
* zero-based; indices are one-based. The next call
|
||||
* returns i - 1. That's data hiding for you.
|
||||
*/
|
||||
|
||||
attoff = AttrNumberGetAttrOffset(i);
|
||||
attdata[attoff] = GetIndexValue(htup,
|
||||
htupdesc,
|
||||
attoff,
|
||||
attnum,
|
||||
finfo,
|
||||
&attnull,
|
||||
buffer);
|
||||
nulls[attoff] = (attnull ? 'n' : ' ');
|
||||
}
|
||||
|
||||
/* form an index tuple and point it at the heap tuple */
|
||||
itup = index_formtuple(itupdesc, attdata, nulls);
|
||||
|
||||
/*
|
||||
* If the single index key is null, we don't insert it into
|
||||
* the index. Btrees support scans on <, <=, =, >=, and >.
|
||||
* Relational algebra says that A op B (where op is one of the
|
||||
* operators above) returns null if either A or B is null. This
|
||||
* means that no qualification used in an index scan could ever
|
||||
* return true on a null attribute. It also means that indices
|
||||
* can't be used by ISNULL or NOTNULL scans, but that's an
|
||||
* artifact of the strategy map architecture chosen in 1986, not
|
||||
* of the way nulls are handled here.
|
||||
*/
|
||||
|
||||
if (itup->t_info & INDEX_NULL_MASK) {
|
||||
pfree(itup);
|
||||
continue;
|
||||
}
|
||||
|
||||
itup->t_tid = htup->t_ctid;
|
||||
btitem = _bt_formitem(itup);
|
||||
|
||||
/*
|
||||
* if we are doing bottom-up btree build, we insert the index
|
||||
* into a spool page for subsequent processing. otherwise, we
|
||||
* insert into the btree.
|
||||
*/
|
||||
if (FastBuild) {
|
||||
_bt_spool(index, btitem, spool);
|
||||
} else {
|
||||
res = _bt_doinsert(index, btitem);
|
||||
}
|
||||
|
||||
pfree(btitem);
|
||||
pfree(itup);
|
||||
if (res) {
|
||||
pfree(res);
|
||||
}
|
||||
}
|
||||
|
||||
/* okay, all heap tuples are indexed */
|
||||
heap_endscan(hscan);
|
||||
|
||||
if (pred != NULL || oldPred != NULL) {
|
||||
#ifndef OMIT_PARTIAL_INDEX
|
||||
ExecDestroyTupleTable(tupleTable, true);
|
||||
pfree(econtext);
|
||||
#endif /* OMIT_PARTIAL_INDEX */
|
||||
}
|
||||
|
||||
/*
|
||||
* if we are doing bottom-up btree build, we now have a bunch of
|
||||
* sorted runs in the spool pages. finish the build by (1)
|
||||
* merging the runs, (2) inserting the sorted tuples into btree
|
||||
* pages and (3) building the upper levels.
|
||||
*/
|
||||
if (FastBuild) {
|
||||
_bt_spool(index, (BTItem) NULL, spool); /* flush spool */
|
||||
_bt_leafbuild(index, spool);
|
||||
_bt_spooldestroy(spool);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since we just counted the tuples in the heap, we update its
|
||||
* stats in pg_class to guarantee that the planner takes advantage
|
||||
* of the index we just created. Finally, only update statistics
|
||||
* during normal index definitions, not for indices on system catalogs
|
||||
* created during bootstrap processing. We must close the relations
|
||||
* before updatings statistics to guarantee that the relcache entries
|
||||
* are flushed when we increment the command counter in UpdateStats().
|
||||
*/
|
||||
if (IsNormalProcessingMode())
|
||||
{
|
||||
hrelid = heap->rd_id;
|
||||
irelid = index->rd_id;
|
||||
heap_close(heap);
|
||||
index_close(index);
|
||||
UpdateStats(hrelid, nhtups, true);
|
||||
UpdateStats(irelid, nitups, false);
|
||||
if (oldPred != NULL) {
|
||||
if (nitups == nhtups) pred = NULL;
|
||||
UpdateIndexPredicate(irelid, oldPred, pred);
|
||||
}
|
||||
}
|
||||
|
||||
/* be tidy */
|
||||
pfree(nulls);
|
||||
pfree(attdata);
|
||||
|
||||
/* all done */
|
||||
BuildingBtree = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* btinsert() -- insert an index tuple into a btree.
|
||||
*
|
||||
* Descend the tree recursively, find the appropriate location for our
|
||||
* new tuple, put it there, set its unique OID as appropriate, and
|
||||
* return an InsertIndexResult to the caller.
|
||||
*/
|
||||
InsertIndexResult
|
||||
btinsert(Relation rel, IndexTuple itup)
|
||||
{
|
||||
BTItem btitem;
|
||||
InsertIndexResult res;
|
||||
|
||||
if (itup->t_info & INDEX_NULL_MASK)
|
||||
return ((InsertIndexResult) NULL);
|
||||
|
||||
btitem = _bt_formitem(itup);
|
||||
|
||||
res = _bt_doinsert(rel, btitem);
|
||||
pfree(btitem);
|
||||
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
* btgettuple() -- Get the next tuple in the scan.
|
||||
*/
|
||||
char *
|
||||
btgettuple(IndexScanDesc scan, ScanDirection dir)
|
||||
{
|
||||
RetrieveIndexResult res;
|
||||
|
||||
/*
|
||||
* If we've already initialized this scan, we can just advance it
|
||||
* in the appropriate direction. If we haven't done so yet, we
|
||||
* call a routine to get the first item in the scan.
|
||||
*/
|
||||
|
||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
||||
res = _bt_next(scan, dir);
|
||||
else
|
||||
res = _bt_first(scan, dir);
|
||||
|
||||
return ((char *) res);
|
||||
}
|
||||
|
||||
/*
|
||||
* btbeginscan() -- start a scan on a btree index
|
||||
*/
|
||||
char *
|
||||
btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
|
||||
{
|
||||
IndexScanDesc scan;
|
||||
StrategyNumber strat;
|
||||
BTScanOpaque so;
|
||||
|
||||
/* first order the keys in the qualification */
|
||||
if (keysz > 1)
|
||||
_bt_orderkeys(rel, &keysz, scankey);
|
||||
|
||||
/* now get the scan */
|
||||
scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
|
||||
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
|
||||
so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
|
||||
scan->opaque = so;
|
||||
|
||||
/* finally, be sure that the scan exploits the tree order */
|
||||
scan->scanFromEnd = false;
|
||||
scan->flags = 0x0;
|
||||
if (keysz > 0) {
|
||||
strat = _bt_getstrat(scan->relation, 1 /* XXX */,
|
||||
scankey[0].sk_procedure);
|
||||
|
||||
if (strat == BTLessStrategyNumber
|
||||
|| strat == BTLessEqualStrategyNumber)
|
||||
scan->scanFromEnd = true;
|
||||
} else {
|
||||
scan->scanFromEnd = true;
|
||||
}
|
||||
|
||||
/* register scan in case we change pages it's using */
|
||||
_bt_regscan(scan);
|
||||
|
||||
return ((char *) scan);
|
||||
}
|
||||
|
||||
/*
|
||||
* btrescan() -- rescan an index relation
|
||||
*/
|
||||
void
|
||||
btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
|
||||
{
|
||||
ItemPointer iptr;
|
||||
BTScanOpaque so;
|
||||
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/* we hold a read lock on the current page in the scan */
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
|
||||
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
|
||||
so->btso_curbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
/* and we hold a read lock on the last marked item in the scan */
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
|
||||
_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
|
||||
so->btso_mrkbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
/* reset the scan key */
|
||||
if (scan->numberOfKeys > 0) {
|
||||
memmove(scan->keyData,
|
||||
scankey,
|
||||
scan->numberOfKeys * sizeof(ScanKeyData));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
btmovescan(IndexScanDesc scan, Datum v)
|
||||
{
|
||||
ItemPointer iptr;
|
||||
BTScanOpaque so;
|
||||
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/* release any locks we still hold */
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
|
||||
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
|
||||
so->btso_curbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
scan->keyData[0].sk_argument = v;
|
||||
}
|
||||
|
||||
/*
|
||||
* btendscan() -- close down a scan
|
||||
*/
|
||||
void
|
||||
btendscan(IndexScanDesc scan)
|
||||
{
|
||||
ItemPointer iptr;
|
||||
BTScanOpaque so;
|
||||
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/* release any locks we still hold */
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
|
||||
if (BufferIsValid(so->btso_curbuf))
|
||||
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
|
||||
so->btso_curbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
|
||||
if (BufferIsValid(so->btso_mrkbuf))
|
||||
_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
|
||||
so->btso_mrkbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
/* don't need scan registered anymore */
|
||||
_bt_dropscan(scan);
|
||||
|
||||
/* be tidy */
|
||||
#ifdef PERFECT_MMGR
|
||||
pfree (scan->opaque);
|
||||
#endif /* PERFECT_MMGR */
|
||||
}
|
||||
|
||||
/*
|
||||
* btmarkpos() -- save current scan position
|
||||
*/
|
||||
void
|
||||
btmarkpos(IndexScanDesc scan)
|
||||
{
|
||||
ItemPointer iptr;
|
||||
BTScanOpaque so;
|
||||
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/* release lock on old marked data, if any */
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
|
||||
_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
|
||||
so->btso_mrkbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
/* bump lock on currentItemData and copy to currentMarkData */
|
||||
if (ItemPointerIsValid(&(scan->currentItemData))) {
|
||||
so->btso_mrkbuf = _bt_getbuf(scan->relation,
|
||||
BufferGetBlockNumber(so->btso_curbuf),
|
||||
BT_READ);
|
||||
scan->currentMarkData = scan->currentItemData;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* btrestrpos() -- restore scan to last saved position
|
||||
*/
|
||||
void
|
||||
btrestrpos(IndexScanDesc scan)
|
||||
{
|
||||
ItemPointer iptr;
|
||||
BTScanOpaque so;
|
||||
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/* release lock on current data, if any */
|
||||
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
|
||||
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
|
||||
so->btso_curbuf = InvalidBuffer;
|
||||
ItemPointerSetInvalid(iptr);
|
||||
}
|
||||
|
||||
/* bump lock on currentMarkData and copy to currentItemData */
|
||||
if (ItemPointerIsValid(&(scan->currentMarkData))) {
|
||||
so->btso_curbuf = _bt_getbuf(scan->relation,
|
||||
BufferGetBlockNumber(so->btso_mrkbuf),
|
||||
BT_READ);
|
||||
|
||||
scan->currentItemData = scan->currentMarkData;
|
||||
}
|
||||
}
|
||||
|
||||
/* stubs */
|
||||
void
|
||||
btdelete(Relation rel, ItemPointer tid)
|
||||
{
|
||||
/* adjust any active scans that will be affected by this deletion */
|
||||
_bt_adjscans(rel, tid);
|
||||
|
||||
/* delete the data from the page */
|
||||
_bt_pagedel(rel, tid);
|
||||
}
|
164
src/backend/access/nbtree/nbtscan.c
Normal file
164
src/backend/access/nbtree/nbtscan.c
Normal file
@@ -0,0 +1,164 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btscan.c--
|
||||
* manage scans on btrees.
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
*
|
||||
* NOTES
|
||||
* Because we can be doing an index scan on a relation while we update
|
||||
* it, we need to avoid missing data that moves around in the index.
|
||||
* The routines and global variables in this file guarantee that all
|
||||
* scans in the local address space stay correctly positioned. This
|
||||
* is all we need to worry about, since write locking guarantees that
|
||||
* no one else will be on the same page at the same time as we are.
|
||||
*
|
||||
* The scheme is to manage a list of active scans in the current backend.
|
||||
* Whenever we add or remove records from an index, or whenever we
|
||||
* split a leaf page, we check the list of active scans to see if any
|
||||
* has been affected. A scan is affected only if it is on the same
|
||||
* relation, and the same page, as the update.
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
#include "utils/elog.h"
|
||||
#include "utils/palloc.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/excid.h"
|
||||
|
||||
#include "access/heapam.h"
|
||||
#include "access/genam.h"
|
||||
#include "access/sdir.h"
|
||||
#include "access/nbtree.h"
|
||||
|
||||
typedef struct BTScanListData {
|
||||
IndexScanDesc btsl_scan;
|
||||
struct BTScanListData *btsl_next;
|
||||
} BTScanListData;
|
||||
|
||||
typedef BTScanListData *BTScanList;
|
||||
|
||||
static BTScanList BTScans = (BTScanList) NULL;
|
||||
|
||||
/*
|
||||
* _bt_regscan() -- register a new scan.
|
||||
*/
|
||||
void
|
||||
_bt_regscan(IndexScanDesc scan)
|
||||
{
|
||||
BTScanList new_el;
|
||||
|
||||
new_el = (BTScanList) palloc(sizeof(BTScanListData));
|
||||
new_el->btsl_scan = scan;
|
||||
new_el->btsl_next = BTScans;
|
||||
BTScans = new_el;
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_dropscan() -- drop a scan from the scan list
|
||||
*/
|
||||
void
|
||||
_bt_dropscan(IndexScanDesc scan)
|
||||
{
|
||||
BTScanList chk, last;
|
||||
|
||||
last = (BTScanList) NULL;
|
||||
for (chk = BTScans;
|
||||
chk != (BTScanList) NULL && chk->btsl_scan != scan;
|
||||
chk = chk->btsl_next) {
|
||||
last = chk;
|
||||
}
|
||||
|
||||
if (chk == (BTScanList) NULL)
|
||||
elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
|
||||
|
||||
if (last == (BTScanList) NULL)
|
||||
BTScans = chk->btsl_next;
|
||||
else
|
||||
last->btsl_next = chk->btsl_next;
|
||||
|
||||
#ifdef PERFECT_MEM
|
||||
pfree (chk);
|
||||
#endif /* PERFECT_MEM */
|
||||
}
|
||||
|
||||
void
|
||||
_bt_adjscans(Relation rel, ItemPointer tid)
|
||||
{
|
||||
BTScanList l;
|
||||
Oid relid;
|
||||
|
||||
relid = rel->rd_id;
|
||||
for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
|
||||
if (relid == l->btsl_scan->relation->rd_id)
|
||||
_bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid),
|
||||
ItemPointerGetOffsetNumber(tid));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
|
||||
{
|
||||
ItemPointer current;
|
||||
Buffer buf;
|
||||
BTScanOpaque so;
|
||||
|
||||
if (!_bt_scantouched(scan, blkno, offno))
|
||||
return;
|
||||
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
buf = so->btso_curbuf;
|
||||
|
||||
current = &(scan->currentItemData);
|
||||
if (ItemPointerIsValid(current)
|
||||
&& ItemPointerGetBlockNumber(current) == blkno
|
||||
&& ItemPointerGetOffsetNumber(current) >= offno) {
|
||||
_bt_step(scan, &buf, BackwardScanDirection);
|
||||
so->btso_curbuf = buf;
|
||||
}
|
||||
|
||||
current = &(scan->currentMarkData);
|
||||
if (ItemPointerIsValid(current)
|
||||
&& ItemPointerGetBlockNumber(current) == blkno
|
||||
&& ItemPointerGetOffsetNumber(current) >= offno) {
|
||||
ItemPointerData tmp;
|
||||
tmp = *current;
|
||||
*current = scan->currentItemData;
|
||||
scan->currentItemData = tmp;
|
||||
_bt_step(scan, &buf, BackwardScanDirection);
|
||||
so->btso_mrkbuf = buf;
|
||||
tmp = *current;
|
||||
*current = scan->currentItemData;
|
||||
scan->currentItemData = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
|
||||
{
|
||||
ItemPointer current;
|
||||
|
||||
current = &(scan->currentItemData);
|
||||
if (ItemPointerIsValid(current)
|
||||
&& ItemPointerGetBlockNumber(current) == blkno
|
||||
&& ItemPointerGetOffsetNumber(current) >= offno)
|
||||
return (true);
|
||||
|
||||
current = &(scan->currentMarkData);
|
||||
if (ItemPointerIsValid(current)
|
||||
&& ItemPointerGetBlockNumber(current) == blkno
|
||||
&& ItemPointerGetOffsetNumber(current) >= offno)
|
||||
return (true);
|
||||
|
||||
return (false);
|
||||
}
|
1133
src/backend/access/nbtree/nbtsearch.c
Normal file
1133
src/backend/access/nbtree/nbtsearch.c
Normal file
File diff suppressed because it is too large
Load Diff
1196
src/backend/access/nbtree/nbtsort.c
Normal file
1196
src/backend/access/nbtree/nbtsort.c
Normal file
File diff suppressed because it is too large
Load Diff
134
src/backend/access/nbtree/nbtstrat.c
Normal file
134
src/backend/access/nbtree/nbtstrat.c
Normal file
@@ -0,0 +1,134 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btstrat.c--
|
||||
* Srategy map entries for the btree indexed access method
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
#include "utils/elog.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/excid.h"
|
||||
|
||||
#include "access/genam.h"
|
||||
#include "access/nbtree.h"
|
||||
|
||||
/*
|
||||
* Note:
|
||||
* StrategyNegate, StrategyCommute, and StrategyNegateCommute
|
||||
* assume <, <=, ==, >=, > ordering.
|
||||
*/
|
||||
static StrategyNumber BTNegate[5] = {
|
||||
BTGreaterEqualStrategyNumber,
|
||||
BTGreaterStrategyNumber,
|
||||
InvalidStrategy,
|
||||
BTLessStrategyNumber,
|
||||
BTLessEqualStrategyNumber
|
||||
};
|
||||
|
||||
static StrategyNumber BTCommute[5] = {
|
||||
BTGreaterStrategyNumber,
|
||||
BTGreaterEqualStrategyNumber,
|
||||
InvalidStrategy,
|
||||
BTLessEqualStrategyNumber,
|
||||
BTLessStrategyNumber
|
||||
};
|
||||
|
||||
static StrategyNumber BTNegateCommute[5] = {
|
||||
BTLessEqualStrategyNumber,
|
||||
BTLessStrategyNumber,
|
||||
InvalidStrategy,
|
||||
BTGreaterStrategyNumber,
|
||||
BTGreaterEqualStrategyNumber
|
||||
};
|
||||
|
||||
static uint16 BTLessTermData[] = { /* XXX type clash */
|
||||
2,
|
||||
BTLessStrategyNumber,
|
||||
SK_NEGATE,
|
||||
BTLessStrategyNumber,
|
||||
SK_NEGATE | SK_COMMUTE
|
||||
};
|
||||
|
||||
static uint16 BTLessEqualTermData[] = { /* XXX type clash */
|
||||
2,
|
||||
BTLessEqualStrategyNumber,
|
||||
0x0,
|
||||
BTLessEqualStrategyNumber,
|
||||
SK_COMMUTE
|
||||
};
|
||||
|
||||
static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */
|
||||
2,
|
||||
BTGreaterEqualStrategyNumber,
|
||||
0x0,
|
||||
BTGreaterEqualStrategyNumber,
|
||||
SK_COMMUTE
|
||||
};
|
||||
|
||||
static uint16 BTGreaterTermData[] = { /* XXX type clash */
|
||||
2,
|
||||
BTGreaterStrategyNumber,
|
||||
SK_NEGATE,
|
||||
BTGreaterStrategyNumber,
|
||||
SK_NEGATE | SK_COMMUTE
|
||||
};
|
||||
|
||||
static StrategyTerm BTEqualExpressionData[] = {
|
||||
(StrategyTerm)BTLessTermData, /* XXX */
|
||||
(StrategyTerm)BTLessEqualTermData, /* XXX */
|
||||
(StrategyTerm)BTGreaterEqualTermData, /* XXX */
|
||||
(StrategyTerm)BTGreaterTermData, /* XXX */
|
||||
NULL
|
||||
};
|
||||
|
||||
static StrategyEvaluationData BTEvaluationData = {
|
||||
/* XXX static for simplicity */
|
||||
|
||||
BTMaxStrategyNumber,
|
||||
(StrategyTransformMap)BTNegate, /* XXX */
|
||||
(StrategyTransformMap)BTCommute, /* XXX */
|
||||
(StrategyTransformMap)BTNegateCommute, /* XXX */
|
||||
|
||||
{ NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
|
||||
NULL,NULL,NULL,NULL,NULL,NULL,NULL}
|
||||
};
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
* RelationGetBTStrategy
|
||||
* ----------------------------------------------------------------
|
||||
*/
|
||||
|
||||
StrategyNumber
|
||||
_bt_getstrat(Relation rel,
|
||||
AttrNumber attno,
|
||||
RegProcedure proc)
|
||||
{
|
||||
StrategyNumber strat;
|
||||
|
||||
strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
|
||||
|
||||
Assert(StrategyNumberIsValid(strat));
|
||||
|
||||
return (strat);
|
||||
}
|
||||
|
||||
bool
|
||||
_bt_invokestrat(Relation rel,
|
||||
AttrNumber attno,
|
||||
StrategyNumber strat,
|
||||
Datum left,
|
||||
Datum right)
|
||||
{
|
||||
return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat,
|
||||
left, right));
|
||||
}
|
239
src/backend/access/nbtree/nbtutils.c
Normal file
239
src/backend/access/nbtree/nbtutils.c
Normal file
@@ -0,0 +1,239 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* btutils.c--
|
||||
* Utility code for Postgres btree implementation.
|
||||
*
|
||||
* Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include <stdio.h>
|
||||
#include "postgres.h"
|
||||
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/bufpage.h"
|
||||
|
||||
#include "fmgr.h"
|
||||
#include "utils/elog.h"
|
||||
#include "utils/palloc.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/excid.h"
|
||||
#include "utils/datum.h"
|
||||
|
||||
#include "access/heapam.h"
|
||||
#include "access/genam.h"
|
||||
#include "access/iqual.h"
|
||||
#include "access/nbtree.h"
|
||||
|
||||
ScanKey
|
||||
_bt_mkscankey(Relation rel, IndexTuple itup)
|
||||
{
|
||||
ScanKey skey;
|
||||
TupleDesc itupdesc;
|
||||
int natts;
|
||||
int i;
|
||||
Datum arg;
|
||||
RegProcedure proc;
|
||||
bool null;
|
||||
|
||||
natts = rel->rd_rel->relnatts;
|
||||
itupdesc = RelationGetTupleDescriptor(rel);
|
||||
|
||||
skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
|
||||
|
||||
for (i = 0; i < natts; i++) {
|
||||
arg = index_getattr(itup, i + 1, itupdesc, &null);
|
||||
proc = index_getprocid(rel, i + 1, BTORDER_PROC);
|
||||
ScanKeyEntryInitialize(&skey[i],
|
||||
0x0, (AttrNumber) (i + 1), proc, arg);
|
||||
}
|
||||
|
||||
return (skey);
|
||||
}
|
||||
|
||||
void
|
||||
_bt_freeskey(ScanKey skey)
|
||||
{
|
||||
pfree(skey);
|
||||
}
|
||||
|
||||
void
|
||||
_bt_freestack(BTStack stack)
|
||||
{
|
||||
BTStack ostack;
|
||||
|
||||
while (stack != (BTStack) NULL) {
|
||||
ostack = stack;
|
||||
stack = stack->bts_parent;
|
||||
pfree(ostack->bts_btitem);
|
||||
pfree(ostack);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
|
||||
*
|
||||
* The order of the keys in the qual match the ordering imposed by
|
||||
* the index. This routine only needs to be called if there are
|
||||
* more than one qual clauses using this index.
|
||||
*/
|
||||
void
|
||||
_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key)
|
||||
{
|
||||
ScanKey xform;
|
||||
ScanKeyData *cur;
|
||||
StrategyMap map;
|
||||
int nbytes;
|
||||
long test;
|
||||
int i, j;
|
||||
int init[BTMaxStrategyNumber+1];
|
||||
|
||||
/* haven't looked at any strategies yet */
|
||||
for (i = 0; i <= BTMaxStrategyNumber; i++)
|
||||
init[i] = 0;
|
||||
|
||||
/* get space for the modified array of keys */
|
||||
nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
|
||||
xform = (ScanKey) palloc(nbytes);
|
||||
memset(xform, 0, nbytes);
|
||||
|
||||
|
||||
/* get the strategy map for this index/attribute pair */
|
||||
/*
|
||||
* XXX
|
||||
* When we support multiple keys in a single index, this is what
|
||||
* we'll want to do. At present, the planner is hosed, so we
|
||||
* hard-wire the attribute number below. Postgres only does single-
|
||||
* key indices...
|
||||
* map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
|
||||
* BTMaxStrategyNumber,
|
||||
* key->data[0].attributeNumber);
|
||||
*/
|
||||
map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
|
||||
BTMaxStrategyNumber,
|
||||
1 /* XXX */ );
|
||||
|
||||
/* check each key passed in */
|
||||
for (i = *numberOfKeys; --i >= 0; ) {
|
||||
cur = &key[i];
|
||||
for (j = BTMaxStrategyNumber; --j >= 0; ) {
|
||||
if (cur->sk_procedure == map->entry[j].sk_procedure)
|
||||
break;
|
||||
}
|
||||
|
||||
/* have we seen one of these before? */
|
||||
if (init[j]) {
|
||||
/* yup, use the appropriate value */
|
||||
test =
|
||||
(long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
|
||||
cur->sk_argument, xform[j].sk_argument);
|
||||
if (test)
|
||||
xform[j].sk_argument = cur->sk_argument;
|
||||
} else {
|
||||
/* nope, use this value */
|
||||
memmove(&xform[j], cur, sizeof(*cur));
|
||||
|
||||
init[j] = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* if = has been specified, no other key will be used */
|
||||
if (init[BTEqualStrategyNumber - 1]) {
|
||||
init[BTLessStrategyNumber - 1] = 0;
|
||||
init[BTLessEqualStrategyNumber - 1] = 0;
|
||||
init[BTGreaterEqualStrategyNumber - 1] = 0;
|
||||
init[BTGreaterStrategyNumber - 1] = 0;
|
||||
}
|
||||
|
||||
/* only one of <, <= */
|
||||
if (init[BTLessStrategyNumber - 1]
|
||||
&& init[BTLessEqualStrategyNumber - 1]) {
|
||||
|
||||
ScanKeyData *lt, *le;
|
||||
|
||||
lt = &xform[BTLessStrategyNumber - 1];
|
||||
le = &xform[BTLessEqualStrategyNumber - 1];
|
||||
|
||||
/*
|
||||
* DO NOT use the cached function stuff here -- this is key
|
||||
* ordering, happens only when the user expresses a hokey
|
||||
* qualification, and gets executed only once, anyway. The
|
||||
* transform maps are hard-coded, and can't be initialized
|
||||
* in the correct way.
|
||||
*/
|
||||
|
||||
test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument);
|
||||
|
||||
if (test)
|
||||
init[BTLessEqualStrategyNumber - 1] = 0;
|
||||
else
|
||||
init[BTLessStrategyNumber - 1] = 0;
|
||||
}
|
||||
|
||||
/* only one of >, >= */
|
||||
if (init[BTGreaterStrategyNumber - 1]
|
||||
&& init[BTGreaterEqualStrategyNumber - 1]) {
|
||||
|
||||
ScanKeyData *gt, *ge;
|
||||
|
||||
gt = &xform[BTGreaterStrategyNumber - 1];
|
||||
ge = &xform[BTGreaterEqualStrategyNumber - 1];
|
||||
|
||||
/* see note above on function cache */
|
||||
test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument);
|
||||
|
||||
if (test)
|
||||
init[BTGreaterStrategyNumber - 1] = 0;
|
||||
else
|
||||
init[BTGreaterEqualStrategyNumber - 1] = 0;
|
||||
}
|
||||
|
||||
/* okay, reorder and count */
|
||||
j = 0;
|
||||
|
||||
for (i = BTMaxStrategyNumber; --i >= 0; )
|
||||
if (init[i])
|
||||
key[j++] = xform[i];
|
||||
|
||||
*numberOfKeys = j;
|
||||
|
||||
pfree(xform);
|
||||
}
|
||||
|
||||
bool
|
||||
_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
|
||||
{
|
||||
if (scan->numberOfKeys > 0)
|
||||
return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
|
||||
scan->numberOfKeys, scan->keyData));
|
||||
else
|
||||
return (true);
|
||||
}
|
||||
|
||||
BTItem
|
||||
_bt_formitem(IndexTuple itup)
|
||||
{
|
||||
int nbytes_btitem;
|
||||
BTItem btitem;
|
||||
Size tuplen;
|
||||
extern Oid newoid();
|
||||
|
||||
/* disallow nulls in btree keys */
|
||||
if (itup->t_info & INDEX_NULL_MASK)
|
||||
elog(WARN, "btree indices cannot include null keys");
|
||||
|
||||
/* make a copy of the index tuple with room for the sequence number */
|
||||
tuplen = IndexTupleSize(itup);
|
||||
nbytes_btitem = tuplen +
|
||||
(sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||
|
||||
btitem = (BTItem) palloc(nbytes_btitem);
|
||||
memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
|
||||
|
||||
btitem->bti_oid = newoid();
|
||||
return (btitem);
|
||||
}
|
Reference in New Issue
Block a user