1
0
mirror of https://github.com/postgres/postgres.git synced 2025-08-31 17:02:12 +03:00

Postgres95 1.01 Distribution - Virgin Sources

This commit is contained in:
Marc G. Fournier
1996-07-09 06:22:35 +00:00
commit d31084e9d1
868 changed files with 242656 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
#-------------------------------------------------------------------------
#
# Makefile.inc--
# Makefile for access/nbtree (btree acess methods)
#
# Copyright (c) 1994, Regents of the University of California
#
#
# IDENTIFICATION
# $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:11 scrappy Exp $
#
#-------------------------------------------------------------------------
SUBSRCS+= nbtcompare.c nbtinsert.c nbtpage.c nbtree.c nbtscan.c nbtsearch.c \
nbtstrat.c nbtutils.c nbtsort.c

View File

@@ -0,0 +1,68 @@
$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
This directory contains a correct implementation of Lehman and Yao's
btree management algorithm that supports concurrent access for Postgres.
We have made the following changes in order to incorporate their algorithm
into Postgres:
+ The requirement that all btree keys be unique is too onerous,
but the algorithm won't work correctly without it. As a result,
this implementation adds an OID (guaranteed to be unique) to
every key in the index. This guarantees uniqueness within a set
of duplicates. Space overhead is four bytes.
For this reason, when we're passed an index tuple to store by the
common access method code, we allocate a larger one and copy the
supplied tuple into it. No Postgres code outside of the btree
access method knows about this xid or sequence number.
+ Lehman and Yao don't require read locks, but assume that in-
memory copies of tree nodes are unshared. Postgres shares
in-memory buffers among backends. As a result, we do page-
level read locking on btree nodes in order to guarantee that
no record is modified while we are examining it. This reduces
concurrency but guaranteees correct behavior.
+ Read locks on a page are held for as long as a scan has a pointer
to the page. However, locks are always surrendered before the
sibling page lock is acquired (for readers), so we remain deadlock-
free. I will do a formal proof if I get bored anytime soon.
In addition, the following things are handy to know:
+ Page zero of every btree is a meta-data page. This page stores
the location of the root page, a pointer to a list of free
pages, and other stuff that's handy to know.
+ This algorithm doesn't really work, since it requires ordered
writes, and UNIX doesn't support ordered writes.
+ There's one other case where we may screw up in this
implementation. When we start a scan, we descend the tree
to the key nearest the one in the qual, and once we get there,
position ourselves correctly for the qual type (eg, <, >=, etc).
If we happen to step off a page, decide we want to get back to
it, and fetch the page again, and if some bad person has split
the page and moved the last tuple we saw off of it, then the
code complains about botched concurrency in an elog(WARN, ...)
and gives up the ghost. This is the ONLY violation of Lehman
and Yao's guarantee of correct behavior that I am aware of in
this code.
Notes to operator class implementors:
With this implementation, we require the user to supply us with
a procedure for pg_amproc. This procedure should take two keys
A and B and return < 0, 0, or > 0 if A < B, A = B, or A > B,
respectively. See the contents of that relation for the btree
access method for some samples.
Notes to mao for implementation document:
On deletions, we need to adjust the position of active scans on
the index. The code in nbtscan.c handles this. We don't need to
do this for splits because of the way splits are handled; if they
happen behind us, we'll automatically go to the next page, and if
they happen in front of us, we're not affected by them. For
insertions, if we inserted a tuple behind the current scan location
on the current scan page, we move one space ahead.

View File

@@ -0,0 +1,173 @@
/*-------------------------------------------------------------------------
*
* btcompare.c--
* Comparison functions for btree access method.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtcompare.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
* NOTES
* These functions are stored in pg_amproc. For each operator class
* defined on btrees, they compute
*
* compare(a, b):
* < 0 if a < b,
* = 0 if a == b,
* > 0 if a > b.
*-------------------------------------------------------------------------
*/
#include <string.h>
#include "postgres.h"
#include "utils/nabstime.h"
int32
btint2cmp(int16 a, int16 b)
{
return ((int32) (a - b));
}
int32
btint4cmp(int32 a, int32 b)
{
return (a - b);
}
int32
btint24cmp(int16 a, int32 b)
{
return (((int32) a) - b);
}
int32
btint42cmp(int32 a, int16 b)
{
return (a - ((int32) b));
}
int32
btfloat4cmp(float32 a, float32 b)
{
if (*a > *b)
return (1);
else if (*a == *b)
return (0);
else
return (-1);
}
int32
btfloat8cmp(float64 a, float64 b)
{
if (*a > *b)
return (1);
else if (*a == *b)
return (0);
else
return (-1);
}
int32
btoidcmp(Oid a, Oid b)
{
if (a > b)
return (1);
else if (a == b)
return (0);
else
return (-1);
}
int32
btabstimecmp(AbsoluteTime a, AbsoluteTime b)
{
if (AbsoluteTimeIsBefore(a, b))
return (1);
else if (AbsoluteTimeIsBefore(b, a))
return (-1);
else
return (0);
}
int32
btcharcmp(char a, char b)
{
return ((int32) (a - b));
}
int32
btchar2cmp(uint16 a, uint16 b)
{
return (strncmp((char *) &a, (char *) &b, 2));
}
int32
btchar4cmp(uint32 a, uint32 b)
{
return (strncmp((char *) &a, (char *) &b, 4));
}
int32
btchar8cmp(char *a, char *b)
{
return (strncmp(a, b, 8));
}
int32
btchar16cmp(char *a, char *b)
{
return (strncmp(a, b, 16));
}
int32
btnamecmp(NameData *a, NameData *b)
{
return (strncmp(a->data, b->data, NAMEDATALEN));
}
int32
bttextcmp(struct varlena *a, struct varlena *b)
{
char *ap, *bp;
int len;
int res;
ap = VARDATA(a);
bp = VARDATA(b);
/* len is the length of the shorter of the two strings */
if ((len = VARSIZE(a)) > VARSIZE(b))
len = VARSIZE(b);
/* len includes the four bytes in which string length is stored */
len -= sizeof(VARSIZE(a));
/*
* If the two strings differ in the first len bytes, or if they're
* the same in the first len bytes and they're both len bytes long,
* we're done.
*/
res = 0;
if (len > 0) {
do {
res = (int) (*ap++ - *bp++);
len--;
} while (res == 0 && len != 0);
}
if (res != 0 || VARSIZE(a) == VARSIZE(b))
return (res);
/*
* The two strings are the same in the first len bytes, and they
* are of different lengths.
*/
if (VARSIZE(a) < VARSIZE(b))
return (-1);
else
return (1);
}

View File

@@ -0,0 +1,831 @@
/*-------------------------------------------------------------------------
*
* btinsert.c--
* Item insertion in Lehman and Yao btrees for Postgres.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/rel.h"
#include "utils/excid.h"
#include "access/heapam.h"
#include "access/genam.h"
#include "access/nbtree.h"
static InsertIndexResult _bt_insertonpg(Relation rel, Buffer buf, BTStack stack, int keysz, ScanKey scankey, BTItem btitem, BTItem afteritem);
static Buffer _bt_split(Relation rel, Buffer buf);
static OffsetNumber _bt_findsplitloc(Relation rel, Page page, OffsetNumber start, OffsetNumber maxoff, Size llimit);
static void _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf);
static OffsetNumber _bt_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, BTItem btitem, BTItem afteritem);
static bool _bt_goesonpg(Relation rel, Buffer buf, Size keysz, ScanKey scankey, BTItem afteritem);
static void _bt_updateitem(Relation rel, Size keysz, Buffer buf, Oid bti_oid, BTItem newItem);
/*
* _bt_doinsert() -- Handle insertion of a single btitem in the tree.
*
* This routine is called by the public interface routines, btbuild
* and btinsert. By here, btitem is filled in, and has a unique
* (xid, seqno) pair.
*/
InsertIndexResult
_bt_doinsert(Relation rel, BTItem btitem)
{
ScanKey itup_scankey;
IndexTuple itup;
BTStack stack;
Buffer buf;
BlockNumber blkno;
int natts;
InsertIndexResult res;
itup = &(btitem->bti_itup);
/* we need a scan key to do our search, so build one */
itup_scankey = _bt_mkscankey(rel, itup);
natts = rel->rd_rel->relnatts;
/* find the page containing this key */
stack = _bt_search(rel, natts, itup_scankey, &buf);
blkno = BufferGetBlockNumber(buf);
/* trade in our read lock for a write lock */
_bt_relbuf(rel, buf, BT_READ);
buf = _bt_getbuf(rel, blkno, BT_WRITE);
/*
* If the page was split between the time that we surrendered our
* read lock and acquired our write lock, then this page may no
* longer be the right place for the key we want to insert. In this
* case, we need to move right in the tree. See Lehman and Yao for
* an excruciatingly precise description.
*/
buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE);
/* do the insertion */
res = _bt_insertonpg(rel, buf, stack, natts, itup_scankey,
btitem, (BTItem) NULL);
/* be tidy */
_bt_freestack(stack);
_bt_freeskey(itup_scankey);
return (res);
}
/*
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
*
* This recursive procedure does the following things:
*
* + if necessary, splits the target page.
* + finds the right place to insert the tuple (taking into
* account any changes induced by a split).
* + inserts the tuple.
* + if the page was split, pops the parent stack, and finds the
* right place to insert the new child pointer (by walking
* right using information stored in the parent stack).
* + invoking itself with the appropriate tuple for the right
* child page on the parent.
*
* On entry, we must have the right buffer on which to do the
* insertion, and the buffer must be pinned and locked. On return,
* we will have dropped both the pin and the write lock on the buffer.
*
* The locking interactions in this code are critical. You should
* grok Lehman and Yao's paper before making any changes. In addition,
* you need to understand how we disambiguate duplicate keys in this
* implementation, in order to be able to find our location using
* L&Y "move right" operations. Since we may insert duplicate user
* keys, and since these dups may propogate up the tree, we use the
* 'afteritem' parameter to position ourselves correctly for the
* insertion on internal pages.
*/
static InsertIndexResult
_bt_insertonpg(Relation rel,
Buffer buf,
BTStack stack,
int keysz,
ScanKey scankey,
BTItem btitem,
BTItem afteritem)
{
InsertIndexResult res;
Page page;
Buffer rbuf;
Buffer pbuf;
Page rpage;
ScanKey newskey;
BTItem ritem;
BTPageOpaque rpageop;
BlockNumber rbknum, itup_blkno;
OffsetNumber itup_off;
int itemsz;
InsertIndexResult newres;
BTItem new_item = (BTItem) NULL;
BTItem lowLeftItem;
page = BufferGetPage(buf);
itemsz = IndexTupleDSize(btitem->bti_itup)
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
itemsz = DOUBLEALIGN(itemsz); /* be safe, PageAddItem will do this
but we need to be consistent */
if (PageGetFreeSpace(page) < itemsz) {
/* split the buffer into left and right halves */
rbuf = _bt_split(rel, buf);
/* which new page (left half or right half) gets the tuple? */
if (_bt_goesonpg(rel, buf, keysz, scankey, afteritem)) {
/* left page */
itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
itemsz, btitem, afteritem);
itup_blkno = BufferGetBlockNumber(buf);
} else {
/* right page */
itup_off = _bt_pgaddtup(rel, rbuf, keysz, scankey,
itemsz, btitem, afteritem);
itup_blkno = BufferGetBlockNumber(rbuf);
}
/*
* By here,
*
* + our target page has been split;
* + the original tuple has been inserted;
* + we have write locks on both the old (left half) and new
* (right half) buffers, after the split; and
* + we have the key we want to insert into the parent.
*
* Do the parent insertion. We need to hold onto the locks for
* the child pages until we locate the parent, but we can release
* them before doing the actual insertion (see Lehman and Yao for
* the reasoning).
*/
if (stack == (BTStack) NULL) {
/* create a new root node and release the split buffers */
_bt_newroot(rel, buf, rbuf);
_bt_relbuf(rel, buf, BT_WRITE);
_bt_relbuf(rel, rbuf, BT_WRITE);
} else {
/* form a index tuple that points at the new right page */
rbknum = BufferGetBlockNumber(rbuf);
rpage = BufferGetPage(rbuf);
rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
/*
* By convention, the first entry (0) on every
* non-rightmost page is the high key for that page. In
* order to get the lowest key on the new right page, we
* actually look at its second (1) entry.
*/
if (! P_RIGHTMOST(rpageop)) {
ritem = (BTItem) PageGetItem(rpage,
PageGetItemId(rpage, P_FIRSTKEY));
} else {
ritem = (BTItem) PageGetItem(rpage,
PageGetItemId(rpage, P_HIKEY));
}
/* get a unique btitem for this key */
new_item = _bt_formitem(&(ritem->bti_itup));
ItemPointerSet(&(new_item->bti_itup.t_tid), rbknum, P_HIKEY);
/* find the parent buffer */
pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
/*
* If the key of new_item is < than the key of the item
* in the parent page pointing to the left page
* (stack->bts_btitem), we have to update the latter key;
* otherwise the keys on the parent page wouldn't be
* monotonically increasing after we inserted the new
* pointer to the right page (new_item). This only
* happens if our left page is the leftmost page and a
* new minimum key had been inserted before, which is not
* reflected in the parent page but didn't matter so
* far. If there are duplicate keys and this new minimum
* key spills over to our new right page, we get an
* inconsistency if we don't update the left key in the
* parent page.
*/
if (_bt_itemcmp(rel, keysz, stack->bts_btitem, new_item,
BTGreaterStrategyNumber)) {
lowLeftItem =
(BTItem) PageGetItem(page,
PageGetItemId(page, P_FIRSTKEY));
/* page must have right pointer after split */
_bt_updateitem(rel, keysz, pbuf, stack->bts_btitem->bti_oid,
lowLeftItem);
}
/* don't need the children anymore */
_bt_relbuf(rel, buf, BT_WRITE);
_bt_relbuf(rel, rbuf, BT_WRITE);
newskey = _bt_mkscankey(rel, &(new_item->bti_itup));
newres = _bt_insertonpg(rel, pbuf, stack->bts_parent,
keysz, newskey, new_item,
stack->bts_btitem);
/* be tidy */
pfree(newres);
pfree(newskey);
pfree(new_item);
}
} else {
itup_off = _bt_pgaddtup(rel, buf, keysz, scankey,
itemsz, btitem, afteritem);
itup_blkno = BufferGetBlockNumber(buf);
_bt_relbuf(rel, buf, BT_WRITE);
}
/* by here, the new tuple is inserted */
res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
return (res);
}
/*
* _bt_split() -- split a page in the btree.
*
* On entry, buf is the page to split, and is write-locked and pinned.
* Returns the new right sibling of buf, pinned and write-locked. The
* pin and lock on buf are maintained.
*/
static Buffer
_bt_split(Relation rel, Buffer buf)
{
Buffer rbuf;
Page origpage;
Page leftpage, rightpage;
BTPageOpaque ropaque, lopaque, oopaque;
Buffer sbuf;
Page spage;
BTPageOpaque sopaque;
Size itemsz;
ItemId itemid;
BTItem item;
OffsetNumber leftoff, rightoff;
OffsetNumber start;
OffsetNumber maxoff;
OffsetNumber firstright;
OffsetNumber i;
Size llimit;
rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
origpage = BufferGetPage(buf);
leftpage = PageGetTempPage(origpage, sizeof(BTPageOpaqueData));
rightpage = BufferGetPage(rbuf);
_bt_pageinit(rightpage, BufferGetPageSize(rbuf));
_bt_pageinit(leftpage, BufferGetPageSize(buf));
/* init btree private data */
oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
lopaque = (BTPageOpaque) PageGetSpecialPointer(leftpage);
ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
/* if we're splitting this page, it won't be the root when we're done */
oopaque->btpo_flags &= ~BTP_ROOT;
lopaque->btpo_flags = ropaque->btpo_flags = oopaque->btpo_flags;
lopaque->btpo_prev = oopaque->btpo_prev;
ropaque->btpo_prev = BufferGetBlockNumber(buf);
lopaque->btpo_next = BufferGetBlockNumber(rbuf);
ropaque->btpo_next = oopaque->btpo_next;
/*
* If the page we're splitting is not the rightmost page at its
* level in the tree, then the first (0) entry on the page is the
* high key for the page. We need to copy that to the right
* half. Otherwise (meaning the rightmost page case), we should
* treat the line pointers beginning at zero as user data.
*
* We leave a blank space at the start of the line table for the
* left page. We'll come back later and fill it in with the high
* key item we get from the right key.
*/
leftoff = P_FIRSTKEY;
ropaque->btpo_next = oopaque->btpo_next;
if (! P_RIGHTMOST(oopaque)) {
/* splitting a non-rightmost page, start at the first data item */
start = P_FIRSTKEY;
/* copy the original high key to the new page */
itemid = PageGetItemId(origpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (BTItem) PageGetItem(origpage, itemid);
(void) PageAddItem(rightpage, (Item) item, itemsz, P_HIKEY, LP_USED);
rightoff = P_FIRSTKEY;
} else {
/* splitting a rightmost page, "high key" is the first data item */
start = P_HIKEY;
/* the new rightmost page will not have a high key */
rightoff = P_HIKEY;
}
maxoff = PageGetMaxOffsetNumber(origpage);
llimit = PageGetFreeSpace(leftpage) / 2;
firstright = _bt_findsplitloc(rel, origpage, start, maxoff, llimit);
for (i = start; i <= maxoff; i = OffsetNumberNext(i)) {
itemid = PageGetItemId(origpage, i);
itemsz = ItemIdGetLength(itemid);
item = (BTItem) PageGetItem(origpage, itemid);
/* decide which page to put it on */
if (i < firstright) {
(void) PageAddItem(leftpage, (Item) item, itemsz, leftoff,
LP_USED);
leftoff = OffsetNumberNext(leftoff);
} else {
(void) PageAddItem(rightpage, (Item) item, itemsz, rightoff,
LP_USED);
rightoff = OffsetNumberNext(rightoff);
}
}
/*
* Okay, page has been split, high key on right page is correct. Now
* set the high key on the left page to be the min key on the right
* page.
*/
if (P_RIGHTMOST(ropaque)) {
itemid = PageGetItemId(rightpage, P_HIKEY);
} else {
itemid = PageGetItemId(rightpage, P_FIRSTKEY);
}
itemsz = ItemIdGetLength(itemid);
item = (BTItem) PageGetItem(rightpage, itemid);
/*
* We left a hole for the high key on the left page; fill it. The
* modal crap is to tell the page manager to put the new item on the
* page and not screw around with anything else. Whoever designed
* this interface has presumably crawled back into the dung heap they
* came from. No one here will admit to it.
*/
PageManagerModeSet(OverwritePageManagerMode);
(void) PageAddItem(leftpage, (Item) item, itemsz, P_HIKEY, LP_USED);
PageManagerModeSet(ShufflePageManagerMode);
/*
* By here, the original data page has been split into two new halves,
* and these are correct. The algorithm requires that the left page
* never move during a split, so we copy the new left page back on top
* of the original. Note that this is not a waste of time, since we
* also require (in the page management code) that the center of a
* page always be clean, and the most efficient way to guarantee this
* is just to compact the data by reinserting it into a new left page.
*/
PageRestoreTempPage(leftpage, origpage);
/* write these guys out */
_bt_wrtnorelbuf(rel, rbuf);
_bt_wrtnorelbuf(rel, buf);
/*
* Finally, we need to grab the right sibling (if any) and fix the
* prev pointer there. We are guaranteed that this is deadlock-free
* since no other writer will be moving holding a lock on that page
* and trying to move left, and all readers release locks on a page
* before trying to fetch its neighbors.
*/
if (! P_RIGHTMOST(ropaque)) {
sbuf = _bt_getbuf(rel, ropaque->btpo_next, BT_WRITE);
spage = BufferGetPage(sbuf);
sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
sopaque->btpo_prev = BufferGetBlockNumber(rbuf);
/* write and release the old right sibling */
_bt_wrtbuf(rel, sbuf);
}
/* split's done */
return (rbuf);
}
/*
* _bt_findsplitloc() -- find a safe place to split a page.
*
* In order to guarantee the proper handling of searches for duplicate
* keys, the first duplicate in the chain must either be the first
* item on the page after the split, or the entire chain must be on
* one of the two pages. That is,
* [1 2 2 2 3 4 5]
* must become
* [1] [2 2 2 3 4 5]
* or
* [1 2 2 2] [3 4 5]
* but not
* [1 2 2] [2 3 4 5].
* However,
* [2 2 2 2 2 3 4]
* may be split as
* [2 2 2 2] [2 3 4].
*/
static OffsetNumber
_bt_findsplitloc(Relation rel,
Page page,
OffsetNumber start,
OffsetNumber maxoff,
Size llimit)
{
OffsetNumber i;
OffsetNumber saferight;
ItemId nxtitemid, safeitemid;
BTItem safeitem, nxtitem;
IndexTuple safetup, nxttup;
Size nbytes;
TupleDesc itupdesc;
int natts;
int attno;
Datum attsafe;
Datum attnext;
bool null;
itupdesc = RelationGetTupleDescriptor(rel);
natts = rel->rd_rel->relnatts;
saferight = start;
safeitemid = PageGetItemId(page, saferight);
nbytes = ItemIdGetLength(safeitemid) + sizeof(ItemIdData);
safeitem = (BTItem) PageGetItem(page, safeitemid);
safetup = &(safeitem->bti_itup);
i = OffsetNumberNext(start);
while (nbytes < llimit) {
/* check the next item on the page */
nxtitemid = PageGetItemId(page, i);
nbytes += (ItemIdGetLength(nxtitemid) + sizeof(ItemIdData));
nxtitem = (BTItem) PageGetItem(page, nxtitemid);
nxttup = &(nxtitem->bti_itup);
/* test against last known safe item */
for (attno = 1; attno <= natts; attno++) {
attsafe = index_getattr(safetup, attno, itupdesc, &null);
attnext = index_getattr(nxttup, attno, itupdesc, &null);
/*
* If the tuple we're looking at isn't equal to the last safe one
* we saw, then it's our new safe tuple.
*/
if (!_bt_invokestrat(rel, attno, BTEqualStrategyNumber,
attsafe, attnext)) {
safetup = nxttup;
saferight = i;
/* break is for the attno for loop */
break;
}
}
i = OffsetNumberNext(i);
}
/*
* If the chain of dups starts at the beginning of the page and extends
* past the halfway mark, we can split it in the middle.
*/
if (saferight == start)
saferight = i;
return (saferight);
}
/*
* _bt_newroot() -- Create a new root page for the index.
*
* We've just split the old root page and need to create a new one.
* In order to do this, we add a new root page to the file, then lock
* the metadata page and update it. This is guaranteed to be deadlock-
* free, because all readers release their locks on the metadata page
* before trying to lock the root, and all writers lock the root before
* trying to lock the metadata page. We have a write lock on the old
* root page, so we have not introduced any cycles into the waits-for
* graph.
*
* On entry, lbuf (the old root) and rbuf (its new peer) are write-
* locked. We don't drop the locks in this routine; that's done by
* the caller. On exit, a new root page exists with entries for the
* two new children. The new root page is neither pinned nor locked.
*/
static void
_bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
{
Buffer rootbuf;
Page lpage, rpage, rootpage;
BlockNumber lbkno, rbkno;
BlockNumber rootbknum;
BTPageOpaque rootopaque;
ItemId itemid;
BTItem item;
Size itemsz;
BTItem new_item;
/* get a new root page */
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rootpage = BufferGetPage(rootbuf);
_bt_pageinit(rootpage, BufferGetPageSize(rootbuf));
/* set btree special data */
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpage);
rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
rootopaque->btpo_flags |= BTP_ROOT;
/*
* Insert the internal tuple pointers.
*/
lbkno = BufferGetBlockNumber(lbuf);
rbkno = BufferGetBlockNumber(rbuf);
lpage = BufferGetPage(lbuf);
rpage = BufferGetPage(rbuf);
/*
* step over the high key on the left page while building the
* left page pointer.
*/
itemid = PageGetItemId(lpage, P_FIRSTKEY);
itemsz = ItemIdGetLength(itemid);
item = (BTItem) PageGetItem(lpage, itemid);
new_item = _bt_formitem(&(item->bti_itup));
ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_FIRSTKEY);
/*
* insert the left page pointer into the new root page. the root
* page is the rightmost page on its level so the "high key" item
* is the first data item.
*/
(void) PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED);
pfree(new_item);
/*
* the right page is the rightmost page on the second level, so
* the "high key" item is the first data item on that page as well.
*/
itemid = PageGetItemId(rpage, P_HIKEY);
itemsz = ItemIdGetLength(itemid);
item = (BTItem) PageGetItem(rpage, itemid);
new_item = _bt_formitem(&(item->bti_itup));
ItemPointerSet(&(new_item->bti_itup.t_tid), rbkno, P_HIKEY);
/*
* insert the right page pointer into the new root page.
*/
(void) PageAddItem(rootpage, (Item) new_item, itemsz, P_FIRSTKEY, LP_USED);
pfree(new_item);
/* write and let go of the root buffer */
rootbknum = BufferGetBlockNumber(rootbuf);
_bt_wrtbuf(rel, rootbuf);
/* update metadata page with new root block number */
_bt_metaproot(rel, rootbknum);
}
/*
* _bt_pgaddtup() -- add a tuple to a particular page in the index.
*
* This routine adds the tuple to the page as requested, and keeps the
* write lock and reference associated with the page's buffer. It is
* an error to call pgaddtup() without a write lock and reference. If
* afteritem is non-null, it's the item that we expect our new item
* to follow. Otherwise, we do a binary search for the correct place
* and insert the new item there.
*/
static OffsetNumber
_bt_pgaddtup(Relation rel,
Buffer buf,
int keysz,
ScanKey itup_scankey,
Size itemsize,
BTItem btitem,
BTItem afteritem)
{
OffsetNumber itup_off;
OffsetNumber first;
Page page;
BTPageOpaque opaque;
BTItem chkitem;
Oid afteroid;
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
first = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
if (afteritem == (BTItem) NULL) {
itup_off = _bt_binsrch(rel, buf, keysz, itup_scankey, BT_INSERTION);
} else {
afteroid = afteritem->bti_oid;
itup_off = first;
do {
chkitem =
(BTItem) PageGetItem(page, PageGetItemId(page, itup_off));
itup_off = OffsetNumberNext(itup_off);
} while (chkitem->bti_oid != afteroid);
}
(void) PageAddItem(page, (Item) btitem, itemsize, itup_off, LP_USED);
/* write the buffer, but hold our lock */
_bt_wrtnorelbuf(rel, buf);
return (itup_off);
}
/*
* _bt_goesonpg() -- Does a new tuple belong on this page?
*
* This is part of the complexity introduced by allowing duplicate
* keys into the index. The tuple belongs on this page if:
*
* + there is no page to the right of this one; or
* + it is less than the high key on the page; or
* + the item it is to follow ("afteritem") appears on this
* page.
*/
static bool
_bt_goesonpg(Relation rel,
Buffer buf,
Size keysz,
ScanKey scankey,
BTItem afteritem)
{
Page page;
ItemId hikey;
BTPageOpaque opaque;
BTItem chkitem;
OffsetNumber offnum, maxoff;
Oid afteroid;
bool found;
page = BufferGetPage(buf);
/* no right neighbor? */
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
if (P_RIGHTMOST(opaque))
return (true);
/*
* this is a non-rightmost page, so it must have a high key item.
*
* If the scan key is < the high key (the min key on the next page),
* then it for sure belongs here.
*/
hikey = PageGetItemId(page, P_HIKEY);
if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTLessStrategyNumber))
return (true);
/*
* If the scan key is > the high key, then it for sure doesn't belong
* here.
*/
if (_bt_skeycmp(rel, keysz, scankey, page, hikey, BTGreaterStrategyNumber))
return (false);
/*
* If we have no adjacency information, and the item is equal to the
* high key on the page (by here it is), then the item does not belong
* on this page.
*/
if (afteritem == (BTItem) NULL)
return (false);
/* damn, have to work for it. i hate that. */
afteroid = afteritem->bti_oid;
maxoff = PageGetMaxOffsetNumber(page);
/*
* Search the entire page for the afteroid. We need to do this, rather
* than doing a binary search and starting from there, because if the
* key we're searching for is the leftmost key in the tree at this
* level, then a binary search will do the wrong thing. Splits are
* pretty infrequent, so the cost isn't as bad as it could be.
*/
found = false;
for (offnum = P_FIRSTKEY;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum)) {
chkitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
if (chkitem->bti_oid == afteroid) {
found = true;
break;
}
}
return (found);
}
/*
* _bt_itemcmp() -- compare item1 to item2 using a requested
* strategy (<, <=, =, >=, >)
*
*/
bool
_bt_itemcmp(Relation rel,
Size keysz,
BTItem item1,
BTItem item2,
StrategyNumber strat)
{
TupleDesc tupDes;
IndexTuple indexTuple1, indexTuple2;
Datum attrDatum1, attrDatum2;
int i;
bool isNull;
bool compare;
tupDes = RelationGetTupleDescriptor(rel);
indexTuple1 = &(item1->bti_itup);
indexTuple2 = &(item2->bti_itup);
for (i = 1; i <= keysz; i++) {
attrDatum1 = index_getattr(indexTuple1, i, tupDes, &isNull);
attrDatum2 = index_getattr(indexTuple2, i, tupDes, &isNull);
compare = _bt_invokestrat(rel, i, strat, attrDatum1, attrDatum2);
if (!compare) {
return (false);
}
}
return (true);
}
/*
* _bt_updateitem() -- updates the key of the item identified by the
* oid with the key of newItem (done in place)
*
*/
static void
_bt_updateitem(Relation rel,
Size keysz,
Buffer buf,
Oid bti_oid,
BTItem newItem)
{
Page page;
OffsetNumber maxoff;
OffsetNumber i;
ItemPointerData itemPtrData;
BTItem item;
IndexTuple oldIndexTuple, newIndexTuple;
page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);
/* locate item on the page */
i = P_HIKEY;
do {
item = (BTItem) PageGetItem(page, PageGetItemId(page, i));
i = OffsetNumberNext(i);
} while (i <= maxoff && item->bti_oid != bti_oid);
/* this should never happen (in theory) */
if (item->bti_oid != bti_oid) {
elog(FATAL, "_bt_getstackbuf was lying!!");
}
oldIndexTuple = &(item->bti_itup);
newIndexTuple = &(newItem->bti_itup);
/* keep the original item pointer */
ItemPointerCopy(&(oldIndexTuple->t_tid), &itemPtrData);
CopyIndexTuple(newIndexTuple, &oldIndexTuple);
ItemPointerCopy(&itemPtrData, &(oldIndexTuple->t_tid));
}

View File

@@ -0,0 +1,523 @@
/*-------------------------------------------------------------------------
*
* btpage.c--
* BTree-specific page management code for the Postgres btree access
* method.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtpage.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
* NOTES
* Postgres btree pages look like ordinary relation pages. The opaque
* data at high addresses includes pointers to left and right siblings
* and flag data describing page state. The first page in a btree, page
* zero, is special -- it stores meta-information describing the tree.
* Pages one and higher store the actual tree data.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "utils/elog.h"
#include "utils/rel.h"
#include "utils/excid.h"
#include "access/genam.h"
#include "access/nbtree.h"
#define BTREE_METAPAGE 0
#define BTREE_MAGIC 0x053162
#define BTREE_VERSION 0
typedef struct BTMetaPageData {
uint32 btm_magic;
uint32 btm_version;
BlockNumber btm_root;
} BTMetaPageData;
#define BTPageGetMeta(p) \
((BTMetaPageData *) &((PageHeader) p)->pd_linp[0])
extern bool BuildingBtree;
/*
* We use high-concurrency locking on btrees. There are two cases in
* which we don't do locking. One is when we're building the btree.
* Since the creating transaction has not committed, no one can see
* the index, and there's no reason to share locks. The second case
* is when we're just starting up the database system. We use some
* special-purpose initialization code in the relation cache manager
* (see utils/cache/relcache.c) to allow us to do indexed scans on
* the system catalogs before we'd normally be able to. This happens
* before the lock table is fully initialized, so we can't use it.
* Strictly speaking, this violates 2pl, but we don't do 2pl on the
* system catalogs anyway, so I declare this to be okay.
*/
#define USELOCKING (!BuildingBtree && !IsInitProcessingMode())
/*
* _bt_metapinit() -- Initialize the metadata page of a btree.
*/
void
_bt_metapinit(Relation rel)
{
Buffer buf;
Page pg;
int nblocks;
BTMetaPageData metad;
BTPageOpaque op;
/* can't be sharing this with anyone, now... */
if (USELOCKING)
RelationSetLockForWrite(rel);
if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
elog(WARN, "Cannot initialize non-empty btree %s",
RelationGetRelationName(rel));
}
buf = ReadBuffer(rel, P_NEW);
pg = BufferGetPage(buf);
_bt_pageinit(pg, BufferGetPageSize(buf));
metad.btm_magic = BTREE_MAGIC;
metad.btm_version = BTREE_VERSION;
metad.btm_root = P_NONE;
memmove((char *) BTPageGetMeta(pg), (char *) &metad, sizeof(metad));
op = (BTPageOpaque) PageGetSpecialPointer(pg);
op->btpo_flags = BTP_META;
WriteBuffer(buf);
/* all done */
if (USELOCKING)
RelationUnsetLockForWrite(rel);
}
/*
* _bt_checkmeta() -- Verify that the metadata stored in a btree are
* reasonable.
*/
void
_bt_checkmeta(Relation rel)
{
Buffer metabuf;
Page metap;
BTMetaPageData *metad;
BTPageOpaque op;
int nblocks;
/* if the relation is empty, this is init time; don't complain */
if ((nblocks = RelationGetNumberOfBlocks(rel)) == 0)
return;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metap = BufferGetPage(metabuf);
op = (BTPageOpaque) PageGetSpecialPointer(metap);
if (!(op->btpo_flags & BTP_META)) {
elog(WARN, "Invalid metapage for index %s",
RelationGetRelationName(rel));
}
metad = BTPageGetMeta(metap);
if (metad->btm_magic != BTREE_MAGIC) {
elog(WARN, "Index %s is not a btree",
RelationGetRelationName(rel));
}
if (metad->btm_version != BTREE_VERSION) {
elog(WARN, "Version mismatch on %s: version %d file, version %d code",
RelationGetRelationName(rel),
metad->btm_version, BTREE_VERSION);
}
_bt_relbuf(rel, metabuf, BT_READ);
}
/*
* _bt_getroot() -- Get the root page of the btree.
*
* Since the root page can move around the btree file, we have to read
* its location from the metadata page, and then read the root page
* itself. If no root page exists yet, we have to create one. The
* standard class of race conditions exists here; I think I covered
* them all in the Hopi Indian rain dance of lock requests below.
*
* We pass in the access type (BT_READ or BT_WRITE), and return the
* root page's buffer with the appropriate lock type set. Reference
* count on the root page gets bumped by ReadBuffer. The metadata
* page is unlocked and unreferenced by this process when this routine
* returns.
*/
Buffer
_bt_getroot(Relation rel, int access)
{
Buffer metabuf;
Page metapg;
BTPageOpaque metaopaque;
Buffer rootbuf;
Page rootpg;
BTPageOpaque rootopaque;
BlockNumber rootblkno;
BTMetaPageData *metad;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
Assert(metaopaque->btpo_flags & BTP_META);
metad = BTPageGetMeta(metapg);
/* if no root page initialized yet, do it */
if (metad->btm_root == P_NONE) {
/* turn our read lock in for a write lock */
_bt_relbuf(rel, metabuf, BT_READ);
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metapg = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metapg);
Assert(metaopaque->btpo_flags & BTP_META);
metad = BTPageGetMeta(metapg);
/*
* Race condition: if someone else initialized the metadata between
* the time we released the read lock and acquired the write lock,
* above, we want to avoid doing it again.
*/
if (metad->btm_root == P_NONE) {
/*
* Get, initialize, write, and leave a lock of the appropriate
* type on the new root page. Since this is the first page in
* the tree, it's a leaf.
*/
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
rootblkno = BufferGetBlockNumber(rootbuf);
rootpg = BufferGetPage(rootbuf);
metad->btm_root = rootblkno;
_bt_pageinit(rootpg, BufferGetPageSize(rootbuf));
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
rootopaque->btpo_flags |= (BTP_LEAF | BTP_ROOT);
_bt_wrtnorelbuf(rel, rootbuf);
/* swap write lock for read lock, if appropriate */
if (access != BT_WRITE) {
_bt_setpagelock(rel, rootblkno, BT_READ);
_bt_unsetpagelock(rel, rootblkno, BT_WRITE);
}
/* okay, metadata is correct */
_bt_wrtbuf(rel, metabuf);
} else {
/*
* Metadata initialized by someone else. In order to guarantee
* no deadlocks, we have to release the metadata page and start
* all over again.
*/
_bt_relbuf(rel, metabuf, BT_WRITE);
return (_bt_getroot(rel, access));
}
} else {
rootbuf = _bt_getbuf(rel, metad->btm_root, access);
/* done with the meta page */
_bt_relbuf(rel, metabuf, BT_READ);
}
/*
* Race condition: If the root page split between the time we looked
* at the metadata page and got the root buffer, then we got the wrong
* buffer.
*/
rootpg = BufferGetPage(rootbuf);
rootopaque = (BTPageOpaque) PageGetSpecialPointer(rootpg);
if (!(rootopaque->btpo_flags & BTP_ROOT)) {
/* it happened, try again */
_bt_relbuf(rel, rootbuf, access);
return (_bt_getroot(rel, access));
}
/*
* By here, we have a correct lock on the root block, its reference
* count is correct, and we have no lock set on the metadata page.
* Return the root block.
*/
return (rootbuf);
}
/*
* _bt_getbuf() -- Get a buffer by block number for read or write.
*
* When this routine returns, the appropriate lock is set on the
* requested buffer its reference count is correct.
*/
Buffer
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
{
Buffer buf;
Page page;
/*
* If we want a new block, we can't set a lock of the appropriate type
* until we've instantiated the buffer.
*/
if (blkno != P_NEW) {
if (access == BT_WRITE)
_bt_setpagelock(rel, blkno, BT_WRITE);
else
_bt_setpagelock(rel, blkno, BT_READ);
buf = ReadBuffer(rel, blkno);
} else {
buf = ReadBuffer(rel, blkno);
blkno = BufferGetBlockNumber(buf);
page = BufferGetPage(buf);
_bt_pageinit(page, BufferGetPageSize(buf));
if (access == BT_WRITE)
_bt_setpagelock(rel, blkno, BT_WRITE);
else
_bt_setpagelock(rel, blkno, BT_READ);
}
/* ref count and lock type are correct */
return (buf);
}
/*
* _bt_relbuf() -- release a locked buffer.
*/
void
_bt_relbuf(Relation rel, Buffer buf, int access)
{
BlockNumber blkno;
blkno = BufferGetBlockNumber(buf);
/* access had better be one of read or write */
if (access == BT_WRITE)
_bt_unsetpagelock(rel, blkno, BT_WRITE);
else
_bt_unsetpagelock(rel, blkno, BT_READ);
ReleaseBuffer(buf);
}
/*
* _bt_wrtbuf() -- write a btree page to disk.
*
* This routine releases the lock held on the buffer and our reference
* to it. It is an error to call _bt_wrtbuf() without a write lock
* or a reference to the buffer.
*/
void
_bt_wrtbuf(Relation rel, Buffer buf)
{
BlockNumber blkno;
blkno = BufferGetBlockNumber(buf);
WriteBuffer(buf);
_bt_unsetpagelock(rel, blkno, BT_WRITE);
}
/*
* _bt_wrtnorelbuf() -- write a btree page to disk, but do not release
* our reference or lock.
*
* It is an error to call _bt_wrtnorelbuf() without a write lock
* or a reference to the buffer.
*/
void
_bt_wrtnorelbuf(Relation rel, Buffer buf)
{
BlockNumber blkno;
blkno = BufferGetBlockNumber(buf);
WriteNoReleaseBuffer(buf);
}
/*
* _bt_pageinit() -- Initialize a new page.
*/
void
_bt_pageinit(Page page, Size size)
{
/*
* Cargo-cult programming -- don't really need this to be zero, but
* creating new pages is an infrequent occurrence and it makes me feel
* good when I know they're empty.
*/
memset(page, 0, size);
PageInit(page, size, sizeof(BTPageOpaqueData));
}
/*
* _bt_metaproot() -- Change the root page of the btree.
*
* Lehman and Yao require that the root page move around in order to
* guarantee deadlock-free short-term, fine-granularity locking. When
* we split the root page, we record the new parent in the metadata page
* for the relation. This routine does the work.
*
* No direct preconditions, but if you don't have the a write lock on
* at least the old root page when you call this, you're making a big
* mistake. On exit, metapage data is correct and we no longer have
* a reference to or lock on the metapage.
*/
void
_bt_metaproot(Relation rel, BlockNumber rootbknum)
{
Buffer metabuf;
Page metap;
BTPageOpaque metaopaque;
BTMetaPageData *metad;
metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
metap = BufferGetPage(metabuf);
metaopaque = (BTPageOpaque) PageGetSpecialPointer(metap);
Assert(metaopaque->btpo_flags & BTP_META);
metad = BTPageGetMeta(metap);
metad->btm_root = rootbknum;
_bt_wrtbuf(rel, metabuf);
}
/*
* _bt_getstackbuf() -- Walk back up the tree one step, and find the item
* we last looked at in the parent.
*
* This is possible because we save a bit image of the last item
* we looked at in the parent, and the update algorithm guarantees
* that if items above us in the tree move, they only move right.
*/
Buffer
_bt_getstackbuf(Relation rel, BTStack stack, int access)
{
Buffer buf;
BlockNumber blkno;
OffsetNumber start, offnum, maxoff;
OffsetNumber i;
Page page;
ItemId itemid;
BTItem item;
BTPageOpaque opaque;
blkno = stack->bts_blkno;
buf = _bt_getbuf(rel, blkno, access);
page = BufferGetPage(buf);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
maxoff = PageGetMaxOffsetNumber(page);
if (maxoff >= stack->bts_offset) {
itemid = PageGetItemId(page, stack->bts_offset);
item = (BTItem) PageGetItem(page, itemid);
/* if the item is where we left it, we're done */
if (item->bti_oid == stack->bts_btitem->bti_oid)
return (buf);
/* if the item has just moved right on this page, we're done */
for (i = OffsetNumberNext(stack->bts_offset);
i <= maxoff;
i = OffsetNumberNext(i)) {
itemid = PageGetItemId(page, i);
item = (BTItem) PageGetItem(page, itemid);
/* if the item is where we left it, we're done */
if (item->bti_oid == stack->bts_btitem->bti_oid)
return (buf);
}
}
/* by here, the item we're looking for moved right at least one page */
for (;;) {
blkno = opaque->btpo_next;
if (P_RIGHTMOST(opaque))
elog(FATAL, "my bits moved right off the end of the world!");
_bt_relbuf(rel, buf, access);
buf = _bt_getbuf(rel, blkno, access);
page = BufferGetPage(buf);
maxoff = PageGetMaxOffsetNumber(page);
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
/* if we have a right sibling, step over the high key */
start = P_RIGHTMOST(opaque) ? P_HIKEY : P_FIRSTKEY;
/* see if it's on this page */
for (offnum = start;
offnum <= maxoff;
offnum = OffsetNumberNext(offnum)) {
itemid = PageGetItemId(page, offnum);
item = (BTItem) PageGetItem(page, itemid);
if (item->bti_oid == stack->bts_btitem->bti_oid)
return (buf);
}
}
}
void
_bt_setpagelock(Relation rel, BlockNumber blkno, int access)
{
ItemPointerData iptr;
if (USELOCKING) {
ItemPointerSet(&iptr, blkno, P_HIKEY);
if (access == BT_WRITE)
RelationSetSingleWLockPage(rel, &iptr);
else
RelationSetSingleRLockPage(rel, &iptr);
}
}
void
_bt_unsetpagelock(Relation rel, BlockNumber blkno, int access)
{
ItemPointerData iptr;
if (USELOCKING) {
ItemPointerSet(&iptr, blkno, P_HIKEY);
if (access == BT_WRITE)
RelationUnsetSingleWLockPage(rel, &iptr);
else
RelationUnsetSingleRLockPage(rel, &iptr);
}
}
void
_bt_pagedel(Relation rel, ItemPointer tid)
{
Buffer buf;
Page page;
BlockNumber blkno;
OffsetNumber offno;
blkno = ItemPointerGetBlockNumber(tid);
offno = ItemPointerGetOffsetNumber(tid);
buf = _bt_getbuf(rel, blkno, BT_WRITE);
page = BufferGetPage(buf);
PageIndexTupleDelete(page, offno);
/* write the buffer and release the lock */
_bt_wrtbuf(rel, buf);
}

View File

@@ -0,0 +1,516 @@
/*-------------------------------------------------------------------------
*
* btree.c--
* Implementation of Lehman and Yao's btree management algorithm for
* Postgres.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
* NOTES
* This file contains only the public interface routines.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/rel.h"
#include "utils/excid.h"
#include "access/heapam.h"
#include "access/genam.h"
#include "access/sdir.h"
#include "access/nbtree.h"
#include "access/funcindex.h"
#include "nodes/execnodes.h"
#include "nodes/plannodes.h"
#include "executor/executor.h"
#include "executor/tuptable.h"
#include "catalog/index.h"
bool BuildingBtree = false;
bool FastBuild = false; /* turn this on to make bulk builds work*/
/*
* btbuild() -- build a new btree index.
*
* We use a global variable to record the fact that we're creating
* a new index. This is used to avoid high-concurrency locking,
* since the index won't be visible until this transaction commits
* and since building is guaranteed to be single-threaded.
*/
void
btbuild(Relation heap,
Relation index,
int natts,
AttrNumber *attnum,
IndexStrategy istrat,
uint16 pcount,
Datum *params,
FuncIndexInfo *finfo,
PredInfo *predInfo)
{
HeapScanDesc hscan;
Buffer buffer;
HeapTuple htup;
IndexTuple itup;
TupleDesc htupdesc, itupdesc;
Datum *attdata;
bool *nulls;
InsertIndexResult res;
int nhtups, nitups;
int i;
BTItem btitem;
ExprContext *econtext;
TupleTable tupleTable;
TupleTableSlot *slot;
Oid hrelid, irelid;
Node *pred, *oldPred;
void *spool;
/* note that this is a new btree */
BuildingBtree = true;
pred = predInfo->pred;
oldPred = predInfo->oldPred;
/* initialize the btree index metadata page (if this is a new index) */
if (oldPred == NULL)
_bt_metapinit(index);
/* get tuple descriptors for heap and index relations */
htupdesc = RelationGetTupleDescriptor(heap);
itupdesc = RelationGetTupleDescriptor(index);
/* get space for data items that'll appear in the index tuple */
attdata = (Datum *) palloc(natts * sizeof(Datum));
nulls = (bool *) palloc(natts * sizeof(bool));
/*
* If this is a predicate (partial) index, we will need to evaluate the
* predicate using ExecQual, which requires the current tuple to be in a
* slot of a TupleTable. In addition, ExecQual must have an ExprContext
* referring to that slot. Here, we initialize dummy TupleTable and
* ExprContext objects for this purpose. --Nels, Feb '92
*/
#ifndef OMIT_PARTIAL_INDEX
if (pred != NULL || oldPred != NULL) {
tupleTable = ExecCreateTupleTable(1);
slot = ExecAllocTableSlot(tupleTable);
econtext = makeNode(ExprContext);
FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
}
#endif /* OMIT_PARTIAL_INDEX */
/* start a heap scan */
hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
htup = heap_getnext(hscan, 0, &buffer);
/* build the index */
nhtups = nitups = 0;
if (FastBuild) {
spool = _bt_spoolinit(index, 7);
res = (InsertIndexResult) NULL;
}
for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
nhtups++;
/*
* If oldPred != NULL, this is an EXTEND INDEX command, so skip
* this tuple if it was already in the existing partial index
*/
if (oldPred != NULL) {
#ifndef OMIT_PARTIAL_INDEX
/*SetSlotContents(slot, htup);*/
slot->val = htup;
if (ExecQual((List*)oldPred, econtext) == true) {
nitups++;
continue;
}
#endif /* OMIT_PARTIAL_INDEX */
}
/* Skip this tuple if it doesn't satisfy the partial-index predicate */
if (pred != NULL) {
#ifndef OMIT_PARTIAL_INDEX
/* SetSlotContents(slot, htup); */
slot->val = htup;
if (ExecQual((List*)pred, econtext) == false)
continue;
#endif /* OMIT_PARTIAL_INDEX */
}
nitups++;
/*
* For the current heap tuple, extract all the attributes
* we use in this index, and note which are null.
*/
for (i = 1; i <= natts; i++) {
int attoff;
bool attnull;
/*
* Offsets are from the start of the tuple, and are
* zero-based; indices are one-based. The next call
* returns i - 1. That's data hiding for you.
*/
attoff = AttrNumberGetAttrOffset(i);
attdata[attoff] = GetIndexValue(htup,
htupdesc,
attoff,
attnum,
finfo,
&attnull,
buffer);
nulls[attoff] = (attnull ? 'n' : ' ');
}
/* form an index tuple and point it at the heap tuple */
itup = index_formtuple(itupdesc, attdata, nulls);
/*
* If the single index key is null, we don't insert it into
* the index. Btrees support scans on <, <=, =, >=, and >.
* Relational algebra says that A op B (where op is one of the
* operators above) returns null if either A or B is null. This
* means that no qualification used in an index scan could ever
* return true on a null attribute. It also means that indices
* can't be used by ISNULL or NOTNULL scans, but that's an
* artifact of the strategy map architecture chosen in 1986, not
* of the way nulls are handled here.
*/
if (itup->t_info & INDEX_NULL_MASK) {
pfree(itup);
continue;
}
itup->t_tid = htup->t_ctid;
btitem = _bt_formitem(itup);
/*
* if we are doing bottom-up btree build, we insert the index
* into a spool page for subsequent processing. otherwise, we
* insert into the btree.
*/
if (FastBuild) {
_bt_spool(index, btitem, spool);
} else {
res = _bt_doinsert(index, btitem);
}
pfree(btitem);
pfree(itup);
if (res) {
pfree(res);
}
}
/* okay, all heap tuples are indexed */
heap_endscan(hscan);
if (pred != NULL || oldPred != NULL) {
#ifndef OMIT_PARTIAL_INDEX
ExecDestroyTupleTable(tupleTable, true);
pfree(econtext);
#endif /* OMIT_PARTIAL_INDEX */
}
/*
* if we are doing bottom-up btree build, we now have a bunch of
* sorted runs in the spool pages. finish the build by (1)
* merging the runs, (2) inserting the sorted tuples into btree
* pages and (3) building the upper levels.
*/
if (FastBuild) {
_bt_spool(index, (BTItem) NULL, spool); /* flush spool */
_bt_leafbuild(index, spool);
_bt_spooldestroy(spool);
}
/*
* Since we just counted the tuples in the heap, we update its
* stats in pg_class to guarantee that the planner takes advantage
* of the index we just created. Finally, only update statistics
* during normal index definitions, not for indices on system catalogs
* created during bootstrap processing. We must close the relations
* before updatings statistics to guarantee that the relcache entries
* are flushed when we increment the command counter in UpdateStats().
*/
if (IsNormalProcessingMode())
{
hrelid = heap->rd_id;
irelid = index->rd_id;
heap_close(heap);
index_close(index);
UpdateStats(hrelid, nhtups, true);
UpdateStats(irelid, nitups, false);
if (oldPred != NULL) {
if (nitups == nhtups) pred = NULL;
UpdateIndexPredicate(irelid, oldPred, pred);
}
}
/* be tidy */
pfree(nulls);
pfree(attdata);
/* all done */
BuildingBtree = false;
}
/*
* btinsert() -- insert an index tuple into a btree.
*
* Descend the tree recursively, find the appropriate location for our
* new tuple, put it there, set its unique OID as appropriate, and
* return an InsertIndexResult to the caller.
*/
InsertIndexResult
btinsert(Relation rel, IndexTuple itup)
{
BTItem btitem;
InsertIndexResult res;
if (itup->t_info & INDEX_NULL_MASK)
return ((InsertIndexResult) NULL);
btitem = _bt_formitem(itup);
res = _bt_doinsert(rel, btitem);
pfree(btitem);
return (res);
}
/*
* btgettuple() -- Get the next tuple in the scan.
*/
char *
btgettuple(IndexScanDesc scan, ScanDirection dir)
{
RetrieveIndexResult res;
/*
* If we've already initialized this scan, we can just advance it
* in the appropriate direction. If we haven't done so yet, we
* call a routine to get the first item in the scan.
*/
if (ItemPointerIsValid(&(scan->currentItemData)))
res = _bt_next(scan, dir);
else
res = _bt_first(scan, dir);
return ((char *) res);
}
/*
* btbeginscan() -- start a scan on a btree index
*/
char *
btbeginscan(Relation rel, bool fromEnd, uint16 keysz, ScanKey scankey)
{
IndexScanDesc scan;
StrategyNumber strat;
BTScanOpaque so;
/* first order the keys in the qualification */
if (keysz > 1)
_bt_orderkeys(rel, &keysz, scankey);
/* now get the scan */
scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData));
so->btso_curbuf = so->btso_mrkbuf = InvalidBuffer;
scan->opaque = so;
/* finally, be sure that the scan exploits the tree order */
scan->scanFromEnd = false;
scan->flags = 0x0;
if (keysz > 0) {
strat = _bt_getstrat(scan->relation, 1 /* XXX */,
scankey[0].sk_procedure);
if (strat == BTLessStrategyNumber
|| strat == BTLessEqualStrategyNumber)
scan->scanFromEnd = true;
} else {
scan->scanFromEnd = true;
}
/* register scan in case we change pages it's using */
_bt_regscan(scan);
return ((char *) scan);
}
/*
* btrescan() -- rescan an index relation
*/
void
btrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
{
ItemPointer iptr;
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
/* we hold a read lock on the current page in the scan */
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
/* and we hold a read lock on the last marked item in the scan */
if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
so->btso_mrkbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
/* reset the scan key */
if (scan->numberOfKeys > 0) {
memmove(scan->keyData,
scankey,
scan->numberOfKeys * sizeof(ScanKeyData));
}
}
void
btmovescan(IndexScanDesc scan, Datum v)
{
ItemPointer iptr;
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
/* release any locks we still hold */
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
scan->keyData[0].sk_argument = v;
}
/*
* btendscan() -- close down a scan
*/
void
btendscan(IndexScanDesc scan)
{
ItemPointer iptr;
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
/* release any locks we still hold */
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
if (BufferIsValid(so->btso_curbuf))
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
if (BufferIsValid(so->btso_mrkbuf))
_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
so->btso_mrkbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
/* don't need scan registered anymore */
_bt_dropscan(scan);
/* be tidy */
#ifdef PERFECT_MMGR
pfree (scan->opaque);
#endif /* PERFECT_MMGR */
}
/*
* btmarkpos() -- save current scan position
*/
void
btmarkpos(IndexScanDesc scan)
{
ItemPointer iptr;
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
/* release lock on old marked data, if any */
if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
_bt_relbuf(scan->relation, so->btso_mrkbuf, BT_READ);
so->btso_mrkbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
/* bump lock on currentItemData and copy to currentMarkData */
if (ItemPointerIsValid(&(scan->currentItemData))) {
so->btso_mrkbuf = _bt_getbuf(scan->relation,
BufferGetBlockNumber(so->btso_curbuf),
BT_READ);
scan->currentMarkData = scan->currentItemData;
}
}
/*
* btrestrpos() -- restore scan to last saved position
*/
void
btrestrpos(IndexScanDesc scan)
{
ItemPointer iptr;
BTScanOpaque so;
so = (BTScanOpaque) scan->opaque;
/* release lock on current data, if any */
if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
_bt_relbuf(scan->relation, so->btso_curbuf, BT_READ);
so->btso_curbuf = InvalidBuffer;
ItemPointerSetInvalid(iptr);
}
/* bump lock on currentMarkData and copy to currentItemData */
if (ItemPointerIsValid(&(scan->currentMarkData))) {
so->btso_curbuf = _bt_getbuf(scan->relation,
BufferGetBlockNumber(so->btso_mrkbuf),
BT_READ);
scan->currentItemData = scan->currentMarkData;
}
}
/* stubs */
void
btdelete(Relation rel, ItemPointer tid)
{
/* adjust any active scans that will be affected by this deletion */
_bt_adjscans(rel, tid);
/* delete the data from the page */
_bt_pagedel(rel, tid);
}

View File

@@ -0,0 +1,164 @@
/*-------------------------------------------------------------------------
*
* btscan.c--
* manage scans on btrees.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtscan.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
*
* NOTES
* Because we can be doing an index scan on a relation while we update
* it, we need to avoid missing data that moves around in the index.
* The routines and global variables in this file guarantee that all
* scans in the local address space stay correctly positioned. This
* is all we need to worry about, since write locking guarantees that
* no one else will be on the same page at the same time as we are.
*
* The scheme is to manage a list of active scans in the current backend.
* Whenever we add or remove records from an index, or whenever we
* split a leaf page, we check the list of active scans to see if any
* has been affected. A scan is affected only if it is on the same
* relation, and the same page, as the update.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/rel.h"
#include "utils/excid.h"
#include "access/heapam.h"
#include "access/genam.h"
#include "access/sdir.h"
#include "access/nbtree.h"
typedef struct BTScanListData {
IndexScanDesc btsl_scan;
struct BTScanListData *btsl_next;
} BTScanListData;
typedef BTScanListData *BTScanList;
static BTScanList BTScans = (BTScanList) NULL;
/*
* _bt_regscan() -- register a new scan.
*/
void
_bt_regscan(IndexScanDesc scan)
{
BTScanList new_el;
new_el = (BTScanList) palloc(sizeof(BTScanListData));
new_el->btsl_scan = scan;
new_el->btsl_next = BTScans;
BTScans = new_el;
}
/*
* _bt_dropscan() -- drop a scan from the scan list
*/
void
_bt_dropscan(IndexScanDesc scan)
{
BTScanList chk, last;
last = (BTScanList) NULL;
for (chk = BTScans;
chk != (BTScanList) NULL && chk->btsl_scan != scan;
chk = chk->btsl_next) {
last = chk;
}
if (chk == (BTScanList) NULL)
elog(WARN, "btree scan list trashed; can't find 0x%lx", scan);
if (last == (BTScanList) NULL)
BTScans = chk->btsl_next;
else
last->btsl_next = chk->btsl_next;
#ifdef PERFECT_MEM
pfree (chk);
#endif /* PERFECT_MEM */
}
void
_bt_adjscans(Relation rel, ItemPointer tid)
{
BTScanList l;
Oid relid;
relid = rel->rd_id;
for (l = BTScans; l != (BTScanList) NULL; l = l->btsl_next) {
if (relid == l->btsl_scan->relation->rd_id)
_bt_scandel(l->btsl_scan, ItemPointerGetBlockNumber(tid),
ItemPointerGetOffsetNumber(tid));
}
}
void
_bt_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
{
ItemPointer current;
Buffer buf;
BTScanOpaque so;
if (!_bt_scantouched(scan, blkno, offno))
return;
so = (BTScanOpaque) scan->opaque;
buf = so->btso_curbuf;
current = &(scan->currentItemData);
if (ItemPointerIsValid(current)
&& ItemPointerGetBlockNumber(current) == blkno
&& ItemPointerGetOffsetNumber(current) >= offno) {
_bt_step(scan, &buf, BackwardScanDirection);
so->btso_curbuf = buf;
}
current = &(scan->currentMarkData);
if (ItemPointerIsValid(current)
&& ItemPointerGetBlockNumber(current) == blkno
&& ItemPointerGetOffsetNumber(current) >= offno) {
ItemPointerData tmp;
tmp = *current;
*current = scan->currentItemData;
scan->currentItemData = tmp;
_bt_step(scan, &buf, BackwardScanDirection);
so->btso_mrkbuf = buf;
tmp = *current;
*current = scan->currentItemData;
scan->currentItemData = tmp;
}
}
bool
_bt_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
{
ItemPointer current;
current = &(scan->currentItemData);
if (ItemPointerIsValid(current)
&& ItemPointerGetBlockNumber(current) == blkno
&& ItemPointerGetOffsetNumber(current) >= offno)
return (true);
current = &(scan->currentMarkData);
if (ItemPointerIsValid(current)
&& ItemPointerGetBlockNumber(current) == blkno
&& ItemPointerGetOffsetNumber(current) >= offno)
return (true);
return (false);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,134 @@
/*-------------------------------------------------------------------------
*
* btstrat.c--
* Srategy map entries for the btree indexed access method
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/Attic/nbtstrat.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "storage/bufpage.h"
#include "utils/elog.h"
#include "utils/rel.h"
#include "utils/excid.h"
#include "access/genam.h"
#include "access/nbtree.h"
/*
* Note:
* StrategyNegate, StrategyCommute, and StrategyNegateCommute
* assume <, <=, ==, >=, > ordering.
*/
static StrategyNumber BTNegate[5] = {
BTGreaterEqualStrategyNumber,
BTGreaterStrategyNumber,
InvalidStrategy,
BTLessStrategyNumber,
BTLessEqualStrategyNumber
};
static StrategyNumber BTCommute[5] = {
BTGreaterStrategyNumber,
BTGreaterEqualStrategyNumber,
InvalidStrategy,
BTLessEqualStrategyNumber,
BTLessStrategyNumber
};
static StrategyNumber BTNegateCommute[5] = {
BTLessEqualStrategyNumber,
BTLessStrategyNumber,
InvalidStrategy,
BTGreaterStrategyNumber,
BTGreaterEqualStrategyNumber
};
static uint16 BTLessTermData[] = { /* XXX type clash */
2,
BTLessStrategyNumber,
SK_NEGATE,
BTLessStrategyNumber,
SK_NEGATE | SK_COMMUTE
};
static uint16 BTLessEqualTermData[] = { /* XXX type clash */
2,
BTLessEqualStrategyNumber,
0x0,
BTLessEqualStrategyNumber,
SK_COMMUTE
};
static uint16 BTGreaterEqualTermData[] = { /* XXX type clash */
2,
BTGreaterEqualStrategyNumber,
0x0,
BTGreaterEqualStrategyNumber,
SK_COMMUTE
};
static uint16 BTGreaterTermData[] = { /* XXX type clash */
2,
BTGreaterStrategyNumber,
SK_NEGATE,
BTGreaterStrategyNumber,
SK_NEGATE | SK_COMMUTE
};
static StrategyTerm BTEqualExpressionData[] = {
(StrategyTerm)BTLessTermData, /* XXX */
(StrategyTerm)BTLessEqualTermData, /* XXX */
(StrategyTerm)BTGreaterEqualTermData, /* XXX */
(StrategyTerm)BTGreaterTermData, /* XXX */
NULL
};
static StrategyEvaluationData BTEvaluationData = {
/* XXX static for simplicity */
BTMaxStrategyNumber,
(StrategyTransformMap)BTNegate, /* XXX */
(StrategyTransformMap)BTCommute, /* XXX */
(StrategyTransformMap)BTNegateCommute, /* XXX */
{ NULL, NULL, (StrategyExpression)BTEqualExpressionData, NULL, NULL,
NULL,NULL,NULL,NULL,NULL,NULL,NULL}
};
/* ----------------------------------------------------------------
* RelationGetBTStrategy
* ----------------------------------------------------------------
*/
StrategyNumber
_bt_getstrat(Relation rel,
AttrNumber attno,
RegProcedure proc)
{
StrategyNumber strat;
strat = RelationGetStrategy(rel, attno, &BTEvaluationData, proc);
Assert(StrategyNumberIsValid(strat));
return (strat);
}
bool
_bt_invokestrat(Relation rel,
AttrNumber attno,
StrategyNumber strat,
Datum left,
Datum right)
{
return (RelationInvokeStrategy(rel, &BTEvaluationData, attno, strat,
left, right));
}

View File

@@ -0,0 +1,239 @@
/*-------------------------------------------------------------------------
*
* btutils.c--
* Utility code for Postgres btree implementation.
*
* Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtutils.c,v 1.1.1.1 1996/07/09 06:21:12 scrappy Exp $
*
*-------------------------------------------------------------------------
*/
#include <stdio.h>
#include "postgres.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "fmgr.h"
#include "utils/elog.h"
#include "utils/palloc.h"
#include "utils/rel.h"
#include "utils/excid.h"
#include "utils/datum.h"
#include "access/heapam.h"
#include "access/genam.h"
#include "access/iqual.h"
#include "access/nbtree.h"
ScanKey
_bt_mkscankey(Relation rel, IndexTuple itup)
{
ScanKey skey;
TupleDesc itupdesc;
int natts;
int i;
Datum arg;
RegProcedure proc;
bool null;
natts = rel->rd_rel->relnatts;
itupdesc = RelationGetTupleDescriptor(rel);
skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
for (i = 0; i < natts; i++) {
arg = index_getattr(itup, i + 1, itupdesc, &null);
proc = index_getprocid(rel, i + 1, BTORDER_PROC);
ScanKeyEntryInitialize(&skey[i],
0x0, (AttrNumber) (i + 1), proc, arg);
}
return (skey);
}
void
_bt_freeskey(ScanKey skey)
{
pfree(skey);
}
void
_bt_freestack(BTStack stack)
{
BTStack ostack;
while (stack != (BTStack) NULL) {
ostack = stack;
stack = stack->bts_parent;
pfree(ostack->bts_btitem);
pfree(ostack);
}
}
/*
* _bt_orderkeys() -- Put keys in a sensible order for conjunctive quals.
*
* The order of the keys in the qual match the ordering imposed by
* the index. This routine only needs to be called if there are
* more than one qual clauses using this index.
*/
void
_bt_orderkeys(Relation relation, uint16 *numberOfKeys, ScanKey key)
{
ScanKey xform;
ScanKeyData *cur;
StrategyMap map;
int nbytes;
long test;
int i, j;
int init[BTMaxStrategyNumber+1];
/* haven't looked at any strategies yet */
for (i = 0; i <= BTMaxStrategyNumber; i++)
init[i] = 0;
/* get space for the modified array of keys */
nbytes = BTMaxStrategyNumber * sizeof(ScanKeyData);
xform = (ScanKey) palloc(nbytes);
memset(xform, 0, nbytes);
/* get the strategy map for this index/attribute pair */
/*
* XXX
* When we support multiple keys in a single index, this is what
* we'll want to do. At present, the planner is hosed, so we
* hard-wire the attribute number below. Postgres only does single-
* key indices...
* map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
* BTMaxStrategyNumber,
* key->data[0].attributeNumber);
*/
map = IndexStrategyGetStrategyMap(RelationGetIndexStrategy(relation),
BTMaxStrategyNumber,
1 /* XXX */ );
/* check each key passed in */
for (i = *numberOfKeys; --i >= 0; ) {
cur = &key[i];
for (j = BTMaxStrategyNumber; --j >= 0; ) {
if (cur->sk_procedure == map->entry[j].sk_procedure)
break;
}
/* have we seen one of these before? */
if (init[j]) {
/* yup, use the appropriate value */
test =
(long) FMGR_PTR2(cur->sk_func, cur->sk_procedure,
cur->sk_argument, xform[j].sk_argument);
if (test)
xform[j].sk_argument = cur->sk_argument;
} else {
/* nope, use this value */
memmove(&xform[j], cur, sizeof(*cur));
init[j] = 1;
}
}
/* if = has been specified, no other key will be used */
if (init[BTEqualStrategyNumber - 1]) {
init[BTLessStrategyNumber - 1] = 0;
init[BTLessEqualStrategyNumber - 1] = 0;
init[BTGreaterEqualStrategyNumber - 1] = 0;
init[BTGreaterStrategyNumber - 1] = 0;
}
/* only one of <, <= */
if (init[BTLessStrategyNumber - 1]
&& init[BTLessEqualStrategyNumber - 1]) {
ScanKeyData *lt, *le;
lt = &xform[BTLessStrategyNumber - 1];
le = &xform[BTLessEqualStrategyNumber - 1];
/*
* DO NOT use the cached function stuff here -- this is key
* ordering, happens only when the user expresses a hokey
* qualification, and gets executed only once, anyway. The
* transform maps are hard-coded, and can't be initialized
* in the correct way.
*/
test = (long) fmgr(le->sk_procedure, le->sk_argument, lt->sk_argument);
if (test)
init[BTLessEqualStrategyNumber - 1] = 0;
else
init[BTLessStrategyNumber - 1] = 0;
}
/* only one of >, >= */
if (init[BTGreaterStrategyNumber - 1]
&& init[BTGreaterEqualStrategyNumber - 1]) {
ScanKeyData *gt, *ge;
gt = &xform[BTGreaterStrategyNumber - 1];
ge = &xform[BTGreaterEqualStrategyNumber - 1];
/* see note above on function cache */
test = (long) fmgr(ge->sk_procedure, gt->sk_argument, gt->sk_argument);
if (test)
init[BTGreaterStrategyNumber - 1] = 0;
else
init[BTGreaterEqualStrategyNumber - 1] = 0;
}
/* okay, reorder and count */
j = 0;
for (i = BTMaxStrategyNumber; --i >= 0; )
if (init[i])
key[j++] = xform[i];
*numberOfKeys = j;
pfree(xform);
}
bool
_bt_checkqual(IndexScanDesc scan, IndexTuple itup)
{
if (scan->numberOfKeys > 0)
return (index_keytest(itup, RelationGetTupleDescriptor(scan->relation),
scan->numberOfKeys, scan->keyData));
else
return (true);
}
BTItem
_bt_formitem(IndexTuple itup)
{
int nbytes_btitem;
BTItem btitem;
Size tuplen;
extern Oid newoid();
/* disallow nulls in btree keys */
if (itup->t_info & INDEX_NULL_MASK)
elog(WARN, "btree indices cannot include null keys");
/* make a copy of the index tuple with room for the sequence number */
tuplen = IndexTupleSize(itup);
nbytes_btitem = tuplen +
(sizeof(BTItemData) - sizeof(IndexTupleData));
btitem = (BTItem) palloc(nbytes_btitem);
memmove((char *) &(btitem->bti_itup), (char *) itup, tuplen);
btitem->bti_oid = newoid();
return (btitem);
}