Postgres95 1.01 Distribution - Virgin Sources

2025-11-13 16:22:44 +03:00 · 1996-07-09 06:22:35 +00:00
commit d31084e9d1
868 changed files with 242656 additions and 0 deletions
--- a/src/backend/access/hash/Makefile.inc
+++ b/src/backend/access/hash/Makefile.inc
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for access/hash (hash access method)
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= hash.c hashfunc.c hashinsert.c hashovfl.c hashpage.c hashscan.c \
+	hashsearch.c hashstrat.c hashutil.c
+
+
+
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -0,0 +1,467 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.c--
+ *    Implementation of Margo Seltzer's Hashing package for postgres. 
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ *    This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+#include "access/funcindex.h"
+#include "nodes/execnodes.h"
+#include "nodes/plannodes.h"
+#include "executor/executor.h"
+#include "executor/tuptable.h"
+#include "catalog/index.h"
+
+
+bool	BuildingHash = false;
+
+/*
+ *  hashbuild() -- build a new hash index.
+ *
+ *	We use a global variable to record the fact that we're creating
+ *	a new index.  This is used to avoid high-concurrency locking,
+ *	since the index won't be visible until this transaction commits
+ *	and since building is guaranteed to be single-threaded.
+ */
+void
+hashbuild(Relation heap,
+	  Relation index,
+	  int natts,
+	  AttrNumber *attnum,
+	  IndexStrategy istrat,
+	  uint16 pcount,
+	  Datum *params,
+	  FuncIndexInfo *finfo,
+	  PredInfo *predInfo)
+{
+    HeapScanDesc hscan;
+    Buffer buffer;
+    HeapTuple htup;
+    IndexTuple itup;
+    TupleDesc htupdesc, itupdesc;
+    Datum *attdata;
+    bool *nulls;
+    InsertIndexResult res;
+    int nhtups, nitups;
+    int i;
+    HashItem hitem;
+    ExprContext *econtext;
+    TupleTable tupleTable;
+    TupleTableSlot *slot;
+    Oid hrelid, irelid;
+    Node *pred, *oldPred;
+    
+    /* note that this is a new btree */
+    BuildingHash = true;
+    
+    pred = predInfo->pred;
+    oldPred = predInfo->oldPred;
+    
+    /*  initialize the hash index metadata page (if this is a new index) */
+    if (oldPred == NULL)
+	_hash_metapinit(index);
+    
+    /* get tuple descriptors for heap and index relations */
+    htupdesc = RelationGetTupleDescriptor(heap);
+    itupdesc = RelationGetTupleDescriptor(index);
+    
+    /* get space for data items that'll appear in the index tuple */
+    attdata = (Datum *) palloc(natts * sizeof(Datum));
+    nulls = (bool *) palloc(natts * sizeof(bool));
+    
+    /*
+     * If this is a predicate (partial) index, we will need to evaluate the
+     * predicate using ExecQual, which requires the current tuple to be in a
+     * slot of a TupleTable.  In addition, ExecQual must have an ExprContext
+     * referring to that slot.  Here, we initialize dummy TupleTable and
+     * ExprContext objects for this purpose. --Nels, Feb '92
+     */
+#ifndef OMIT_PARTIAL_INDEX
+    if (pred != NULL || oldPred != NULL) {
+	tupleTable = ExecCreateTupleTable(1);
+	slot = ExecAllocTableSlot(tupleTable);
+	econtext = makeNode(ExprContext);
+	FillDummyExprContext(econtext, slot, htupdesc, buffer);
+    }
+#endif /* OMIT_PARTIAL_INDEX */
+    
+    /* start a heap scan */
+    hscan = heap_beginscan(heap, 0, NowTimeQual, 0, (ScanKey) NULL);
+    htup = heap_getnext(hscan, 0, &buffer);
+    
+    /* build the index */
+    nhtups = nitups = 0;
+    
+    for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) {
+	
+	nhtups++;
+	
+	/*
+	 * If oldPred != NULL, this is an EXTEND INDEX command, so skip
+	 * this tuple if it was already in the existing partial index
+	 */
+	if (oldPred != NULL) {
+	    /*SetSlotContents(slot, htup); */
+#ifndef OMIT_PARTIAL_INDEX
+	    slot->val = htup;
+	    if (ExecQual((List*)oldPred, econtext) == true) {
+		nitups++;
+		continue;
+	    }
+#endif /* OMIT_PARTIAL_INDEX */    
+	}
+	
+	/* Skip this tuple if it doesn't satisfy the partial-index predicate */
+	if (pred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+	    /*SetSlotContents(slot, htup); */
+	    slot->val = htup;
+	    if (ExecQual((List*)pred, econtext) == false)
+		continue;
+#endif /* OMIT_PARTIAL_INDEX */    	
+}
+	
+	nitups++;
+	
+	/*
+	 *  For the current heap tuple, extract all the attributes
+	 *  we use in this index, and note which are null.
+	 */
+	for (i = 1; i <= natts; i++) {
+	    int attoff;
+	    bool attnull;
+	    
+	    /*
+	     *  Offsets are from the start of the tuple, and are
+	     *  zero-based; indices are one-based.  The next call
+	     *  returns i - 1.  That's data hiding for you.
+	     */
+	    
+	    /* attoff = i - 1 */
+	    attoff = AttrNumberGetAttrOffset(i);
+	    
+	    /* below, attdata[attoff] set to equal some datum &
+	     * attnull is changed to indicate whether or not the attribute 
+	     * is null for this tuple
+	     */
+	    attdata[attoff] = GetIndexValue(htup, 
+					    htupdesc,
+					    attoff, 
+					    attnum, 
+					    finfo, 
+					    &attnull,
+					    buffer);
+	    nulls[attoff] = (attnull ? 'n' : ' ');
+	}
+	
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_formtuple(itupdesc, attdata, nulls);
+	
+	/*
+	 *  If the single index key is null, we don't insert it into
+	 *  the index.  Hash tables support scans on '='.
+	 *  Relational algebra says that A = B
+	 *  returns null if either A or B is null.  This
+	 *  means that no qualification used in an index scan could ever
+	 *  return true on a null attribute.  It also means that indices
+	 *  can't be used by ISNULL or NOTNULL scans, but that's an
+	 *  artifact of the strategy map architecture chosen in 1986, not
+	 *  of the way nulls are handled here.
+	 */
+	
+	if (itup->t_info & INDEX_NULL_MASK) {
+	    pfree(itup);
+	    continue;
+	}
+	
+	itup->t_tid = htup->t_ctid;
+	hitem = _hash_formitem(itup);
+	res = _hash_doinsert(index, hitem);
+	pfree(hitem);
+	pfree(itup);
+	pfree(res);
+    }
+    
+    /* okay, all heap tuples are indexed */
+    heap_endscan(hscan);
+    
+    if (pred != NULL || oldPred != NULL) {
+#ifndef OMIT_PARTIAL_INDEX
+	ExecDestroyTupleTable(tupleTable, true);
+	pfree(econtext);
+#endif /* OMIT_PARTIAL_INDEX */    	
+    }
+    
+    /*
+     *  Since we just counted the tuples in the heap, we update its
+     *  stats in pg_class to guarantee that the planner takes advantage
+     *  of the index we just created. Finally, only update statistics
+     *  during normal index definitions, not for indices on system catalogs
+     *  created during bootstrap processing.  We must close the relations
+     *  before updatings statistics to guarantee that the relcache entries
+     *  are flushed when we increment the command counter in UpdateStats().
+     */
+    if (IsNormalProcessingMode())
+	{
+	    hrelid = heap->rd_id;
+	    irelid = index->rd_id;
+	    heap_close(heap);
+	    index_close(index);
+	    UpdateStats(hrelid, nhtups, true);
+	    UpdateStats(irelid, nitups, false);
+	    if (oldPred != NULL) {
+		if (nitups == nhtups) pred = NULL;
+		UpdateIndexPredicate(irelid, oldPred, pred);
+	    }  
+	}
+    
+    /* be tidy */
+    pfree(nulls);
+    pfree(attdata);
+    
+    /* all done */
+    BuildingHash = false;
+}
+
+/*
+ *  hashinsert() -- insert an index tuple into a hash table. 
+ *
+ *  Hash on the index tuple's key, find the appropriate location 
+ *  for the new tuple, put it there, and return an InsertIndexResult
+ *  to the caller. 
+ */
+InsertIndexResult
+hashinsert(Relation rel, IndexTuple itup)
+{
+    HashItem hitem;
+    InsertIndexResult res;
+    
+    if (itup->t_info & INDEX_NULL_MASK)
+	return ((InsertIndexResult) NULL);
+    
+    hitem = _hash_formitem(itup);
+    
+    res = _hash_doinsert(rel, hitem);
+    
+    pfree(hitem);
+    
+    return (res);
+}
+
+
+/*
+ *  hashgettuple() -- Get the next tuple in the scan.
+ */
+char *
+hashgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+    RetrieveIndexResult res;
+    
+    /*
+     *  If we've already initialized this scan, we can just advance it
+     *  in the appropriate direction.  If we haven't done so yet, we
+     *  call a routine to get the first item in the scan.
+     */
+    
+    if (ItemPointerIsValid(&(scan->currentItemData)))
+	res = _hash_next(scan, dir);
+    else
+	res = _hash_first(scan, dir);
+    
+    return ((char *) res);
+}
+
+
+/*
+ *  hashbeginscan() -- start a scan on a hash index
+ */
+char *
+hashbeginscan(Relation rel,
+	      bool fromEnd,
+	      uint16 keysz,
+	      ScanKey scankey)
+{
+    IndexScanDesc scan;
+    HashScanOpaque so;
+    
+    scan = RelationGetIndexScan(rel, fromEnd, keysz, scankey);
+    so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData)); 
+    so->hashso_curbuf = so->hashso_mrkbuf = InvalidBuffer;
+    scan->opaque = so; 
+    scan->flags = 0x0;
+    
+    /* register scan in case we change pages it's using */
+    _hash_regscan(scan);
+    
+    return ((char *) scan);
+}
+
+/*
+ *  hashrescan() -- rescan an index relation
+ */
+void
+hashrescan(IndexScanDesc scan, bool fromEnd, ScanKey scankey)
+{
+    ItemPointer iptr;
+    HashScanOpaque so;
+    
+    so = (HashScanOpaque) scan->opaque;
+    
+    /* we hold a read lock on the current page in the scan */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+	so->hashso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	_hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+	so->hashso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* reset the scan key */
+    if (scan->numberOfKeys > 0) {
+	memmove(scan->keyData,
+		scankey,
+		scan->numberOfKeys * sizeof(ScanKeyData));
+    }
+}
+
+/*
+ *  hashendscan() -- close down a scan
+ */
+void
+hashendscan(IndexScanDesc scan)
+{
+    
+    ItemPointer iptr;
+    HashScanOpaque so;
+    
+    so = (HashScanOpaque) scan->opaque;
+    
+    /* release any locks we still hold */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+	so->hashso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	if (BufferIsValid(so->hashso_mrkbuf))
+	    _hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+	so->hashso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* don't need scan registered anymore */
+    _hash_dropscan(scan);
+    
+    /* be tidy */
+#ifdef PERFECT_MMGR
+    pfree (scan->opaque);
+#endif /* PERFECT_MMGR */
+}
+
+/*
+ *  hashmarkpos() -- save current scan position
+ *
+ */
+void
+hashmarkpos(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    HashScanOpaque so;
+    
+    /*  see if we ever call this code. if we do, then so_mrkbuf a
+     *  useful element in the scan->opaque structure. if this procedure
+     *  is never called, so_mrkbuf should be removed from the scan->opaque
+     *  structure. 
+     */
+    elog(NOTICE, "Hashmarkpos() called.");
+    
+    so = (HashScanOpaque) scan->opaque;
+    
+    /* release lock on old marked data, if any */
+    if (ItemPointerIsValid(iptr = &(scan->currentMarkData))) {
+	_hash_relbuf(scan->relation, so->hashso_mrkbuf, HASH_READ);
+	so->hashso_mrkbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* bump lock on currentItemData and copy to currentMarkData */
+    if (ItemPointerIsValid(&(scan->currentItemData))) {
+	so->hashso_mrkbuf = _hash_getbuf(scan->relation,
+					 BufferGetBlockNumber(so->hashso_curbuf),
+					 HASH_READ);
+	scan->currentMarkData = scan->currentItemData;
+    }
+}
+
+/*
+ *  hashrestrpos() -- restore scan to last saved position
+ */
+void
+hashrestrpos(IndexScanDesc scan)
+{
+    ItemPointer iptr;
+    HashScanOpaque so;
+    
+    /*  see if we ever call this code. if we do, then so_mrkbuf a
+     *  useful element in the scan->opaque structure. if this procedure
+     *  is never called, so_mrkbuf should be removed from the scan->opaque
+     *  structure. 
+     */
+    elog(NOTICE, "Hashrestrpos() called.");
+    
+    so = (HashScanOpaque) scan->opaque;
+    
+    /* release lock on current data, if any */
+    if (ItemPointerIsValid(iptr = &(scan->currentItemData))) {
+	_hash_relbuf(scan->relation, so->hashso_curbuf, HASH_READ);
+	so->hashso_curbuf = InvalidBuffer;
+	ItemPointerSetInvalid(iptr);
+    }
+    
+    /* bump lock on currentMarkData and copy to currentItemData */
+    if (ItemPointerIsValid(&(scan->currentMarkData))) {
+	so->hashso_curbuf =
+	    _hash_getbuf(scan->relation,
+			 BufferGetBlockNumber(so->hashso_mrkbuf),
+			 HASH_READ);
+	
+	scan->currentItemData = scan->currentMarkData;
+    }
+}
+
+/* stubs */
+void
+hashdelete(Relation rel, ItemPointer tid)
+{
+    /* adjust any active scans that will be affected by this deletion */
+    _hash_adjscans(rel, tid);
+    
+    /* delete the data from the page */
+    _hash_pagedel(rel, tid);
+}
+
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -0,0 +1,276 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashfunc.c--
+ *    Comparison functions for hash access method.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashfunc.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ *    These functions are stored in pg_amproc.  For each operator class
+ *    defined on hash tables, they compute the hash value of the argument.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "utils/nabstime.h"
+
+uint32 hashint2(int16 key)
+{
+    return ((uint32) ~key);
+}
+
+uint32 hashint4(uint32 key)
+{
+    return (~key);
+}
+
+/* Hash function from Chris Torek. */
+uint32 hashfloat4(float32 keyp)
+{
+    int len;
+    int loop;
+    uint32 h;
+    char *kp = (char *) keyp;
+
+    len = sizeof(float32data);
+
+#define HASH4a   h = (h << 5) - h + *kp++;
+#define HASH4b   h = (h << 5) + h + *kp++;
+#define HASH4 HASH4b
+
+
+    h = 0;
+    if (len > 0) {
+	loop = (len + 8 - 1) >> 3;
+	
+	switch (len & (8 - 1)) {
+	case 0:
+	    do {	/* All fall throughs */
+		HASH4;
+	    case 7:
+		HASH4;
+	    case 6:
+		HASH4;
+	    case 5:
+		HASH4;
+	    case 4:
+		HASH4;
+	    case 3:
+		HASH4;
+	    case 2:
+		HASH4;
+	    case 1:
+		HASH4;
+	    } while (--loop);
+	}
+    }
+    return (h);
+}	
+
+
+uint32 hashfloat8(float64 keyp)
+{
+    int len;
+    int loop;
+    uint32 h;
+    char *kp = (char *) keyp;
+
+    len = sizeof(float64data);
+
+#define HASH4a   h = (h << 5) - h + *kp++;
+#define HASH4b   h = (h << 5) + h + *kp++;
+#define HASH4 HASH4b
+
+
+    h = 0;
+    if (len > 0) {
+	loop = (len + 8 - 1) >> 3;
+	
+	switch (len & (8 - 1)) {
+	case 0:
+	    do {	/* All fall throughs */
+		HASH4;
+	    case 7:
+		HASH4;
+	    case 6:
+		HASH4;
+	    case 5:
+		HASH4;
+	    case 4:
+		HASH4;
+	    case 3:
+		HASH4;
+	    case 2:
+		HASH4;
+	    case 1:
+		HASH4;
+	    } while (--loop);
+	}
+    }
+    return (h);
+}	
+
+
+uint32 hashoid(Oid key)
+{
+    return ((uint32) ~key);
+}
+
+
+uint32 hashchar(char key)
+{
+    int len;
+    uint32 h;
+
+    len = sizeof(char);
+
+#define PRIME1		37
+#define PRIME2		1048583
+
+    h = 0;
+    /* Convert char to integer */
+    h = h * PRIME1 ^ (key - ' ');
+    h %= PRIME2;
+    
+    return (h);
+}
+
+uint32 hashchar2(uint16 intkey)
+{
+    uint32 h;
+    int len;
+    char *key = (char *) &intkey;
+ 
+    h = 0;
+    len = sizeof(uint16);
+    /* Convert string to integer */
+    while (len--)
+	h = h * PRIME1 ^ (*key++ - ' ');
+    h %= PRIME2;
+	
+    return (h);
+}
+
+uint32 hashchar4(uint32 intkey)
+{
+    uint32 h;
+    int len;
+    char *key = (char *) &intkey;
+ 
+    h = 0;
+    len = sizeof(uint32);
+    /* Convert string to integer */
+    while (len--)
+	h = h * PRIME1 ^ (*key++ - ' ');
+    h %= PRIME2;
+	
+    return (h);
+}
+
+uint32 hashchar8(char *key)
+{
+    uint32 h;
+    int len;
+ 
+    h = 0;
+    len = sizeof(char8);
+    /* Convert string to integer */
+    while (len--)
+	h = h * PRIME1 ^ (*key++ - ' ');
+    h %= PRIME2;
+	
+    return (h);
+}
+
+uint32 hashname(NameData *n)
+{
+    uint32 h;
+    int len;
+    char *key;
+
+    key = n->data;
+ 
+    h = 0;
+    len = NAMEDATALEN;
+    /* Convert string to integer */
+    while (len--)
+	h = h * PRIME1 ^ (*key++ - ' ');
+    h %= PRIME2;
+	
+    return (h);
+}
+
+
+uint32 hashchar16(char *key)
+{
+    uint32 h;
+    int len;
+ 
+    h = 0;
+    len = sizeof(char16);
+    /* Convert string to integer */
+    while (len--)
+	h = h * PRIME1 ^ (*key++ - ' ');
+    h %= PRIME2;
+	
+    return (h);
+}
+
+
+/*
+ * (Comment from the original db3 hashing code: )
+ *
+ * "This is INCREDIBLY ugly, but fast.  We break the string up into 8 byte
+ * units.  On the first time through the loop we get the 'leftover bytes'
+ * (strlen % 8).  On every other iteration, we perform 8 HASHC's so we handle
+ * all 8 bytes.  Essentially, this saves us 7 cmp & branch instructions.  If
+ * this routine is heavily used enough, it's worth the ugly coding.
+ *
+ * "OZ's original sdbm hash"
+ */
+uint32 hashtext(struct varlena *key)
+{
+    int keylen;
+    char *keydata;
+    uint32 n;
+    int loop;
+
+    keydata = VARDATA(key);
+    keylen = VARSIZE(key);
+
+    /* keylen includes the four bytes in which string keylength is stored */
+    keylen -= sizeof(VARSIZE(key));
+
+#define HASHC   n = *keydata++ + 65599 * n
+
+    n = 0;
+    if (keylen > 0) {
+	loop = (keylen + 8 - 1) >> 3;
+	
+	switch (keylen & (8 - 1)) {
+	case 0:
+	    do {	/* All fall throughs */
+		HASHC;
+	    case 7:
+		HASHC;
+	    case 6:
+		HASHC;
+	    case 5:
+		HASHC;
+	    case 4:
+		HASHC;
+	    case 3:
+		HASHC;
+	    case 2:
+		HASHC;
+	    case 1:
+		HASHC;
+	    } while (--loop);
+	}
+    }
+    return (n);
+}	
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -0,0 +1,239 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashinsert.c--
+ *    Item insertion in hash tables for Postgres.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashinsert.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/hash.h"
+
+static InsertIndexResult _hash_insertonpg(Relation rel, Buffer buf, int keysz, ScanKey scankey, HashItem hitem, Buffer metabuf);
+static OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, int keysz, ScanKey itup_scankey, Size itemsize, HashItem hitem);
+
+/*
+ *  _hash_doinsert() -- Handle insertion of a single HashItem in the table.
+ *
+ *	This routine is called by the public interface routines, hashbuild
+ *	and hashinsert.  By here, hashitem is filled in, and has a unique
+ *	(xid, seqno) pair. The datum to be used as a "key" is in the
+ * 	hashitem. 
+ */
+InsertIndexResult
+_hash_doinsert(Relation rel, HashItem hitem)
+{
+    Buffer buf;
+    Buffer metabuf;
+    BlockNumber blkno;
+    HashMetaPage metap;
+    IndexTuple itup;
+    InsertIndexResult res;
+    ScanKey itup_scankey;
+    int natts;
+    Page page;
+    
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    
+    /* we need a scan key to do our search, so build one */
+    itup = &(hitem->hash_itup);
+    if ((natts = rel->rd_rel->relnatts) != 1)
+	elog(WARN, "Hash indices valid for only one index key.");
+    itup_scankey = _hash_mkscankey(rel, itup, metap);
+    
+    /* 
+     * find the first page in the bucket chain containing this key and
+     * place it in buf.  _hash_search obtains a read lock for us.
+     */
+    _hash_search(rel, natts, itup_scankey, &buf, metap);
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE);
+
+    /*
+     * trade in our read lock for a write lock so that we can do the
+     * insertion.
+     */
+    blkno = BufferGetBlockNumber(buf);
+    _hash_relbuf(rel, buf, HASH_READ);
+    buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+    
+    
+    /*
+     * XXX btree comment (haven't decided what to do in hash): don't
+     * think the bucket can be split while we're reading the metapage.
+     *
+     * If the page was split between the time that we surrendered our
+     * read lock and acquired our write lock, then this page may no
+     * longer be the right place for the key we want to insert.
+     */
+    
+    /* do the insertion */
+    res = _hash_insertonpg(rel, buf, natts, itup_scankey,
+			   hitem, metabuf);
+    
+    /* be tidy */
+    _hash_freeskey(itup_scankey);
+    
+    return (res);
+}
+
+/*
+ *  _hash_insertonpg() -- Insert a tuple on a particular page in the table.
+ *
+ *	This recursive procedure does the following things:
+ *
+ *	    +  if necessary, splits the target page.  
+ *	    +  inserts the tuple.
+ *
+ *	On entry, we must have the right buffer on which to do the
+ *	insertion, and the buffer must be pinned and locked.  On return,
+ *	we will have dropped both the pin and the write lock on the buffer.
+ *
+ */
+static InsertIndexResult
+_hash_insertonpg(Relation rel,
+		 Buffer buf,
+		 int keysz,
+		 ScanKey scankey,
+		 HashItem hitem,
+		 Buffer metabuf)
+{
+    InsertIndexResult res; 
+    Page page;
+    BlockNumber itup_blkno;
+    OffsetNumber itup_off;
+    int itemsz;
+    HashPageOpaque pageopaque;
+    bool do_expand = false;	 
+    Buffer ovflbuf;
+    HashMetaPage metap;
+    Bucket bucket;
+    
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+    bucket = pageopaque->hasho_bucket;
+
+    itemsz = IndexTupleDSize(hitem->hash_itup)
+	+ (sizeof(HashItemData) - sizeof(IndexTupleData));
+    itemsz = DOUBLEALIGN(itemsz);
+    
+    while (PageGetFreeSpace(page) < itemsz) {
+	/* 
+         * no space on this page; check for an overflow page 
+	 */
+	if (BlockNumberIsValid(pageopaque->hasho_nextblkno)) {
+	    /* 
+	     * ovfl page exists; go get it.  if it doesn't have room,
+	     * we'll find out next pass through the loop test above.
+	     */
+	    ovflbuf = _hash_getbuf(rel, pageopaque->hasho_nextblkno,
+				   HASH_WRITE);
+	    _hash_relbuf(rel, buf, HASH_WRITE);
+	    buf = ovflbuf;
+	    page = BufferGetPage(buf);
+	} else {
+	    /* 
+	     * we're at the end of the bucket chain and we haven't
+	     * found a page with enough room.  allocate a new overflow
+	     * page.
+	     */
+	    do_expand = true;
+	    ovflbuf = _hash_addovflpage(rel, &metabuf, buf);
+	    _hash_relbuf(rel, buf, HASH_WRITE);
+	    buf = ovflbuf;
+	    page = BufferGetPage(buf);
+
+	    if (PageGetFreeSpace(page) < itemsz) {
+		/* it doesn't fit on an empty page -- give up */
+		elog(WARN, "hash item too large");
+	    }
+	}
+	_hash_checkpage(page, LH_OVERFLOW_PAGE);
+	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	Assert(pageopaque->hasho_bucket == bucket);
+    }
+
+    itup_off = _hash_pgaddtup(rel, buf, keysz, scankey, itemsz, hitem);
+    itup_blkno = BufferGetBlockNumber(buf);
+    
+    /* by here, the new tuple is inserted */
+    res = (InsertIndexResult) palloc(sizeof(InsertIndexResultData));
+    
+    ItemPointerSet(&(res->pointerData), itup_blkno, itup_off);
+    
+    if (res != NULL) {
+	/* 
+	 * Increment the number of keys in the table.
+	 * We switch lock access type just for a moment
+	 * to allow greater accessibility to the metapage. 
+	 */
+	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
+						  HASH_READ, HASH_WRITE);
+	metap->hashm_nkeys += 1;
+	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf,
+						  HASH_WRITE, HASH_READ);
+	
+    }
+    
+    _hash_wrtbuf(rel, buf);
+    
+    if (do_expand || 
+	(metap->hashm_nkeys / (metap->hashm_maxbucket + 1))
+	> metap->hashm_ffactor) {
+	_hash_expandtable(rel, metabuf);
+    }
+    _hash_relbuf(rel, metabuf, HASH_READ);
+    return (res);
+}	
+
+/*
+ *  _hash_pgaddtup() -- add a tuple to a particular page in the index.
+ *
+ *	This routine adds the tuple to the page as requested, and keeps the
+ *	write lock and reference associated with the page's buffer.  It is
+ *	an error to call pgaddtup() without a write lock and reference.
+ */
+static OffsetNumber
+_hash_pgaddtup(Relation rel,
+	       Buffer buf,
+	       int keysz,
+	       ScanKey itup_scankey,
+	       Size itemsize,
+	       HashItem hitem)
+{
+    OffsetNumber itup_off;
+    Page page;
+    
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+
+    itup_off = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+    (void) PageAddItem(page, (Item) hitem, itemsize, itup_off, LP_USED);
+    
+    /* write the buffer, but hold our lock */
+    _hash_wrtnorelbuf(rel, buf);
+    
+    return (itup_off);
+}
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -0,0 +1,614 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashovfl.c--
+ *    Overflow page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ *    Overflow pages look like ordinary relation pages.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static OverflowPageAddress _hash_getovfladdr(Relation rel, Buffer *metabufp);
+static uint32 _hash_firstfreebit(uint32 map);
+
+/*
+ *  _hash_addovflpage
+ *
+ *  Add an overflow page to the page currently pointed to by the buffer 
+ *  argument 'buf'. 
+ *
+ *  *Metabufp has a read lock upon entering the function; buf has a 
+ *  write lock. 
+ *  
+ */
+Buffer
+_hash_addovflpage(Relation rel, Buffer *metabufp, Buffer buf)
+{
+    
+    OverflowPageAddress oaddr;
+    BlockNumber ovflblkno;
+    Buffer ovflbuf;
+    HashMetaPage metap;
+    HashPageOpaque ovflopaque;
+    HashPageOpaque pageopaque;
+    Page page;
+    Page ovflpage;
+    
+    /* this had better be the last page in a bucket chain */
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+    Assert(!BlockNumberIsValid(pageopaque->hasho_nextblkno));
+    
+    metap = (HashMetaPage) BufferGetPage(*metabufp);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+
+    /* allocate an empty overflow page */
+    oaddr = _hash_getovfladdr(rel, metabufp);
+    if (oaddr == InvalidOvflAddress) {
+	elog(WARN, "_hash_addovflpage: problem with _hash_getovfladdr.");
+    }
+    ovflblkno = OADDR_TO_BLKNO(OADDR_OF(SPLITNUM(oaddr), OPAGENUM(oaddr)));
+    Assert(BlockNumberIsValid(ovflblkno));
+    ovflbuf = _hash_getbuf(rel, ovflblkno, HASH_WRITE);
+    Assert(BufferIsValid(ovflbuf));
+    ovflpage = BufferGetPage(ovflbuf);
+
+    /* initialize the new overflow page */
+    _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf));
+    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+    ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf);
+    ovflopaque->hasho_nextblkno = InvalidBlockNumber;
+    ovflopaque->hasho_flag = LH_OVERFLOW_PAGE;
+    ovflopaque->hasho_oaddr = oaddr;
+    ovflopaque->hasho_bucket = pageopaque->hasho_bucket;
+    _hash_wrtnorelbuf(rel, ovflbuf);
+    
+    /* logically chain overflow page to previous page */
+    pageopaque->hasho_nextblkno = ovflblkno;
+    _hash_wrtnorelbuf(rel, buf);
+    return (ovflbuf);
+}
+
+/*
+ *  _hash_getovfladdr()
+ *
+ *  Find an available overflow page and return its address. 
+ *
+ *  When we enter this function, we have a read lock on *metabufp which
+ *  we change to a write lock immediately. Before exiting, the write lock
+ *  is exchanged for a read lock. 
+ *
+ */
+static OverflowPageAddress
+_hash_getovfladdr(Relation rel, Buffer *metabufp)
+{
+    HashMetaPage metap;
+    Buffer mapbuf;
+    BlockNumber blkno;
+    PageOffset offset;
+    OverflowPageAddress oaddr;
+    SplitNumber splitnum;
+    uint32 *freep;
+    uint32 max_free; 
+    uint32 bit;
+    uint32 first_page; 
+    uint32 free_bit; 
+    uint32 free_page; 
+    uint32 in_use_bits;
+    uint32 i, j;
+    
+    metap = (HashMetaPage) _hash_chgbufaccess(rel, metabufp, HASH_READ, HASH_WRITE);
+    
+    splitnum = metap->OVFL_POINT;
+    max_free = metap->SPARES[splitnum];
+    
+    free_page = (max_free - 1) >> (metap->BSHIFT + BYTE_TO_BIT);
+    free_bit = (max_free - 1) & (BMPGSZ_BIT(metap) - 1);
+    
+    /* Look through all the free maps to find the first free block */
+    first_page = metap->LAST_FREED >> (metap->BSHIFT + BYTE_TO_BIT);
+    for ( i = first_page; i <= free_page; i++ ) {
+	Page mappage;
+
+	blkno = metap->hashm_mapp[i];
+	mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+	mappage = BufferGetPage(mapbuf);
+	_hash_checkpage(mappage, LH_BITMAP_PAGE);
+	freep = HashPageGetBitmap(mappage);
+	Assert(freep);
+	
+	if (i == free_page)
+	    in_use_bits = free_bit;
+	else
+	    in_use_bits = BMPGSZ_BIT(metap) - 1;
+	
+	if (i == first_page) {
+	    bit = metap->LAST_FREED & (BMPGSZ_BIT(metap) - 1);
+	    j = bit / BITS_PER_MAP;
+	    bit = bit & ~(BITS_PER_MAP - 1);
+	} else {
+	    bit = 0;
+	    j = 0;
+	}
+	for (; bit <= in_use_bits; j++, bit += BITS_PER_MAP)
+	    if (freep[j] != ALL_SET)
+		goto found;
+    }
+    
+    /* No Free Page Found - have to allocate a new page */
+    metap->LAST_FREED = metap->SPARES[splitnum];
+    metap->SPARES[splitnum]++;
+    offset = metap->SPARES[splitnum] -
+	(splitnum ? metap->SPARES[splitnum - 1] : 0);
+    
+#define	OVMSG	"HASH: Out of overflow pages.  Out of luck.\n"
+    
+    if (offset > SPLITMASK) {
+	if (++splitnum >= NCACHED) {
+	    elog(WARN, OVMSG);
+	}
+	metap->OVFL_POINT = splitnum;
+	metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+	metap->SPARES[splitnum-1]--;
+	offset = 0;
+    }
+    
+    /* Check if we need to allocate a new bitmap page */
+    if (free_bit == BMPGSZ_BIT(metap) - 1) {
+	/* won't be needing old map page */
+
+	_hash_relbuf(rel, mapbuf, HASH_WRITE);
+
+	free_page++;
+	if (free_page >= NCACHED) {
+	    elog(WARN, OVMSG);
+	}
+	
+	/*
+	 * This is tricky.  The 1 indicates that you want the new page
+	 * allocated with 1 clear bit.  Actually, you are going to
+	 * allocate 2 pages from this map.  The first is going to be
+	 * the map page, the second is the overflow page we were
+	 * looking for.  The init_bitmap routine automatically, sets
+	 * the first bit of itself to indicate that the bitmap itself
+	 * is in use.  We would explicitly set the second bit, but
+	 * don't have to if we tell init_bitmap not to leave it clear
+	 * in the first place.
+	 */
+	if (_hash_initbitmap(rel, metap, OADDR_OF(splitnum, offset),
+			     1, free_page)) {
+	    elog(WARN, "overflow_page: problem with _hash_initbitmap.");
+	}
+	metap->SPARES[splitnum]++;
+	offset++;
+	if (offset > SPLITMASK) {
+	    if (++splitnum >= NCACHED) {
+		elog(WARN, OVMSG);
+	    }
+	    metap->OVFL_POINT = splitnum;
+	    metap->SPARES[splitnum] = metap->SPARES[splitnum-1];
+	    metap->SPARES[splitnum-1]--;
+	    offset = 0;
+	}
+    } else {
+	
+	/*
+	 * Free_bit addresses the last used bit.  Bump it to address
+	 * the first available bit.
+	 */
+	free_bit++;
+	SETBIT(freep, free_bit);
+	_hash_wrtbuf(rel, mapbuf);
+    }
+    
+    /* Calculate address of the new overflow page */
+    oaddr = OADDR_OF(splitnum, offset);
+    _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+    return (oaddr);
+    
+ found:
+    bit = bit + _hash_firstfreebit(freep[j]);
+    SETBIT(freep, bit);
+    _hash_wrtbuf(rel, mapbuf);
+    
+    /*
+     * Bits are addressed starting with 0, but overflow pages are addressed
+     * beginning at 1. Bit is a bit addressnumber, so we need to increment
+     * it to convert it to a page number.
+     */
+    
+    bit = 1 + bit + (i * BMPGSZ_BIT(metap));
+    if (bit >= metap->LAST_FREED) {
+	metap->LAST_FREED = bit - 1;
+    }
+    
+    /* Calculate the split number for this page */
+    for (i = 0; (i < splitnum) && (bit > metap->SPARES[i]); i++)
+	;
+    offset = (i ? bit - metap->SPARES[i - 1] : bit);
+    if (offset >= SPLITMASK) {
+	elog(WARN, OVMSG);
+    }
+    
+    /* initialize this page */
+    oaddr = OADDR_OF(i, offset);
+    _hash_chgbufaccess(rel, metabufp, HASH_WRITE, HASH_READ);
+    return (oaddr);
+}
+
+/*
+ *  _hash_firstfreebit()
+ *
+ *  Return the first bit that is not set in the argument 'map'. This
+ *  function is used to find an available overflow page within a
+ *  splitnumber. 
+ * 
+ */
+static uint32
+_hash_firstfreebit(uint32 map)
+{
+    uint32 i, mask;
+    
+    mask = 0x1;
+    for (i = 0; i < BITS_PER_MAP; i++) {
+	if (!(mask & map))
+	    return (i);
+	mask = mask << 1;
+    }
+    return (i);
+}
+
+/*
+ *  _hash_freeovflpage() - 
+ *
+ *  Mark this overflow page as free and return a buffer with 
+ *  the page that follows it (which may be defined as
+ *  InvalidBuffer). 
+ *
+ */
+Buffer
+_hash_freeovflpage(Relation rel, Buffer ovflbuf)
+{
+    HashMetaPage metap;
+    Buffer metabuf;
+    Buffer mapbuf;
+    BlockNumber prevblkno;
+    BlockNumber blkno;
+    BlockNumber nextblkno;
+    HashPageOpaque ovflopaque;
+    Page ovflpage;
+    Page mappage;
+    OverflowPageAddress addr;
+    SplitNumber splitnum;
+    uint32 *freep;
+    uint32 ovflpgno;
+    int32 bitmappage, bitmapbit;
+    Bucket bucket;
+    
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    
+    ovflpage = BufferGetPage(ovflbuf);
+    _hash_checkpage(ovflpage, LH_OVERFLOW_PAGE);
+    ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage);
+    addr = ovflopaque->hasho_oaddr;
+    nextblkno = ovflopaque->hasho_nextblkno;
+    prevblkno = ovflopaque->hasho_prevblkno;
+    bucket = ovflopaque->hasho_bucket;
+    (void) memset(ovflpage, 0, BufferGetPageSize(ovflbuf));
+    _hash_wrtbuf(rel, ovflbuf);
+    
+    /* 
+     * fix up the bucket chain.  this is a doubly-linked list, so we
+     * must fix up the bucket chain members behind and ahead of the
+     * overflow page being deleted.
+     *
+     * XXX this should look like:
+     * - lock prev/next
+     * - modify/write prev/next (how to do write ordering with a
+     * doubly-linked list???)
+     * - unlock prev/next
+     */
+    if (BlockNumberIsValid(prevblkno)) {
+	Buffer prevbuf = _hash_getbuf(rel, prevblkno, HASH_WRITE);
+	Page prevpage = BufferGetPage(prevbuf);
+	HashPageOpaque prevopaque =
+	    (HashPageOpaque) PageGetSpecialPointer(prevpage);
+
+	_hash_checkpage(prevpage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+	Assert(prevopaque->hasho_bucket == bucket);
+	prevopaque->hasho_nextblkno = nextblkno;
+	_hash_wrtbuf(rel, prevbuf);
+    }
+    if (BlockNumberIsValid(nextblkno)) {
+	Buffer nextbuf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
+	Page nextpage = BufferGetPage(nextbuf);
+	HashPageOpaque nextopaque =
+	    (HashPageOpaque) PageGetSpecialPointer(nextpage);
+	
+	_hash_checkpage(nextpage, LH_OVERFLOW_PAGE);
+	Assert(nextopaque->hasho_bucket == bucket);
+	nextopaque->hasho_prevblkno = prevblkno;
+	_hash_wrtbuf(rel, nextbuf);
+    }
+    
+    /* 
+     * Fix up the overflow page bitmap that tracks this particular
+     * overflow page. The bitmap can be found in the MetaPageData
+     * array element hashm_mapp[bitmappage].
+     */
+    splitnum = (addr >> SPLITSHIFT);
+    ovflpgno =
+	(splitnum ? metap->SPARES[splitnum - 1] : 0) + (addr & SPLITMASK) - 1;
+    
+    if (ovflpgno < metap->LAST_FREED) {
+	metap->LAST_FREED = ovflpgno;
+    }
+    
+    bitmappage = (ovflpgno >> (metap->BSHIFT + BYTE_TO_BIT));
+    bitmapbit = ovflpgno & (BMPGSZ_BIT(metap) - 1);
+    
+    blkno = metap->hashm_mapp[bitmappage];
+    mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE);
+    mappage = BufferGetPage(mapbuf);
+    _hash_checkpage(mappage, LH_BITMAP_PAGE);
+    freep = HashPageGetBitmap(mappage);
+    CLRBIT(freep, bitmapbit);
+    _hash_wrtbuf(rel, mapbuf);
+    
+    _hash_relbuf(rel, metabuf, HASH_WRITE);
+    
+    /* 
+     * now instantiate the page that replaced this one, 
+     * if it exists, and return that buffer with a write lock.
+     */
+    if (BlockNumberIsValid(nextblkno)) {
+	return (_hash_getbuf(rel, nextblkno, HASH_WRITE));
+    } else {
+	return (InvalidBuffer);
+    }
+}
+
+
+/*
+ *  _hash_initbitmap()
+ *  
+ *   Initialize a new bitmap page.  The metapage has a write-lock upon
+ *   entering the function.
+ *
+ * 'pnum' is the OverflowPageAddress of the new bitmap page.
+ * 'nbits' is how many bits to clear (i.e., make available) in the new
+ * bitmap page.  the remainder of the bits (as well as the first bit,
+ * representing the bitmap page itself) will be set.
+ * 'ndx' is the 0-based offset of the new bitmap page within the
+ * metapage's array of bitmap page OverflowPageAddresses.
+ */
+
+#define INT_MASK	((1 << INT_TO_BIT) -1)
+
+int32
+_hash_initbitmap(Relation rel,
+		 HashMetaPage metap,
+		 int32 pnum,
+		 int32 nbits,
+		 int32 ndx)
+{
+    Buffer buf;
+    BlockNumber blkno;
+    Page pg;
+    HashPageOpaque op;
+    uint32 *freep;
+    int clearbytes, clearints;
+    
+    blkno = OADDR_TO_BLKNO(pnum);
+    buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+    pg = BufferGetPage(buf);
+    _hash_pageinit(pg, BufferGetPageSize(buf));
+    op = (HashPageOpaque) PageGetSpecialPointer(pg);
+    op->hasho_oaddr = InvalidOvflAddress;
+    op->hasho_prevblkno = InvalidBlockNumber;
+    op->hasho_nextblkno = InvalidBlockNumber;
+    op->hasho_flag = LH_BITMAP_PAGE;
+    op->hasho_bucket = -1;
+
+    freep = HashPageGetBitmap(pg);
+
+    /* set all of the bits above 'nbits' to 1 */
+    clearints = ((nbits - 1) >> INT_TO_BIT) + 1;
+    clearbytes = clearints << INT_TO_BYTE;
+    (void) memset((char *) freep, 0, clearbytes);
+    (void) memset(((char *) freep) + clearbytes, 0xFF,
+		  BMPGSZ_BYTE(metap) - clearbytes);
+    freep[clearints - 1] = ALL_SET << (nbits & INT_MASK);
+
+    /* bit 0 represents the new bitmap page */
+    SETBIT(freep, 0);
+        
+    /* metapage already has a write lock */
+    metap->hashm_nmaps++;
+    metap->hashm_mapp[ndx] = blkno;
+    
+    /* write out the new bitmap page (releasing its locks) */
+    _hash_wrtbuf(rel, buf);
+
+    return (0);
+}
+
+
+/*
+ *  _hash_squeezebucket(rel, bucket)
+ *
+ *  Try to squeeze the tuples onto pages occuring earlier in the
+ *  bucket chain in an attempt to free overflow pages. When we start
+ *  the "squeezing", the page from which we start taking tuples (the
+ *  "read" page) is the last bucket in the bucket chain and the page
+ *  onto which we start squeezing tuples (the "write" page) is the
+ *  first page in the bucket chain.  The read page works backward and
+ *  the write page works forward; the procedure terminates when the
+ *  read page and write page are the same page.
+ */
+void
+_hash_squeezebucket(Relation rel,
+		    HashMetaPage metap, 
+		    Bucket bucket)
+{
+    Buffer wbuf;
+    Buffer rbuf;
+    BlockNumber wblkno;		
+    BlockNumber rblkno;		
+    Page wpage;
+    Page rpage;
+    HashPageOpaque wopaque;
+    HashPageOpaque ropaque;
+    OffsetNumber woffnum;
+    OffsetNumber roffnum;
+    HashItem hitem;
+    int itemsz;
+    
+/*    elog(DEBUG, "_hash_squeezebucket: squeezing bucket %d", bucket); */
+
+    /*
+     * start squeezing into the base bucket page.
+     */
+    wblkno = BUCKET_TO_BLKNO(bucket);
+    wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+    wpage = BufferGetPage(wbuf);
+    _hash_checkpage(wpage, LH_BUCKET_PAGE);
+    wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+    
+    /*
+     * if there aren't any overflow pages, there's nothing to squeeze.
+     */
+    if (!BlockNumberIsValid(wopaque->hasho_nextblkno)) {
+	_hash_relbuf(rel, wbuf, HASH_WRITE);
+	return;
+    }
+    
+    /*
+     * find the last page in the bucket chain by starting at the base
+     * bucket page and working forward.
+     *
+     * XXX if chains tend to be long, we should probably move forward
+     * using HASH_READ and then _hash_chgbufaccess to HASH_WRITE when
+     * we reach the end.  if they are short we probably don't care
+     * very much.  if the hash function is working at all, they had
+     * better be short..
+     */
+    ropaque = wopaque;
+    do {
+	rblkno = ropaque->hasho_nextblkno;
+	if (ropaque != wopaque) {
+	    _hash_relbuf(rel, rbuf, HASH_WRITE);
+	}
+	rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+	rpage = BufferGetPage(rbuf);
+	_hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+	Assert(!PageIsEmpty(rpage));
+	ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+	Assert(ropaque->hasho_bucket == bucket);
+    } while (BlockNumberIsValid(ropaque->hasho_nextblkno));
+
+    /*
+     * squeeze the tuples.
+     */
+    roffnum = FirstOffsetNumber;
+    for(;;) {
+	hitem = (HashItem) PageGetItem(rpage, PageGetItemId(rpage, roffnum));
+	itemsz = IndexTupleDSize(hitem->hash_itup) 
+	    + (sizeof(HashItemData) - sizeof(IndexTupleData));
+	itemsz = DOUBLEALIGN(itemsz);
+	
+	/*
+	 * walk up the bucket chain, looking for a page big enough for
+	 * this item.
+	 */
+	while (PageGetFreeSpace(wpage) < itemsz) {
+	    wblkno = wopaque->hasho_nextblkno;
+
+	    _hash_wrtbuf(rel, wbuf);
+
+	    if (!BlockNumberIsValid(wblkno) || (rblkno == wblkno)) {
+		_hash_wrtbuf(rel, rbuf);
+		/* wbuf is already released */
+		return;
+	    }
+	    
+	    wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
+	    wpage = BufferGetPage(wbuf);
+	    _hash_checkpage(wpage, LH_OVERFLOW_PAGE);
+	    Assert(!PageIsEmpty(wpage));
+	    wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage);
+	    Assert(wopaque->hasho_bucket == bucket);
+	}
+	
+	/* 
+	 * if we're here, we have found room so insert on the "write"
+	 * page.
+	 */
+	woffnum = OffsetNumberNext(PageGetMaxOffsetNumber(wpage));
+	(void) PageAddItem(wpage, (Item) hitem, itemsz, woffnum, LP_USED);
+	
+	/* 
+	 * delete the tuple from the "read" page.
+	 * PageIndexTupleDelete repacks the ItemId array, so 'roffnum'
+	 * will be "advanced" to the "next" ItemId.
+	 */
+	PageIndexTupleDelete(rpage, roffnum);
+	_hash_wrtnorelbuf(rel, rbuf);
+	
+	/*
+	 * if the "read" page is now empty because of the deletion,
+	 * free it.
+	 */
+	if (PageIsEmpty(rpage) && (ropaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+	    rblkno = ropaque->hasho_prevblkno;
+	    Assert(BlockNumberIsValid(rblkno));
+
+	    /*
+	     * free this overflow page.  the extra _hash_relbuf is
+	     * because _hash_freeovflpage gratuitously returns the
+	     * next page (we want the previous page and will get it
+	     * ourselves later).
+	     */
+	    rbuf = _hash_freeovflpage(rel, rbuf);
+	    if (BufferIsValid(rbuf)) {
+		_hash_relbuf(rel, rbuf, HASH_WRITE);
+	    }
+	    
+	    if (rblkno == wblkno) {
+		/* rbuf is already released */
+		_hash_wrtbuf(rel, wbuf);
+		return;
+	    }
+	    
+	    rbuf = _hash_getbuf(rel, rblkno, HASH_WRITE);
+	    rpage = BufferGetPage(rbuf);
+	    _hash_checkpage(rpage, LH_OVERFLOW_PAGE);
+	    Assert(!PageIsEmpty(rpage));
+	    ropaque = (HashPageOpaque) PageGetSpecialPointer(rpage);
+	    Assert(ropaque->hasho_bucket == bucket);
+
+	    roffnum = FirstOffsetNumber;
+	}
+    }
+}
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -0,0 +1,669 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashpage.c--
+ *    Hash table page management code for the Postgres hash access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ *    Postgres hash pages look like ordinary relation pages.  The opaque
+ *    data at high addresses includes information about the page including
+ *    whether a page is an overflow page or a true bucket, the block 
+ *    numbers of the preceding and following pages, and the overflow
+ *    address of the page if it is an overflow page.
+ *
+ *    The first page in a hash relation, page zero, is special -- it stores
+ *    information describing the hash table; it is referred to as teh
+ *    "meta page." Pages one and higher store the actual data. 
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+static void _hash_setpagelock(Relation rel, BlockNumber blkno, int access);
+static void _hash_unsetpagelock(Relation rel, BlockNumber blkno, int access);
+static void _hash_splitpage(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket);
+
+/*  
+ *  We use high-concurrency locking on hash indices.  There are two cases in
+ *  which we don't do locking.  One is when we're building the index.
+ *  Since the creating transaction has not committed, no one can see
+ *  the index, and there's no reason to share locks.  The second case
+ *  is when we're just starting up the database system.  We use some
+ *  special-purpose initialization code in the relation cache manager
+ *  (see utils/cache/relcache.c) to allow us to do indexed scans on
+ *  the system catalogs before we'd normally be able to.  This happens
+ *  before the lock table is fully initialized, so we can't use it.
+ *  Strictly speaking, this violates 2pl, but we don't do 2pl on the
+ *  system catalogs anyway.
+ */
+
+
+#define USELOCKING	(!BuildingHash && !IsInitProcessingMode())
+
+
+/*
+ *  _hash_metapinit() -- Initialize the metadata page of a hash index,
+ *		the two buckets that we begin with and the initial
+ *		bitmap page.
+ */
+void
+_hash_metapinit(Relation rel)
+{
+    HashMetaPage metap;
+    HashPageOpaque pageopaque;
+    Buffer metabuf;
+    Buffer buf;
+    Page pg;
+    int nbuckets;
+    uint32 nelem;			/* number elements */
+    uint32 lg2nelem;			/* _hash_log2(nelem)   */
+    uint32 nblocks;
+    uint16 i;
+    
+    /* can't be sharing this with anyone, now... */
+    if (USELOCKING)
+	RelationSetLockForWrite(rel);
+    
+    if ((nblocks = RelationGetNumberOfBlocks(rel)) != 0) {
+	elog(WARN, "Cannot initialize non-empty hash table %s",
+	     RelationGetRelationName(rel));
+    }
+    
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+    pg = BufferGetPage(metabuf);
+    metap = (HashMetaPage) pg;
+    _hash_pageinit(pg, BufferGetPageSize(metabuf));
+    
+    metap->hashm_magic 		= HASH_MAGIC;
+    metap->hashm_version 	= HASH_VERSION;
+    metap->hashm_nkeys 		= 0;
+    metap->hashm_nmaps 		= 0;
+    metap->hashm_ffactor 	= DEFAULT_FFACTOR;
+    metap->hashm_bsize 		= BufferGetPageSize(metabuf);
+    metap->hashm_bshift		= _hash_log2(metap->hashm_bsize);
+    for (i = metap->hashm_bshift; i > 0; --i) {
+	if ((1 << i) < (metap->hashm_bsize -
+			(DOUBLEALIGN(sizeof(PageHeaderData)) +
+			 DOUBLEALIGN(sizeof(HashPageOpaqueData))))) {
+	    break;
+	}
+    }
+    Assert(i);
+    metap->hashm_bmsize		= 1 << i;
+    metap->hashm_procid		= index_getprocid(rel, 1, HASHPROC);
+    
+    /* 
+     * Make nelem = 2 rather than 0 so that we end up allocating space 
+     * for the next greater power of two number of buckets. 
+     */
+    nelem = 2;
+    lg2nelem = 1; 	 	/*_hash_log2(MAX(nelem, 2)) */
+    nbuckets = 2; 		/*1 << lg2nelem */
+    
+    memset((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
+    memset((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
+    
+    metap->hashm_spares[lg2nelem]     = 2;	/* lg2nelem + 1 */
+    metap->hashm_spares[lg2nelem + 1] = 2;	/* lg2nelem + 1 */
+    metap->hashm_ovflpoint            = 1;	/* lg2nelem */
+    metap->hashm_lastfreed            = 2;
+    
+    metap->hashm_maxbucket = metap->hashm_lowmask = 1; 	/* nbuckets - 1 */
+    metap->hashm_highmask  = 3;			 /* (nbuckets << 1) - 1 */
+    
+    pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+    pageopaque->hasho_oaddr = InvalidOvflAddress;
+    pageopaque->hasho_prevblkno = InvalidBlockNumber;
+    pageopaque->hasho_nextblkno = InvalidBlockNumber;
+    pageopaque->hasho_flag = LH_META_PAGE;
+    pageopaque->hasho_bucket = -1;
+
+    /* 
+     * First bitmap page is at: splitpoint lg2nelem page offset 1 which
+     * turns out to be page 3. Couldn't initialize page 3  until we created
+     * the first two buckets above. 
+     */
+    if (_hash_initbitmap(rel, metap, OADDR_OF(lg2nelem, 1), lg2nelem + 1, 0))
+	elog(WARN, "Problem with _hash_initbitmap.");
+
+    /* all done */
+    _hash_wrtnorelbuf(rel, metabuf);
+    
+    /* 
+     * initialize the first two buckets 
+     */
+    for (i = 0; i <= 1; i++) {
+	buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
+	pg = BufferGetPage(buf);
+	_hash_pageinit(pg, BufferGetPageSize(buf));
+	pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
+	pageopaque->hasho_oaddr = InvalidOvflAddress;
+	pageopaque->hasho_prevblkno = InvalidBlockNumber;
+	pageopaque->hasho_nextblkno = InvalidBlockNumber;
+	pageopaque->hasho_flag = LH_BUCKET_PAGE;
+	pageopaque->hasho_bucket = i;
+	_hash_wrtbuf(rel, buf);
+    }
+    
+    _hash_relbuf(rel, metabuf, HASH_WRITE);
+    
+    if (USELOCKING)
+	RelationUnsetLockForWrite(rel);
+}
+
+/*
+ *  _hash_getbuf() -- Get a buffer by block number for read or write.
+ *
+ *	When this routine returns, the appropriate lock is set on the
+ *	requested buffer its reference count is correct.
+ *
+ *	XXX P_NEW is not used because, unlike the tree structures, we
+ *	need the bucket blocks to be at certain block numbers.  we must
+ *	depend on the caller to call _hash_pageinit on the block if it
+ *	knows that this is a new block.
+ */
+Buffer
+_hash_getbuf(Relation rel, BlockNumber blkno, int access)
+{
+    Buffer buf;
+    
+    if (blkno == P_NEW) {
+	elog(WARN, "_hash_getbuf: internal error: hash AM does not use P_NEW");
+    }
+    switch (access) {
+    case HASH_WRITE:
+    case HASH_READ:
+	_hash_setpagelock(rel, blkno, access);
+	break;
+    default:
+	elog(WARN, "_hash_getbuf: invalid access (%d) on new blk: %.*s",
+	     access, NAMEDATALEN, RelationGetRelationName(rel));
+	break;
+    }
+    buf = ReadBuffer(rel, blkno);
+    
+    /* ref count and lock type are correct */
+    return (buf);
+}
+
+/*
+ *  _hash_relbuf() -- release a locked buffer.
+ */
+void
+_hash_relbuf(Relation rel, Buffer buf, int access)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    
+    switch (access) {
+    case HASH_WRITE:
+    case HASH_READ:
+	_hash_unsetpagelock(rel, blkno, access);
+	break;
+    default:
+	elog(WARN, "_hash_relbuf: invalid access (%d) on blk %x: %.*s",
+	     access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+    }
+    
+    ReleaseBuffer(buf);
+}
+
+/*
+ *  _hash_wrtbuf() -- write a hash page to disk.
+ *
+ *	This routine releases the lock held on the buffer and our reference
+ *	to it.  It is an error to call _hash_wrtbuf() without a write lock
+ *	or a reference to the buffer.
+ */
+void
+_hash_wrtbuf(Relation rel, Buffer buf)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    WriteBuffer(buf);
+    _hash_unsetpagelock(rel, blkno, HASH_WRITE);
+}
+
+/*
+ *  _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
+ *			 our reference or lock.
+ *
+ *	It is an error to call _hash_wrtnorelbuf() without a write lock
+ *	or a reference to the buffer.
+ */
+void
+_hash_wrtnorelbuf(Relation rel, Buffer buf)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(buf);
+    WriteNoReleaseBuffer(buf);
+}
+
+Page
+_hash_chgbufaccess(Relation rel,
+		   Buffer *bufp,
+		   int from_access,
+		   int to_access)
+{
+    BlockNumber blkno;
+    
+    blkno = BufferGetBlockNumber(*bufp);
+    
+    switch (from_access) {
+    case HASH_WRITE:
+	_hash_wrtbuf(rel, *bufp);
+	break;
+    case HASH_READ:
+	_hash_relbuf(rel, *bufp, from_access);
+	break;
+    default:
+	elog(WARN, "_hash_chgbufaccess: invalid access (%d) on blk %x: %.*s",
+	     from_access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+	break;
+    }
+    *bufp = _hash_getbuf(rel, blkno, to_access);
+    return (BufferGetPage(*bufp));
+}
+
+/*
+ *  _hash_pageinit() -- Initialize a new page.
+ */
+void
+_hash_pageinit(Page page, Size size)
+{
+    Assert(((PageHeader) page)->pd_lower == 0);
+    Assert(((PageHeader) page)->pd_upper == 0);
+    Assert(((PageHeader) page)->pd_special == 0);
+
+    /*
+     *  Cargo-cult programming -- don't really need this to be zero, but
+     *  creating new pages is an infrequent occurrence and it makes me feel
+     *  good when I know they're empty.
+     */
+    memset(page, 0, size);
+    
+    PageInit(page, size, sizeof(HashPageOpaqueData));
+}
+
+static void
+_hash_setpagelock(Relation rel,
+		  BlockNumber blkno,
+		  int access)
+{
+    ItemPointerData iptr;
+    
+    if (USELOCKING) {
+	ItemPointerSet(&iptr, blkno, 1);
+	
+	switch (access) {
+	case HASH_WRITE:
+	    RelationSetSingleWLockPage(rel, &iptr);
+	    break;
+	case HASH_READ:
+	    RelationSetSingleRLockPage(rel, &iptr);
+	    break;
+	default:
+	    elog(WARN, "_hash_setpagelock: invalid access (%d) on blk %x: %.*s",
+		 access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+	    break;
+	}
+    }
+}
+
+static void
+_hash_unsetpagelock(Relation rel,
+		    BlockNumber blkno,
+		    int access)
+{
+    ItemPointerData iptr;
+    
+    if (USELOCKING) {
+	ItemPointerSet(&iptr, blkno, 1);
+	
+	switch (access) {
+	case HASH_WRITE:
+	    RelationUnsetSingleWLockPage(rel, &iptr);
+	    break;
+	case HASH_READ:
+	    RelationUnsetSingleRLockPage(rel, &iptr);
+	    break;
+	default:
+	    elog(WARN, "_hash_unsetpagelock: invalid access (%d) on blk %x: %.*s",
+		 access, blkno, NAMEDATALEN, RelationGetRelationName(rel));
+	    break;
+	}
+    }
+}
+
+void
+_hash_pagedel(Relation rel, ItemPointer tid)
+{
+    Buffer buf;
+    Buffer metabuf;
+    Page page;
+    BlockNumber blkno;
+    OffsetNumber offno;
+    HashMetaPage metap;
+    HashPageOpaque opaque;
+    
+    blkno = ItemPointerGetBlockNumber(tid);
+    offno = ItemPointerGetOffsetNumber(tid);
+    
+    buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+    
+    PageIndexTupleDelete(page, offno);
+    _hash_wrtnorelbuf(rel, buf);
+    
+    if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+	buf = _hash_freeovflpage(rel, buf);
+	if (BufferIsValid(buf)) {
+	    _hash_relbuf(rel, buf, HASH_WRITE);
+	}
+    } else {
+	_hash_relbuf(rel, buf, HASH_WRITE);
+    }
+    
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    ++metap->hashm_nkeys;
+    _hash_wrtbuf(rel, metabuf);
+}
+
+void
+_hash_expandtable(Relation rel, Buffer metabuf)
+{
+    HashMetaPage metap;
+    Bucket old_bucket;
+    Bucket new_bucket;
+    uint32 spare_ndx;
+    
+/*    elog(DEBUG, "_hash_expandtable: expanding..."); */
+
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    
+    metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);	
+    new_bucket = ++metap->MAX_BUCKET;
+    metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);	
+    old_bucket = (metap->MAX_BUCKET & metap->LOW_MASK);
+    
+    /*
+     * If the split point is increasing (MAX_BUCKET's log base 2
+     * * increases), we need to copy the current contents of the spare
+     * split bucket to the next bucket.
+     */
+    spare_ndx = _hash_log2(metap->MAX_BUCKET + 1);
+    if (spare_ndx > metap->OVFL_POINT) {
+	
+	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);	
+	metap->SPARES[spare_ndx] = metap->SPARES[metap->OVFL_POINT];
+	metap->OVFL_POINT = spare_ndx;
+	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);	
+    }
+    
+    if (new_bucket > metap->HIGH_MASK) {
+	
+	/* Starting a new doubling */
+	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_READ, HASH_WRITE);	
+	metap->LOW_MASK = metap->HIGH_MASK;
+	metap->HIGH_MASK = new_bucket | metap->LOW_MASK;
+	metap = (HashMetaPage) _hash_chgbufaccess(rel, &metabuf, HASH_WRITE, HASH_READ);	
+	
+    }
+    /* Relocate records to the new bucket */
+    _hash_splitpage(rel, metabuf, old_bucket, new_bucket);
+}
+
+
+/*
+ * _hash_splitpage -- split 'obucket' into 'obucket' and 'nbucket'
+ *
+ * this routine is actually misnamed -- we are splitting a bucket that
+ * consists of a base bucket page and zero or more overflow (bucket
+ * chain) pages.
+ */
+static void
+_hash_splitpage(Relation rel,
+		Buffer metabuf,
+		Bucket obucket,
+		Bucket nbucket)
+{
+    Bucket bucket;
+    Buffer obuf;
+    Buffer nbuf;
+    Buffer ovflbuf;
+    BlockNumber oblkno;
+    BlockNumber nblkno;
+    bool null;
+    Datum datum;
+    HashItem hitem;
+    HashPageOpaque oopaque;
+    HashPageOpaque nopaque;
+    HashMetaPage metap;
+    IndexTuple itup;
+    int itemsz;
+    OffsetNumber ooffnum;
+    OffsetNumber noffnum;
+    OffsetNumber omaxoffnum;
+    Page opage;
+    Page npage;
+    TupleDesc itupdesc;
+    
+/*    elog(DEBUG, "_hash_splitpage: splitting %d into %d,%d",
+	 obucket, obucket, nbucket);
+*/
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+    
+    /* get the buffers & pages */
+    oblkno = BUCKET_TO_BLKNO(obucket);
+    nblkno = BUCKET_TO_BLKNO(nbucket);
+    obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+    nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
+    opage = BufferGetPage(obuf);
+    npage = BufferGetPage(nbuf);
+
+    /* initialize the new bucket */
+    _hash_pageinit(npage, BufferGetPageSize(nbuf));
+    nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
+    nopaque->hasho_prevblkno = InvalidBlockNumber;
+    nopaque->hasho_nextblkno = InvalidBlockNumber;
+    nopaque->hasho_flag = LH_BUCKET_PAGE;
+    nopaque->hasho_oaddr = InvalidOvflAddress;
+    nopaque->hasho_bucket = nbucket;
+    _hash_wrtnorelbuf(rel, nbuf);
+    
+    /*
+     * make sure the old bucket isn't empty.  advance 'opage' and
+     * friends through the overflow bucket chain until we find a
+     * non-empty page.
+     *
+     * XXX we should only need this once, if we are careful to
+     * preserve the invariant that overflow pages are never empty.
+     */
+    _hash_checkpage(opage, LH_BUCKET_PAGE);
+    oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+    if (PageIsEmpty(opage)) {
+	oblkno = oopaque->hasho_nextblkno;
+	_hash_relbuf(rel, obuf, HASH_WRITE);
+	if (!BlockNumberIsValid(oblkno)) {
+	    /*
+	     * the old bucket is completely empty; of course, the new
+	     * bucket will be as well, but since it's a base bucket
+	     * page we don't care.
+	     */
+	    _hash_relbuf(rel, nbuf, HASH_WRITE);
+	    return;
+	}
+	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+	opage = BufferGetPage(obuf);
+	_hash_checkpage(opage, LH_OVERFLOW_PAGE);
+	if (PageIsEmpty(opage)) {
+	    elog(WARN, "_hash_splitpage: empty overflow page %d", oblkno);
+	}
+	oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+    }
+
+    /*
+     * we are now guaranteed that 'opage' is not empty.  partition the
+     * tuples in the old bucket between the old bucket and the new
+     * bucket, advancing along their respective overflow bucket chains
+     * and adding overflow pages as needed.
+     */
+    ooffnum = FirstOffsetNumber;
+    omaxoffnum = PageGetMaxOffsetNumber(opage); 
+    for (;;) {
+	/*
+	 * at each iteration through this loop, each of these variables
+	 * should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
+	 */
+
+	/* check if we're at the end of the page */
+	if (ooffnum > omaxoffnum) {
+	    /* at end of page, but check for overflow page */
+	    oblkno = oopaque->hasho_nextblkno;		
+	    if (BlockNumberIsValid(oblkno)) {
+		/*
+		 * we ran out of tuples on this particular page, but
+		 * we have more overflow pages; re-init values.
+		 */
+		_hash_wrtbuf(rel, obuf);
+		obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
+		opage = BufferGetPage(obuf);
+		_hash_checkpage(opage, LH_OVERFLOW_PAGE);
+		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+		
+		/* we're guaranteed that an ovfl page has at least 1 tuple */
+		if (PageIsEmpty(opage)) {
+		    elog(WARN, "_hash_splitpage: empty ovfl page %d!",
+			 oblkno);
+		}
+		ooffnum = FirstOffsetNumber;
+		omaxoffnum = PageGetMaxOffsetNumber(opage);
+	    } else {
+		/*
+		 * we're at the end of the bucket chain, so now we're
+		 * really done with everything.  before quitting, call
+		 * _hash_squeezebucket to ensure the tuples in the
+		 * bucket (including the overflow pages) are packed as
+		 * tightly as possible.
+		 */
+		_hash_wrtbuf(rel, obuf);
+		_hash_wrtbuf(rel, nbuf);
+		_hash_squeezebucket(rel, metap, obucket);
+		return;
+	    }
+	}
+	
+	/* hash on the tuple */
+	hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
+	itup = &(hitem->hash_itup);
+	itupdesc = RelationGetTupleDescriptor(rel);
+	datum = index_getattr(itup, 1, itupdesc, &null);
+	bucket = _hash_call(rel, metap, datum);
+	
+	if (bucket == nbucket) {
+	    /*
+	     * insert the tuple into the new bucket.  if it doesn't
+	     * fit on the current page in the new bucket, we must
+	     * allocate a new overflow page and place the tuple on
+	     * that page instead.
+	     */
+	    itemsz = IndexTupleDSize(hitem->hash_itup) 
+		+ (sizeof(HashItemData) - sizeof(IndexTupleData));
+
+	    itemsz = DOUBLEALIGN(itemsz);
+	    
+	    if (PageGetFreeSpace(npage) < itemsz) {
+		ovflbuf = _hash_addovflpage(rel, &metabuf, nbuf);
+		_hash_wrtbuf(rel, nbuf);
+		nbuf = ovflbuf;
+		npage = BufferGetPage(nbuf);
+		_hash_checkpage(npage, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+	    }
+	    
+	    noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
+	    (void) PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED);
+	    _hash_wrtnorelbuf(rel, nbuf);
+	    
+	    /*
+	     * now delete the tuple from the old bucket.  after this
+	     * section of code, 'ooffnum' will actually point to the
+	     * ItemId to which we would point if we had advanced it
+	     * before the deletion (PageIndexTupleDelete repacks the
+	     * ItemId array).  this also means that 'omaxoffnum' is
+	     * exactly one less than it used to be, so we really can
+	     * just decrement it instead of calling
+	     * PageGetMaxOffsetNumber.
+	     */
+	    PageIndexTupleDelete(opage, ooffnum);
+	    _hash_wrtnorelbuf(rel, obuf);
+	    omaxoffnum = OffsetNumberPrev(omaxoffnum);
+	    
+	    /*
+	     * tidy up.  if the old page was an overflow page and it
+	     * is now empty, we must free it (we want to preserve the
+	     * invariant that overflow pages cannot be empty).
+	     */
+	    if (PageIsEmpty(opage) &&
+		(oopaque->hasho_flag & LH_OVERFLOW_PAGE)) {
+		obuf = _hash_freeovflpage(rel, obuf);
+		
+		/* check that we're not through the bucket chain */
+		if (BufferIsInvalid(obuf)) {
+		    _hash_wrtbuf(rel, nbuf);
+		    _hash_squeezebucket(rel, metap, obucket);
+		    return;
+		}
+		
+		/* 
+		 * re-init. again, we're guaranteed that an ovfl page
+		 * has at least one tuple.
+		 */
+		opage = BufferGetPage(obuf);
+		_hash_checkpage(opage, LH_OVERFLOW_PAGE);
+		oblkno = BufferGetBlockNumber(obuf);
+		oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
+		if (PageIsEmpty(opage)) {
+		    elog(WARN, "_hash_splitpage: empty overflow page %d",
+			 oblkno);
+		}
+		ooffnum = FirstOffsetNumber;
+		omaxoffnum = PageGetMaxOffsetNumber(opage);
+	    }
+	} else {
+	    /*
+	     * the tuple stays on this page.  we didn't move anything,
+	     * so we didn't delete anything and therefore we don't
+	     * have to change 'omaxoffnum'.
+	     *
+	     * XXX any hash value from [0, nbucket-1] will map to this
+	     * bucket, which doesn't make sense to me.
+	     */
+	    ooffnum = OffsetNumberNext(ooffnum);
+	}
+    }
+    /*NOTREACHED*/
+}
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
@@ -0,0 +1,172 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashscan.c--
+ *    manage scans on hash tables
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashscan.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ * NOTES
+ *    Because we can be doing an index scan on a relation while we
+ *    update it, we need to avoid missing data that moves around in
+ *    the index.  The routines and global variables in this file
+ *    guarantee that all scans in the local address space stay
+ *    correctly positioned.  This is all we need to worry about, since
+ *    write locking guarantees that no one else will be on the same
+ *    page at the same time as we are.
+ *
+ *    The scheme is to manage a list of active scans in the current
+ *    backend.  Whenever we add or remove records from an index, we
+ *    check the list of active scans to see if any has been affected.
+ *    A scan is affected only if it is on the same relation, and the
+ *    same page, as the update.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+
+static void _hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+static bool _hash_scantouched(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno);
+
+typedef struct HashScanListData {
+    IndexScanDesc		hashsl_scan;
+    struct HashScanListData	*hashsl_next;
+} HashScanListData;
+
+typedef HashScanListData	*HashScanList;
+
+static HashScanList	HashScans = (HashScanList) NULL;
+
+/*
+ *  _Hash_regscan() -- register a new scan.
+ */
+void
+_hash_regscan(IndexScanDesc scan)
+{
+    HashScanList new_el;
+    
+    new_el = (HashScanList) palloc(sizeof(HashScanListData));
+    new_el->hashsl_scan = scan;
+    new_el->hashsl_next = HashScans;
+    HashScans = new_el;
+}
+
+/*
+ *  _hash_dropscan() -- drop a scan from the scan list
+ */
+void
+_hash_dropscan(IndexScanDesc scan)
+{
+    HashScanList chk, last;
+    
+    last = (HashScanList) NULL;
+    for (chk = HashScans;
+	 chk != (HashScanList) NULL && chk->hashsl_scan != scan;
+	 chk = chk->hashsl_next) {
+	last = chk;
+    }
+    
+    if (chk == (HashScanList) NULL)
+	elog(WARN, "hash scan list trashed; can't find 0x%lx", scan);
+    
+    if (last == (HashScanList) NULL)
+	HashScans = chk->hashsl_next;
+    else
+	last->hashsl_next = chk->hashsl_next;
+    
+#ifdef PERFECT_MEM
+    pfree (chk);
+#endif /* PERFECT_MEM */
+}
+
+void
+_hash_adjscans(Relation rel, ItemPointer tid)
+{
+    HashScanList l;
+    Oid relid;
+    
+    relid = rel->rd_id;
+    for (l = HashScans; l != (HashScanList) NULL; l = l->hashsl_next) {
+	if (relid == l->hashsl_scan->relation->rd_id)
+	    _hash_scandel(l->hashsl_scan, ItemPointerGetBlockNumber(tid),
+			  ItemPointerGetOffsetNumber(tid));
+    }
+}
+
+static void
+_hash_scandel(IndexScanDesc scan, BlockNumber blkno, OffsetNumber offno)
+{
+    ItemPointer current;
+    Buffer buf;
+    Buffer metabuf;
+    HashScanOpaque so;
+    
+    if (!_hash_scantouched(scan, blkno, offno))
+	return;
+    
+    metabuf = _hash_getbuf(scan->relation, HASH_METAPAGE, HASH_READ);
+    
+    so = (HashScanOpaque) scan->opaque;
+    buf = so->hashso_curbuf;
+    
+    current = &(scan->currentItemData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno) {
+	_hash_step(scan, &buf, BackwardScanDirection, metabuf);
+	so->hashso_curbuf = buf;
+    }
+    
+    current = &(scan->currentMarkData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno) {
+	ItemPointerData tmp;
+	tmp = *current;
+	*current = scan->currentItemData;
+	scan->currentItemData = tmp;
+	_hash_step(scan, &buf, BackwardScanDirection, metabuf);
+	so->hashso_mrkbuf = buf;
+	tmp = *current;
+	*current = scan->currentItemData;
+	scan->currentItemData = tmp;
+    }
+}
+
+static bool
+_hash_scantouched(IndexScanDesc scan,
+		  BlockNumber blkno,
+		  OffsetNumber offno)
+{
+    ItemPointer current;
+    
+    current = &(scan->currentItemData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno)
+	return (true);
+    
+    current = &(scan->currentMarkData);
+    if (ItemPointerIsValid(current)
+	&& ItemPointerGetBlockNumber(current) == blkno
+	&& ItemPointerGetOffsetNumber(current) >= offno)
+	return (true);
+    
+    return (false);
+}
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -0,0 +1,425 @@
+/*-------------------------------------------------------------------------
+ *
+ * hashsearch.c--
+ *    search code for postgres hash tables
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "fmgr.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/skey.h"
+#include "access/sdir.h"
+#include "access/hash.h"
+
+/*
+ *  _hash_search() -- Finds the page/bucket that the contains the
+ *  scankey and loads it into *bufP.  the buffer has a read lock.
+ */
+void
+_hash_search(Relation rel,
+	     int keysz,
+	     ScanKey scankey,
+	     Buffer *bufP,
+	     HashMetaPage metap)
+{
+    BlockNumber blkno;
+    Datum keyDatum;
+    Bucket bucket;
+
+    if (scankey == (ScanKey) NULL ||
+	(keyDatum = scankey[0].sk_argument) == (Datum) NULL) {
+	/* 
+	 * If the scankey argument is NULL, all tuples will satisfy
+	 * the scan so we start the scan at the first bucket (bucket
+	 * 0).
+	 */
+	bucket = 0;
+    } else {
+	bucket = _hash_call(rel, metap, keyDatum);
+    }
+
+    blkno = BUCKET_TO_BLKNO(bucket);
+    
+    *bufP = _hash_getbuf(rel, blkno, HASH_READ);
+}
+
+/*
+ *  _hash_next() -- Get the next item in a scan.
+ *
+ *	On entry, we have a valid currentItemData in the scan, and a
+ *	read lock on the page that contains that item.  We do not have
+ *	the page pinned.  We return the next item in the scan.  On
+ *	exit, we have the page containing the next item locked but not
+ *	pinned.
+ */
+RetrieveIndexResult
+_hash_next(IndexScanDesc scan, ScanDirection dir)
+{
+    Relation rel;
+    Buffer buf;
+    Buffer metabuf;
+    Page page;
+    OffsetNumber offnum;
+    RetrieveIndexResult res;
+    ItemPointer current;
+    ItemPointer iptr;
+    HashItem hitem;
+    IndexTuple itup;
+    HashScanOpaque so;
+
+    rel = scan->relation;
+    so = (HashScanOpaque) scan->opaque; 
+    current = &(scan->currentItemData);
+
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+
+    /*
+     *  XXX 10 may 91:  somewhere there's a bug in our management of the
+     *  cached buffer for this scan.  wei discovered it.  the following
+     *  is a workaround so he can work until i figure out what's going on.
+     */
+
+    if (!BufferIsValid(so->hashso_curbuf)) {
+	so->hashso_curbuf = _hash_getbuf(rel,
+					 ItemPointerGetBlockNumber(current),
+					 HASH_READ);
+    }
+
+    /* we still have the buffer pinned and locked */
+    buf = so->hashso_curbuf;
+
+    /*
+     * step to next valid tuple.  note that _hash_step releases our
+     * lock on 'metabuf'; if we switch to a new 'buf' while looking
+     * for the next tuple, we come back with a lock on that buffer.
+     */
+    if (!_hash_step(scan, &buf, dir, metabuf)) {
+	return ((RetrieveIndexResult) NULL);
+    }
+
+    /* if we're here, _hash_step found a valid tuple */
+    current = &(scan->currentItemData);
+    offnum = ItemPointerGetOffsetNumber(current);
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+    itup = &hitem->hash_itup;
+    iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+    memmove((char *) iptr, (char *) &(itup->t_tid),  sizeof(ItemPointerData));
+    res = FormRetrieveIndexResult(current, iptr);
+
+    return (res);
+}
+
+static void
+_hash_readnext(Relation rel,
+	       Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+{
+    BlockNumber blkno;
+
+    blkno = (*opaquep)->hasho_nextblkno;
+    _hash_relbuf(rel, *bufp, HASH_READ);
+    *bufp = InvalidBuffer;
+    if (BlockNumberIsValid(blkno)) {
+	*bufp = _hash_getbuf(rel, blkno, HASH_READ);
+	*pagep = BufferGetPage(*bufp);
+	_hash_checkpage(*pagep, LH_OVERFLOW_PAGE);
+	*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+	Assert(!PageIsEmpty(*pagep));
+    }
+}
+
+static void
+_hash_readprev(Relation rel,
+	       Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
+{
+    BlockNumber blkno;
+
+    blkno = (*opaquep)->hasho_prevblkno;
+    _hash_relbuf(rel, *bufp, HASH_READ);
+    *bufp = InvalidBuffer;
+    if (BlockNumberIsValid(blkno)) {
+	*bufp = _hash_getbuf(rel, blkno, HASH_READ);
+	*pagep = BufferGetPage(*bufp);
+	_hash_checkpage(*pagep, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+	*opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep);
+	if (PageIsEmpty(*pagep)) {
+	    Assert((*opaquep)->hasho_flag & LH_BUCKET_PAGE);
+	    _hash_relbuf(rel, *bufp, HASH_READ);
+	    *bufp = InvalidBuffer;
+	}
+    }
+}
+
+/*
+ *  _hash_first() -- Find the first item in a scan.
+ *
+ *	Return the RetrieveIndexResult of the first item in the tree that
+ *	satisfies the qualificatin associated with the scan descriptor. On
+ * 	exit, the page containing the current index tuple is read locked
+ * 	and pinned, and the scan's opaque data entry is updated to 
+ *	include the buffer.  
+ */
+RetrieveIndexResult
+_hash_first(IndexScanDesc scan, ScanDirection dir)
+{
+    Relation rel;
+    Buffer buf;
+    Buffer metabuf;
+    Page page;
+    HashPageOpaque opaque;
+    HashMetaPage metap;
+    HashItem hitem;
+    IndexTuple itup;
+    ItemPointer current;
+    ItemPointer iptr;
+    OffsetNumber offnum;
+    RetrieveIndexResult res;
+    HashScanOpaque so;
+
+    rel = scan->relation;
+    so = (HashScanOpaque) scan->opaque;
+    current = &(scan->currentItemData);
+
+    metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+
+    /*
+     *  XXX -- The attribute number stored in the scan key is the attno
+     *	       in the heap relation.  We need to transmogrify this into
+     *         the index relation attno here.  For the moment, we have
+     *	       hardwired attno == 1.
+     */
+
+    /* find the correct bucket page and load it into buf */
+    _hash_search(rel, 1, scan->keyData, &buf, metap);
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE);
+    opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+    /*
+     * if we are scanning forward, we need to find the first non-empty
+     * page (if any) in the bucket chain.  since overflow pages are
+     * never empty, this had better be either the bucket page or the
+     * first overflow page.
+     *
+     * if we are scanning backward, we always go all the way to the
+     * end of the bucket chain.
+     */
+    if (PageIsEmpty(page)) {
+	if (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+	    _hash_readnext(rel, &buf, &page, &opaque);
+	} else {
+	    ItemPointerSetInvalid(current);
+	    so->hashso_curbuf = InvalidBuffer;
+	    return ((RetrieveIndexResult) NULL);
+	}
+    }
+    if (ScanDirectionIsBackward(dir)) {
+	while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+	    _hash_readnext(rel, &buf, &page, &opaque);
+	}
+    }
+
+    if (!_hash_step(scan, &buf, dir, metabuf)) {
+	return ((RetrieveIndexResult) NULL);
+    }
+
+    /* if we're here, _hash_step found a valid tuple */
+    current = &(scan->currentItemData);
+    offnum = ItemPointerGetOffsetNumber(current);
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+    itup = &hitem->hash_itup;
+    iptr = (ItemPointer) palloc(sizeof(ItemPointerData));
+    memmove((char *) iptr, (char *) &(itup->t_tid), sizeof(ItemPointerData));
+    res = FormRetrieveIndexResult(current, iptr);
+
+    return (res);
+}
+
+/*
+ *  _hash_step() -- step to the next valid item in a scan in the bucket.
+ *
+ *	If no valid record exists in the requested direction, return
+ *	false.  Else, return true and set the CurrentItemData for the
+ *	scan to the right thing.
+ * 
+ *	'bufP' points to the buffer which contains the current page
+ *	that we'll step through.
+ *
+ *	'metabuf' is released when this returns.
+ */
+bool
+_hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
+{
+    Relation rel;
+    ItemPointer current;
+    HashScanOpaque so;
+    int allbuckets;
+    HashMetaPage metap;
+    Buffer buf;
+    Page page;
+    HashPageOpaque opaque;
+    OffsetNumber maxoff;
+    OffsetNumber offnum;
+    Bucket bucket;
+    BlockNumber blkno;
+    HashItem hitem;
+    IndexTuple itup;
+
+    rel = scan->relation;
+    current = &(scan->currentItemData);
+    so = (HashScanOpaque) scan->opaque;
+    allbuckets = (scan->numberOfKeys < 1);
+
+    metap = (HashMetaPage) BufferGetPage(metabuf);
+    _hash_checkpage((Page) metap, LH_META_PAGE);
+
+    buf = *bufP;
+    page = BufferGetPage(buf);
+    _hash_checkpage(page, LH_BUCKET_PAGE|LH_OVERFLOW_PAGE);
+    opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+    /*
+     * If _hash_step is called from _hash_first, current will not be
+     * valid, so we can't dereference it.  However, in that case, we
+     * presumably want to start at the beginning/end of the page...
+     */
+    maxoff = PageGetMaxOffsetNumber(page);
+    if (ItemPointerIsValid(current)) {
+	offnum = ItemPointerGetOffsetNumber(current);
+    } else {
+	offnum = InvalidOffsetNumber;
+    }
+
+    /*
+     * 'offnum' now points to the last tuple we have seen (if any).
+     *
+     * continue to step through tuples until:
+     * 	  1) we get to the end of the bucket chain or
+     * 	  2) we find a valid tuple.
+     */
+    do {
+	bucket = opaque->hasho_bucket;
+
+	switch (dir) {
+	case ForwardScanDirection:
+	    if (offnum != InvalidOffsetNumber) {
+		offnum = OffsetNumberNext(offnum);	/* move forward */
+	    } else {
+		offnum = FirstOffsetNumber;		/* new page */
+	    }
+	    while (offnum > maxoff) {
+		/*
+		 * either this page is empty (maxoff ==
+		 * InvalidOffsetNumber) or we ran off the end.
+		 */
+		_hash_readnext(rel, &buf, &page, &opaque);
+		if (BufferIsInvalid(buf)) {	/* end of chain */
+		    if (allbuckets && bucket < metap->hashm_maxbucket) {
+			++bucket;
+			blkno = BUCKET_TO_BLKNO(bucket);
+			buf = _hash_getbuf(rel, blkno, HASH_READ);
+			page = BufferGetPage(buf);
+			_hash_checkpage(page, LH_BUCKET_PAGE);
+			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+			Assert(opaque->hasho_bucket == bucket);
+			while (PageIsEmpty(page) &&
+			       BlockNumberIsValid(opaque->hasho_nextblkno)) {
+			    _hash_readnext(rel, &buf, &page, &opaque);
+			}
+			maxoff = PageGetMaxOffsetNumber(page);
+			offnum = FirstOffsetNumber;
+		    } else {
+			maxoff = offnum = InvalidOffsetNumber;
+			break;	/* while */
+		    }
+		} else {
+		    /* _hash_readnext never returns an empty page */
+		    maxoff = PageGetMaxOffsetNumber(page);
+		    offnum = FirstOffsetNumber;
+		}
+	    }
+	    break;
+	case BackwardScanDirection:
+	    if (offnum != InvalidOffsetNumber) {
+		offnum = OffsetNumberPrev(offnum);	/* move back */
+	    } else {
+		offnum = maxoff;			/* new page */
+	    }
+	    while (offnum < FirstOffsetNumber) {
+		/*
+		 * either this page is empty (offnum ==
+		 * InvalidOffsetNumber) or we ran off the end.
+		 */
+		_hash_readprev(rel, &buf, &page, &opaque);
+		if (BufferIsInvalid(buf)) {	/* end of chain */
+		    if (allbuckets && bucket > 0) {
+			--bucket;
+			blkno = BUCKET_TO_BLKNO(bucket);
+			buf = _hash_getbuf(rel, blkno, HASH_READ);
+			page = BufferGetPage(buf);
+			_hash_checkpage(page, LH_BUCKET_PAGE);
+			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+			Assert(opaque->hasho_bucket == bucket);
+			while (BlockNumberIsValid(opaque->hasho_nextblkno)) {
+			    _hash_readnext(rel, &buf, &page, &opaque);
+			}
+			maxoff = offnum = PageGetMaxOffsetNumber(page);
+		    } else {
+			maxoff = offnum = InvalidOffsetNumber;
+			break;	/* while */
+		    }
+		} else {
+		    /* _hash_readprev never returns an empty page */
+		    maxoff = offnum = PageGetMaxOffsetNumber(page);
+		}
+	    }
+	    break;
+	default:
+	    /* NoMovementScanDirection */
+	    /* this should not be reached */
+	    break;
+	}
+
+	/* we ran off the end of the world without finding a match */
+	if (offnum == InvalidOffsetNumber) {
+	    _hash_relbuf(rel, metabuf, HASH_READ);
+	    *bufP = so->hashso_curbuf = InvalidBuffer;
+	    ItemPointerSetInvalid(current);
+	    return(false);
+	}
+	
+	/* get ready to check this tuple */
+	hitem = (HashItem) PageGetItem(page, PageGetItemId(page, offnum));
+	itup = &hitem->hash_itup;
+    } while (!_hash_checkqual(scan, itup));
+   
+    /* if we made it to here, we've found a valid tuple */
+    _hash_relbuf(rel, metabuf, HASH_READ);
+    blkno = BufferGetBlockNumber(buf);
+    *bufP = so->hashso_curbuf = buf;
+    ItemPointerSet(current, blkno, offnum);
+    return(true);
+}
--- a/src/backend/access/hash/hashstrat.c
+++ b/src/backend/access/hash/hashstrat.c
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * btstrat.c--
+ *    Srategy map entries for the btree indexed access method
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/Attic/hashstrat.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufpage.h"
+
+#include "utils/elog.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/genam.h"
+#include "access/hash.h"
+
+/* 
+ *  only one valid strategy for hash tables: equality. 
+ */
+
+static StrategyNumber	HTNegate[1] = {
+    InvalidStrategy
+};
+
+static StrategyNumber	HTCommute[1] = {
+    HTEqualStrategyNumber
+};
+
+static StrategyNumber	HTNegateCommute[1] = {
+    InvalidStrategy
+};
+
+static StrategyEvaluationData	HTEvaluationData = {
+    /* XXX static for simplicity */
+
+    HTMaxStrategyNumber,
+    (StrategyTransformMap)HTNegate,
+    (StrategyTransformMap)HTCommute,
+    (StrategyTransformMap)HTNegateCommute,
+    {NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL}
+};
+
+/* ----------------------------------------------------------------
+ *	RelationGetHashStrategy
+ * ----------------------------------------------------------------
+ */
+
+StrategyNumber
+_hash_getstrat(Relation rel,
+	       AttrNumber attno,
+	       RegProcedure proc)
+{
+    StrategyNumber	strat;
+
+    strat = RelationGetStrategy(rel, attno, &HTEvaluationData, proc);
+
+    Assert(StrategyNumberIsValid(strat));
+
+    return (strat);
+}
+
+bool
+_hash_invokestrat(Relation rel,
+		  AttrNumber attno,
+		  StrategyNumber strat,
+		  Datum left,
+		  Datum right)
+{
+    return (RelationInvokeStrategy(rel, &HTEvaluationData, attno, strat, 
+				   left, right));
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -0,0 +1,147 @@
+/*-------------------------------------------------------------------------
+ *
+ * btutils.c--
+ *    Utility code for Postgres btree implementation.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/access/hash/hashutil.c,v 1.1.1.1 1996/07/09 06:21:10 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/bufpage.h"
+
+#include "fmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+#include "utils/excid.h"
+
+#include "access/heapam.h"
+#include "access/genam.h"
+#include "access/iqual.h"
+#include "access/hash.h"
+
+ScanKey
+_hash_mkscankey(Relation rel, IndexTuple itup, HashMetaPage metap)
+{
+    ScanKey skey;
+    TupleDesc itupdesc;
+    int natts;
+    AttrNumber i;
+    Datum arg;
+    RegProcedure proc;
+    bool null;
+    
+    natts = rel->rd_rel->relnatts;
+    itupdesc = RelationGetTupleDescriptor(rel);
+    
+    skey = (ScanKey) palloc(natts * sizeof(ScanKeyData));
+    
+    for (i = 0; i < natts; i++) {
+	arg = index_getattr(itup, i + 1, itupdesc, &null);
+	proc = metap->hashm_procid;
+	ScanKeyEntryInitialize(&skey[i],
+			       0x0, (AttrNumber) (i + 1), proc, arg);
+    }
+    
+    return (skey);
+}	
+
+void
+_hash_freeskey(ScanKey skey)
+{
+    pfree(skey);
+}
+
+
+bool
+_hash_checkqual(IndexScanDesc scan, IndexTuple itup)
+{
+    if (scan->numberOfKeys > 0)
+	return (index_keytest(itup, 
+			      RelationGetTupleDescriptor(scan->relation),
+			      scan->numberOfKeys, scan->keyData));
+    else
+	return (true);
+}
+
+HashItem
+_hash_formitem(IndexTuple itup)
+{
+    int nbytes_hitem;
+    HashItem hitem;
+    Size tuplen;
+    
+    /* disallow nulls in hash keys */
+    if (itup->t_info & INDEX_NULL_MASK)
+	elog(WARN, "hash indices cannot include null keys");
+    
+    /* make a copy of the index tuple with room for the sequence number */
+    tuplen = IndexTupleSize(itup);
+    nbytes_hitem = tuplen +
+	(sizeof(HashItemData) - sizeof(IndexTupleData));
+    
+    hitem = (HashItem) palloc(nbytes_hitem);
+    memmove((char *) &(hitem->hash_itup), (char *) itup, tuplen);
+    
+    return (hitem);
+}
+
+Bucket
+_hash_call(Relation rel, HashMetaPage metap, Datum key)
+{
+    uint32 n;
+    Bucket bucket;
+    RegProcedure proc;
+    
+    proc = metap->hashm_procid;
+    n = (uint32) fmgr(proc, key);
+    bucket = n & metap->hashm_highmask;
+    if (bucket > metap->hashm_maxbucket)
+	bucket = bucket & metap->hashm_lowmask;
+    return (bucket);
+}
+
+/*
+ * _hash_log2 -- returns ceil(lg2(num))
+ */
+uint32
+_hash_log2(uint32 num)
+{
+    uint32 i, limit;
+    
+    limit = 1;
+    for (i = 0; limit < num; limit = limit << 1, i++)
+	;
+    return (i);
+}
+
+/*
+ * _hash_checkpage -- sanity checks on the format of all hash pages
+ */
+void
+_hash_checkpage(Page page, int flags)
+{
+    PageHeader ph = (PageHeader) page;
+    HashPageOpaque opaque;
+
+    Assert(page);
+    Assert(ph->pd_lower >= (sizeof(PageHeaderData) - sizeof(ItemIdData)));
+#if 1
+    Assert(ph->pd_upper <=
+	   (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData))));
+    Assert(ph->pd_special ==
+	   (BLCKSZ - DOUBLEALIGN(sizeof(HashPageOpaqueData))));
+    Assert(ph->pd_opaque.od_pagesize == BLCKSZ);
+#endif
+    if (flags) {
+	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	Assert(opaque->hasho_flag & flags);
+    }
+}