Postgres95 1.01 Distribution - Virgin Sources

2025-11-12 05:01:15 +03:00 · 1996-07-09 06:22:35 +00:00
commit d31084e9d1
868 changed files with 242656 additions and 0 deletions
--- a/src/backend/storage/Makefile.inc
+++ b/src/backend/storage/Makefile.inc
@@ -0,0 +1,31 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for the storage modules
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+stordir= $(CURDIR)/storage
+VPATH:= $(VPATH):$(stordir):$(stordir)/buffer:$(stordir)/file:$(stordir)/ipc:\
+	$(stordir)/large_object:$(stordir)/lmgr:$(stordir)/page:$(stordir)/smgr
+
+SUBSRCS=
+include $(stordir)/buffer/Makefile.inc
+include $(stordir)/file/Makefile.inc
+include $(stordir)/ipc/Makefile.inc
+include $(stordir)/large_object/Makefile.inc
+include $(stordir)/lmgr/Makefile.inc
+include $(stordir)/page/Makefile.inc
+include $(stordir)/smgr/Makefile.inc
+SRCS_STORAGE:= $(SUBSRCS)
+
+HEADERS+= backendid.h block.h buf.h buf_internals.h bufmgr.h bufpage.h \
+	fd.h ipc.h item.h itemid.h itempos.h \
+	itemptr.h large_object.h lmgr.h lock.h multilev.h off.h page.h \
+	pagenum.h pos.h proc.h shmem.h sinval.h sinvaladt.h smgr.h spin.h
--- a/src/backend/storage/backendid.h
+++ b/src/backend/storage/backendid.h
@@ -0,0 +1,32 @@
+/*-------------------------------------------------------------------------
+ *
+ * backendid.h--
+ *    POSTGRES backend id communication definitions
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: backendid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BACKENDID_H
+#define BACKENDID_H
+
+/* ----------------
+ *	pulled out of sinval.h to temporarily reduce #include nesting.
+ *	-cim 8/17/90
+ * ----------------
+ */
+typedef int16	BackendId;	/* unique currently active backend identifier */
+
+#define InvalidBackendId	(-1)
+
+typedef int32	BackendTag;	/* unique backend identifier */
+
+#define InvalidBackendTag	(-1)
+
+extern BackendId	MyBackendId;	/* backend id of this backend */
+extern BackendTag	MyBackendTag;	/* backend tag of this backend */
+
+#endif /* BACKENDID_H */
--- a/src/backend/storage/block.h
+++ b/src/backend/storage/block.h
@@ -0,0 +1,114 @@
+/*-------------------------------------------------------------------------
+ *
+ * block.h--
+ *    POSTGRES disk block definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: block.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BLOCK_H
+#define BLOCK_H
+
+#include "c.h"
+
+/*
+ * BlockNumber:
+ *
+ * each data file (heap or index) is divided into postgres disk blocks
+ * (which may be thought of as the unit of i/o -- a postgres buffer
+ * contains exactly one disk block).  the blocks are numbered
+ * sequentially, 0 to 0xFFFFFFFE.
+ *
+ * InvalidBlockNumber is the same thing as P_NEW in buf.h.
+ *
+ * the access methods, the buffer manager and the storage manager are
+ * more or less the only pieces of code that should be accessing disk
+ * blocks directly.
+ */
+typedef uint32	BlockNumber;
+
+#define InvalidBlockNumber	((BlockNumber) 0xFFFFFFFF)
+
+/*
+ * BlockId:
+ *
+ * this is a storage type for BlockNumber.  in other words, this type
+ * is used for on-disk structures (e.g., in HeapTupleData) whereas
+ * BlockNumber is the type on which calculations are performed (e.g.,
+ * in access method code).
+ *
+ * there doesn't appear to be any reason to have separate types except
+ * for the fact that BlockIds can be SHORTALIGN'd (and therefore any
+ * structures that contains them, such as ItemPointerData, can also be
+ * SHORTALIGN'd).  this is an important consideration for reducing the
+ * space requirements of the line pointer (ItemIdData) array on each
+ * page and the header of each heap or index tuple, so it doesn't seem
+ * wise to change this without good reason.
+ */
+typedef struct BlockIdData {
+    uint16	bi_hi;
+    uint16	bi_lo;
+} BlockIdData;
+
+typedef BlockIdData	*BlockId;	/* block identifier */
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * BlockNumberIsValid --
+ *	True iff blockNumber is valid.
+ */
+#define BlockNumberIsValid(blockNumber) \
+    ((bool) ((int32) (blockNumber) != InvalidBlockNumber))
+
+/*
+ * BlockIdIsValid --
+ *	True iff the block identifier is valid.
+ */
+#define BlockIdIsValid(blockId) \
+    ((bool) PointerIsValid(blockId))
+
+/*
+ * BlockIdSet --
+ *	Sets a block identifier to the specified value.
+ */
+#define BlockIdSet(blockId, blockNumber) \
+    Assert(PointerIsValid(blockId)); \
+    (blockId)->bi_hi = (blockNumber) >> 16; \
+    (blockId)->bi_lo = (blockNumber) & 0xffff
+
+/*
+ * BlockIdCopy --
+ *	Copy a block identifier.
+ */
+#define BlockIdCopy(toBlockId, fromBlockId) \
+    Assert(PointerIsValid(toBlockId)); \
+    Assert(PointerIsValid(fromBlockId)); \
+    (toBlockId)->bi_hi = (fromBlockId)->bi_hi; \
+    (toBlockId)->bi_lo = (fromBlockId)->bi_lo
+
+/*
+ * BlockIdEquals --
+ *	Check for block number equality.
+ */
+#define BlockIdEquals(blockId1, blockId2) \
+    ((blockId1)->bi_hi == (blockId2)->bi_hi && \
+     (blockId1)->bi_lo == (blockId2)->bi_lo)
+
+/*
+ * BlockIdGetBlockNumber --
+ *	Retrieve the block number from a block identifier.
+ */
+#define BlockIdGetBlockNumber(blockId) \
+    (AssertMacro(BlockIdIsValid(blockId)) ? \
+     (BlockNumber) (((blockId)->bi_hi << 16) | ((uint16) (blockId)->bi_lo)) : \
+     (BlockNumber) InvalidBlockNumber)
+
+#endif	/* BLOCK_H */
--- a/src/backend/storage/buf.h
+++ b/src/backend/storage/buf.h
@@ -0,0 +1,47 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf.h--
+ *    Basic buffer manager data types.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buf.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUF_H
+#define BUF_H
+
+#define InvalidBuffer	(0)
+#define UnknownBuffer	(-99999)
+
+typedef long	Buffer;
+
+/*
+ * BufferIsInvalid --
+ *	True iff the buffer is invalid.
+ */
+#define BufferIsInvalid(buffer)	((buffer) == InvalidBuffer)
+
+/*
+ * BufferIsUnknown --
+ *	True iff the buffer is unknown.
+ */
+#define BufferIsUnknown(buffer)	((buffer) == UnknownBuffer)
+
+/*
+ * BufferIsLocal --
+ *	True iff the buffer is local (not visible to other servers).
+ */
+#define BufferIsLocal(buffer)	((buffer) < 0)
+
+/*
+ * If NO_BUFFERISVALID is defined, all error checking using BufferIsValid()
+ * are suppressed.  Decision-making using BufferIsValid is not affected.
+ * This should be set only if one is sure there will be no errors.
+ * - plai 9/10/90
+ */
+#undef NO_BUFFERISVALID
+
+#endif	/* BUF_H */
--- a/src/backend/storage/buf_internals.h
+++ b/src/backend/storage/buf_internals.h
@@ -0,0 +1,220 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_internals.h--
+ *    Internal definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: buf_internals.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ * NOTE
+ *	If BUFFERPAGE0 is defined, then 0 will be used as a
+ *	valid buffer page number.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUFMGR_INTERNALS_H
+#define BUFMGR_INTERNALS_H
+
+#include "postgres.h"
+#include "storage/buf.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "miscadmin.h"
+#include "storage/lmgr.h"
+#include "utils/rel.h"
+#include "utils/relcache.h"
+
+/* Buf Mgr constants */
+/* in bufmgr.c */
+extern int NBuffers;
+extern int Data_Descriptors;
+extern int Free_List_Descriptor;
+extern int Lookup_List_Descriptor;
+extern int Num_Descriptors;
+
+/*
+ * Flags for buffer descriptors
+ */
+#define BM_DIRTY   		(1 << 0)
+#define BM_PRIVATE 		(1 << 1)
+#define BM_VALID 		(1 << 2)
+#define BM_DELETED   		(1 << 3)
+#define BM_FREE			(1 << 4)
+#define BM_IO_IN_PROGRESS	(1 << 5)
+#define BM_IO_ERROR		(1 << 6)
+
+typedef bits16 BufFlags;
+
+typedef struct sbufdesc BufferDesc;
+typedef struct sbufdesc BufferHdr;
+typedef struct buftag BufferTag;
+/* long * so alignment will be correct */
+typedef long **BufferBlock;
+
+struct buftag{
+  LRelId	relId;
+  BlockNumber   blockNum;  /* blknum relative to begin of reln */
+};
+
+#define CLEAR_BUFFERTAG(a)\
+  (a)->relId.dbId = InvalidOid; \
+  (a)->relId.relId = InvalidOid; \
+  (a)->blockNum = InvalidBlockNumber
+
+#define INIT_BUFFERTAG(a,xx_reln,xx_blockNum) \
+{ \
+  (a)->blockNum = xx_blockNum;\
+  (a)->relId = RelationGetLRelId(xx_reln); \
+}
+
+#define COPY_BUFFERTAG(a,b)\
+{ \
+  (a)->blockNum = (b)->blockNum;\
+  LRelIdAssign(*(a),*(b));\
+}
+
+#define EQUAL_BUFFERTAG(a,b) \
+  (((a)->blockNum == (b)->blockNum) &&\
+   (OID_Equal((a)->relId.relId,(b)->relId.relId)))
+
+
+#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
+#define INVALID_DESCRIPTOR (-3)
+
+/*
+ *  bletch hack -- anyplace that we declare space for relation or
+ *  database names, we just use '16', not a symbolic constant, to
+ *  specify their lengths.  BM_NAMESIZE is the length of these names,
+ *  and is used in the buffer manager code.  somebody with lots of
+ *  spare time should do this for all the other modules, too.
+ */
+#define BM_NAMESIZE	16
+
+/*
+ *  struct sbufdesc -- shared buffer cache metadata for a single
+ *		       shared buffer descriptor.
+ *
+ *	We keep the name of the database and relation in which this
+ *	buffer appears in order to avoid a catalog lookup on cache
+ *	flush if we don't have the reldesc in the cache.  It is also
+ *	possible that the relation to which this buffer belongs is
+ *	not visible to all backends at the time that it gets flushed.
+ *	Dbname, relname, dbid, and relid are enough to determine where
+ *	to put the buffer, for all storage managers.
+ */
+
+struct sbufdesc {
+    Buffer		freeNext;	/* link for freelist chain */
+    Buffer		freePrev;
+    SHMEM_OFFSET	data;		/* pointer to data in buf pool */
+
+    /* tag and id must be together for table lookup to work */
+    BufferTag		tag;		/* file/block identifier */
+    int			buf_id;		/* maps global desc to local desc */
+
+    BufFlags		flags;    	/* described below */
+    int16		bufsmgr;	/* storage manager id for buffer */
+    unsigned		refcount;	/* # of times buffer is pinned */
+
+    char *sb_dbname;	/* name of db in which buf belongs */
+    char *sb_relname;	/* name of reln */
+#ifdef HAS_TEST_AND_SET
+    /* can afford a dedicated lock if test-and-set locks are available */
+    slock_t	io_in_progress_lock;
+#endif /* HAS_TEST_AND_SET */
+
+    /*
+     * I padded this structure to a power of 2 (128 bytes on a MIPS) because
+     * BufferDescriptorGetBuffer is called a billion times and it does an
+     * C pointer subtraction (i.e., "x - y" -> array index of x relative
+     * to y, which is calculated using division by struct size).  Integer
+     * ".div" hits you for 35 cycles, as opposed to a 1-cycle "sra" ...
+     * this hack cut 10% off of the time to create the Wisconsin database!
+     * It eats up more shared memory, of course, but we're (allegedly)
+     * going to make some of these types bigger soon anyway... -pma 1/2/93
+     */
+#if defined(PORTNAME_ultrix4)
+    char		sb_pad[60];	/* no slock_t */
+#endif /* mips */
+#if defined(PORTNAME_sparc) || defined(PORTNAME_sparc_solaris) || defined(PORTNAME_irix5)
+    char		sb_pad[56];	/* has slock_t */
+#endif /* sparc || irix5 */
+#if defined(PORTNAME_hpux)
+    char		sb_pad[44];	/* has slock_t */
+#endif /* alpha */
+#if defined(PORTNAME_alpha)
+    char		sb_pad[40];	/* has slock_t */
+#endif /* alpha */
+};
+
+/*
+ *  mao tracing buffer allocation
+ */
+
+/*#define BMTRACE*/
+#ifdef BMTRACE
+
+typedef struct _bmtrace {
+    int		bmt_pid;
+    long	bmt_buf;
+    long	bmt_dbid;
+    long	bmt_relid;
+    int		bmt_blkno;
+    int		bmt_op;
+
+#define BMT_NOTUSED	0
+#define BMT_ALLOCFND	1
+#define BMT_ALLOCNOTFND	2
+#define	BMT_DEALLOC	3
+
+} bmtrace;
+
+#endif /* BMTRACE */
+
+
+/* 
+ * Bufmgr Interface:
+ */
+
+/* Internal routines: only called by buf.c */
+
+/*freelist.c*/
+extern void AddBufferToFreelist(BufferDesc *bf);
+extern void PinBuffer(BufferDesc *buf);
+extern void PinBuffer_Debug(char *file, int line, BufferDesc *buf);
+extern void UnpinBuffer(BufferDesc *buf);
+extern void UnpinBuffer_Debug(char *file, int line, BufferDesc *buf);
+extern BufferDesc *GetFreeBuffer(void);
+extern void InitFreeList(bool init);
+extern void DBG_FreeListCheck(int nfree);
+
+/* buf_table.c */
+extern void InitBufTable(void);
+extern BufferDesc *BufTableLookup(BufferTag *tagPtr);
+extern bool BufTableDelete(BufferDesc *buf);
+extern bool BufTableInsert(BufferDesc *buf);
+extern void DBG_LookupListCheck(int nlookup);
+
+/* bufmgr.c */
+extern BufferDesc 	*BufferDescriptors;
+extern BufferBlock 	BufferBlocks;
+extern long		*PrivateRefCount;
+extern long		*LastRefCount;
+extern SPINLOCK		BufMgrLock;
+
+/* localbuf.c */
+extern long *LocalRefCount;
+extern BufferDesc *LocalBufferDescriptors;
+extern int NLocBuffer;
+
+extern BufferDesc *LocalBufferAlloc(Relation reln, BlockNumber blockNum,
+				    bool *foundPtr);
+extern int WriteLocalBuffer(Buffer buffer, bool release);
+extern int FlushLocalBuffer(Buffer buffer);
+extern void InitLocalBuffer();
+extern void LocalBufferSync();
+extern void ResetLocalBufferPool();
+     
+#endif	/* BUFMGR_INTERNALS_H */
--- a/src/backend/storage/buffer/Makefile.inc
+++ b/src/backend/storage/buffer/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/buffer
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= buf_table.c buf_init.c bufmgr.c freelist.c localbuf.c
+
+SRCS_SITEMGR+= buf_table.c buf_init.c freelist.c
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -0,0 +1,280 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_init.c--
+ *    buffer manager initialization routines
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h"	/* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+/*
+ *  if BMTRACE is defined, we trace the last 200 buffer allocations and
+ *  deallocations in a circular buffer in shared memory.
+ */
+#ifdef	BMTRACE
+bmtrace	*TraceBuf;
+long	*CurTraceBuf;
+#define	BMT_LIMIT	200
+#endif /* BMTRACE */
+int ShowPinTrace = 0;
+
+int		NBuffers = NDBUFS;  /* NDBUFS defined in miscadmin.h */
+int		Data_Descriptors;
+int		Free_List_Descriptor;
+int		Lookup_List_Descriptor;
+int		Num_Descriptors;
+
+BufferDesc 	*BufferDescriptors;
+BufferBlock 	BufferBlocks;
+#ifndef HAS_TEST_AND_SET
+long	*NWaitIOBackendP;
+#endif
+
+extern IpcSemaphoreId      WaitIOSemId;
+
+long	*PrivateRefCount;	/* also used in freelist.c */
+long	*LastRefCount;  /* refcounts of last ExecMain level */
+
+/*
+ * Data Structures:
+ *      buffers live in a freelist and a lookup data structure.
+ *	
+ *
+ * Buffer Lookup:
+ *	Two important notes.  First, the buffer has to be
+ *	available for lookup BEFORE an IO begins.  Otherwise
+ *	a second process trying to read the buffer will 
+ *	allocate its own copy and the buffeer pool will 
+ *	become inconsistent.
+ *
+ * Buffer Replacement:
+ *	see freelist.c.  A buffer cannot be replaced while in
+ *	use either by data manager or during IO.
+ *
+ * WriteBufferBack:
+ *	currently, a buffer is only written back at the time
+ *	it is selected for replacement.  It should 
+ *	be done sooner if possible to reduce latency of 
+ *	BufferAlloc().  Maybe there should be a daemon process.
+ *
+ * Synchronization/Locking:
+ *
+ * BufMgrLock lock -- must be acquired before manipulating the 
+ * 	buffer queues (lookup/freelist).  Must be released 
+ * 	before exit and before doing any IO.  
+ *
+ * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
+ *      It must be set when an IO is initiated and cleared at
+ *      the end of  the IO.  It is there to make sure that one
+ *	process doesn't start to use a buffer while another is
+ *	faulting it in.  see IOWait/IOSignal.
+ *
+ * refcount --  A buffer is pinned during IO and immediately
+ *	after a BufferAlloc().  A buffer is always either pinned
+ *	or on the freelist but never both.  The buffer must be
+ *	released, written, or flushed before the end of 
+ * 	transaction.
+ *
+ * PrivateRefCount -- Each buffer also has a private refcount the keeps
+ *	track of the number of times the buffer is pinned in the current
+ *	processes.  This is used for two purposes, first, if we pin a
+ *	a buffer more than once, we only need to change the shared refcount
+ *	once, thus only lock the buffer pool once, second, when a transaction
+ *	aborts, it should only unpin the buffers exactly the number of times it
+ *	has pinned them, so that it will not blow away buffers of another
+ *	backend.
+ *
+ */
+
+SPINLOCK BufMgrLock;
+
+/* delayed write: TRUE on, FALSE off */
+int LateWrite = TRUE;
+
+int ReadBufferCount;
+int BufferHitCount;
+int BufferFlushCount;
+
+
+/*
+ * Initialize module:
+ *
+ * should calculate size of pool dynamically based on the
+ * amount of available memory.
+ */
+void
+InitBufferPool(IPCKey key)
+{
+    bool foundBufs,foundDescs;
+    int i;
+    
+    Data_Descriptors = NBuffers;
+    Free_List_Descriptor = Data_Descriptors;
+    Lookup_List_Descriptor = Data_Descriptors + 1;
+    Num_Descriptors = Data_Descriptors + 1;
+    
+    SpinAcquire(BufMgrLock);
+    
+#ifdef BMTRACE
+    CurTraceBuf = (long *) ShmemInitStruct("Buffer trace",
+					   (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long),
+					   &foundDescs);
+    if (!foundDescs)
+	memset(CurTraceBuf, 0, (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long));
+    
+    TraceBuf = (bmtrace *) &(CurTraceBuf[1]);
+#endif
+    
+    BufferDescriptors = (BufferDesc *)
+	ShmemInitStruct("Buffer Descriptors",
+			Num_Descriptors*sizeof(BufferDesc),&foundDescs);
+    
+    BufferBlocks = (BufferBlock)
+	ShmemInitStruct("Buffer Blocks",
+			NBuffers*BLCKSZ,&foundBufs);
+    
+#ifndef HAS_TEST_AND_SET
+    {
+	bool foundNWaitIO;
+	
+	NWaitIOBackendP = (long *)ShmemInitStruct("#Backends Waiting IO",
+						  sizeof(long),
+						  &foundNWaitIO);
+	if (!foundNWaitIO)
+	    *NWaitIOBackendP = 0;
+    }
+#endif
+    
+    if (foundDescs || foundBufs) {
+	
+	/* both should be present or neither */
+	Assert(foundDescs && foundBufs);
+	
+    } else {
+	BufferDesc *buf;
+	unsigned long block;
+	
+	buf = BufferDescriptors;
+	block = (unsigned long) BufferBlocks;
+	
+	/*
+	 * link the buffers into a circular, doubly-linked list to
+	 * initialize free list.  Still don't know anything about
+	 * replacement strategy in this file.
+	 */
+	for (i = 0; i < Data_Descriptors; block+=BLCKSZ,buf++,i++) {
+	    Assert(ShmemIsValid((unsigned long)block));
+	    
+	    buf->freeNext = i+1;
+	    buf->freePrev = i-1;
+	    
+	    CLEAR_BUFFERTAG(&(buf->tag));
+	    buf->data = MAKE_OFFSET(block);
+	    buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
+	    buf->refcount = 0;
+	    buf->buf_id = i;
+#ifdef HAS_TEST_AND_SET
+	    S_INIT_LOCK(&(buf->io_in_progress_lock));
+#endif
+	}
+	
+	/* close the circular queue */
+	BufferDescriptors[0].freePrev = Data_Descriptors-1;
+	BufferDescriptors[Data_Descriptors-1].freeNext = 0;
+    }
+    
+    /* Init the rest of the module */
+    InitBufTable();
+    InitFreeList(!foundDescs);
+    
+    SpinRelease(BufMgrLock);
+    
+#ifndef HAS_TEST_AND_SET
+    {
+	int status;
+	WaitIOSemId = IpcSemaphoreCreate(IPCKeyGetWaitIOSemaphoreKey(key),
+					 1, IPCProtection, 0, 1, &status);
+    }
+#endif
+    PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
+    LastRefCount = (long *) calloc(NBuffers, sizeof(long));
+}
+
+/* -----------------------------------------------------
+ * BufferShmemSize
+ *
+ * compute the size of shared memory for the buffer pool including
+ * data pages, buffer descriptors, hash tables, etc.
+ * ----------------------------------------------------
+ */
+int
+BufferShmemSize()
+{
+    int size = 0;
+    int nbuckets;
+    int nsegs;
+    int tmp;
+    
+    nbuckets = 1 << (int)my_log2((NBuffers - 1) / DEF_FFACTOR + 1);
+    nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
+    
+    /* size of shmem binding table */
+    size += MAXALIGN(my_log2(BTABLE_SIZE) * sizeof(void *)); /* HTAB->dir */
+    size += MAXALIGN(sizeof(HHDR));			     /* HTAB->hctl */
+    size += MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    size += BUCKET_ALLOC_INCR * 
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(BTABLE_KEYSIZE) +
+	 MAXALIGN(BTABLE_DATASIZE));
+    
+    /* size of buffer descriptors */
+    size += MAXALIGN((NBuffers + 1) * sizeof(BufferDesc));
+    
+    /* size of data pages */
+    size += NBuffers * MAXALIGN(BLCKSZ);
+    
+    /* size of buffer hash table */
+    size += MAXALIGN(my_log2(NBuffers) * sizeof(void *)); /* HTAB->dir */
+    size += MAXALIGN(sizeof(HHDR));			  /* HTAB->hctl */
+    size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    tmp = (int)ceil((double)NBuffers/BUCKET_ALLOC_INCR);
+    size += tmp * BUCKET_ALLOC_INCR * 
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(BufferTag)) +
+	 MAXALIGN(sizeof(Buffer)));
+    
+#ifdef BMTRACE
+    size += (BMT_LIMIT * sizeof(bmtrace)) + sizeof(long);
+#endif
+    return size;
+}
+
+
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -0,0 +1,162 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_table.c--
+ *    routines for finding buffers in the buffer pool.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ *
+ * Data Structures:
+ *
+ *	Buffers are identified by their BufferTag (buf.h).  This
+ * file contains routines for allocating a shmem hash table to
+ * map buffer tags to buffer descriptors.
+ *
+ * Synchronization:
+ *  
+ *  All routines in this file assume buffer manager spinlock is
+ *  held by their caller.
+ */
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"	/* where the declarations go */
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+static HTAB *SharedBufHash;
+
+extern HTAB *ShmemInitHash();
+
+typedef struct lookup { 
+    BufferTag	key; 
+    Buffer	id; 
+} LookupEnt;
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ */
+void
+InitBufTable()
+{
+    HASHCTL info;
+    int hash_flags;
+    
+    /* assume lock is held */
+    
+    /* BufferTag maps to Buffer */
+    info.keysize = sizeof(BufferTag);
+    info.datasize = sizeof(Buffer);
+    info.hash = tag_hash;
+    
+    hash_flags = (HASH_ELEM | HASH_FUNCTION);
+    
+    
+    SharedBufHash = (HTAB *) ShmemInitHash("Shared Buf Lookup Table",
+					   NBuffers,NBuffers,
+					   &info,hash_flags);
+    
+    if (! SharedBufHash) {
+	elog(FATAL,"couldn't initialize shared buffer pool Hash Tbl");
+	exit(1);
+    }
+    
+}
+
+BufferDesc *
+BufTableLookup(BufferTag *tagPtr)
+{
+    LookupEnt *	result;
+    bool	found;
+    
+    if (tagPtr->blockNum == P_NEW)
+	return(NULL);
+    
+    result = (LookupEnt *) 
+	hash_search(SharedBufHash,(char *) tagPtr,HASH_FIND,&found);
+    
+    if (! result){
+	elog(WARN,"BufTableLookup: BufferLookup table corrupted");
+	return(NULL);
+    }
+    if (! found) {
+	return(NULL);
+    }
+    return(&(BufferDescriptors[result->id]));
+}
+
+/*
+ * BufTableDelete
+ */
+bool
+BufTableDelete(BufferDesc *buf)
+{
+    LookupEnt *	result;
+    bool	found;
+    
+    /* buffer not initialized or has been removed from
+     * table already.  BM_DELETED keeps us from removing 
+     * buffer twice.
+     */
+    if (buf->flags & BM_DELETED) {
+	return(TRUE);
+    }
+    
+    buf->flags |= BM_DELETED;
+    
+    result = (LookupEnt *)
+	hash_search(SharedBufHash,(char *) &(buf->tag),HASH_REMOVE,&found);
+    
+    if (! (result && found)) {
+	elog(WARN,"BufTableDelete: BufferLookup table corrupted");    
+	return(FALSE);
+    }
+    
+    return(TRUE);
+}
+
+bool
+BufTableInsert(BufferDesc *buf)
+{
+    LookupEnt *	result;
+    bool	found;
+    
+    /* cannot insert it twice */
+    Assert (buf->flags & BM_DELETED);
+    buf->flags &= ~(BM_DELETED);
+    
+    result = (LookupEnt *)
+	hash_search(SharedBufHash,(char *) &(buf->tag),HASH_ENTER,&found);
+    
+    if (! result) {
+	Assert(0);
+	elog(WARN,"BufTableInsert: BufferLookup table corrupted");
+	return(FALSE);
+    }
+    /* found something else in the table ! */
+    if (found) {
+	Assert(0);
+	elog(WARN,"BufTableInsert: BufferLookup table corrupted");
+	return(FALSE);
+    } 
+    
+    result->id = buf->buf_id;
+    return(TRUE);
+}
+
+/* prints out collision stats for the buf table */
+void
+DBG_LookupListCheck(int nlookup)
+{
+    nlookup = 10;
+    
+    hash_stats("Shared",SharedBufHash);
+}
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -0,0 +1,285 @@
+/*-------------------------------------------------------------------------
+ *
+ * freelist.c--
+ *    routines for manipulating the buffer pool's replacement strategy
+ *    freelist.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * OLD COMMENTS
+ *
+ * Data Structures:
+ *	SharedFreeList is a circular queue.  Notice that this
+ *	is a shared memory queue so the next/prev "ptrs" are
+ *	buffer ids, not addresses.
+ *
+ * Sync: all routines in this file assume that the buffer
+ * 	semaphore has been acquired by the caller.
+ */
+#include <stdio.h>
+#include "storage/bufmgr.h"
+#include "storage/buf_internals.h"	/* where declarations go */
+#include "storage/spin.h"
+#include "utils/elog.h"
+
+
+static BufferDesc 	*SharedFreeList;
+
+/* only actually used in debugging.  The lock
+ * should be acquired before calling the freelist manager.
+ */
+extern SPINLOCK BufMgrLock;
+
+#define IsInQueue(bf) \
+    Assert((bf->freeNext != INVALID_DESCRIPTOR));\
+    Assert((bf->freePrev != INVALID_DESCRIPTOR));\
+    Assert((bf->flags & BM_FREE))
+
+#define NotInQueue(bf) \
+    Assert((bf->freeNext == INVALID_DESCRIPTOR));\
+    Assert((bf->freePrev == INVALID_DESCRIPTOR));\
+    Assert(! (bf->flags & BM_FREE))
+
+
+/*
+ * AddBufferToFreelist --  
+ *
+ * In theory, this is the only routine that needs to be changed
+ * if the buffer replacement strategy changes.  Just change
+ * the manner in which buffers are added to the freelist queue.
+ * Currently, they are added on an LRU basis.
+ */
+void
+AddBufferToFreelist(BufferDesc *bf)
+{
+#ifdef BMTRACE
+    _bm_trace(bf->tag.relId.dbId, bf->tag.relId.relId, bf->tag.blockNum,
+	      BufferDescriptorGetBuffer(bf), BMT_DEALLOC);
+#endif /* BMTRACE */
+    NotInQueue(bf);
+    
+    /* change bf so it points to inFrontOfNew and its successor */
+    bf->freePrev = SharedFreeList->freePrev;
+    bf->freeNext = Free_List_Descriptor;
+    
+    /* insert new into chain */
+    BufferDescriptors[bf->freeNext].freePrev = bf->buf_id;
+    BufferDescriptors[bf->freePrev].freeNext = bf->buf_id;
+}
+
+#undef PinBuffer
+
+/*
+ * PinBuffer -- make buffer unavailable for replacement.
+ */
+void
+PinBuffer(BufferDesc *buf)
+{
+    long b;
+    
+    /* Assert (buf->refcount < 25); */
+    
+    if (buf->refcount == 0) {
+	IsInQueue(buf);
+	
+	/* remove from freelist queue */
+	BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
+	BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+	buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
+	
+	/* mark buffer as no longer free */
+	buf->flags &= ~BM_FREE;
+    } else {
+	NotInQueue(buf);
+    }
+    
+    b = BufferDescriptorGetBuffer(buf) - 1;
+    Assert(PrivateRefCount[b] >= 0);
+    if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+	buf->refcount++;
+    PrivateRefCount[b]++;
+}
+
+void
+PinBuffer_Debug(char *file, int line, BufferDesc *buf)
+{
+    PinBuffer(buf);
+    if (ShowPinTrace) {
+	Buffer buffer = BufferDescriptorGetBuffer(buf);
+	
+	fprintf(stderr, "PIN(Pin) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+#undef UnpinBuffer
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ */
+void
+UnpinBuffer(BufferDesc *buf)
+{
+    long b = BufferDescriptorGetBuffer(buf) - 1;
+    
+    Assert(buf->refcount);
+    Assert(PrivateRefCount[b] > 0);
+    PrivateRefCount[b]--;
+    if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+	buf->refcount--;
+    NotInQueue(buf);
+    
+    if (buf->refcount == 0) {
+	AddBufferToFreelist(buf);
+	buf->flags |= BM_FREE;
+    } else {
+	/* do nothing */
+    }
+}
+
+void
+UnpinBuffer_Debug(char *file, int line, BufferDesc *buf)
+{
+    UnpinBuffer(buf);
+    if (ShowPinTrace) {
+	Buffer buffer = BufferDescriptorGetBuffer(buf);
+	
+	fprintf(stderr, "UNPIN(Unpin) %ld relname = %s, blockNum = %d, \
+refcount = %ld, file: %s, line: %d\n",
+		buffer, buf->sb_relname, buf->tag.blockNum,
+		PrivateRefCount[buffer - 1], file, line);
+    }
+}
+
+/*
+ * GetFreeBuffer() -- get the 'next' buffer from the freelist.
+ *
+ */
+BufferDesc *
+GetFreeBuffer()
+{
+    BufferDesc *buf;
+    
+    if (Free_List_Descriptor == SharedFreeList->freeNext) {
+	
+	/* queue is empty. All buffers in the buffer pool are pinned. */
+	elog(WARN,"out of free buffers: time to abort !\n");
+	return(NULL);
+    }
+    buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+    
+    /* remove from freelist queue */
+    BufferDescriptors[buf->freeNext].freePrev = buf->freePrev;
+    BufferDescriptors[buf->freePrev].freeNext = buf->freeNext;
+    buf->freeNext = buf->freePrev = INVALID_DESCRIPTOR;
+    
+    buf->flags &= ~(BM_FREE);
+    
+    return(buf);
+}
+
+/*
+ * InitFreeList -- initialize the dummy buffer descriptor used
+ *   	as a freelist head.
+ *
+ * Assume: All of the buffers are already linked in a circular
+ *	queue.   Only called by postmaster and only during 
+ * 	initialization.
+ */
+void
+InitFreeList(bool init)
+{
+    SharedFreeList = &(BufferDescriptors[Free_List_Descriptor]);
+    
+    if (init) {
+	/* we only do this once, normally the postmaster */
+	SharedFreeList->data = INVALID_OFFSET;
+	SharedFreeList->flags = 0;
+	SharedFreeList->flags &= ~(BM_VALID | BM_DELETED | BM_FREE);
+	SharedFreeList->buf_id = Free_List_Descriptor;
+	
+	/* insert it into a random spot in the circular queue */
+	SharedFreeList->freeNext = BufferDescriptors[0].freeNext;
+	SharedFreeList->freePrev = 0;
+	BufferDescriptors[SharedFreeList->freeNext].freePrev = 
+	    BufferDescriptors[SharedFreeList->freePrev].freeNext = 
+		Free_List_Descriptor;
+    }
+}
+
+
+/*
+ * print out the free list and check for breaks.
+ */
+void
+DBG_FreeListCheck(int nfree)
+{
+    int i;
+    BufferDesc *buf;
+    
+    buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+    for (i=0;i<nfree;i++,buf = &(BufferDescriptors[buf->freeNext])) {
+	
+	if (! (buf->flags & (BM_FREE))){
+	    if (buf != SharedFreeList) {
+		printf("\tfree list corrupted: %d flags %x\n",
+		       buf->buf_id,buf->flags);
+	    } else  {
+		printf("\tfree list corrupted: too short -- %d not %d\n",
+		       i,nfree);
+		
+	    }
+	    
+	    
+	}
+	if ((BufferDescriptors[buf->freeNext].freePrev != buf->buf_id) ||
+	    (BufferDescriptors[buf->freePrev].freeNext != buf->buf_id)) {
+	    printf("\tfree list links corrupted: %d %ld %ld\n",
+		   buf->buf_id,buf->freePrev,buf->freeNext);
+	}
+	
+    }
+    if (buf != SharedFreeList) {
+	printf("\tfree list corrupted: %d-th buffer is %d\n",
+	       nfree,buf->buf_id);
+	
+    }
+}
+
+/*
+ * PrintBufferFreeList -
+ *    prints the buffer free list, for debugging
+ */
+void
+PrintBufferFreeList()
+{
+    BufferDesc *buf;
+
+    if (SharedFreeList->freeNext == Free_List_Descriptor) {
+	printf("free list is empty.\n");
+	return;
+    }
+    
+    buf = &(BufferDescriptors[SharedFreeList->freeNext]);
+    for (;;) {
+	int i = (buf - BufferDescriptors);
+	printf("[%-2d] (%s, %d) flags=0x%x, refcnt=%d %ld, nxt=%ld prv=%ld)\n",
+	       i, buf->sb_relname, buf->tag.blockNum,
+	       buf->flags, buf->refcount, PrivateRefCount[i],
+	       buf->freeNext, buf->freePrev);
+	
+	if (buf->freeNext == Free_List_Descriptor)
+	    break;
+
+	buf = &(BufferDescriptors[buf->freeNext]);
+    }
+}
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -0,0 +1,284 @@
+/*-------------------------------------------------------------------------
+ *
+ * localbuf.c--
+ *    local buffer manager. Fast buffer manager for temporary tables
+ *    or special cases when the operation is not visible to other backends.
+ *
+ *    When a relation is being created, the descriptor will have rd_islocal
+ *    set to indicate that the local buffer manager should be used. During
+ *    the same transaction the relation is being created, any inserts or
+ *    selects from the newly created relation will use the local buffer
+ *    pool. rd_islocal is reset at the end of a transaction (commit/abort).
+ *    This is useful for queries like SELECT INTO TABLE and create index.
+ *
+ * Copyright (c) 1994-5, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/file.h>
+#include <stdio.h>
+#include <math.h>
+#include <signal.h>
+
+/* declarations split between these three files */
+#include "storage/buf.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/smgr.h"
+#include "storage/lmgr.h"
+#include "miscadmin.h"
+#include "utils/builtins.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+#include "executor/execdebug.h"	/* for NDirectFileRead */
+#include "catalog/catalog.h"
+
+int NLocBuffer = 64;
+BufferDesc *LocalBufferDescriptors = NULL;
+long *LocalRefCount = NULL;
+
+static int nextFreeLocalBuf = 0;
+
+/*#define LBDEBUG*/
+
+/*
+ * LocalBufferAlloc -
+ *    allocate a local buffer. We do round robin allocation for now.
+ */
+BufferDesc *
+LocalBufferAlloc(Relation reln, BlockNumber blockNum, bool *foundPtr)
+{
+    int i;
+    BufferDesc *bufHdr = (BufferDesc *) NULL;
+
+    if (blockNum == P_NEW) {
+	blockNum = reln->rd_nblocks;
+	reln->rd_nblocks++;
+    } 
+
+    /* a low tech search for now -- not optimized for scans */
+    for (i=0; i < NLocBuffer; i++) {
+	if (LocalBufferDescriptors[i].tag.relId.relId == reln->rd_id &&
+	    LocalBufferDescriptors[i].tag.blockNum == blockNum) {
+
+#ifdef LBDEBUG
+	    fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
+		    reln->rd_id, blockNum, -i-1);
+#endif    
+	    LocalRefCount[i]++;
+	    *foundPtr = TRUE;
+	    return &LocalBufferDescriptors[i];
+	}
+    }
+
+#ifdef LBDEBUG
+    fprintf(stderr, "LB ALLOC (%d,%d) %d\n",
+	    reln->rd_id, blockNum, -nextFreeLocalBuf-1);
+#endif    
+    
+    /* need to get a new buffer (round robin for now) */
+    for(i=0; i < NLocBuffer; i++) {
+	int b = (nextFreeLocalBuf + i) % NLocBuffer;
+
+	if (LocalRefCount[b]==0) {
+	    bufHdr = &LocalBufferDescriptors[b];
+	    LocalRefCount[b]++;
+	    nextFreeLocalBuf = (b + 1) % NLocBuffer;
+	    break;
+	}
+    }
+    if (bufHdr==NULL)
+	elog(WARN, "no empty local buffer.");
+
+    /*
+     * this buffer is not referenced but it might still be dirty (the
+     * last transaction to touch it doesn't need its contents but has
+     * not flushed it).  if that's the case, write it out before
+     * reusing it!
+     */
+    if (bufHdr->flags & BM_DIRTY) {
+	Relation bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
+
+	Assert(bufrel != NULL);
+	
+	/* flush this page */
+	smgrwrite(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
+		  (char *) MAKE_PTR(bufHdr->data));
+    }
+
+    /*
+     * it's all ours now.
+     */
+    bufHdr->tag.relId.relId = reln->rd_id;
+    bufHdr->tag.blockNum = blockNum;
+    bufHdr->flags &= ~BM_DIRTY;
+
+    /*
+     * lazy memory allocation. (see MAKE_PTR for why we need to do 
+     * MAKE_OFFSET.)
+     */
+    if (bufHdr->data == (SHMEM_OFFSET)0) {
+	char *data = (char *)malloc(BLCKSZ);
+
+	bufHdr->data = MAKE_OFFSET(data);
+    }
+    
+    *foundPtr = FALSE;
+    return bufHdr;
+}
+
+/*
+ * WriteLocalBuffer -
+ *    writes out a local buffer
+ */
+int
+WriteLocalBuffer(Buffer buffer, bool release)
+{
+    int bufid;
+
+    Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+    fprintf(stderr, "LB WRITE %d\n", buffer);
+#endif    
+    
+    bufid = - (buffer + 1);
+    LocalBufferDescriptors[bufid].flags |= BM_DIRTY;
+
+    if (release) {
+	Assert(LocalRefCount[bufid] > 0);
+	LocalRefCount[bufid]--;
+    }
+
+    return true;
+}
+
+/*
+ * FlushLocalBuffer -
+ *    flushes a local buffer
+ */
+int
+FlushLocalBuffer(Buffer buffer)
+{
+    int bufid;
+    Relation bufrel;
+    BufferDesc *bufHdr;
+
+    Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+    fprintf(stderr, "LB FLUSH %d\n", buffer);
+#endif    
+
+    bufid = - (buffer + 1);
+    bufHdr = &LocalBufferDescriptors[bufid];
+    bufHdr->flags &= ~BM_DIRTY;
+    bufrel = RelationIdCacheGetRelation(bufHdr->tag.relId.relId);
+
+    Assert(bufrel != NULL);
+    smgrflush(bufrel->rd_rel->relsmgr, bufrel, bufHdr->tag.blockNum,
+	      (char *) MAKE_PTR(bufHdr->data));
+
+    Assert(LocalRefCount[bufid] > 0);
+    LocalRefCount[bufid]--;
+    
+    return true;
+}
+
+/*
+ * InitLocalBuffer -
+ *    init the local buffer cache. Since most queries (esp. multi-user ones)
+ *    don't involve local buffers, we delay allocating memory for actual the
+ *    buffer until we need it.
+ */
+void
+InitLocalBuffer()
+{
+    int i;
+    
+    /*
+     * these aren't going away. I'm not gonna use palloc.
+     */
+    LocalBufferDescriptors =
+	(BufferDesc *)malloc(sizeof(BufferDesc) * NLocBuffer);
+    memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
+    nextFreeLocalBuf = 0;
+
+    for (i = 0; i < NLocBuffer; i++) {
+	BufferDesc *buf = &LocalBufferDescriptors[i];
+
+	/*
+	 * negative to indicate local buffer. This is tricky: shared buffers
+	 * start with 0. We have to start with -2. (Note that the routine
+	 * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
+	 * is -1.)
+	 */
+	buf->buf_id = - i - 2;	
+    }
+
+    LocalRefCount =
+	(long *)malloc(sizeof(long) * NLocBuffer);
+    memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
+
+/*
+ * LocalBufferSync -
+ *    flush all dirty buffers in the local buffer cache. Since the buffer
+ *    cache is only used for keeping relations visible during a transaction,
+ *    we will not need these buffers again.
+ */
+void
+LocalBufferSync()
+{
+    int i;
+    
+    for (i = 0; i < NLocBuffer; i++) {
+	BufferDesc *buf = &LocalBufferDescriptors[i];
+	Relation bufrel;
+
+	if (buf->flags & BM_DIRTY) {
+#ifdef LBDEBUG
+	    fprintf(stderr, "LB SYNC %d\n", -i-1);
+#endif	    
+	    bufrel = RelationIdCacheGetRelation(buf->tag.relId.relId);
+
+	    Assert(bufrel != NULL);
+	    
+	    smgrwrite(bufrel->rd_rel->relsmgr, bufrel, buf->tag.blockNum,
+		      (char *) MAKE_PTR(buf->data));
+
+	    buf->tag.relId.relId = InvalidOid;
+	    buf->flags &= ~BM_DIRTY;
+	}
+    }
+
+    memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
+
+void
+ResetLocalBufferPool()
+{
+    int i;
+
+    memset(LocalBufferDescriptors, 0, sizeof(BufferDesc) * NLocBuffer);
+    nextFreeLocalBuf = 0;
+
+    for (i = 0; i < NLocBuffer; i++) {
+	BufferDesc *buf = &LocalBufferDescriptors[i];
+
+	/* just like InitLocalBuffer() */
+	buf->buf_id = - i - 2;	
+    }
+
+    memset(LocalRefCount, 0, sizeof(long) * NLocBuffer);
+}
--- a/src/backend/storage/bufmgr.h
+++ b/src/backend/storage/bufmgr.h
@@ -0,0 +1,112 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.h--
+ *    POSTGRES buffer manager definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: bufmgr.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUFMGR_H
+#define BUFMGR_H
+
+#include "c.h"
+
+#include "machine.h"		/* for BLCKSZ */
+#include "utils/rel.h"
+
+#include "storage/buf_internals.h"	/* UGLY! -- ay */
+
+/*
+ * the maximum size of a disk block for any possible installation.
+ *
+ * in theory this could be anything, but in practice this is actually
+ * limited to 2^13 bytes because we have limited ItemIdData.lp_off and
+ * ItemIdData.lp_len to 13 bits (see itemid.h).
+ */
+#define	MAXBLCKSZ	8192
+
+typedef void *Block;
+
+
+/* special pageno for bget */
+#define P_NEW	InvalidBlockNumber	/* grow the file to get a new page */
+
+typedef bits16	BufferLock;
+
+/**********************************************************************
+
+  the rest is function defns in the bufmgr that are externally callable
+
+ **********************************************************************/
+
+/*
+ * These routines are beaten on quite heavily, hence the macroization.
+ * See buf_internals.h for a related comment.
+ */
+#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
+
+/*
+ * BufferIsPinned --
+ *	True iff the buffer is pinned (and therefore valid)
+ *
+ * Note:
+ *	Smenatics are identical to BufferIsValid 
+ *      XXX - need to remove either one eventually.
+ */
+#define BufferIsPinned BufferIsValid
+
+
+extern int ShowPinTrace;
+
+/*
+ * prototypes for functions in bufmgr.c 
+ */
+extern Buffer RelationGetBufferWithBuffer(Relation relation,
+		  BlockNumber blockNumber, Buffer buffer);
+extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);
+extern Buffer ReadBuffer_Debug(char *file, int line, Relation reln,
+			       BlockNumber blockNum);
+extern int WriteBuffer(Buffer buffer);
+extern void WriteBuffer_Debug(char *file, int line, Buffer buffer);
+extern void DirtyBufferCopy(Oid dbid, Oid relid, BlockNumber blkno,
+			    char *dest);
+extern int WriteNoReleaseBuffer(Buffer buffer);
+extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
+				   BlockNumber blockNum);
+
+extern void InitBufferPool(IPCKey key);
+extern void PrintBufferUsage(FILE *statfp);
+extern void ResetBufferUsage(void);
+extern void ResetBufferPool(void);
+extern int BufferPoolCheckLeak(void);
+extern void FlushBufferPool(int StableMainMemoryFlag);
+extern bool BufferIsValid(Buffer bufnum);
+extern BlockNumber BufferGetBlockNumber(Buffer buffer);
+extern Relation BufferGetRelation(Buffer buffer);
+extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
+extern Block BufferGetBlock(Buffer buffer);
+extern void ReleaseTmpRelBuffers(Relation tempreldesc);
+extern void DropBuffers(Oid dbid);
+extern void PrintBufferDescs(void);
+extern void PrintPinnedBufs(void);
+extern int BufferShmemSize(void);
+extern void BufferPoolBlowaway(void);
+extern void IncrBufferRefCount(Buffer buffer);
+extern int ReleaseBuffer(Buffer buffer);
+
+extern void IncrBufferRefCount_Debug(char *file, int line, Buffer buffer);
+extern void ReleaseBuffer_Debug(char *file, int line, Buffer buffer);
+extern int ReleaseAndReadBuffer_Debug(char *file,
+				int line,
+				Buffer buffer,
+				Relation relation,
+				BlockNumber blockNum);
+extern void BufferRefCountReset(int *refcountsave);
+extern void BufferRefCountRestore(int *refcountsave);
+
+#endif	/* !defined(BufMgrIncluded) */
+
--- a/src/backend/storage/bufpage.h
+++ b/src/backend/storage/bufpage.h
@@ -0,0 +1,256 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.h--
+ *    Standard POSTGRES buffer page definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: bufpage.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	BUFPAGE_H
+#define BUFPAGE_H
+
+#include "c.h"
+#include "machine.h"		/* for BLCKSZ */
+
+#include "storage/buf.h"
+#include "storage/item.h"
+#include "storage/itemid.h"
+#include "storage/itemptr.h"
+
+/*
+ * a postgres disk page is an abstraction layered on top of a postgres
+ * disk block (which is simply a unit of i/o, see block.h).
+ *
+ * specifically, while a disk block can be unformatted, a postgres
+ * disk page is always a slotted page of the form:
+ *
+ * +----------------+---------------------------------+
+ * | PageHeaderData | linp0 linp1 linp2 ...           |
+ * +-----------+----+---------------------------------+
+ * | ... linpN |                                      |
+ * +-----------+--------------------------------------+
+ * |           ^ pd_lower                             |
+ * |                                                  |
+ * |             v pd_upper                           |
+ * +-------------+------------------------------------+
+ * |             | tupleN ...                         |
+ * +-------------+------------------+-----------------+
+ * |       ... tuple2 tuple1 tuple0 | "special space" |
+ * +--------------------------------+-----------------+
+ *                                  ^ pd_special
+ *
+ * a page is full when nothing can be added between pd_lower and
+ * pd_upper.
+ *
+ * all blocks written out by an access method must be disk pages.
+ *
+ * EXCEPTIONS:
+ *
+ * obviously, a page is not formatted before it is initialized with by
+ * a call to PageInit.
+ *
+ * the contents of the special pg_variable/pg_time/pg_log tables are
+ * raw disk blocks with special formats.  these are the only "access
+ * methods" that need not write disk pages.
+ *
+ * NOTES:
+ *
+ * linp0..N form an ItemId array.  ItemPointers point into this array
+ * rather than pointing directly to a tuple.
+ *
+ * tuple0..N are added "backwards" on the page.  because a tuple's
+ * ItemPointer points to its ItemId entry rather than its actual
+ * byte-offset position, tuples can be physically shuffled on a page
+ * whenever the need arises.
+ *
+ * AM-generic per-page information is kept in the pd_opaque field of
+ * the PageHeaderData.  (this is currently only the page size.)
+ * AM-specific per-page data is kept in the area marked "special
+ * space"; each AM has an "opaque" structure defined somewhere that is
+ * stored as the page trailer.  an access method should always
+ * initialize its pages with PageInit and then set its own opaque
+ * fields.
+ */
+typedef Pointer	Page;
+
+/*
+ * PageIsValid --
+ *	True iff page is valid.
+ */
+#define	PageIsValid(page) PointerIsValid(page)
+
+
+/*
+ * location (byte offset) within a page.
+ *
+ * note that this is actually limited to 2^13 because we have limited
+ * ItemIdData.lp_off and ItemIdData.lp_len to 13 bits (see itemid.h).
+ */
+typedef uint16	LocationIndex;
+
+
+/*
+ * space management information generic to any page
+ *
+ *	od_pagesize	- size in bytes.
+ *			  in reality, we need at least 64B to fit the 
+ *			  page header, opaque space and a minimal tuple;
+ *			  on the high end, we can only support pages up
+ *			  to 8KB because lp_off/lp_len are 13 bits.
+ */
+typedef struct OpaqueData {
+    uint16 od_pagesize;
+} OpaqueData;
+    
+typedef OpaqueData	*Opaque;
+
+
+/*
+ * disk page organization
+ */
+typedef struct PageHeaderData {
+    LocationIndex	pd_lower;	/* offset to start of free space */
+    LocationIndex	pd_upper;	/* offset to end of free space */
+    LocationIndex	pd_special;	/* offset to start of special space */
+    OpaqueData       	pd_opaque;	/* AM-generic information */
+    ItemIdData		pd_linp[1];	/* line pointers */
+} PageHeaderData;
+
+typedef PageHeaderData	*PageHeader;
+
+typedef enum {
+    ShufflePageManagerMode,
+    OverwritePageManagerMode
+} PageManagerMode;
+
+/* ----------------
+ *	misc support macros
+ * ----------------
+ */
+
+/*
+ * XXX this is wrong -- ignores padding/alignment, variable page size,
+ * AM-specific opaque space at the end of the page (as in btrees), ...
+ * however, it at least serves as an upper bound for heap pages.
+ */
+#define MAXTUPLEN	(BLCKSZ - sizeof (PageHeaderData))
+
+/* ----------------------------------------------------------------
+ *			page support macros
+ * ----------------------------------------------------------------
+ */
+/*
+ * PageIsValid -- This is defined in page.h.
+ */
+
+/*
+ * PageIsUsed --
+ *	True iff the page size is used.
+ *
+ * Note:
+ *	Assumes page is valid.
+ */
+#define PageIsUsed(page) \
+    (AssertMacro(PageIsValid(page)) ? \
+     ((bool) (((PageHeader) (page))->pd_lower != 0)) : false)
+
+/*
+ * PageIsEmpty --
+ *	returns true iff no itemid has been allocated on the page
+ */
+#define PageIsEmpty(page) \
+    (((PageHeader) (page))->pd_lower == \
+     (sizeof(PageHeaderData) - sizeof(ItemIdData)) ? true : false)
+
+/*
+ * PageGetItemId --
+ *	Returns an item identifier of a page.
+ */
+#define PageGetItemId(page, offsetNumber) \
+    ((ItemId) (&((PageHeader) (page))->pd_linp[(-1) + (offsetNumber)]))
+
+/* ----------------
+ *	macros to access opaque space
+ * ----------------
+ */
+
+/*
+ * PageSizeIsValid --
+ *	True iff the page size is valid.
+ *
+ * XXX currently all page sizes are "valid" but we only actually
+ *     use BLCKSZ.
+ */
+#define PageSizeIsValid(pageSize) 1
+
+/*
+ * PageGetPageSize --
+ *	Returns the page size of a page.
+ *
+ * this can only be called on a formatted page (unlike
+ * BufferGetPageSize, which can be called on an unformatted page).
+ * however, it can be called on a page for which there is no buffer.
+ */
+#define PageGetPageSize(page) \
+    ((Size) ((PageHeader) (page))->pd_opaque.od_pagesize)
+
+/*
+ * PageSetPageSize --
+ *	Sets the page size of a page.
+ */
+#define PageSetPageSize(page, size) \
+    ((PageHeader) (page))->pd_opaque.od_pagesize = (size)
+
+/* ----------------
+ *	page special data macros
+ * ----------------
+ */
+/*
+ * PageGetSpecialSize --
+ *	Returns size of special space on a page.
+ *
+ * Note:
+ *	Assumes page is locked.
+ */
+#define PageGetSpecialSize(page) \
+    ((uint16) (PageGetPageSize(page) - ((PageHeader)page)->pd_special))
+
+/*
+ * PageGetSpecialPointer --
+ *	Returns pointer to special space on a page.
+ *
+ * Note:
+ *	Assumes page is locked.
+ */
+#define PageGetSpecialPointer(page) \
+    (AssertMacro(PageIsValid(page)) ? \
+     (char *) ((char *) (page) + ((PageHeader) (page))->pd_special) \
+     : (char *) 0)
+
+/* ----------------------------------------------------------------
+ *	extern declarations
+ * ----------------------------------------------------------------
+ */
+
+extern Size BufferGetPageSize(Buffer buffer);
+extern Page BufferGetPage(Buffer buffer);
+extern void PageInit(Page page, Size pageSize, Size specialSize);
+extern Item PageGetItem(Page page, ItemId itemId);
+extern OffsetNumber PageAddItem(Page page, Item item, Size size,
+			 OffsetNumber offsetNumber, ItemIdFlags flags);
+extern Page PageGetTempPage(Page page, Size specialSize);
+extern void PageRestoreTempPage(Page tempPage, Page oldPage);
+extern OffsetNumber PageGetMaxOffsetNumber(Page page);
+extern void PageRepairFragmentation(Page page);
+extern Size PageGetFreeSpace(Page page);
+extern void PageManagerModeSet(PageManagerMode mode);
+extern void PageIndexTupleDelete(Page page, OffsetNumber offset);
+extern void PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
+				       char *location, Size size);
+
+
+#endif	/* BUFPAGE_H */
--- a/src/backend/storage/fd.h
+++ b/src/backend/storage/fd.h
@@ -0,0 +1,96 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.h--
+ *    Virtual file descriptor definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: fd.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * calls:
+ * 
+ *  File {Close, Read, Write, Seek, Tell, Sync}
+ *  {File Name Open, Allocate, Free} File
+ *
+ * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
+ * use them for all file activity...
+ *
+ *  fd = FilePathOpenFile("foo", O_RDONLY);
+ *  File fd;
+ *
+ * use AllocateFile if you need a file descriptor in some other context.
+ * it will make sure that there is a file descriptor free
+ *
+ * use FreeFile to let the virtual file descriptor package know that 
+ * there is now a free fd (when you are done with it)
+ *
+ *  AllocateFile();
+ *  FreeFile();
+ */
+#ifndef	FD_H
+#define FD_H
+
+/*
+ * FileOpen uses the standard UNIX open(2) flags.
+ */
+#include <fcntl.h>	/* for O_ on most */
+#ifndef O_RDONLY
+#include <sys/file.h>	/* for O_ on the rest */
+#endif /* O_RDONLY */
+
+/*
+ * FileSeek uses the standard UNIX lseek(2) flags.
+ */
+#ifndef WIN32
+#include <unistd.h>	/* for SEEK_ on most */
+#else
+#ifndef SEEK_SET
+#include <stdio.h>	/* for SEEK_ on the rest */
+#endif /* SEEK_SET */
+#endif /* WIN32 */
+
+#include "c.h"
+#include "storage/block.h"
+
+typedef char   *FileName;
+
+typedef int	File;
+
+/* originally in libpq-fs.h */
+struct pgstat { /* just the fields we need from stat structure */
+    int st_ino;
+    int st_mode;
+    unsigned int st_size;
+    unsigned int st_sizehigh;	/* high order bits */
+/* 2^64 == 1.8 x 10^20 bytes */
+    int st_uid;
+    int st_atime_s;	/* just the seconds */
+    int st_mtime_s;	/* since SysV and the new BSD both have */
+    int st_ctime_s;	/* usec fields.. */
+};
+
+/*
+ * prototypes for functions in fd.c
+ */
+extern void FileInvalidate(File file);
+extern File FileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+extern void FileClose(File file);
+extern void FileUnlink(File file);
+extern int FileRead(File file, char *buffer, int amount);
+extern int FileWrite(File file, char *buffer, int amount);
+extern long FileSeek(File file, long offset, int whence);
+extern long FileTell(File file);
+extern int FileTruncate(File file, int offset);
+extern int FileSync(File file);
+extern int FileNameUnlink(char *filename);
+extern void AllocateFile(void);
+extern void FreeFile(void);
+extern void closeAllVfds(void);
+extern void closeOneVfd(void);
+
+#endif	/* FD_H */
--- a/src/backend/storage/file/Makefile.inc
+++ b/src/backend/storage/file/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/file
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/file/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= fd.c
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -0,0 +1,888 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.c--
+ *    Virtual file descriptor code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *    $Id: fd.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ * NOTES:
+ *
+ * This code manages a cache of 'virtual' file descriptors (VFDs).
+ * The server opens many file descriptors for a variety of reasons,
+ * including base tables, scratch files (e.g., sort and hash spool
+ * files), and random calls to C library routines like system(3); it
+ * is quite easy to exceed system limits on the number of open files a
+ * single process can have.  (This is around 256 on many modern
+ * operating systems, but can be as low as 32 on others.)
+ *
+ * VFDs are managed as an LRU pool, with actual OS file descriptors
+ * being opened and closed as needed.  Obviously, if a routine is
+ * opened using these interfaces, all subsequent operations must also
+ * be through these interfaces (the File type is not a real file
+ * descriptor).
+ *
+ * For this scheme to work, most (if not all) routines throughout the
+ * server should use these interfaces instead of calling the C library
+ * routines (e.g., open(2) and fopen(3)) themselves.  Otherwise, we
+ * may find ourselves short of real file descriptors anyway.
+ *
+ * This file used to contain a bunch of stuff to support RAID levels 0
+ * (jbod), 1 (duplex) and 5 (xor parity).  That stuff is all gone
+ * because the parallel query processing code that called it is all
+ * gone.  If you really need it you could get it from the original
+ * POSTGRES source.
+ *-------------------------------------------------------------------------
+ */
+
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "c.h"
+#include "miscadmin.h"	/* for DataDir */
+#include "utils/palloc.h"
+
+#ifdef PORTNAME_sparc
+/*
+ * the SunOS 4 NOFILE is a lie, because the default limit is *not* the
+ * maximum number of file descriptors you can have open.
+ *
+ * we have to either use this number (the default dtablesize) or
+ * explicitly call setrlimit(RLIMIT_NOFILE, NOFILE).
+ */
+#include <sys/user.h>
+#undef NOFILE
+#define NOFILE NOFILE_IN_U
+#endif /* PORTNAME_sparc */
+
+/*
+ * Problem: Postgres does a system(ld...) to do dynamic loading.  This
+ * will open several extra files in addition to those used by
+ * Postgres.  We need to do this hack to guarentee that there are file
+ * descriptors free for ld to use.
+ *
+ * The current solution is to limit the number of files descriptors
+ * that this code will allocated at one time.  (it leaves
+ * RESERVE_FOR_LD free).
+ *
+ * (Even though most dynamic loaders now use dlopen(3) or the
+ * equivalent, the OS must still open several files to perform the
+ * dynamic loading.  Keep this here.)
+ */
+#define RESERVE_FOR_LD	10
+
+/*
+ * If we are using weird storage managers, we may need to keep real
+ * file descriptors open so that the jukebox server doesn't think we
+ * have gone away (and no longer care about a platter or file that
+ * we've been using).  This might be an actual file descriptor for a
+ * local jukebox interface that uses paths, or a socket connection for
+ * a network jukebox server.  Since we can't be opening and closing
+ * these descriptors at whim, we must make allowances for them.
+ */
+#ifdef HP_JUKEBOX
+#define RESERVE_FOR_JB	25
+#define	MAXFILES	((NOFILE - RESERVE_FOR_LD) - RESERVE_FOR_JB)
+#else /* HP_JUKEBOX */
+#define	MAXFILES	(NOFILE - RESERVE_FOR_LD)
+#endif /* HP_JUKEBOX */
+
+/* Debugging.... */
+
+#ifdef FDDEBUG
+# define DO_DB(A) A
+#else
+# define DO_DB(A) /* A */
+#endif
+
+#define VFD_CLOSED -1
+
+#include "storage/fd.h"
+#include "utils/elog.h"
+
+#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
+
+typedef struct vfd {
+    signed short	fd;
+    unsigned short	fdstate;
+
+#define FD_DIRTY	(1 << 0)
+
+    File	nextFree;
+    File	lruMoreRecently;
+    File	lruLessRecently;
+    long	seekPos;
+    char	*fileName;
+    int		fileFlags;
+    int		fileMode;
+} Vfd;
+
+/*
+ * Virtual File Descriptor array pointer and size.  This grows as
+ * needed.
+ */
+static	Vfd	*VfdCache;
+static	Size	SizeVfdCache = 0;
+
+/*
+ * Minimum number of file descriptors known to be free.
+ */
+static	int	FreeFd = 0;
+
+/*
+ * Number of file descriptors known to be open.
+ */
+static	int	nfile = 0;
+
+/*
+ * we use the name of the null device in various places, mostly so
+ * that we can open it and find out if we really have any descriptors
+ * available or not.
+ */
+#ifndef WIN32
+static char *Nulldev = "/dev/null";
+static char Sep_char = '/';
+#else
+static char *Nulldev = "NUL";
+static char Sep_char = '\\';
+#endif /* WIN32 */
+
+/*
+ * Private Routines
+ *
+ * Delete	   - delete a file from the Lru ring
+ * LruDelete	   - remove a file from the Lru ring and close
+ * Insert	   - put a file at the front of the Lru ring
+ * LruInsert	   - put a file at the front of the Lru ring and open
+ * AssertLruRoom   - make sure that there is a free fd.
+ *
+ * the Last Recently Used ring is a doubly linked list that begins and
+ * ends on element zero.
+ *
+ * example:
+ *
+ *     /--less----\                /---------\
+ *     v           \              v           \
+ *   #0 --more---> LeastRecentlyUsed --more-\ \
+ *    ^\                                    | |
+ *     \\less--> MostRecentlyUsedFile   <---/ |
+ *      \more---/                    \--less--/
+ *
+ * AllocateVfd	   - grab a free (or new) file record (from VfdArray)
+ * FreeVfd	   - free a file record
+ *
+ */
+static void Delete(File file);
+static void LruDelete(File file);
+static void Insert(File file);
+static int LruInsert (File file);
+static void AssertLruRoom(void);
+static File AllocateVfd(void);
+static void FreeVfd(File file);
+
+static int FileAccess(File file);
+static File fileNameOpenFile(FileName fileName, int fileFlags, int fileMode);
+static char *filepath(char *filename);
+
+#if defined(FDDEBUG)
+static void
+_dump_lru()
+{
+    int mru = VfdCache[0].lruLessRecently;
+    Vfd *vfdP = &VfdCache[mru];
+    
+    printf("MOST %d ", mru);
+    while (mru != 0)
+	{
+	    mru = vfdP->lruLessRecently;
+	    vfdP = &VfdCache[mru];
+	    printf("%d ", mru);
+	}
+    printf("LEAST\n");
+}
+#endif /* FDDEBUG */
+
+static void
+Delete(File file)
+{
+    Vfd	*fileP;
+    
+    DO_DB(printf("DEBUG:	Delete %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    DO_DB(_dump_lru());
+    
+    Assert(file != 0);
+    
+    fileP = &VfdCache[file];
+
+    VfdCache[fileP->lruLessRecently].lruMoreRecently =
+	VfdCache[file].lruMoreRecently;
+    VfdCache[fileP->lruMoreRecently].lruLessRecently =
+	VfdCache[file].lruLessRecently;
+    
+    DO_DB(_dump_lru());
+}
+
+static void
+LruDelete(File file)
+{
+    Vfd     *fileP;
+    int	returnValue;
+    
+    DO_DB(printf("DEBUG:	LruDelete %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    Assert(file != 0);
+    
+    fileP = &VfdCache[file];
+    
+    /* delete the vfd record from the LRU ring */
+    Delete(file);
+    
+    /* save the seek position */
+    fileP->seekPos = lseek(fileP->fd, 0L, SEEK_CUR);
+    Assert( fileP->seekPos != -1);
+    
+    /* if we have written to the file, sync it */
+    if (fileP->fdstate & FD_DIRTY) {
+	returnValue = fsync(fileP->fd);
+	Assert(returnValue != -1);
+	fileP->fdstate &= ~FD_DIRTY;
+    }
+    
+    /* close the file */
+    returnValue = close(fileP->fd);
+    Assert(returnValue != -1);
+    
+    --nfile;
+    fileP->fd = VFD_CLOSED;
+    
+    /* note that there is now one more free real file descriptor */
+    FreeFd++;
+}
+
+static void
+Insert(File file)
+{
+    Vfd	*vfdP;
+    
+    DO_DB(printf("DEBUG:	Insert %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    DO_DB(_dump_lru());
+    
+    vfdP = &VfdCache[file];
+    
+    vfdP->lruMoreRecently = 0;
+    vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
+    VfdCache[0].lruLessRecently = file;
+    VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
+    
+    DO_DB(_dump_lru());
+}
+
+static int
+LruInsert (File file)
+{
+    Vfd	*vfdP;
+    int	returnValue;
+    
+    DO_DB(printf("DEBUG:	LruInsert %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    vfdP = &VfdCache[file];
+    
+    if (FileIsNotOpen(file)) {
+	int tmpfd;
+	
+        /*
+	 * Note, we check to see if there's a free file descriptor
+	 * before attempting to open a file. One general way to do
+	 * this is to try to open the null device which everybody
+	 * should be able to open all the time. If this fails, we
+	 * assume this is because there's no free file descriptors.
+	 */
+    tryAgain:
+	tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
+	if (tmpfd < 0) {
+	    FreeFd = 0;
+	    errno = 0;
+	    AssertLruRoom();
+	    goto tryAgain;
+	} else {
+	    close(tmpfd);
+	}
+	vfdP->fd = open(vfdP->fileName,vfdP->fileFlags,vfdP->fileMode);
+	
+	if (vfdP->fd < 0) {
+	    DO_DB(printf("RE_OPEN FAILED: %d\n",
+			 errno));
+	    return (vfdP->fd);
+	} else {
+	    DO_DB(printf("RE_OPEN SUCCESS\n"));
+	    ++nfile;
+	}
+	
+	/* seek to the right position */
+	if (vfdP->seekPos != 0L) {
+	    returnValue =
+		lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
+	    Assert(returnValue != -1);
+	}
+	
+	/* init state on open */
+	vfdP->fdstate = 0x0;
+	
+	/* note that a file descriptor has been used up */
+	if (FreeFd > 0)
+	    FreeFd--;
+    }
+    
+    /*
+     * put it at the head of the Lru ring
+     */
+    
+    Insert(file);
+    
+    return (0);
+}
+
+static void
+AssertLruRoom()
+{
+    DO_DB(printf("DEBUG:	AssertLruRoom (FreeFd = %d)\n",
+		 FreeFd));
+    
+    if (FreeFd <= 0 || nfile >= MAXFILES) {
+	LruDelete(VfdCache[0].lruMoreRecently);
+    }
+}
+
+static File
+AllocateVfd()
+{
+    Index	i;
+    File	file;
+    
+    DO_DB(printf("DEBUG:	AllocateVfd\n"));
+    
+    if (SizeVfdCache == 0) {
+	
+	/* initialize */
+	VfdCache = (Vfd *)malloc(sizeof(Vfd));
+	
+	VfdCache->nextFree = 0;
+	VfdCache->lruMoreRecently = 0;
+	VfdCache->lruLessRecently = 0;
+	VfdCache->fd = VFD_CLOSED;
+	VfdCache->fdstate = 0x0;
+	
+	SizeVfdCache = 1;
+    }
+    
+    if (VfdCache[0].nextFree == 0) {
+	
+	/*
+	 * The free list is empty so it is time to increase the
+	 * size of the array
+	 */
+	
+	VfdCache =(Vfd *)realloc(VfdCache, sizeof(Vfd)*SizeVfdCache*2);
+	Assert(VfdCache != NULL);
+	
+	/*
+	 * Set up the free list for the new entries
+	 */
+	
+	for (i = SizeVfdCache; i < 2*SizeVfdCache; i++)  {
+	    memset((char *) &(VfdCache[i]), 0, sizeof(VfdCache[0]));
+	    VfdCache[i].nextFree = i+1;
+	    VfdCache[i].fd = VFD_CLOSED;
+	}
+	
+	/*
+	 * Element 0 is the first and last element of the free
+	 * list
+	 */
+	
+	VfdCache[0].nextFree = SizeVfdCache;
+	VfdCache[2*SizeVfdCache-1].nextFree = 0;
+	
+	/*
+	 * Record the new size
+	 */
+	
+	SizeVfdCache *= 2;
+    }
+    file = VfdCache[0].nextFree;
+    
+    VfdCache[0].nextFree = VfdCache[file].nextFree;
+    
+    return file;
+}
+
+static void
+FreeVfd(File file)
+{
+    DO_DB(printf("DB: FreeVfd: %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    VfdCache[file].nextFree = VfdCache[0].nextFree;
+    VfdCache[0].nextFree = file;
+}
+
+static char *
+filepath(char *filename)
+{
+    char *buf;
+    char basename[16];
+    int len;
+
+#ifndef WIN32    
+    if (*filename != Sep_char) {
+#else
+    if (!(filename[1] == ':' && filename[2] == Sep_char)) {
+#endif /* WIN32 */	
+
+	/* Either /base/ or \base\ */
+	sprintf(basename, "%cbase%c", Sep_char, Sep_char);
+
+	len = strlen(DataDir) + strlen(basename) + strlen(GetDatabaseName())
+	    + strlen(filename) + 2;
+	buf = (char*) palloc(len);
+	sprintf(buf, "%s%s%s%c%s",
+		DataDir, basename, GetDatabaseName(), Sep_char, filename);
+    } else {
+	buf = (char *) palloc(strlen(filename) + 1);
+	strcpy(buf, filename);
+    }
+    
+    return(buf);
+}
+
+static int
+FileAccess(File file)
+{
+    int	returnValue;
+    
+    DO_DB(printf("DB: FileAccess %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    /*
+     * Is the file open?  If not, close the least recently used,
+     * then open it and stick it at the head of the used ring
+     */
+    
+    if (FileIsNotOpen(file)) {
+	
+	AssertLruRoom();
+	
+	returnValue = LruInsert(file);
+	if (returnValue != 0)
+	    return returnValue;
+	
+    } else {
+	
+	/*
+	 * We now know that the file is open and that it is not the
+	 * last one accessed, so we need to more it to the head of
+	 * the Lru ring.
+	 */
+	
+	Delete(file);
+	Insert(file);
+    }
+    
+    return (0);
+}
+
+/*
+ *  Called when we get a shared invalidation message on some relation.
+ */
+void
+FileInvalidate(File file)
+{
+    if (!FileIsNotOpen(file)) {
+	LruDelete(file);
+    }
+}
+
+/* VARARGS2 */
+static File
+fileNameOpenFile(FileName fileName,
+		 int fileFlags,
+		 int fileMode)
+{
+    static int osRanOut = 0;
+    File	file;
+    Vfd	*vfdP;
+    int     tmpfd;
+    
+    DO_DB(printf("DEBUG: FileNameOpenFile: %s %x %o\n",
+		 fileName, fileFlags, fileMode));
+    
+    file = AllocateVfd();
+    vfdP = &VfdCache[file];
+    
+    if (nfile >= MAXFILES || (FreeFd == 0 && osRanOut)) {
+	AssertLruRoom();
+    }
+    
+ tryAgain:
+    tmpfd = open(Nulldev, O_CREAT|O_RDWR, 0666);
+    if (tmpfd < 0) {
+	DO_DB(printf("DB: not enough descs, retry, er= %d\n",
+		     errno));
+	errno = 0;
+	FreeFd = 0;
+	osRanOut = 1;
+	AssertLruRoom();
+	goto tryAgain;
+    } else {
+	close(tmpfd);
+    }
+    
+#ifdef WIN32
+      fileFlags |= _O_BINARY;
+#endif /* WIN32 */
+    vfdP->fd = open(fileName,fileFlags,fileMode);
+    vfdP->fdstate = 0x0;
+    
+    if (vfdP->fd < 0) {
+	FreeVfd(file);
+	return -1;
+    }
+    ++nfile;
+    DO_DB(printf("DB: FNOF success %d\n",
+		 vfdP->fd));
+    
+    (void)LruInsert(file);
+    
+    if (fileName==NULL) {
+	elog(WARN, "fileNameOpenFile: NULL fname");
+    }
+    vfdP->fileName = malloc(strlen(fileName)+1);
+    strcpy(vfdP->fileName,fileName);
+    
+    vfdP->fileFlags = fileFlags & ~(O_TRUNC|O_EXCL);
+    vfdP->fileMode = fileMode;
+    vfdP->seekPos = 0;
+    
+    return file;
+}
+
+/*
+ * open a file in the database directory ($PGDATA/base/...)
+ */
+File
+FileNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+    File fd;
+    char *fname;
+    
+    fname = filepath(fileName);
+    fd = fileNameOpenFile(fname, fileFlags, fileMode);
+    pfree(fname);
+    return(fd);
+}
+
+/*
+ * open a file in an arbitrary directory
+ */
+File
+PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
+{
+    return(fileNameOpenFile(fileName, fileFlags, fileMode));
+}
+
+void
+FileClose(File file)
+{
+    int	returnValue;
+    
+    DO_DB(printf("DEBUG: FileClose: %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    if (!FileIsNotOpen(file)) {
+	
+	/* remove the file from the lru ring */
+	Delete(file);
+	
+	/* record the new free operating system file descriptor */
+	FreeFd++;
+	
+	/* if we did any writes, sync the file before closing */
+	if (VfdCache[file].fdstate & FD_DIRTY) {
+	    returnValue = fsync(VfdCache[file].fd);
+	    Assert(returnValue != -1);
+	    VfdCache[file].fdstate &= ~FD_DIRTY;
+	}
+	
+	/* close the file */
+	returnValue = close(VfdCache[file].fd);
+	Assert(returnValue != -1);
+	
+	--nfile;
+	VfdCache[file].fd = VFD_CLOSED;
+    }
+    /*
+     * Add the Vfd slot to the free list
+     */
+    FreeVfd(file);
+    /*
+     * Free the filename string
+     */
+    free(VfdCache[file].fileName);
+}
+
+void
+FileUnlink(File file)
+{
+    int returnValue;
+    
+    DO_DB(printf("DB: FileClose: %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    if (!FileIsNotOpen(file)) {
+	
+	/* remove the file from the lru ring */
+	Delete(file);
+	
+	/* record the new free operating system file descriptor */
+	FreeFd++;
+	
+	/* if we did any writes, sync the file before closing */
+	if (VfdCache[file].fdstate & FD_DIRTY) {
+	    returnValue = fsync(VfdCache[file].fd);
+	    Assert(returnValue != -1);
+	    VfdCache[file].fdstate &= ~FD_DIRTY;
+	}
+	
+	/* close the file */
+	returnValue = close(VfdCache[file].fd);
+	Assert(returnValue != -1);
+	
+	--nfile;
+	VfdCache[file].fd = VFD_CLOSED;
+    }
+    /* add the Vfd slot to the free list */
+    FreeVfd(file);
+    
+    /* free the filename string */
+    unlink(VfdCache[file].fileName);
+    free(VfdCache[file].fileName);
+}
+
+int
+FileRead(File file, char *buffer, int amount)
+{
+    int	returnCode;
+
+    DO_DB(printf("DEBUG: FileRead: %d (%s) %d 0x%x\n",
+		 file, VfdCache[file].fileName, amount, buffer));
+    
+    FileAccess(file);
+    returnCode = read(VfdCache[file].fd, buffer, amount);
+    if (returnCode > 0) {
+	VfdCache[file].seekPos += returnCode;
+    }
+    
+    return returnCode;
+}
+
+int
+FileWrite(File file, char *buffer, int amount)
+{
+    int	returnCode;
+
+    DO_DB(printf("DB: FileWrite: %d (%s) %d 0x%lx\n",
+		 file, VfdCache[file].fileName, amount, buffer));
+    
+    FileAccess(file);
+    returnCode = write(VfdCache[file].fd, buffer, amount);
+    if (returnCode > 0) {  /* changed by Boris with Mao's advice */
+	VfdCache[file].seekPos += returnCode;
+    }
+    
+    /* record the write */
+    VfdCache[file].fdstate |= FD_DIRTY;
+    
+    return returnCode;
+}
+
+long
+FileSeek(File file, long offset, int whence)
+{
+    int	returnCode;
+    
+    DO_DB(printf("DEBUG: FileSeek: %d (%s) %d %d\n",
+		 file, VfdCache[file].fileName, offset, whence));
+    
+    if (FileIsNotOpen(file)) {
+	switch(whence) {
+	case SEEK_SET:
+	    VfdCache[file].seekPos = offset;
+	    return offset;
+	case SEEK_CUR:
+	    VfdCache[file].seekPos = VfdCache[file].seekPos +offset;
+	    return VfdCache[file].seekPos;
+	case SEEK_END:
+	    FileAccess(file);
+	    returnCode = VfdCache[file].seekPos = 
+		lseek(VfdCache[file].fd, offset, whence);
+	    return returnCode;
+	default:
+	    elog(WARN, "FileSeek: invalid whence: %d", whence);
+	    break;
+	}
+    } else {
+	returnCode = VfdCache[file].seekPos = 
+	    lseek(VfdCache[file].fd, offset, whence);
+	return returnCode;
+    }
+    /*NOTREACHED*/
+    return(-1L);
+}
+
+/*
+ * XXX not actually used but here for completeness
+ */
+long
+FileTell(File file)
+{
+    DO_DB(printf("DEBUG: FileTell %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    return VfdCache[file].seekPos;
+}
+
+int
+FileTruncate(File file, int offset)
+{
+    int returnCode;
+
+    DO_DB(printf("DEBUG: FileTruncate %d (%s)\n",
+		 file, VfdCache[file].fileName));
+    
+    (void) FileSync(file);
+    (void) FileAccess(file);
+    returnCode = ftruncate(VfdCache[file].fd, offset);
+    return(returnCode);
+}
+
+int
+FileSync(File file)
+{
+    int	returnCode;
+    
+    /*
+     *  If the file isn't open, then we don't need to sync it; we
+     *  always sync files when we close them.  Also, if we haven't
+     *  done any writes that we haven't already synced, we can ignore
+     *  the request.
+     */
+    
+    if (VfdCache[file].fd < 0 || !(VfdCache[file].fdstate & FD_DIRTY)) {
+	returnCode = 0;
+    } else {
+	returnCode = fsync(VfdCache[file].fd);
+	VfdCache[file].fdstate &= ~FD_DIRTY;
+    }
+    
+    return returnCode;
+}
+
+int
+FileNameUnlink(char *filename)
+{
+    int retval;
+    char *fname;
+
+    fname = filepath(filename);
+    retval = unlink(fname);
+    pfree(fname);
+    return(retval);
+}
+
+/*
+ * if we want to be sure that we have a real file descriptor available
+ * (e.g., we want to know this in psort) we call AllocateFile to force
+ * availability.  when we are done we call FreeFile to deallocate the
+ * descriptor.
+ *
+ * allocatedFiles keeps track of how many have been allocated so we
+ * can give a warning if there are too few left.
+ */
+static int allocatedFiles = 0;
+
+void
+AllocateFile()
+{
+    int fd;
+    int fdleft;
+
+    while ((fd = open(Nulldev,O_WRONLY,0)) < 0) {
+	if (errno == EMFILE) {
+	    errno = 0;
+	    FreeFd = 0;
+	    AssertLruRoom();
+	} else {
+	    elog(WARN,"Open: %s in %s line %d\n", Nulldev,
+		 __FILE__, __LINE__);
+	}
+    }
+    close(fd);
+    ++allocatedFiles;
+    fdleft = MAXFILES - allocatedFiles;
+    if (fdleft < 6) {
+	elog(DEBUG,"warning: few usable file descriptors left (%d)", fdleft);
+    }
+    
+    DO_DB(printf("DEBUG: AllocatedFile.  FreeFd = %d\n",
+		 FreeFd));
+}
+
+/*
+ * XXX What happens if FreeFile() is called without a previous
+ * AllocateFile()?
+ */
+void
+FreeFile()
+{
+    DO_DB(printf("DEBUG: FreeFile.  FreeFd now %d\n",
+		 FreeFd));
+    FreeFd++;
+    nfile++;			/* dangerous */
+    Assert(allocatedFiles > 0);
+    --allocatedFiles;
+}
+
+void
+closeAllVfds()
+{
+    int i;
+    for (i=0; i<SizeVfdCache; i++) {
+	if (!FileIsNotOpen(i))
+	    LruDelete(i);
+    }
+}
+
+void
+closeOneVfd()
+{
+    int tmpfd;
+    
+    tmpfd = open(Nulldev, O_CREAT | O_RDWR, 0666);
+    if (tmpfd < 0) {
+	FreeFd = 0;
+	AssertLruRoom();
+	FreeFd = 0;
+    }
+    else
+	close(tmpfd);
+}
--- a/src/backend/storage/ipc.h
+++ b/src/backend/storage/ipc.h
@@ -0,0 +1,285 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.h--
+ *    POSTGRES inter-process communication definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: ipc.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ * NOTES
+ *    This file is very architecture-specific.  This stuff should actually
+ *    be factored into the port/ directories.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	IPC_H
+#define IPC_H
+
+#include <sys/types.h>
+#ifndef	_IPC_
+#define _IPC_
+#include <sys/ipc.h>
+#endif
+
+#include "c.h"
+
+/*
+ * Many architectures have support for user-level spinlocks (i.e., an
+ * atomic test-and-set instruction).  However, we have only written
+ * spinlock code for the architectures listed.
+ */
+#if defined(PORTNAME_aix) || \
+    defined(PORTNAME_alpha) || \
+    defined(PORTNAME_hpux) || \
+    defined(PORTNAME_irix5) || \
+    defined(PORTNAME_next) || \
+    defined(PORTNAME_sparc) || \
+    defined(PORTNAME_sparc_solaris) || \
+    (defined(__i386__) && defined(__GNUC__))
+#define HAS_TEST_AND_SET
+#endif
+
+#if defined(HAS_TEST_AND_SET)
+
+#if defined(PORTNAME_next)
+/*
+ * Use Mach mutex routines since these are, in effect, test-and-set
+ * spinlocks.
+ */
+#undef NEVER	/* definition in cthreads.h conflicts with parse.h */
+#include <mach/cthreads.h>
+
+typedef struct mutex	slock_t;
+#else /* next */
+#if defined(PORTNAME_aix)
+/*
+ * The AIX C library has the cs(3) builtin for compare-and-set that 
+ * operates on ints.
+ */
+typedef unsigned int	slock_t;
+#else /* aix */
+#if defined(PORTNAME_alpha)
+#include <sys/mman.h>
+typedef msemaphore	slock_t;
+#else /* alpha */
+#if defined(PORTNAME_hpux)
+/*
+ * The PA-RISC "semaphore" for the LDWCX instruction is 4 bytes aligned
+ * to a 16-byte boundary.
+ */
+typedef struct { int sem[4]; } slock_t;
+#else /* hpux */
+#if defined(PORTNAME_irix5)
+#include <abi_mutex.h>
+typedef abilock_t	slock_t;
+#else /* irix5 */
+/*
+ * On all other architectures spinlocks are a single byte.
+ */
+typedef unsigned char   slock_t;
+#endif /* irix5 */
+#endif /* hpux */
+#endif /* alpha */
+#endif /* aix */
+#endif /* next */
+
+extern void S_LOCK(slock_t *lock);
+extern void S_UNLOCK(slock_t *lock);
+extern void S_INIT_LOCK(slock_t *lock);
+
+#if defined(PORTNAME_hpux) || defined(PORTNAME_alpha) || defined(PORTNAME_irix5) || defined(PORTNAME_next)
+extern int S_LOCK_FREE(slock_t *lock);
+#else /* PORTNAME_hpux */
+#define S_LOCK_FREE(lock)	((*lock) == 0)
+#endif /* PORTNAME_hpux */
+
+#endif /* HAS_TEST_AND_SET */
+
+/*
+ * On architectures for which we have not implemented spinlocks (or
+ * cannot do so), we use System V semaphores.  We also use them for 
+ * long locks.  For some reason union semun is never defined in the 
+ * System V header files so we must do it ourselves.
+ */
+#if defined(sequent) || \
+    defined(PORTNAME_aix) || \
+    defined(PORTNAME_alpha) || \
+    defined(PORTNAME_hpux) || \
+    defined(PORTNAME_sparc_solaris) || \
+    defined(WIN32) || \
+    defined(PORTNAME_ultrix4)
+union semun {
+    int val;
+    struct semid_ds *buf;
+    unsigned short *array;
+};
+#endif
+
+typedef uint16	SystemPortAddress;
+
+/* semaphore definitions */
+
+#define IPCProtection	(0600)		/* access/modify by user only */
+
+#define IPC_NMAXSEM	25		/* maximum number of semaphores */
+#define IpcSemaphoreDefaultStartValue	255
+#define IpcSharedLock					(-1)
+#define IpcExclusiveLock			  (-255)
+
+#define IpcUnknownStatus	(-1)
+#define IpcInvalidArgument	(-2)
+#define IpcSemIdExist		(-3)
+#define IpcSemIdNotExist	(-4)
+
+typedef uint32	IpcSemaphoreKey;		/* semaphore key */
+typedef int	IpcSemaphoreId;
+
+/* shared memory definitions */ 
+
+#define IpcMemCreationFailed	(-1)
+#define IpcMemIdGetFailed	(-2)
+#define IpcMemAttachFailed	0
+
+typedef uint32	IPCKey;
+#define PrivateIPCKey	IPC_PRIVATE
+#define DefaultIPCKey	17317
+
+typedef uint32  IpcMemoryKey;			/* shared memory key */
+typedef int	IpcMemoryId;
+
+
+/* ipc.c */
+extern void exitpg(int code);
+extern void quasi_exitpg(void);
+extern on_exitpg(void (*function)(), caddr_t arg);
+
+extern IpcSemaphoreId IpcSemaphoreCreate(IpcSemaphoreKey semKey,
+		int semNum, int permission, int semStartValue,
+		int removeOnExit, int *status);
+extern void IpcSemaphoreSet(int semId, int semno, int value);
+extern void IpcSemaphoreKill(IpcSemaphoreKey key);
+extern void IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock);
+extern void IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock);
+extern int IpcSemaphoreGetCount(IpcSemaphoreId semId, int sem);
+extern int IpcSemaphoreGetValue(IpcSemaphoreId semId, int sem);
+extern IpcMemoryId IpcMemoryCreate(IpcMemoryKey memKey, uint32 size,
+				   int permission);
+extern IpcMemoryId IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size);
+extern void IpcMemoryDetach(int status, char *shmaddr);
+extern char *IpcMemoryAttach(IpcMemoryId memId);
+extern void IpcMemoryKill(IpcMemoryKey memKey);
+extern void CreateAndInitSLockMemory(IPCKey key);
+extern void AttachSLockMemory(IPCKey key);
+
+
+#ifdef HAS_TEST_AND_SET
+
+#define NSLOCKS		2048
+#define	NOLOCK		0
+#define SHAREDLOCK	1
+#define EXCLUSIVELOCK	2
+
+typedef enum _LockId_ {
+    BUFMGRLOCKID,
+    LOCKLOCKID,
+    OIDGENLOCKID,
+    SHMEMLOCKID,
+    BINDINGLOCKID,
+    LOCKMGRLOCKID,
+    SINVALLOCKID,
+
+#ifdef MAIN_MEMORY
+    MMCACHELOCKID,
+#endif /* MAIN_MEMORY */
+
+    PROCSTRUCTLOCKID,
+    FIRSTFREELOCKID
+} _LockId_;
+
+#define MAX_SPINS	FIRSTFREELOCKID
+
+typedef struct slock {
+    slock_t		locklock;
+    unsigned char	flag;
+    short		nshlocks;
+    slock_t		shlock;
+    slock_t		exlock;
+    slock_t		comlock;
+    struct slock	*next;
+} SLock;
+
+extern void ExclusiveLock(int lockid);
+extern void ExclusiveUnlock(int lockid);
+extern bool LockIsFree(int lockid);
+#else /* HAS_TEST_AND_SET */
+
+typedef enum _LockId_ {
+    SHMEMLOCKID,
+    BINDINGLOCKID,
+    BUFMGRLOCKID,
+    LOCKMGRLOCKID,
+    SINVALLOCKID,
+
+#ifdef MAIN_MEMORY
+    MMCACHELOCKID,
+#endif /* MAIN_MEMORY */
+
+    PROCSTRUCTLOCKID,
+    OIDGENLOCKID,
+    FIRSTFREELOCKID
+} _LockId_;
+
+#define MAX_SPINS	FIRSTFREELOCKID
+
+#endif /* HAS_TEST_AND_SET */
+
+/*
+ * the following are originally in ipci.h but the prototypes have circular
+ * dependencies and most files include both ipci.h and ipc.h anyway, hence
+ * combined.
+ *
+ */
+
+/*
+ * Note:
+ *	These must not hash to DefaultIPCKey or PrivateIPCKey.
+ */
+#define SystemPortAddressGetIPCKey(address) \
+	(28597 * (address) + 17491)
+
+/*
+ * these keys are originally numbered from 1 to 12 consecutively but not
+ * all are used. The unused ones are removed.		- ay 4/95.
+ */
+#define IPCKeyGetBufferMemoryKey(key) \
+	((key == PrivateIPCKey) ? key : 1 + (key))
+
+#define IPCKeyGetSIBufferMemoryBlock(key) \
+	((key == PrivateIPCKey) ? key : 7 + (key))
+
+#define IPCKeyGetSLockSharedMemoryKey(key) \
+	((key == PrivateIPCKey) ? key : 10 + (key))
+
+#define IPCKeyGetSpinLockSemaphoreKey(key) \
+	((key == PrivateIPCKey) ? key : 11 + (key))
+#define IPCKeyGetWaitIOSemaphoreKey(key) \
+	((key == PrivateIPCKey) ? key : 12 + (key))
+
+/* --------------------------
+ * NOTE: This macro must always give the highest numbered key as every backend
+ * process forked off by the postmaster will be trying to acquire a semaphore
+ * with a unique key value starting at key+14 and incrementing up.  Each
+ * backend uses the current key value then increments it by one.
+ * --------------------------
+ */
+#define IPCGetProcessSemaphoreInitKey(key) \
+	((key == PrivateIPCKey) ? key : 14 + (key))
+
+/* ipci.c */
+extern IPCKey SystemPortAddressCreateIPCKey(SystemPortAddress address);
+extern void CreateSharedMemoryAndSemaphores(IPCKey key);
+extern void AttachSharedMemoryAndSemaphores(IPCKey key);
+
+#endif	/* IPC_H */
--- a/src/backend/storage/ipc/Makefile.inc
+++ b/src/backend/storage/ipc/Makefile.inc
@@ -0,0 +1,15 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/ipc
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= ipc.c ipci.c s_lock.c shmem.c shmqueue.c sinval.c \
+	sinvaladt.c spin.c
--- a/src/backend/storage/ipc/README
+++ b/src/backend/storage/ipc/README
@@ -0,0 +1,31 @@
+$Header: /cvsroot/pgsql/src/backend/storage/ipc/README,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+Mon Jul 18 11:09:22 PDT 1988  W.KLAS
+
+Cache invalidation synchronization routines:
+===========================================
+
+The cache synchronization is done using a message queue. Every
+backend can register a message which then has to be read by
+all backends. A message read by all backends is removed from the 
+queue automatically. If a message has been lost because the buffer
+was full, all backends that haven't read this message will be
+noticed that they have to reset their cache state. This is done
+at the time when they try to read the message queue.
+
+The message queue is implemented as a shared buffer segment. Actually,
+the queue is a circle to allow fast inserting, reading (invalidate data) and
+maintaining the buffer.
+
+Access to this shared message buffer is synchronized by the lock manager.
+The lock manager treats the buffer as a regular relation and sets
+relation level locks (with mode = LockWait) to block backends while 
+another backend is writing or reading the buffer. The identifiers used
+for this special 'relation' are database id = 0 and relation id = 0.
+
+The current implementation prints regular (e)log information
+when a message has been removed from the buffer because the buffer 
+is full, and a backend has to reset its cache state. The elog level
+is NOTICE. This can be used to improve teh behavior of backends
+when invalidating or reseting their cache state.
+
+
--- a/src/backend/storage/ipc/ipc.c
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,718 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c--
+ *    POSTGRES inter-process communication definitions.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ * NOTES
+ *
+ *    Currently, semaphores are used (my understanding anyway) in two
+ *    different ways:
+ *      1. as mutexes on machines that don't have test-and-set (eg.
+ *         mips R3000).
+ *      2. for putting processes to sleep when waiting on a lock 
+ *         and waking them up when the lock is free.
+ *    The number of semaphores in (1) is fixed and those are shared
+ *    among all backends. In (2), there is 1 semaphore per process and those
+ *    are not shared with anyone else.
+ *                                                        -ay 4/95
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+#include <stdio.h>
+#include <errno.h>
+
+/* XXX - the following  dependency should be moved into the defaults.mk file */
+#ifndef	_IPC_
+#define _IPC_
+#include <sys/ipc.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#endif
+
+#include "storage/ipc.h"
+#include "utils/memutils.h"
+#include "utils/elog.h"
+
+#if defined(PORTNAME_bsd44)
+int UsePrivateMemory = 1;
+#else
+int UsePrivateMemory = 0;
+#endif
+
+#if defined(PORTNAME_bsdi)
+/* hacka, hacka, hacka (XXX) */
+union semun {
+	int val; /* value for SETVAL */
+	struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
+	ushort *array; /* array for GETALL & SETALL */
+};
+#endif
+
+
+/* ----------------------------------------------------------------
+ *			exit() handling stuff
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+static struct ONEXIT {
+    void (*function)();
+    caddr_t arg;
+} onexit_list[ MAX_ON_EXITS ];
+
+static int onexit_index;
+
+typedef struct _PrivateMemStruct {
+    int id;
+    char *memptr;
+} PrivateMem;
+
+PrivateMem IpcPrivateMem[16];
+
+static int
+PrivateMemoryCreate(IpcMemoryKey memKey,
+		    uint32 size)
+{
+    static int memid = 0;
+    
+    UsePrivateMemory = 1;
+    
+    IpcPrivateMem[memid].id = memid;
+    IpcPrivateMem[memid].memptr = malloc(size);
+    if (IpcPrivateMem[memid].memptr == NULL)
+	elog(WARN, "PrivateMemoryCreate: not enough memory to malloc");
+    memset(IpcPrivateMem[memid].memptr, 0, size);	/* XXX PURIFY */
+    
+    return (memid++);
+}
+
+static char *
+PrivateMemoryAttach(IpcMemoryId memid)
+{
+    return ( IpcPrivateMem[memid].memptr );
+}
+
+
+/* ----------------------------------------------------------------
+ *	exitpg
+ *
+ *	this function calls all the callbacks registered
+ *	for it (to free resources) and then calls exit.
+ *	This should be the only function to call exit().
+ *	-cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+static int exitpg_inprogress = 0;
+
+void
+exitpg(int code)
+{
+    int i;
+    
+    /* ----------------
+     *	if exitpg_inprocess is true, then it means that we
+     *  are being invoked from within an on_exit() handler
+     *  and so we return immediately to avoid recursion.
+     * ----------------
+     */
+    if (exitpg_inprogress)
+	return;
+    
+    exitpg_inprogress = 1;
+    
+    /* ----------------
+     *	call all the callbacks registered before calling exit().
+     * ----------------
+     */
+    for (i = onexit_index - 1; i >= 0; --i)
+	(*onexit_list[i].function)(code, onexit_list[i].arg);
+    
+    exit(code);
+}
+
+/* ------------------
+ * Run all of the on_exitpg routines but don't exit in the end.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly
+ * ------------------
+ */
+void
+quasi_exitpg()
+{
+    int i;
+    
+    /* ----------------
+     *	if exitpg_inprocess is true, then it means that we
+     *  are being invoked from within an on_exit() handler
+     *  and so we return immediately to avoid recursion.
+     * ----------------
+     */
+    if (exitpg_inprogress)
+	return;
+    
+    exitpg_inprogress = 1;
+    
+    /* ----------------
+     *	call all the callbacks registered before calling exit().
+     * ----------------
+     */
+    for (i = onexit_index - 1; i >= 0; --i)
+	(*onexit_list[i].function)(0, onexit_list[i].arg);
+    
+    onexit_index = 0;
+    exitpg_inprogress = 0;
+}
+
+/* ----------------------------------------------------------------
+ *	on_exitpg
+ *
+ *	this function adds a callback function to the list of
+ *	functions invoked by exitpg().	-cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+int
+on_exitpg(void (*function)(), caddr_t arg)
+{
+    if (onexit_index >= MAX_ON_EXITS)
+	return(-1);
+    
+    onexit_list[ onexit_index ].function = function;
+    onexit_list[ onexit_index ].arg = arg;
+    
+    ++onexit_index;
+    
+    return(0);
+}
+
+/****************************************************************************/
+/*   IPCPrivateSemaphoreKill(status, semId)				    */
+/*									    */
+/****************************************************************************/
+static void
+IPCPrivateSemaphoreKill(int status,
+			int semId) /* caddr_t */
+{
+    union semun	semun;
+    semctl(semId, 0, IPC_RMID, semun);
+}
+
+
+/****************************************************************************/
+/*   IPCPrivateMemoryKill(status, shmId)				    */
+/*									    */
+/****************************************************************************/
+static void
+IPCPrivateMemoryKill(int status,
+		     int shmId)	/* caddr_t */
+{
+    if ( UsePrivateMemory ) {
+	/* free ( IpcPrivateMem[shmId].memptr ); */
+    } else {
+	if (shmctl(shmId, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
+	    elog(NOTICE, "IPCPrivateMemoryKill: shmctl(%d, %d, 0) failed: %m",
+		 shmId, IPC_RMID);
+	}
+    } 
+}
+
+
+/****************************************************************************/
+/*   IpcSemaphoreCreate(semKey, semNum, permission, semStartValue)          */
+/*    									    */
+/*    - returns a semaphore identifier:					    */
+/*    									    */
+/* if key doesn't exist: return a new id,      status:= IpcSemIdNotExist    */
+/* if key exists:        return the old id,    status:= IpcSemIdExist	    */
+/* if semNum > MAX :     return # of argument, status:=IpcInvalidArgument   */
+/*									    */
+/****************************************************************************/
+
+/*
+ * Note:
+ * XXX	This should be split into two different calls.  One should
+ * XXX	be used to create a semaphore set.  The other to "attach" a
+ * XXX	existing set.  It should be an error for the semaphore set
+ * XXX	to to already exist or for it not to, respectively.
+ *
+ *	Currently, the semaphore sets are "attached" and an error
+ *	is detected only when a later shared memory attach fails.
+ */
+
+IpcSemaphoreId
+IpcSemaphoreCreate(IpcSemaphoreKey semKey,
+		   int semNum,
+		   int permission,
+		   int semStartValue,
+		   int removeOnExit,
+		   int *status)
+{
+    int		i;
+    int		errStatus;
+    int		semId;
+    u_short	array[IPC_NMAXSEM];
+    union semun	semun;
+
+    /* get a semaphore if non-existent */
+    /* check arguments	*/
+    if (semNum > IPC_NMAXSEM || semNum <= 0)  {
+	*status = IpcInvalidArgument;
+	return(2);	/* returns the number of the invalid argument	*/
+    }
+    
+    semId = semget(semKey, 0, 0);
+
+    if (semId == -1) {
+	*status = IpcSemIdNotExist;	/* there doesn't exist a semaphore */
+#ifdef DEBUG_IPC
+	fprintf(stderr,"calling semget with %d, %d , %d\n",
+		semKey,
+		semNum,
+		IPC_CREAT|permission );
+#endif
+	semId = semget(semKey, semNum, IPC_CREAT|permission);
+
+	if (semId < 0) {
+	    perror("semget");
+	    exitpg(3);
+	}
+	for (i = 0; i < semNum; i++) {
+	    array[i] = semStartValue;
+	}
+	semun.array = array;
+	errStatus = semctl(semId, 0, SETALL, semun);
+	if (errStatus == -1) {
+	    perror("semctl");
+	}
+	
+	if (removeOnExit)
+	    on_exitpg(IPCPrivateSemaphoreKill, (caddr_t)semId);
+	
+    } else {
+	/* there is a semaphore id for this key */
+	*status = IpcSemIdExist;
+    }
+    
+#ifdef DEBUG_IPC
+    fprintf(stderr,"\nIpcSemaphoreCreate, status %d, returns %d\n",
+	    *status,
+	    semId );
+    fflush(stdout);
+    fflush(stderr);
+#endif
+    return(semId);
+}
+
+
+/****************************************************************************/
+/*   IpcSemaphoreSet()		- sets the initial value of the semaphore   */
+/*									    */
+/*	note: the xxx_return variables are only used for debugging.	    */
+/****************************************************************************/
+static int IpcSemaphoreSet_return;
+
+void
+IpcSemaphoreSet(int semId, int semno, int value)
+{
+    int		errStatus;
+    union semun	semun;
+    
+    semun.val = value;
+    errStatus = semctl(semId, semno, SETVAL, semun);
+    IpcSemaphoreSet_return = errStatus;
+    
+    if (errStatus == -1)
+	perror("semctl");
+}
+
+/****************************************************************************/
+/*   IpcSemaphoreKill(key)	- removes a semaphore			    */
+/*									    */
+/****************************************************************************/
+void
+IpcSemaphoreKill(IpcSemaphoreKey key)
+{
+    int 	semId;
+    union semun	semun;
+    
+    /* kill semaphore if existent */
+    
+    semId = semget(key, 0, 0);
+    if (semId != -1)
+	semctl(semId, 0, IPC_RMID, semun);
+}
+
+/****************************************************************************/
+/*   IpcSemaphoreLock(semId, sem, lock)	- locks a semaphore		    */
+/*									    */
+/*	note: the xxx_return variables are only used for debugging.	    */
+/****************************************************************************/
+static int IpcSemaphoreLock_return;
+
+void
+IpcSemaphoreLock(IpcSemaphoreId semId, int sem, int lock)
+{
+    extern int		errno;
+    int			errStatus;
+    struct sembuf	sops;
+    
+    sops.sem_op = lock;
+    sops.sem_flg = 0;
+    sops.sem_num = sem;
+    
+    /* ----------------
+     *	Note: if errStatus is -1 and errno == EINTR then it means we
+     *        returned from the operation prematurely because we were
+     *	      sent a signal.  So we try and lock the semaphore again.
+     *	      I am not certain this is correct, but the semantics aren't
+     *	      clear it fixes problems with parallel abort synchronization,
+     *	      namely that after processing an abort signal, the semaphore
+     *	      call returns with -1 (and errno == EINTR) before it should.
+     *	      -cim 3/28/90
+     * ----------------
+     */
+    do {
+	errStatus = semop(semId, &sops, 1);
+    } while (errStatus == -1 && errno == EINTR);
+    
+    IpcSemaphoreLock_return = errStatus;
+    
+    if (errStatus == -1) {
+	perror("semop");
+	exitpg(255);
+    }
+}
+
+/****************************************************************************/
+/*   IpcSemaphoreUnlock(semId, sem, lock)	- unlocks a semaphore	    */
+/*									    */
+/*	note: the xxx_return variables are only used for debugging.	    */
+/****************************************************************************/
+static int IpcSemaphoreUnlock_return;
+
+void
+IpcSemaphoreUnlock(IpcSemaphoreId semId, int sem, int lock)
+{
+    extern int		errno;
+    int			errStatus;
+    struct sembuf	sops;
+    
+    sops.sem_op = -lock;
+    sops.sem_flg = 0;
+    sops.sem_num = sem;
+    
+    
+    /* ----------------
+     *	Note: if errStatus is -1 and errno == EINTR then it means we
+     *        returned from the operation prematurely because we were
+     *	      sent a signal.  So we try and lock the semaphore again.
+     *	      I am not certain this is correct, but the semantics aren't
+     *	      clear it fixes problems with parallel abort synchronization,
+     *	      namely that after processing an abort signal, the semaphore
+     *	      call returns with -1 (and errno == EINTR) before it should.
+     *	      -cim 3/28/90
+     * ----------------
+     */
+    do {
+	errStatus = semop(semId, &sops, 1);
+    } while (errStatus == -1 && errno == EINTR);
+    
+    IpcSemaphoreUnlock_return = errStatus;
+    
+    if (errStatus == -1) {
+	perror("semop");
+	exitpg(255);
+    }
+}
+
+int
+IpcSemaphoreGetCount(IpcSemaphoreId	semId, int sem)
+{
+    int semncnt;
+    union semun dummy; /* for Solaris */
+    
+    semncnt = semctl(semId, sem, GETNCNT, dummy);
+    return semncnt;
+}
+
+int
+IpcSemaphoreGetValue(IpcSemaphoreId	semId, int sem)
+{
+    int semval;
+    union semun dummy; /* for Solaris */
+    
+    semval = semctl(semId, sem, GETVAL, dummy);
+    return semval;
+}
+
+/****************************************************************************/
+/*   IpcMemoryCreate(memKey)						    */
+/*									    */
+/*    - returns the memory identifier, if creation succeeds		    */
+/*	returns IpcMemCreationFailed, if failure			    */
+/****************************************************************************/
+
+IpcMemoryId
+IpcMemoryCreate(IpcMemoryKey memKey, uint32 size, int permission)
+{
+    IpcMemoryId	 shmid;
+    
+    if (memKey == PrivateIPCKey) {
+	/* private */
+	shmid = PrivateMemoryCreate(memKey, size);
+    }else {
+    	shmid = shmget(memKey, size, IPC_CREAT|permission); 
+    }
+
+    if (shmid < 0) {
+	fprintf(stderr,"IpcMemoryCreate: memKey=%d , size=%d , permission=%d", 
+		memKey, size , permission );
+	perror("IpcMemoryCreate: shmget(..., create, ...) failed");
+	return(IpcMemCreationFailed);
+    }
+    
+    /* if (memKey == PrivateIPCKey) */
+    on_exitpg(IPCPrivateMemoryKill, (caddr_t)shmid);
+    
+    return(shmid);
+}
+
+/****************************************************************************/
+/*  IpcMemoryIdGet(memKey, size)    returns the shared memory Id 	    */
+/*				    or IpcMemIdGetFailed	            */
+/****************************************************************************/
+IpcMemoryId
+IpcMemoryIdGet(IpcMemoryKey memKey, uint32 size)
+{
+    IpcMemoryId	shmid;
+    
+    shmid = shmget(memKey, size, 0);
+    
+    if (shmid < 0) {
+	fprintf(stderr,"IpcMemoryIdGet: memKey=%d , size=%d , permission=%d", 
+		memKey, size , 0 );
+	perror("IpcMemoryIdGet:  shmget() failed");
+	return(IpcMemIdGetFailed);
+    }
+    
+    return(shmid);
+}
+
+/****************************************************************************/
+/*  IpcMemoryDetach(status, shmaddr)	removes a shared memory segment	    */
+/*					from a backend address space	    */
+/*  (only called by backends running under the postmaster)		    */
+/****************************************************************************/
+void
+IpcMemoryDetach(int status, char *shmaddr)
+{
+    if (shmdt(shmaddr) < 0) {
+	elog(NOTICE, "IpcMemoryDetach: shmdt(0x%x): %m", shmaddr);
+    }
+}
+
+/****************************************************************************/
+/*  IpcMemoryAttach(memId)    returns the adress of shared memory	    */
+/*			      or IpcMemAttachFailed			    */
+/*							                    */
+/* CALL IT:  addr = (struct <MemoryStructure> *) IpcMemoryAttach(memId);    */
+/*									    */
+/****************************************************************************/
+char *
+IpcMemoryAttach(IpcMemoryId memId)
+{
+    char	*memAddress;
+    
+    if (UsePrivateMemory) {
+	memAddress = (char *) PrivateMemoryAttach(memId);
+    } else {
+	memAddress = (char *) shmat(memId, 0, 0);
+    }
+    
+    /*	if ( *memAddress == -1) { XXX ??? */
+    if ( memAddress == (char *)-1) {
+	perror("IpcMemoryAttach: shmat() failed");
+	return(IpcMemAttachFailed);
+    }
+    
+    if (!UsePrivateMemory)
+	on_exitpg(IpcMemoryDetach, (caddr_t) memAddress);
+    
+    return((char *) memAddress);
+}
+
+
+/****************************************************************************/
+/*  IpcMemoryKill(memKey)    		removes a shared memory segment     */
+/*  (only called by the postmaster and standalone backends)		    */
+/****************************************************************************/
+void
+IpcMemoryKill(IpcMemoryKey memKey)
+{	
+    IpcMemoryId		shmid;
+    
+    if (!UsePrivateMemory && (shmid = shmget(memKey, 0, 0)) >= 0) {
+	if (shmctl(shmid, IPC_RMID, (struct shmid_ds *) NULL) < 0) {
+	    elog(NOTICE, "IpcMemoryKill: shmctl(%d, %d, 0) failed: %m",
+		 shmid, IPC_RMID);
+	}
+    }
+} 
+
+#ifdef HAS_TEST_AND_SET
+/* ------------------
+ *  use hardware locks to replace semaphores for sequent machines
+ *  to avoid costs of swapping processes and to provide unlimited
+ *  supply of locks.
+ * ------------------
+ */
+static SLock *SLockArray = NULL;
+static SLock **FreeSLockPP;
+static int *UnusedSLockIP;
+static slock_t *SLockMemoryLock;
+static IpcMemoryId SLockMemoryId = -1;
+
+struct ipcdummy {		/* to get alignment/size right */
+    SLock	*free;
+    int		unused;
+    slock_t	memlock;
+    SLock	slocks[NSLOCKS];
+};
+static int SLockMemorySize = sizeof(struct ipcdummy);
+
+void
+CreateAndInitSLockMemory(IPCKey key)
+{
+    int id;
+    SLock *slckP;
+    
+    SLockMemoryId = IpcMemoryCreate(key,
+				    SLockMemorySize,
+				    0700);
+    AttachSLockMemory(key);
+    *FreeSLockPP = NULL;
+    *UnusedSLockIP = (int)FIRSTFREELOCKID;
+    for (id=0; id<(int)FIRSTFREELOCKID; id++) {
+	slckP = &(SLockArray[id]);
+	S_INIT_LOCK(&(slckP->locklock));
+	slckP->flag = NOLOCK;
+	slckP->nshlocks = 0;
+	S_INIT_LOCK(&(slckP->shlock));
+	S_INIT_LOCK(&(slckP->exlock));
+	S_INIT_LOCK(&(slckP->comlock));
+	slckP->next = NULL;
+    }
+    return;
+}
+
+void
+AttachSLockMemory(IPCKey key)
+{
+    struct ipcdummy *slockM;
+    
+    if (SLockMemoryId == -1)
+	SLockMemoryId = IpcMemoryIdGet(key,SLockMemorySize);
+    if (SLockMemoryId == -1)
+	elog(FATAL, "SLockMemory not in shared memory");
+    slockM = (struct ipcdummy *) IpcMemoryAttach(SLockMemoryId);
+    if (slockM == IpcMemAttachFailed)
+	elog(FATAL, "AttachSLockMemory: could not attach segment");
+    FreeSLockPP = (SLock **) &(slockM->free);
+    UnusedSLockIP = (int *) &(slockM->unused);
+    SLockMemoryLock = (slock_t *) &(slockM->memlock);
+    S_INIT_LOCK(SLockMemoryLock);
+    SLockArray = (SLock *) &(slockM->slocks[0]);
+    return;
+}
+
+
+#ifdef LOCKDEBUG
+#define PRINT_LOCK(LOCK) printf("(locklock = %d, flag = %d, nshlocks = %d, \
+shlock = %d, exlock =%d)\n", LOCK->locklock, \
+				LOCK->flag, LOCK->nshlocks, LOCK->shlock, \
+				LOCK->exlock)
+#endif
+
+void
+ExclusiveLock(int lockid)
+{
+    SLock *slckP;
+    slckP = &(SLockArray[lockid]);
+#ifdef LOCKDEBUG
+    printf("ExclusiveLock(%d)\n", lockid);
+    printf("IN: ");
+    PRINT_LOCK(slckP);
+#endif
+ ex_try_again:
+    S_LOCK(&(slckP->locklock));
+    switch (slckP->flag) {
+    case NOLOCK:
+	slckP->flag = EXCLUSIVELOCK;
+	S_LOCK(&(slckP->exlock));
+	S_LOCK(&(slckP->shlock));
+	S_UNLOCK(&(slckP->locklock));
+#ifdef LOCKDEBUG
+	printf("OUT: ");
+	PRINT_LOCK(slckP);
+#endif
+	return;
+    case SHAREDLOCK:
+    case EXCLUSIVELOCK:
+	S_UNLOCK(&(slckP->locklock));
+	S_LOCK(&(slckP->exlock));
+	S_UNLOCK(&(slckP->exlock));
+	goto ex_try_again;
+    }
+}
+
+void
+ExclusiveUnlock(int lockid)
+{
+    SLock *slckP;
+    
+    slckP = &(SLockArray[lockid]);
+#ifdef LOCKDEBUG
+    printf("ExclusiveUnlock(%d)\n", lockid);
+    printf("IN: ");
+    PRINT_LOCK(slckP);
+#endif
+    S_LOCK(&(slckP->locklock));
+    /* -------------
+     *  give favor to read processes
+     * -------------
+     */
+    slckP->flag = NOLOCK;
+    if (slckP->nshlocks > 0) {
+	while (slckP->nshlocks > 0) {
+	    S_UNLOCK(&(slckP->shlock));
+	    S_LOCK(&(slckP->comlock));
+	}
+	S_UNLOCK(&(slckP->shlock));
+    }
+    else {
+	S_UNLOCK(&(slckP->shlock));
+    }
+    S_UNLOCK(&(slckP->exlock));
+    S_UNLOCK(&(slckP->locklock));
+#ifdef LOCKDEBUG
+    printf("OUT: ");
+    PRINT_LOCK(slckP);
+#endif
+    return;
+}
+
+bool
+LockIsFree(int lockid)
+{
+    return(SLockArray[lockid].flag == NOLOCK);
+}
+
+#endif /* HAS_TEST_AND_SET */
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,149 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c--
+ *    POSTGRES inter-process communication initialization code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "storage/ipc.h"
+#include "storage/multilev.h"
+#include "utils/elog.h"
+#include "storage/sinval.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "storage/lock.h"
+#include "miscadmin.h"		/* for DebugLvl */
+
+/*
+ * SystemPortAddressCreateMemoryKey --
+ *	Returns a memory key given a port address.
+ */
+IPCKey
+SystemPortAddressCreateIPCKey(SystemPortAddress address)
+{
+    Assert(address < 32768);	/* XXX */
+    
+    return (SystemPortAddressGetIPCKey(address));
+}
+
+/*
+ * CreateSharedMemoryAndSemaphores --
+ *	Creates and initializes shared memory and semaphores.
+ */
+/**************************************************
+  
+  CreateSharedMemoryAndSemaphores
+  is called exactly *ONCE* by the postmaster.
+  It is *NEVER* called by the postgres backend
+  
+  0) destroy any existing semaphores for both buffer
+  and lock managers.
+  1) create the appropriate *SHARED* memory segments
+  for the two resource managers.
+  
+  **************************************************/
+
+void
+CreateSharedMemoryAndSemaphores(IPCKey key)
+{
+    int		size;
+    
+#ifdef HAS_TEST_AND_SET
+    /* ---------------
+     *  create shared memory for slocks
+     * --------------
+     */
+    CreateAndInitSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
+#endif
+    /* ----------------
+     *	kill and create the buffer manager buffer pool (and semaphore)
+     * ----------------
+     */
+    CreateSpinlocks(IPCKeyGetSpinLockSemaphoreKey(key));
+    size = BufferShmemSize() + LockShmemSize();
+    
+#ifdef MAIN_MEMORY
+    size += MMShmemSize();
+#endif /* MAIN_MEMORY */
+    
+    if (DebugLvl > 1) {
+	fprintf(stderr, "binding ShmemCreate(key=%x, size=%d)\n",
+		IPCKeyGetBufferMemoryKey(key), size);
+    }
+    ShmemCreate(IPCKeyGetBufferMemoryKey(key), size);
+    ShmemBindingTabReset();
+    InitShmem(key, size);
+    InitBufferPool(key);
+    
+    /* ----------------
+     *	do the lock table stuff
+     * ----------------
+     */
+    InitLocks();
+    InitMultiLevelLockm();
+    if (InitMultiLevelLockm() == INVALID_TABLEID)
+	elog(FATAL, "Couldn't create the lock table");
+
+    /* ----------------
+     *  do process table stuff
+     * ----------------
+     */
+    InitProcGlobal(key);
+    on_exitpg(ProcFreeAllSemaphores, 0);
+    
+    CreateSharedInvalidationState(key);
+}
+
+
+/*
+ * AttachSharedMemoryAndSemaphores --
+ *	Attachs existant shared memory and semaphores.
+ */
+void
+AttachSharedMemoryAndSemaphores(IPCKey key)
+{
+    int size;
+    
+    /* ----------------
+     *	create rather than attach if using private key
+     * ----------------
+     */
+    if (key == PrivateIPCKey) {
+	CreateSharedMemoryAndSemaphores(key);
+	return;
+    }
+    
+#ifdef HAS_TEST_AND_SET
+    /* ----------------
+     *  attach the slock shared memory
+     * ----------------
+     */
+    AttachSLockMemory(IPCKeyGetSLockSharedMemoryKey(key));
+#endif
+    /* ----------------
+     *	attach the buffer manager buffer pool (and semaphore)
+     * ----------------
+     */
+    size = BufferShmemSize() + LockShmemSize();
+    InitShmem(key, size);
+    InitBufferPool(key);
+    
+    /* ----------------
+     *	initialize lock table stuff
+     * ----------------
+     */
+    InitLocks();
+    if (InitMultiLevelLockm() == INVALID_TABLEID)
+	elog(FATAL, "Couldn't attach to the lock table");
+    
+    AttachSharedInvalidationState(key);
+}
--- a/src/backend/storage/ipc/s_lock.c
+++ b/src/backend/storage/ipc/s_lock.c
@@ -0,0 +1,440 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c--
+ *     This file contains the implementation (if any) for spinlocks.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/s_lock.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *   DESCRIPTION
+ *	The following code fragment should be written (in assembly 
+ *	language) on machines that have a native test-and-set instruction:
+ *
+ *	void
+ *	S_LOCK(char_address)
+ *	    char *char_address;
+ *	{
+ *	    while (test_and_set(char_address))
+ *		;
+ *	}
+ *
+ *	If this is not done, POSTGRES will default to using System V
+ *	semaphores (and take a large performance hit -- around 40% of
+ *	its time on a DS5000/240 is spent in semop(3)...).
+ *
+ *   NOTES
+ *	AIX has a test-and-set but the recommended interface is the cs(3)
+ *	system call.  This provides an 8-instruction (plus system call 
+ *	overhead) uninterruptible compare-and-set operation.  True 
+ *	spinlocks might be faster but using cs(3) still speeds up the 
+ *	regression test suite by about 25%.  I don't have an assembler
+ *	manual for POWER in any case.
+ *
+ */
+#ifdef WIN32
+#include <windows.h>
+#endif /* WIN32 */
+#include "storage/ipc.h"
+
+
+#if defined(HAS_TEST_AND_SET)
+
+#if defined (PORTNAME_next)
+/*
+ * NEXTSTEP (mach)
+ * slock_t is defined as a struct mutex.
+ */
+void
+S_LOCK(slock_t *lock)
+{
+	mutex_lock(lock);
+}
+void
+S_UNLOCK(slock_t *lock)
+{
+	mutex_unlock(lock);
+}
+void
+S_INIT_LOCK(slock_t *lock)
+{
+ 	mutex_init(lock);	
+}
+
+ /* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
+int
+ S_LOCK_FREE(slock_t *lock)
+{
+ 	/* For Mach, we have to delve inside the entrails of `struct  
+mutex'.  Ick! */
+ 	return (lock->lock == 0);
+}
+
+#endif /* PORTNAME_next */
+
+
+
+#if defined(PORTNAME_irix5)
+/*
+ * SGI IRIX 5
+ * slock_t is defined as a struct abilock_t, which has a single unsigned long 
+ * member.
+ * 
+ * This stuff may be supplemented in the future with Masato Kataoka's MIPS-II
+ * assembly from his NECEWS SVR4 port, but we probably ought to retain this
+ * for the R3000 chips out there.
+ */
+void
+S_LOCK(slock_t *lock)
+{
+	/* spin_lock(lock); */
+	while (!acquire_lock(lock))
+	    ;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+	(void)release_lock(lock);
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+	(void)init_lock(lock);	
+}
+
+/* S_LOCK_FREE should return 1 if lock is free; 0 if lock is locked */
+int
+S_LOCK_FREE(slock_t *lock)
+{
+	return(stat_lock(lock)==UNLOCKED); 
+}
+
+#endif /* PORTNAME_irix5 */
+
+
+/*
+ * OSF/1 (Alpha AXP)
+ *
+ * Note that slock_t on the Alpha AXP is msemaphore instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_alpha)
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (msem_lock(lock, MSEM_IF_NOWAIT) < 0)
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    (void) msem_unlock(lock, 0);
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    (void) msem_init(lock, MSEM_UNLOCKED);
+}
+
+int
+S_LOCK_FREE(slock_t *lock)
+{
+    return(lock->msem_state ? 0 : 1);
+}
+
+#endif /* PORTNAME_alpha */
+
+/*
+ * Solaris 2
+ */
+
+#if defined(PORTNAME_sparc_solaris)
+
+/* defined in port/.../tas.s */
+extern int tas(slock_t *lock);
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_sparc_solaris */
+
+/*
+ * AIX (POWER)
+ *
+ * Note that slock_t on POWER/POWER2/PowerPC is int instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_aix)
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (cs((int *) lock, 0, 1))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_aix */
+
+/*
+ * HP-UX (PA-RISC)
+ *
+ * Note that slock_t on PA-RISC is a structure instead of char
+ * (see storage/ipc.h).
+ */
+
+#if defined(PORTNAME_hpux)
+
+/* defined in port/.../tas.s */
+extern int tas(slock_t *lock);
+
+/*
+* a "set" slock_t has a single word cleared.  a "clear" slock_t has 
+* all words set to non-zero.
+*/
+static slock_t clear_lock = { -1, -1, -1, -1 };
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = clear_lock;	/* struct assignment */
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+int
+S_LOCK_FREE(slock_t *lock)
+{
+    register int *lock_word = (int *) (((long) lock + 15) & ~15);
+
+    return(*lock_word != 0);
+}
+
+#endif /* PORTNAME_hpux */
+
+/*
+ * sun3
+ */
+ 
+#if (defined(sun) && ! defined(sparc))
+
+void    
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock));
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+static int
+tas_dummy()
+{
+    asm("LLA0:");
+    asm("	.data");
+    asm("	.text");
+    asm("|#PROC# 04");
+    asm("	.globl	_tas");
+    asm("_tas:");
+    asm("|#PROLOGUE# 1");
+    asm("	movel   sp@(0x4),a0");
+    asm("	tas	a0@");
+    asm("	beq	LLA1");
+    asm("	moveq   #-128,d0");
+    asm("	rts");
+    asm("LLA1:");
+    asm("	moveq   #0,d0");
+    asm("	rts");
+    asm("	.data");
+}
+
+#endif
+
+/*
+ * SPARC (SunOS 4)
+ */
+
+#if defined(PORTNAME_sparc)
+
+/* if we're using -ansi w/ gcc, use __asm__ instead of asm */
+#if defined(__STRICT_ANSI__)
+#define asm(x)  __asm__(x)
+#endif 
+
+static int
+tas_dummy()
+{
+    asm(".seg \"data\"");
+    asm(".seg \"text\"");
+    asm(".global _tas");
+    asm("_tas:");
+    
+    /*
+     * Sparc atomic test and set (sparc calls it "atomic load-store")
+     */
+    
+    asm("ldstub [%r8], %r8");
+    
+    /*
+     * Did test and set actually do the set?
+     */
+    
+    asm("tst %r8");
+    
+    asm("be,a ReturnZero");
+    
+    /*
+     * otherwise, just return.
+     */
+    
+    asm("clr %r8");
+    asm("mov 0x1, %r8");
+    asm("ReturnZero:");
+    asm("retl");
+    asm("nop");
+}
+
+void
+S_LOCK(unsigned char *addr)
+{
+    while (tas(addr));
+}
+
+
+/*
+ * addr should be as in the above S_LOCK routine
+ */
+void
+S_UNLOCK(unsigned char *addr)
+{
+    *addr = 0;
+}
+
+void
+S_INIT_LOCK(unsigned char *addr)
+{
+    *addr = 0;
+}
+
+#endif /* PORTNAME_sparc */
+
+/*
+ * Linux and friends
+ */
+
+#if defined(PORTNAME_linux) || defined(PORTNAME_BSD44_derived)
+
+int
+tas(slock_t *m)
+{
+    slock_t res;
+    __asm__("xchgb %0,%1":"=q" (res),"=m" (*m):"0" (0x1));
+    return(res);
+}
+
+void
+S_LOCK(slock_t *lock)
+{
+    while (tas(lock))
+	;
+}
+
+void
+S_UNLOCK(slock_t *lock)
+{
+    *lock = 0;
+}
+
+void
+S_INIT_LOCK(slock_t *lock)
+{
+    S_UNLOCK(lock);
+}
+
+#endif /* PORTNAME_linux || PORTNAME_BSD44_derived */
+
+
+#endif /* HAS_TEST_AND_SET */
+
+
+#ifdef WIN32
+void
+S_LOCK(HANDLE *lock)
+{
+      int x = 0;
+      x = x / x;
+}
+
+void
+S_UNLOCK(HANDLE *lock)
+{
+      int x = 0;
+      x = x / x;
+}
+
+void
+S_INIT_LOCK(HANDLE *lock)
+{
+      int x = 0;
+      x = x / x;
+}
+#endif /*WIN32*/
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,561 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c--
+ *    create shared memory and initialize shared memory data structures.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is "attached to"
+ * by each of the backends.  The routines in this file are used for
+ * allocating and binding to shared memory data structures.
+ *
+ * NOTES:
+ *	(a) There are three kinds of shared memory data structures
+ *  available to POSTGRES: fixed-size structures, queues and hash 
+ *  tables.  Fixed-size structures contain things like global variables
+ *  for a module and should never be allocated after the process 
+ *  initialization phase.  Hash tables have a fixed maximum size, but
+ *  their actual size can vary dynamically.  When entries are added
+ *  to the table, more space is allocated.  Queues link data structures 
+ *  that have been allocated either as fixed size structures or as hash 
+ *  buckets.  Each shared data structure has a string name to identify 
+ *  it (assigned in the module that declares it).
+ *
+ *	(b) During initialization, each module looks for its
+ *  shared data structures in a hash table called the "Binding Table".
+ *  If the data structure is not present, the caller can allocate
+ *  a new one and initialize it.  If the data structure is present, 
+ *  the caller "attaches" to the structure by initializing a pointer 
+ *  in the local address space.  
+ * 	The binding table has two purposes: first, it gives us
+ *  a simple model of how the world looks when a backend process 
+ *  initializes.  If something is present in the binding table,
+ *  it is initialized.  If it is not, it is uninitialized.  Second,
+ *  the binding table allows us to allocate shared memory on demand
+ *  instead of trying to preallocate structures and hard-wire the
+ *  sizes and locations in header files.  If you are using a lot
+ *  of shared memory in a lot of different places (and changing
+ *  things during development), this is important.
+ *
+ *	(c) memory allocation model: shared memory can never be 
+ *  freed, once allocated.   Each hash table has its own free list,
+ *  so hash buckets can be reused when an item is deleted.  However,
+ *  if one hash table grows very large and then shrinks, its space
+ *  cannot be redistributed to other tables.  We could build a simple
+ *  hash bucket garbage collector if need be.  Right now, it seems
+ *  unnecessary.
+ *
+ *   	See InitSem() in sem.c for an example of how to use the
+ *  binding table.
+ *
+ */
+#include <stdio.h>
+#include <string.h>
+#include "postgres.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+/* shared memory global variables */
+
+unsigned long  ShmemBase = 0;	/* start and end address of
+				 * shared memory
+				 */
+static unsigned long  ShmemEnd = 0;
+static unsigned long  ShmemSize = 0;	/* current size (and default) */
+
+SPINLOCK      ShmemLock;	/* lock for shared memory allocation */
+
+SPINLOCK      BindingLock;	/* lock for binding table access */
+
+static unsigned long *ShmemFreeStart = NULL;	/* pointer to the OFFSET of
+						 * first free shared memory
+						 */
+static unsigned long *ShmemBindingTabOffset = NULL; /* start of the binding
+						     * table (for bootstrap)
+						     */
+static int  ShmemBootstrap = FALSE;	/* flag becomes true when shared mem
+					 * is created by POSTMASTER
+					 */
+
+static HTAB *BindingTable = NULL;
+
+/* ---------------------
+ * ShmemBindingTabReset() - Resets the binding table to NULL....
+ * useful when the postmaster destroys existing shared memory
+ * and creates all new segments after a backend crash.
+ * ----------------------
+ */
+void
+ShmemBindingTabReset()
+{
+    BindingTable = (HTAB *)NULL;
+}
+
+/*
+ *  CreateSharedRegion() --
+ *
+ *  This routine is called once by the postmaster to
+ *  initialize the shared buffer pool.  Assume there is
+ *  only one postmaster so no synchronization is necessary
+ *  until after this routine completes successfully.
+ *
+ * key is a unique identifier for the shmem region.
+ * size is the size of the region.
+ */
+static IpcMemoryId ShmemId;
+
+void
+ShmemCreate(unsigned int key, unsigned int size)
+{
+    if (size)
+	ShmemSize = size;
+    /* create shared mem region */
+    if ((ShmemId=IpcMemoryCreate(key,ShmemSize,IPCProtection))
+	==IpcMemCreationFailed) {
+	elog(FATAL,"ShmemCreate: cannot create region");
+	exit(1);
+    }
+    
+    /* ShmemBootstrap is true if shared memory has been
+     * created, but not yet initialized.  Only the 
+     * postmaster/creator-of-all-things should have
+     * this flag set.
+     */
+    ShmemBootstrap = TRUE;
+}
+
+/*
+ *  InitShmem() -- map region into process address space
+ *	and initialize shared data structures.
+ *
+ */
+int
+InitShmem(unsigned int key, unsigned int size)
+{
+    Pointer 	sharedRegion;
+    unsigned long currFreeSpace;
+    
+    HASHCTL 	info;
+    int 		hash_flags;
+    BindingEnt *	result,item;
+    bool	found;
+    IpcMemoryId	shmid;
+    
+    /* if zero key, use default memory size */
+    if (size)
+	ShmemSize = size;
+    
+    /* default key is 0 */
+    
+    /* attach to shared memory region (SysV or BSD OS specific) */
+    if (ShmemBootstrap && key == PrivateIPCKey)
+	/* if we are running backend alone */
+	shmid = ShmemId;
+    else
+	shmid = IpcMemoryIdGet(IPCKeyGetBufferMemoryKey(key), ShmemSize);
+    sharedRegion = IpcMemoryAttach(shmid);
+    if (sharedRegion == NULL) {
+	elog(FATAL,"AttachSharedRegion: couldn't attach to shmem\n");
+	return(FALSE);
+    }
+    
+    /* get pointers to the dimensions of shared memory */
+    ShmemBase = (unsigned long) sharedRegion;
+    ShmemEnd  = (unsigned long) sharedRegion + ShmemSize;
+    currFreeSpace = 0;
+    
+    /* First long in shared memory is the count of available space */
+    ShmemFreeStart = (unsigned long *) ShmemBase;
+    /* next is a shmem pointer to the binding table */
+    ShmemBindingTabOffset = ShmemFreeStart + 1;
+    
+    currFreeSpace += 
+	sizeof(ShmemFreeStart) + sizeof(ShmemBindingTabOffset);
+    
+    /* bootstrap initialize spin locks so we can start to use the
+     * allocator and binding table.
+     */
+    if (! InitSpinLocks(ShmemBootstrap, IPCKeyGetSpinLockSemaphoreKey(key))) {
+	return(FALSE);
+    }
+    
+    /* We have just allocated additional space for two spinlocks.
+     * Now setup the global free space count 
+     */
+    if (ShmemBootstrap) {
+	*ShmemFreeStart = currFreeSpace;
+    }
+    
+    /* if ShmemFreeStart is NULL, then the allocator won't work */
+    Assert(*ShmemFreeStart);
+    
+    /* create OR attach to the shared memory binding table */
+    info.keysize = BTABLE_KEYSIZE;
+    info.datasize = BTABLE_DATASIZE;
+    hash_flags = (HASH_ELEM);
+    
+    /* This will acquire the binding table lock, but not release it. */
+    BindingTable = ShmemInitHash("BindingTable",
+				 BTABLE_SIZE,BTABLE_SIZE,
+				 &info,hash_flags);
+    
+    if (! BindingTable) {
+	elog(FATAL,"InitShmem: couldn't initialize Binding Table");
+	return(FALSE);
+    }
+    
+    /* Now, check the binding table for an entry to the binding
+     * table.  If there is an entry there, someone else created
+     * the table.  Otherwise, we did and we have to initialize it.
+     */
+    memset(item.key, 0, BTABLE_KEYSIZE);
+    strncpy(item.key,"BindingTable",BTABLE_KEYSIZE);
+    
+    result = (BindingEnt *) 
+	hash_search(BindingTable,(char *) &item,HASH_ENTER, &found);
+    
+    
+    if (! result ) {
+	elog(FATAL,"InitShmem: corrupted binding table");
+	return(FALSE);
+    }
+    
+    if (! found) {
+	/* bootstrapping shmem: we have to initialize the 
+	 * binding table now.
+	 */
+	
+	Assert(ShmemBootstrap);
+	result->location = MAKE_OFFSET(BindingTable->hctl);
+	*ShmemBindingTabOffset = result->location;
+	result->size = BTABLE_SIZE;
+	
+	ShmemBootstrap = FALSE;
+	
+    }  else {
+	Assert(! ShmemBootstrap);
+    }
+    /* now release the lock acquired in ShmemHashInit */
+    SpinRelease (BindingLock);
+    
+    Assert (result->location == MAKE_OFFSET(BindingTable->hctl));
+    
+    return(TRUE);
+}
+
+/*
+ * ShmemAlloc -- allocate word-aligned byte string from
+ * 	shared memory
+ *
+ * Assumes ShmemLock and ShmemFreeStart are initialized.
+ * Returns: real pointer to memory or NULL if we are out
+ * 	of space.  Has to return a real pointer in order 
+ *  	to be compatable with malloc().
+ */
+long *
+ShmemAlloc(unsigned long size)
+{
+    unsigned long tmpFree;
+    long *newSpace;
+    
+    /*
+     * ensure space is word aligned.
+     *
+     * Word-alignment is not good enough. We have to be more
+     * conservative: doubles need 8-byte alignment. (We probably only need
+     * this on RISC platforms but this is not a big waste of space.) 
+     *                                                - ay 12/94
+     */
+    if (size % sizeof(double))
+	size += sizeof(double) - (size % sizeof(double));
+    
+    Assert(*ShmemFreeStart);
+    
+    SpinAcquire(ShmemLock);
+    
+    tmpFree = *ShmemFreeStart + size;
+    if (tmpFree <= ShmemSize) {
+	newSpace = (long *)MAKE_PTR(*ShmemFreeStart);
+	*ShmemFreeStart += size;
+    } else {
+	newSpace = NULL;
+    }
+    
+    SpinRelease(ShmemLock); 
+    
+    if (! newSpace) {
+	elog(NOTICE,"ShmemAlloc: out of memory ");
+    }
+    return(newSpace);
+}
+
+/*
+ * ShmemIsValid -- test if an offset refers to valid shared memory 
+ * 
+ * Returns TRUE if the pointer is valid.
+ */
+int
+ShmemIsValid(unsigned long addr)
+{
+    return ((addr<ShmemEnd) && (addr>=ShmemBase));
+}
+
+/*
+ * ShmemInitHash -- Create/Attach to and initialize 
+ * 	shared memory hash table.
+ *
+ * Notes:
+ *
+ * assume caller is doing some kind of synchronization
+ * so that two people dont try to create/initialize the
+ * table at once.  Use SpinAlloc() to create a spinlock
+ * for the structure before creating the structure itself.
+ */
+HTAB *
+ShmemInitHash(char *name,	/* table string name for binding */
+	      long init_size, 	/* initial size */
+	      long max_size, 	/* max size of the table */
+	      HASHCTL *infoP,	/* info about key and bucket size */
+	      int hash_flags)	/* info about infoP */
+{
+    bool	found;
+    long  *	location;
+    
+    /* shared memory hash tables have a fixed max size so that the
+     * control structures don't try to grow.  The segbase is for
+     * calculating pointer values.  The shared memory allocator
+     * must be specified.
+     */
+    infoP->segbase = (long *) ShmemBase;
+    infoP->alloc = ShmemAlloc;
+    infoP->max_size = max_size;
+    hash_flags |= HASH_SHARED_MEM;
+    
+    /* look it up in the binding table */
+    location = 
+	ShmemInitStruct(name,my_log2(max_size) + sizeof(HHDR),&found);
+    
+    /* binding table is corrupted.  Let someone else give the 
+     * error message since they have more information 
+     */
+    if (location == NULL) {
+	return(0);
+    }
+    
+    /* it already exists, attach to it rather than allocate and
+     * initialize new space 
+     */
+    if (found) {
+	hash_flags |= HASH_ATTACH;
+    }
+    
+    /* these structures were allocated or bound in ShmemInitStruct */
+    /* control information and parameters */
+    infoP->hctl = (long *) location;
+    /* directory for hash lookup */
+    infoP->dir = (long *) (location + sizeof(HHDR));
+    
+    return(hash_create(init_size, infoP, hash_flags));;
+}
+
+/*
+ * ShmemPIDLookup -- lookup process data structure using process id
+ *
+ * Returns: TRUE if no error.  locationPtr is initialized if PID is
+ *	found in the binding table.
+ *
+ * NOTES:
+ * 	only information about success or failure is the value of
+ *	locationPtr.
+ */
+bool
+ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr)
+{
+    BindingEnt *	result,item;
+    bool	found;
+    
+    Assert (BindingTable);
+    memset(item.key, 0, BTABLE_KEYSIZE);
+    sprintf(item.key,"PID %d",pid);
+    
+    SpinAcquire(BindingLock);
+    result = (BindingEnt *) 
+	hash_search(BindingTable,(char *) &item, HASH_ENTER, &found);
+    
+    if (! result) {
+	
+	SpinRelease(BindingLock);
+	elog(WARN,"ShmemInitPID: BindingTable corrupted");
+	return(FALSE);
+	
+    } 
+    
+    if (found) {
+	*locationPtr = result->location;
+    } else {
+	result->location = *locationPtr;
+    }
+    
+    SpinRelease(BindingLock);
+    return (TRUE);
+}
+
+/*
+ * ShmemPIDDestroy -- destroy binding table entry for process
+ *	using process id
+ *
+ * Returns: offset of the process struct in shared memory or
+ *	INVALID_OFFSET if not found.
+ *
+ * Side Effect: removes the entry from the binding table
+ */
+SHMEM_OFFSET
+ShmemPIDDestroy(int pid)
+{
+    BindingEnt *	result,item;
+    bool	found;
+    SHMEM_OFFSET  location;
+    
+    Assert(BindingTable);
+    
+    memset(item.key, 0, BTABLE_KEYSIZE);
+    sprintf(item.key,"PID %d",pid);
+    
+    SpinAcquire(BindingLock);
+    result = (BindingEnt *) 
+	hash_search(BindingTable,(char *) &item, HASH_REMOVE, &found);
+    
+    if (found)
+	location = result->location;
+    SpinRelease(BindingLock);
+    
+    if (! result) {
+	
+	elog(WARN,"ShmemPIDDestroy: PID table corrupted");
+	return(INVALID_OFFSET);
+	
+    } 
+    
+    if (found)
+	return (location);
+    else {
+	return(INVALID_OFFSET);
+    }
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared
+ * 	memory.
+ *
+ *  This is called during initialization to find or allocate
+ *     	a data structure in shared memory.  If no other processes
+ *	have created the structure, this routine allocates space
+ *	for it.  If it exists already, a pointer to the existing
+ * 	table is returned.  
+ *
+ *  Returns: real pointer to the object.  FoundPtr is TRUE if
+ *	the object is already in the binding table (hence, already
+ *	initialized).
+ */
+long *
+ShmemInitStruct(char *name, unsigned long size, bool *foundPtr)
+{
+    BindingEnt *	result,item;
+    long * structPtr;
+
+    strncpy(item.key,name,BTABLE_KEYSIZE);
+    item.location = BAD_LOCATION;
+    
+    SpinAcquire(BindingLock);
+    
+    if (! BindingTable) {
+	/* Assert() is a macro now. substitutes inside quotes. */
+	char *strname = "BindingTable";
+	
+	/* If the binding table doesnt exist, we fake it.
+	 *
+	 * If we are creating the first binding table, then let 
+	 * shmemalloc() allocate the space for a new HTAB.  Otherwise,
+	 * find the old one and return that.  Notice that the
+	 * BindingLock is held until the binding table has been completely
+	 * initialized.
+	 */
+	Assert (! strcmp(name,strname)) ;
+	if (ShmemBootstrap) {
+	    /* in POSTMASTER/Single process */
+	    
+	    *foundPtr = FALSE;
+	    return((long *)ShmemAlloc(size));
+	    
+	} else {
+	    Assert (ShmemBindingTabOffset);
+	    
+	    *foundPtr = TRUE;
+	    return((long *)MAKE_PTR(*ShmemBindingTabOffset));
+	}
+	
+	
+    } else {
+	/* look it up in the bindint table */
+	result = (BindingEnt *) 
+	    hash_search(BindingTable,(char *) &item,HASH_ENTER, foundPtr);
+    }
+    
+    if (! result) {
+	
+	SpinRelease(BindingLock);
+	
+	elog(WARN,"ShmemInitStruct: Binding Table corrupted");
+	return(NULL);
+	
+    } else if (*foundPtr) {
+	/*
+	 * Structure is in the binding table so someone else has allocated 
+	 * it already.  The size better be the same as the size we are 
+	 * trying to initialize to or there is a name conflict (or worse).
+	 */
+	if (result->size != size) {
+	    SpinRelease(BindingLock);
+	    
+	    elog(NOTICE,"ShmemInitStruct: BindingTable entry size is wrong");
+	    /* let caller print its message too */
+	    return(NULL);
+	}
+	structPtr = (long *)MAKE_PTR(result->location);
+    } else {
+	
+	/* It isn't in the table yet. allocate and initialize it */
+	structPtr = ShmemAlloc((long)size);
+	if (! structPtr) {
+	    /* out of memory */
+	    Assert (BindingTable);
+	    (void) hash_search(BindingTable,(char *) &item,HASH_REMOVE, foundPtr);
+	    SpinRelease(BindingLock);
+	    *foundPtr = FALSE;
+	    
+	    elog(NOTICE,"ShmemInitStruct: cannot allocate '%s'",
+		 name);
+	    return(NULL);
+	} 
+	result->size = size;
+	result->location = MAKE_OFFSET(structPtr);
+    }
+    Assert (ShmemIsValid((unsigned long)structPtr));
+    
+    SpinRelease(BindingLock);
+    return(structPtr);
+}
+
+
+
--- a/src/backend/storage/ipc/shmqueue.c
+++ b/src/backend/storage/ipc/shmqueue.c
@@ -0,0 +1,251 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmqueue.c--
+ *    shared memory linked lists
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmqueue.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ * NOTES
+ *
+ * Package for managing doubly-linked lists in shared memory.
+ * The only tricky thing is that SHM_QUEUE will usually be a field 
+ * in a larger record.  SHMQueueGetFirst has to return a pointer
+ * to the record itself instead of a pointer to the SHMQueue field
+ * of the record.  It takes an extra pointer and does some extra
+ * pointer arithmetic to do this correctly.
+ *
+ * NOTE: These are set up so they can be turned into macros some day.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>		/* for sprintf() */
+#include "postgres.h"
+#include "storage/shmem.h"	/* where the declarations go */
+#include "utils/elog.h"
+
+/*#define SHMQUEUE_DEBUG*/
+#ifdef SHMQUEUE_DEBUG
+#define SHMQUEUE_DEBUG_DEL	/* deletions */
+#define SHMQUEUE_DEBUG_HD	/* head inserts */
+#define SHMQUEUE_DEBUG_TL	/* tail inserts */
+#define SHMQUEUE_DEBUG_ELOG NOTICE
+#endif /* SHMQUEUE_DEBUG */
+
+/*
+ * ShmemQueueInit -- make the head of a new queue point
+ * 	to itself
+ */
+void
+SHMQueueInit(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    (queue)->prev = (queue)->next = MAKE_OFFSET(queue);
+}
+
+/*
+ * SHMQueueIsDetached -- TRUE if element is not currently
+ *	in a queue.
+ */
+bool
+SHMQueueIsDetached(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    return ((queue)->prev == INVALID_OFFSET);
+}
+
+/*
+ * SHMQueueElemInit -- clear an element's links
+ */
+void
+SHMQueueElemInit(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    (queue)->prev = (queue)->next = INVALID_OFFSET;
+}
+
+/*
+ * SHMQueueDelete -- remove an element from the queue and
+ * 	close the links
+ */
+void
+SHMQueueDelete(SHM_QUEUE *queue)
+{
+    SHM_QUEUE *nextElem = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+    SHM_QUEUE *prevElem = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
+    
+    Assert(SHM_PTR_VALID(queue));
+    Assert(SHM_PTR_VALID(nextElem));
+    Assert(SHM_PTR_VALID(prevElem));
+    
+#ifdef SHMQUEUE_DEBUG_DEL
+    dumpQ(queue, "in SHMQueueDelete: begin");
+#endif /* SHMQUEUE_DEBUG_DEL */
+    
+    prevElem->next =  (queue)->next;
+    nextElem->prev =  (queue)->prev;
+    
+#ifdef SHMQUEUE_DEBUG_DEL
+    dumpQ((SHM_QUEUE *)MAKE_PTR(queue->prev), "in SHMQueueDelete: end");
+#endif /* SHMQUEUE_DEBUG_DEL */
+}
+
+#ifdef SHMQUEUE_DEBUG
+void
+dumpQ(SHM_QUEUE *q, char *s)
+{
+    char elem[16];
+    char buf[1024];
+    SHM_QUEUE	*start = q;
+    int count = 0;
+    
+    sprintf(buf, "q prevs: %x", MAKE_OFFSET(q));
+    q = (SHM_QUEUE *)MAKE_PTR(q->prev);
+    while (q != start)
+	{
+	    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+	    strcat(buf, elem);
+	    q = (SHM_QUEUE *)MAKE_PTR(q->prev);
+	    if (q->prev == MAKE_OFFSET(q))
+		break;
+	    if (count++ > 40)
+		{
+		    strcat(buf, "BAD PREV QUEUE!!");
+		    break;
+		}
+	}
+    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+    strcat(buf, elem);
+    elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
+    
+    sprintf(buf, "q nexts: %x", MAKE_OFFSET(q));
+    count = 0;
+    q = (SHM_QUEUE *)MAKE_PTR(q->next);
+    while (q != start)
+	{
+	    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+	    strcat(buf, elem);
+	    q = (SHM_QUEUE *)MAKE_PTR(q->next);
+	    if (q->next == MAKE_OFFSET(q))
+		break;
+	    if (count++ > 10)
+		{
+		    strcat(buf, "BAD NEXT QUEUE!!");
+		    break;
+		}
+	}
+    sprintf(elem, "--->%x", MAKE_OFFSET(q));
+    strcat(buf, elem);
+    elog(SHMQUEUE_DEBUG_ELOG, "%s: %s", s, buf);
+}
+#endif /* SHMQUEUE_DEBUG */
+
+/*
+ * SHMQueueInsertHD -- put elem in queue between the queue head
+ *	and its "prev" element.
+ */
+void
+SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+    SHM_QUEUE *prevPtr = (SHM_QUEUE *) MAKE_PTR((queue)->prev);
+    SHMEM_OFFSET	elemOffset = MAKE_OFFSET(elem);
+    
+    Assert(SHM_PTR_VALID(queue));
+    Assert(SHM_PTR_VALID(elem));
+    
+#ifdef SHMQUEUE_DEBUG_HD
+    dumpQ(queue, "in SHMQueueInsertHD: begin");
+#endif /* SHMQUEUE_DEBUG_HD */
+    
+    (elem)->next = prevPtr->next;
+    (elem)->prev = queue->prev;
+    (queue)->prev = elemOffset;
+    prevPtr->next = elemOffset;
+    
+#ifdef SHMQUEUE_DEBUG_HD
+    dumpQ(queue, "in SHMQueueInsertHD: end");
+#endif /* SHMQUEUE_DEBUG_HD */
+}
+
+void
+SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+    SHM_QUEUE *nextPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+    SHMEM_OFFSET	elemOffset = MAKE_OFFSET(elem);
+    
+    Assert(SHM_PTR_VALID(queue));
+    Assert(SHM_PTR_VALID(elem));
+    
+#ifdef SHMQUEUE_DEBUG_TL
+    dumpQ(queue, "in SHMQueueInsertTL: begin");
+#endif /* SHMQUEUE_DEBUG_TL */
+    
+    (elem)->prev = nextPtr->prev;
+    (elem)->next = queue->next;
+    (queue)->next = elemOffset;
+    nextPtr->prev = elemOffset;
+    
+#ifdef SHMQUEUE_DEBUG_TL
+    dumpQ(queue, "in SHMQueueInsertTL: end");
+#endif /* SHMQUEUE_DEBUG_TL */
+}
+
+/*
+ * SHMQueueFirst -- Get the first element from a queue
+ *
+ * First element is queue->next.  If SHMQueue is part of
+ * a larger structure, we want to return a pointer to the
+ * whole structure rather than a pointer to its SHMQueue field.
+ * I.E. struct {
+ *	int 		stuff;
+ *	SHMQueue 	elem;
+ * } ELEMType; 
+ * when this element is in a queue (queue->next) is struct.elem.
+ * nextQueue allows us to calculate the offset of the SHMQueue
+ * field in the structure.
+ *
+ * call to SHMQueueFirst should take these parameters:
+ *
+ *   &(queueHead),&firstElem,&(firstElem->next)
+ *
+ * Note that firstElem may well be uninitialized.  if firstElem
+ * is initially K, &(firstElem->next) will be K+ the offset to
+ * next.
+ */
+void
+SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr, SHM_QUEUE *nextQueue)
+{
+    SHM_QUEUE *elemPtr = (SHM_QUEUE *) MAKE_PTR((queue)->next);
+    
+    Assert(SHM_PTR_VALID(queue));
+    *nextPtrPtr = (Pointer) (((unsigned long) *nextPtrPtr) +
+			  ((unsigned long) elemPtr) - ((unsigned long) nextQueue)); 
+    
+    /*
+      nextPtrPtr a ptr to a structure linked in the queue
+      nextQueue is the SHMQueue field of the structure
+      *nextPtrPtr - nextQueue is 0 minus the offset of the queue 
+      field n the record 
+      elemPtr + (*nextPtrPtr - nexQueue) is the start of the
+      structure containing elemPtr.
+      */
+}
+
+/*
+ * SHMQueueEmpty -- TRUE if queue head is only element, FALSE otherwise
+ */
+bool
+SHMQueueEmpty(SHM_QUEUE *queue)
+{
+    Assert(SHM_PTR_VALID(queue));
+    
+    if (queue->prev == MAKE_OFFSET(queue)) 
+	{
+	    Assert(queue->next = MAKE_OFFSET(queue));
+	    return(TRUE);
+	}
+    return(FALSE);
+}
--- a/src/backend/storage/ipc/sinval.c
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,169 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c--
+ *    POSTGRES shared cache invalidation communication code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/* #define INVALIDDEBUG	1 */
+
+#include "postgres.h"
+
+#include "storage/sinval.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/elog.h"
+
+extern SISeg		*shmInvalBuffer;/* the shared buffer segment, set by*/
+    	    	    	    	    	/*   SISegmentAttach()	    	    */
+extern BackendId	MyBackendId;
+extern BackendTag	MyBackendTag;
+
+SPINLOCK		SInvalLock = (SPINLOCK) NULL;
+
+/****************************************************************************/
+/*  CreateSharedInvalidationState(key)   Create a buffer segment    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  should be called only by the POSTMASTER 	    	    	    	    */
+/****************************************************************************/
+void
+CreateSharedInvalidationState(IPCKey key)
+{
+    int	status;
+    
+    /* REMOVED
+       SISyncKill(IPCKeyGetSIBufferMemorySemaphoreKey(key));
+       SISyncInit(IPCKeyGetSIBufferMemorySemaphoreKey(key));
+       */
+    
+    /* SInvalLock gets set in spin.c, during spinlock init */
+    status = SISegmentInit(true, IPCKeyGetSIBufferMemoryBlock(key));
+    
+    if (status == -1) {
+    	elog(FATAL, "CreateSharedInvalidationState: failed segment init");
+    }
+}
+/****************************************************************************/
+/*  AttachSharedInvalidationState(key)   Attach a buffer segment    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  should be called only by the POSTMASTER 	    	    	    	    */
+/****************************************************************************/
+void
+AttachSharedInvalidationState(IPCKey key)
+{
+    int	status;
+    
+    if (key == PrivateIPCKey) {
+	CreateSharedInvalidationState(key);
+	return;
+    }
+    /* SInvalLock gets set in spin.c, during spinlock init */
+    status = SISegmentInit(false, IPCKeyGetSIBufferMemoryBlock(key));
+    
+    if (status == -1) {
+    	elog(FATAL, "AttachSharedInvalidationState: failed segment init");
+    }
+}
+
+void
+InitSharedInvalidationState()
+{
+    SpinAcquire(SInvalLock);
+    if (!SIBackendInit(shmInvalBuffer))
+	{
+	    SpinRelease(SInvalLock);
+	    elog(FATAL, "Backend cache invalidation initialization failed");
+	}
+    SpinRelease(SInvalLock);
+}
+
+/*
+ * RegisterSharedInvalid --
+ *  Returns a new local cache invalidation state containing a new entry.
+ *
+ * Note:
+ *  Assumes hash index is valid.
+ *  Assumes item pointer is valid.
+ */
+/****************************************************************************/
+/*  RegisterSharedInvalid(cacheId, hashIndex, pointer)      	    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  register a message in the buffer	    	    	    	    	    */
+/*  should be called by a backend   	    	    	    	    	    */
+/****************************************************************************/
+void
+RegisterSharedInvalid(int cacheId, /* XXX */
+		      Index hashIndex,
+		      ItemPointer pointer)
+{
+    SharedInvalidData   newInvalid;
+    
+    /*
+     * This code has been hacked to accept two types of messages.  This might
+     * be treated more generally in the future.
+     *
+     * (1)
+     *	cacheId= system cache id
+     *	hashIndex= system cache hash index for a (possibly) cached tuple
+     *	pointer= pointer of (possibly) cached tuple
+     *
+     * (2)
+     *	cacheId= special non-syscache id
+     *	hashIndex= object id contained in (possibly) cached relation descriptor
+     *	pointer= null
+     */
+    
+    newInvalid.cacheId = cacheId;
+    newInvalid.hashIndex = hashIndex;
+    
+    if (ItemPointerIsValid(pointer)) {
+	ItemPointerCopy(pointer, &newInvalid.pointerData);
+    } else {
+	ItemPointerSetInvalid(&newInvalid.pointerData);
+    }
+    
+    SpinAcquire(SInvalLock);
+    if (!SISetDataEntry(shmInvalBuffer, &newInvalid)) {
+    	/* buffer full */
+    	/* release a message, mark process cache states to be invalid */
+    	SISetProcStateInvalid(shmInvalBuffer);
+	
+    	if (!SIDelDataEntry(shmInvalBuffer)) {
+    	    /* inconsistent buffer state -- shd never happen */
+	    SpinRelease(SInvalLock);
+    	    elog(FATAL, "RegisterSharedInvalid: inconsistent buffer state");
+    	}
+	
+    	/* write again */
+    	(void) SISetDataEntry(shmInvalBuffer, &newInvalid);
+    }
+    SpinRelease(SInvalLock);
+}
+
+/*
+ * InvalidateSharedInvalid --
+ *  Processes all entries in a shared cache invalidation state.
+ */
+/****************************************************************************/
+/*  InvalidateSharedInvalid(invalFunction, resetFunction)    	    	    */
+/*  	    	    	    	    	    	    	    	    	    */
+/*  invalidate a message in the buffer	 (read and clean up)	    	    */
+/*  should be called by a backend   	    	    	    	    	    */
+/****************************************************************************/
+void
+InvalidateSharedInvalid(void (*invalFunction)(),
+			void (*resetFunction)())
+{
+    SpinAcquire(SInvalLock);
+    SIReadEntryData(shmInvalBuffer, MyBackendId, 
+    	    	    invalFunction, resetFunction);  
+    
+    SIDelExpiredDataEntries(shmInvalBuffer);
+    SpinRelease(SInvalLock);
+}
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,797 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c--
+ *    POSTGRES shared cache invalidation segment definitions.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.1.1.1 1996/07/09 06:21:54 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "storage/ipc.h"
+#include "storage/sinvaladt.h"
+#include "storage/lmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+/* ----------------
+ *	global variable notes
+ *
+ *	SharedInvalidationSemaphore
+ *
+ *	shmInvalBuffer
+ *		the shared buffer segment, set by SISegmentAttach()
+ *
+ *	MyBackendId
+ *		might be removed later, used only for
+ * 		debugging in debug routines (end of file)
+ *
+ *	SIDbId
+ *		identification of buffer (disappears)
+ *
+ *	SIRelId		\ 
+ *	SIDummyOid	 \  identification of buffer
+ *	SIXidData	 /
+ *	SIXid		/
+ *
+ *  XXX This file really needs to be cleaned up.  We switched to using
+ *	spinlocks to protect critical sections (as opposed to using fake
+ *	relations and going through the lock manager) and some of the old
+ *	cruft was 'ifdef'ed out, while other parts (now unused) are still
+ *	compiled into the system. -mer 5/24/92
+ * ----------------
+ */
+#ifdef HAS_TEST_AND_SET
+int SharedInvalidationLockId;
+#else
+IpcSemaphoreId	SharedInvalidationSemaphore;
+#endif
+
+SISeg		*shmInvalBuffer;	
+extern BackendId MyBackendId;
+
+static void CleanupInvalidationState(int status, SISeg *segInOutP);
+static BackendId SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag);
+static int SIGetNumEntries(SISeg *segP);
+
+/************************************************************************/
+/* SISetActiveProcess(segP, backendId)	set the backend status active	*/
+/*  	should be called only by the postmaster when creating a backend	*/
+/************************************************************************/
+/* XXX I suspect that the segP parameter is extraneous. -hirohama */
+static void
+SISetActiveProcess(SISeg *segInOutP, BackendId backendId)
+{
+    /* mark all messages as read */
+    
+    /* Assert(segP->procState[backendId - 1].tag == MyBackendTag); */
+    
+    segInOutP->procState[backendId - 1].resetState = false;
+    segInOutP->procState[backendId - 1].limit = SIGetNumEntries(segInOutP);
+}
+
+/****************************************************************************/
+/* SIBackendInit()  initializes a backend to operate on the buffer  	    */
+/****************************************************************************/
+int
+SIBackendInit(SISeg *segInOutP)
+{
+    LRelId  	    	    LtCreateRelId();
+    TransactionId           LMITransactionIdCopy();
+    
+    Assert(MyBackendTag > 0);
+    
+    MyBackendId = SIAssignBackendId(segInOutP, MyBackendTag);
+    if (MyBackendId == InvalidBackendTag)
+	return 0;
+    
+#ifdef	INVALIDDEBUG
+    elog(DEBUG, "SIBackendInit: backend tag %d; backend id %d.",
+	 MyBackendTag, MyBackendId);
+#endif	/* INVALIDDEBUG */
+    
+    SISetActiveProcess(segInOutP, MyBackendId);
+    on_exitpg(CleanupInvalidationState, (caddr_t)segInOutP);
+    return 1;
+}
+
+/* ----------------
+ *	SIAssignBackendId
+ * ----------------
+ */
+static BackendId
+SIAssignBackendId(SISeg *segInOutP, BackendTag backendTag)
+{
+    Index		index;
+    ProcState	*stateP;
+    
+    stateP = NULL;
+    
+    for (index = 0; index < MaxBackendId; index += 1) {
+	if (segInOutP->procState[index].tag == InvalidBackendTag ||
+	    segInOutP->procState[index].tag == backendTag)
+	    {
+		stateP = &segInOutP->procState[index];
+		break;
+	    }
+	
+	if (!PointerIsValid(stateP) ||
+	    (segInOutP->procState[index].resetState &&
+	     (!stateP->resetState ||
+	      stateP->tag < backendTag)) ||
+	    (!stateP->resetState &&
+	     (segInOutP->procState[index].limit <
+	      stateP->limit ||
+	      stateP->tag < backendTag)))
+	    {
+		stateP = &segInOutP->procState[index];
+	    }
+    }
+    
+    /* verify that all "procState" entries checked for matching tags */
+    
+    for (index += 1; index < MaxBackendId; index += 1) {
+	if (segInOutP->procState[index].tag == backendTag) {
+	    elog (FATAL, "SIAssignBackendId: tag %d found twice",
+		  backendTag);
+	}
+    }
+    
+    if (stateP->tag != InvalidBackendTag) {
+	if (stateP->tag == backendTag) {
+	    elog(NOTICE, "SIAssignBackendId: reusing tag %d",
+		 backendTag);
+	} else {
+	    elog(NOTICE,
+		 "SIAssignBackendId: discarding tag %d",
+		 stateP->tag);
+	    return InvalidBackendTag;
+	}
+    }
+    
+    stateP->tag = backendTag;
+    
+    return (1 + stateP - &segInOutP->procState[0]);
+}
+
+
+/************************************************************************/
+/* The following function should be called only by the postmaster !!    */
+/************************************************************************/
+
+/************************************************************************/
+/* SISetDeadProcess(segP, backendId)  set the backend status DEAD   	*/
+/*  	should be called only by the postmaster when a backend died 	*/
+/************************************************************************/
+static void
+SISetDeadProcess(SISeg *segP, int backendId)
+{
+    /* XXX call me.... */
+    
+    segP->procState[backendId - 1].resetState = false;
+    segP->procState[backendId - 1].limit = -1;
+    segP->procState[backendId - 1].tag = InvalidBackendTag;
+}
+
+/*
+ * CleanupInvalidationState --
+ * Note:
+ *	This is a temporary hack.  ExitBackend should call this instead
+ *	of exit (via on_exitpg).
+ */
+static void
+CleanupInvalidationState(int status, /* XXX */
+			 SISeg *segInOutP) /* XXX style */
+{
+    Assert(PointerIsValid(segInOutP));
+    
+    SISetDeadProcess(segInOutP, MyBackendId);
+}
+
+
+/************************************************************************/
+/* SIComputeSize()  - retuns the size of a buffer segment   	    	*/
+/************************************************************************/
+static SISegOffsets *
+SIComputeSize(int *segSize)
+{
+    int      	 A, B, a, b, totalSize;
+    SISegOffsets *oP;
+    
+    A = 0;
+    a = SizeSISeg;  	/* offset to first data entry */
+    b = SizeOfOneSISegEntry * MAXNUMMESSAGES;
+    B = A + a + b;
+    totalSize = B - A;
+    *segSize = totalSize;
+    
+    oP = (SISegOffsets *) palloc(sizeof(SISegOffsets));
+    oP->startSegment = A;
+    oP->offsetToFirstEntry = a; /* relatiove to A */
+    oP->offsetToEndOfSegemnt = totalSize; /* relative to A */
+    return(oP);
+}
+
+
+/************************************************************************/
+/* SISetStartEntrySection(segP, offset)     - sets the offset		*/
+/************************************************************************/
+static void
+SISetStartEntrySection(SISeg *segP, Offset offset)
+{
+    segP->startEntrySection = offset;
+}
+
+/************************************************************************/
+/* SIGetStartEntrySection(segP)     - returnss the offset   		*/
+/************************************************************************/
+static Offset
+SIGetStartEntrySection(SISeg *segP)
+{
+    return(segP->startEntrySection);
+}
+
+
+/************************************************************************/
+/* SISetEndEntrySection(segP, offset) 	- sets the offset   		*/
+/************************************************************************/
+static void
+SISetEndEntrySection(SISeg *segP, Offset offset)
+{
+    segP->endEntrySection = offset;
+}
+
+/************************************************************************/
+/* SISetEndEntryChain(segP, offset) 	- sets the offset   	    	*/
+/************************************************************************/
+static void
+SISetEndEntryChain(SISeg *segP, Offset offset)
+{
+    segP->endEntryChain = offset;
+}
+
+/************************************************************************/
+/* SIGetEndEntryChain(segP) 	- returnss the offset	    	    	*/
+/************************************************************************/
+static Offset
+SIGetEndEntryChain(SISeg *segP)
+{
+    return(segP->endEntryChain);
+}
+
+/************************************************************************/
+/* SISetStartEntryChain(segP, offset) 	- sets the offset   	    	*/
+/************************************************************************/
+static void
+SISetStartEntryChain(SISeg *segP, Offset offset)
+{
+    segP->startEntryChain = offset;
+}
+
+/************************************************************************/
+/* SIGetStartEntryChain(segP) 	- returns  the offset	    	    	*/
+/************************************************************************/
+static Offset
+SIGetStartEntryChain(SISeg *segP)
+{
+    return(segP->startEntryChain);
+}
+
+/************************************************************************/
+/* SISetNumEntries(segP, num)	sets the current nuber of entries   	*/
+/************************************************************************/
+static bool
+SISetNumEntries(SISeg *segP, int num)
+{
+    if ( num <= MAXNUMMESSAGES) {
+        segP->numEntries =  num;
+        return(true);
+    } else {
+        return(false);  /* table full */
+    }    
+}
+
+/************************************************************************/
+/* SIGetNumEntries(segP)    - returns the current nuber of entries  	*/
+/************************************************************************/
+static int
+SIGetNumEntries(SISeg *segP)
+{
+    return(segP->numEntries);
+}
+
+
+/************************************************************************/
+/* SISetMaxNumEntries(segP, num)    sets the maximal number of entries	*/
+/************************************************************************/
+static bool
+SISetMaxNumEntries(SISeg *segP, int num)
+{
+    if ( num <= MAXNUMMESSAGES) {
+        segP->maxNumEntries =  num;
+        return(true);
+    } else {
+        return(false);  /* wrong number */
+    }   
+}
+
+
+/************************************************************************/
+/* SIGetProcStateLimit(segP, i)	returns the limit of read messages  	*/
+/************************************************************************/
+static int
+SIGetProcStateLimit(SISeg *segP, int i)
+{
+    return(segP->procState[i].limit);
+}
+
+/************************************************************************/
+/* SIIncNumEntries(segP, num)	increments the current nuber of entries	*/
+/************************************************************************/
+static bool
+SIIncNumEntries(SISeg *segP, int num)
+{
+    if ((segP->numEntries + num) <= MAXNUMMESSAGES) {
+        segP->numEntries = segP->numEntries + num;
+        return(true);
+    } else {
+        return(false);  /* table full */
+    }   
+}
+
+/************************************************************************/
+/* SIDecNumEntries(segP, num)	decrements the current nuber of entries	*/
+/************************************************************************/
+static bool
+SIDecNumEntries(SISeg *segP, int num)
+{
+    if ((segP->numEntries - num) >=  0) {
+        segP->numEntries = segP->numEntries - num;
+        return(true);
+    } else {
+        return(false);  /* not enough entries in table */
+    }   
+}
+
+/************************************************************************/
+/* SISetStartFreeSpace(segP, offset)  - sets the offset	    	    	*/
+/************************************************************************/
+static void
+SISetStartFreeSpace(SISeg *segP, Offset offset)
+{
+    segP->startFreeSpace = offset;
+}
+
+/************************************************************************/
+/* SIGetStartFreeSpace(segP)  - returns the offset  	    	    	*/
+/************************************************************************/
+static Offset
+SIGetStartFreeSpace(SISeg *segP)
+{
+    return(segP->startFreeSpace);
+}
+
+
+
+/************************************************************************/
+/* SIGetFirstDataEntry(segP)  returns first data entry	    	    	*/
+/************************************************************************/
+static SISegEntry *
+SIGetFirstDataEntry(SISeg *segP)
+{
+    SISegEntry  *eP;
+    Offset      startChain;
+    
+    startChain = SIGetStartEntryChain(segP);
+    
+    if (startChain == InvalidOffset)
+    	return(NULL);
+    
+    eP = (SISegEntry  *) ((Pointer) segP + 
+			  SIGetStartEntrySection(segP) +
+			  startChain );
+    return(eP);
+}
+
+
+/************************************************************************/
+/* SIGetLastDataEntry(segP)  returns last data entry in the chain   	*/
+/************************************************************************/
+static SISegEntry *
+SIGetLastDataEntry(SISeg *segP)
+{
+    SISegEntry  *eP;
+    Offset      endChain;
+    
+    endChain = SIGetEndEntryChain(segP);
+    
+    if (endChain == InvalidOffset)
+    	return(NULL);
+    
+    eP = (SISegEntry  *) ((Pointer) segP + 
+			  SIGetStartEntrySection(segP) +
+			  endChain );
+    return(eP);
+}
+
+/************************************************************************/
+/* SIGetNextDataEntry(segP, offset)  returns next data entry	    	*/
+/************************************************************************/
+static SISegEntry *
+SIGetNextDataEntry(SISeg *segP, Offset offset)
+{
+    SISegEntry  *eP;
+    
+    if (offset == InvalidOffset)
+    	return(NULL);
+    
+    eP = (SISegEntry  *) ((Pointer) segP +
+                          SIGetStartEntrySection(segP) + 
+                          offset);
+    return(eP);
+}
+
+
+/************************************************************************/
+/* SIGetNthDataEntry(segP, n)	returns the n-th data entry in chain	*/
+/************************************************************************/
+static SISegEntry *
+SIGetNthDataEntry(SISeg *segP,
+		  int n)	/* must range from 1 to MaxMessages */
+{
+    SISegEntry  *eP;
+    int	    	i;
+    
+    if (n <= 0) return(NULL);
+    
+    eP = SIGetFirstDataEntry(segP);
+    for (i = 1; i < n; i++) {
+    	/* skip one and get the next	*/
+    	eP = SIGetNextDataEntry(segP, eP->next);
+    }
+    
+    return(eP);
+}
+
+/************************************************************************/
+/* SIEntryOffset(segP, entryP)   returns the offset for an pointer  	*/
+/************************************************************************/
+static Offset
+SIEntryOffset(SISeg *segP, SISegEntry *entryP)
+{
+    /* relative to B !! */
+    return ((Offset) ((Pointer) entryP -
+                      (Pointer) segP - 
+                      SIGetStartEntrySection(segP) ));
+}
+
+
+/************************************************************************/
+/* SISetDataEntry(segP, data)  - sets a message in the segemnt	    	*/
+/************************************************************************/
+bool
+SISetDataEntry(SISeg *segP, SharedInvalidData  *data)
+{
+    Offset  	    offsetToNewData;
+    SISegEntry 	    *eP, *lastP;
+    bool    	    SISegFull();
+    Offset  	    SIEntryOffset();
+    Offset  	    SIGetStartFreeSpace();
+    SISegEntry 	    *SIGetFirstDataEntry();
+    SISegEntry 	    *SIGetNextDataEntry();
+    SISegEntry 	    *SIGetLastDataEntry();
+    
+    if (!SIIncNumEntries(segP, 1)) 
+	return(false);  /* no space */
+    
+    /* get a free entry */
+    offsetToNewData = SIGetStartFreeSpace(segP);
+    eP = SIGetNextDataEntry(segP, offsetToNewData); /* it's a free one */
+    SISetStartFreeSpace(segP, eP->next);
+    /* fill it up */
+    eP->entryData = *data;
+    eP->isfree = false;
+    eP->next = InvalidOffset;
+    
+    /* handle insertion point at the end of the chain !!*/
+    lastP = SIGetLastDataEntry(segP);
+    if (lastP == NULL) {
+    	/* there is no chain, insert the first entry */
+    	SISetStartEntryChain(segP, SIEntryOffset(segP, eP));
+    } else {
+    	/* there is a last entry in the chain */
+    	lastP->next = SIEntryOffset(segP, eP);
+    }
+    SISetEndEntryChain(segP, SIEntryOffset(segP, eP));
+    return(true);
+}
+
+
+/************************************************************************/
+/* SIDecProcLimit(segP, num)  decrements all process limits 	    	*/
+/************************************************************************/
+static void
+SIDecProcLimit(SISeg *segP, int num)
+{
+    int i;
+    for (i=0; i < MaxBackendId; i++) {
+    	/* decrement only, if there is a limit > 0  */
+    	if (segP->procState[i].limit > 0) {
+    	    segP->procState[i].limit = segP->procState[i].limit - num;
+    	    if (segP->procState[i].limit < 0) {
+    	    	/* limit was not high enough, reset to zero */
+    	    	/* negative means it's a dead backend	    */
+    	    	segP->procState[i].limit = 0;
+    	    }
+    	}
+    }
+}
+
+
+/************************************************************************/
+/* SIDelDataEntry(segP)	    - free the FIRST entry   	    	    	*/
+/************************************************************************/
+bool
+SIDelDataEntry(SISeg *segP)
+{
+    SISegEntry 	    *e1P;
+    SISegEntry 	    *SIGetFirstDataEntry();
+    
+    if (!SIDecNumEntries(segP, 1))  {
+    	/* no entries in buffer */
+    	return(false);
+    }
+    
+    e1P = SIGetFirstDataEntry(segP);
+    SISetStartEntryChain(segP, e1P->next);
+    if (SIGetStartEntryChain(segP) == InvalidOffset) {
+    	/* it was the last entry */
+    	SISetEndEntryChain(segP, InvalidOffset);
+    }
+    /* free the entry */
+    e1P->isfree = true;
+    e1P->next = SIGetStartFreeSpace(segP);
+    SISetStartFreeSpace(segP, SIEntryOffset(segP, e1P));
+    SIDecProcLimit(segP, 1);
+    return(true); 
+}
+
+
+
+/************************************************************************/
+/* SISetProcStateInvalid(segP)	checks and marks a backends state as 	*/
+/*  	    	    	    	    invalid 	    	    	    	*/
+/************************************************************************/
+void
+SISetProcStateInvalid(SISeg *segP)
+{
+    int i;
+    
+    for (i=0; i < MaxBackendId; i++) {
+    	if (segP->procState[i].limit == 0) {
+    	    /* backend i didn't read any message    	    	    	*/
+    	    segP->procState[i].resetState = true;
+    	    /*XXX signal backend that it has to reset its internal cache ? */
+    	}
+    }
+}
+
+/************************************************************************/
+/* SIReadEntryData(segP, backendId, function)	    	    	    	*/
+/*  	    	    	- marks messages to be read by id   	    	*/
+/*  	    	          and executes function	    	    	    	*/
+/************************************************************************/
+void
+SIReadEntryData(SISeg *segP,
+		int backendId,
+		void (*invalFunction)(),
+		void (*resetFunction)())
+{
+    int i = 0;
+    SISegEntry *data;
+    
+    Assert(segP->procState[backendId - 1].tag == MyBackendTag);
+    
+    if (!segP->procState[backendId - 1].resetState) {
+    	/* invalidate data, but only those, you have not seen yet !!*/
+    	/* therefore skip read messages */
+    	data = SIGetNthDataEntry(segP, 
+    	    	    	    	 SIGetProcStateLimit(segP, backendId - 1) + 1);
+    	while (data != NULL) {
+    	    i++;
+    	    segP->procState[backendId - 1].limit++;  /* one more message read */
+    	    invalFunction(data->entryData.cacheId, 
+			  data->entryData.hashIndex,
+			  &data->entryData.pointerData);
+    	    data = SIGetNextDataEntry(segP, data->next);
+    	}
+    	/* SIDelExpiredDataEntries(segP); */
+    } else {
+    	/*backend must not read messages, its own state has to be reset	    */
+    	elog(NOTICE, "SIMarkEntryData: cache state reset");
+        resetFunction(); /* XXXX call it here, parameters? */
+	
+	/* new valid state--mark all messages "read" */
+	segP->procState[backendId - 1].resetState = false;
+	segP->procState[backendId - 1].limit = SIGetNumEntries(segP);
+    }
+    /* check whether we can remove dead messages    	    	    	    */
+    if (i > MAXNUMMESSAGES) {
+    	elog(FATAL, "SIReadEntryData: Invalid segment state");
+    }
+}
+
+/************************************************************************/
+/* SIDelExpiredDataEntries  (segP)  - removes irrelevant messages   	*/
+/************************************************************************/
+void
+SIDelExpiredDataEntries(SISeg *segP)
+{
+    int   min, i, h;
+    
+    min = 9999999;
+    for (i = 0; i < MaxBackendId; i++) {
+    	h = SIGetProcStateLimit(segP, i);
+    	if (h >= 0)  { /* backend active */
+    	    if (h < min ) min = h;
+    	}
+    }
+    if (min != 9999999) {
+    	/* we can remove min messages */
+    	for (i = 1; i <= min; i++) {
+    	    /* this  adjusts also the state limits!*/
+    	    if (!SIDelDataEntry(segP)) { 
+            	elog(FATAL, "SIDelExpiredDataEntries: Invalid segment state");
+    	    }
+    	}
+    }
+}
+
+
+
+/************************************************************************/
+/* SISegInit(segP)  - initializes the segment	    	    	    	*/
+/************************************************************************/
+static void
+SISegInit(SISeg *segP)
+{
+    SISegOffsets    *oP;
+    int	    	    segSize, i;
+    SISegEntry      *eP;
+    
+    oP = SIComputeSize(&segSize);
+    /* set sempahore ids in the segment */
+    /* XXX */
+    SISetStartEntrySection(segP, oP->offsetToFirstEntry);
+    SISetEndEntrySection(segP, oP->offsetToEndOfSegemnt);
+    SISetStartFreeSpace(segP, 0);
+    SISetStartEntryChain(segP, InvalidOffset);
+    SISetEndEntryChain(segP, InvalidOffset);
+    (void) SISetNumEntries(segP, 0);
+    (void) SISetMaxNumEntries(segP, MAXNUMMESSAGES);
+    for (i = 0; i < MaxBackendId; i++) {
+    	segP->procState[i].limit = -1; 	    /* no backend active  !!*/
+    	segP->procState[i].resetState = false;
+	segP->procState[i].tag = InvalidBackendTag;
+    }
+    /* construct a chain of free entries    	    	    	    */
+    for (i = 1; i < MAXNUMMESSAGES; i++)  {
+    	eP = (SISegEntry  *) ((Pointer) segP +
+			      SIGetStartEntrySection(segP) +
+			      (i - 1) * sizeof(SISegEntry));
+    	eP->isfree = true;
+    	eP->next = i * sizeof(SISegEntry); /* relative to B */
+    }
+    /* handle the last free entry separate  	    	    	    */
+    eP = (SISegEntry  *) ((Pointer) segP +
+			  SIGetStartEntrySection(segP) +
+			  (MAXNUMMESSAGES - 1) * sizeof(SISegEntry));
+    eP->isfree = true;
+    eP->next = InvalidOffset;  /* it's the end of the chain !! */
+    /*
+     * Be tidy
+     */
+    pfree(oP);
+    
+}
+
+
+
+/************************************************************************/
+/* SISegmentKill(key)   - kill any segment                              */
+/************************************************************************/
+static void
+SISegmentKill(int key)	/* the corresponding key for the segment */
+{   
+    IpcMemoryKill(key);
+}	
+
+
+/************************************************************************/
+/* SISegmentGet(key, size)  - get a shared segment of size <size>       */
+/*                returns a segment id                                  */
+/************************************************************************/
+static IpcMemoryId
+SISegmentGet(int key,		/* the corresponding key for the segment */
+	     int size,		/* size of segment in bytes              */
+	     bool create)
+{
+    IpcMemoryId   shmid;
+    
+    if (create) {
+	shmid = IpcMemoryCreate(key, size, IPCProtection);
+    } else {
+	shmid = IpcMemoryIdGet(key, size);
+    }
+    return(shmid);
+}
+
+/************************************************************************/
+/* SISegmentAttach(shmid)   - attach a shared segment with id shmid     */
+/************************************************************************/
+static void
+SISegmentAttach(IpcMemoryId shmid)
+{
+    shmInvalBuffer = (struct SISeg *) IpcMemoryAttach(shmid);
+    if (shmInvalBuffer == IpcMemAttachFailed) {   
+	/* XXX use validity function */
+	elog(NOTICE, "SISegmentAttach: Could not attach segment");
+	elog(FATAL, "SISegmentAttach: %m");
+    }
+}
+
+
+/************************************************************************/
+/* SISegmentInit(killExistingSegment, key)  initialize segment	    	*/
+/************************************************************************/
+int
+SISegmentInit(bool killExistingSegment, IPCKey key)
+{ 
+    SISegOffsets	*oP;
+    int     	    	segSize;
+    IpcMemoryId	    	shmId;
+    bool    	    	create;
+    
+    if (killExistingSegment) {
+        /* Kill existing segment */
+        /* set semaphore */
+    	SISegmentKill(key);
+    	
+        /* Get a shared segment */
+	
+        oP = SIComputeSize(&segSize);
+	/*
+	 * Be tidy
+	 */
+	pfree(oP);
+	
+        create = true;
+        shmId = SISegmentGet(key,segSize, create);
+        if (shmId < 0) {
+            perror("SISegmentGet: failed");
+            return(-1);                                     /* an error */
+        }
+	
+        /* Attach the shared cache invalidation  segment */
+        /* sets the global variable shmInvalBuffer */
+        SISegmentAttach(shmId);
+	
+        /* Init shared memory table */
+        SISegInit(shmInvalBuffer);  
+    } else {
+    	/* use an existing segment */
+    	create = false;
+    	shmId = SISegmentGet(key, 0, create);
+    	if (shmId < 0) {
+    	    perror("SISegmentGet: getting an existent segment failed");
+    	    return(-1);	    	    	    	    	    /* an error */
+    	}
+    	/* Attach the shared cache invalidation segment */
+      	SISegmentAttach(shmId);
+    }
+    return(1);
+}
+
--- a/src/backend/storage/ipc/spin.c
+++ b/src/backend/storage/ipc/spin.c
@@ -0,0 +1,247 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c--
+ *    routines for managing spin locks
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/ipc/Attic/spin.c,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES has two kinds of locks: semaphores (which put the
+ * process to sleep) and spinlocks (which are supposed to be
+ * short term locks).  Currently both are implemented as SysV
+ * semaphores, but presumably this can change if we move to
+ * a machine with a test-and-set (TAS) instruction.  Its probably
+ * a good idea to think about (and allocate) short term and long
+ * term semaphores separately anyway.
+ *
+ * NOTE: These routines are not supposed to be widely used in Postgres.
+ *	 They are preserved solely for the purpose of porting Mark Sullivan's
+ *	 buffer manager to Postgres.
+ */
+#include <errno.h>
+#include "postgres.h"
+#include "storage/ipc.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+#include "utils/elog.h"
+
+/* globals used in this file */
+IpcSemaphoreId	SpinLockId;
+
+#ifdef HAS_TEST_AND_SET
+/* real spin lock implementations */
+
+bool
+CreateSpinlocks(IPCKey key)
+{ 
+    /* the spin lock shared memory must have been created by now */
+    return(TRUE); 
+}
+
+bool
+AttachSpinLocks(IPCKey key)
+{
+    /* the spin lock shared memory must have been attached by now */
+    return(TRUE);
+}
+
+bool
+InitSpinLocks(int init, IPCKey key)
+{
+    extern SPINLOCK ShmemLock;
+    extern SPINLOCK BindingLock;
+    extern SPINLOCK BufMgrLock;
+    extern SPINLOCK LockMgrLock;
+    extern SPINLOCK ProcStructLock;
+    extern SPINLOCK SInvalLock;
+    extern SPINLOCK OidGenLockId;
+    
+#ifdef MAIN_MEMORY
+    extern SPINLOCK MMCacheLock;
+#endif /* SONY_JUKEBOX */
+    
+    /* These six spinlocks have fixed location is shmem */
+    ShmemLock = (SPINLOCK) SHMEMLOCKID;
+    BindingLock = (SPINLOCK) BINDINGLOCKID;
+    BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
+    LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
+    ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
+    SInvalLock = (SPINLOCK) SINVALLOCKID;
+    OidGenLockId = (SPINLOCK) OIDGENLOCKID;
+    
+#ifdef MAIN_MEMORY
+    MMCacheLock = (SPINLOCK) MMCACHELOCKID;
+#endif /* MAIN_MEMORY */
+    
+    return(TRUE);
+}
+
+void
+SpinAcquire(SPINLOCK lock)
+{
+    ExclusiveLock(lock);
+    PROC_INCR_SLOCK(lock);
+}
+
+void
+SpinRelease(SPINLOCK lock)
+{
+    PROC_DECR_SLOCK(lock);
+    ExclusiveUnlock(lock);
+}
+
+bool
+SpinIsLocked(SPINLOCK lock)
+{
+    return(!LockIsFree(lock));
+}
+
+#else /* HAS_TEST_AND_SET */
+/* Spinlocks are implemented using SysV semaphores */
+
+
+/*
+ * SpinAcquire -- try to grab a spinlock
+ *
+ * FAILS if the semaphore is corrupted.
+ */
+void
+SpinAcquire(SPINLOCK lock)
+{
+    IpcSemaphoreLock(SpinLockId, lock, IpcExclusiveLock);
+    PROC_INCR_SLOCK(lock);
+}
+
+/*
+ * SpinRelease -- release a spin lock
+ * 
+ * FAILS if the semaphore is corrupted
+ */
+void
+SpinRelease(SPINLOCK lock)
+{
+    Assert(SpinIsLocked(lock))
+	PROC_DECR_SLOCK(lock);
+    IpcSemaphoreUnlock(SpinLockId, lock, IpcExclusiveLock);
+}
+
+bool
+SpinIsLocked(SPINLOCK lock)
+{
+    int semval;
+    
+    semval = IpcSemaphoreGetValue(SpinLockId, lock);
+    return(semval < IpcSemaphoreDefaultStartValue);
+}
+
+/*
+ * CreateSpinlocks -- Create a sysV semaphore array for
+ *	the spinlocks
+ *
+ */
+bool
+CreateSpinlocks(IPCKey key)
+{
+    
+    int status;
+    IpcSemaphoreId semid;
+    semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection, 
+			       IpcSemaphoreDefaultStartValue, 1, &status);
+    if (status == IpcSemIdExist) {
+	IpcSemaphoreKill(key);
+	elog(NOTICE,"Destroying old spinlock semaphore");
+	semid = IpcSemaphoreCreate(key, MAX_SPINS, IPCProtection, 
+				   IpcSemaphoreDefaultStartValue, 1, &status);
+    }
+    
+    if (semid >= 0) {
+	SpinLockId = semid;
+	return(TRUE);
+    }
+    /* cannot create spinlocks */
+    elog(FATAL,"CreateSpinlocks: cannot create spin locks");
+    return(FALSE);
+}
+
+/*
+ * Attach to existing spinlock set
+ */
+bool
+AttachSpinLocks(IPCKey key)
+{
+    IpcSemaphoreId id;
+    
+    id = semget (key, MAX_SPINS, 0);
+    if (id < 0) {
+	if (errno == EEXIST) {
+	    /* key is the name of someone else's semaphore */
+	    elog (FATAL,"AttachSpinlocks: SPIN_KEY belongs to someone else");
+	}
+	/* cannot create spinlocks */
+	elog(FATAL,"AttachSpinlocks: cannot create spin locks");
+	return(FALSE);
+    }
+    SpinLockId = id;
+    return(TRUE);
+}
+
+/*
+ * InitSpinLocks -- Spinlock bootstrapping
+ * 
+ * We need several spinlocks for bootstrapping:
+ * BindingLock (for the shmem binding table) and
+ * ShmemLock (for the shmem allocator), BufMgrLock (for buffer
+ * pool exclusive access), LockMgrLock (for the lock table), and
+ * ProcStructLock (a spin lock for the shared process structure).
+ * If there's a Sony WORM drive attached, we also have a spinlock
+ * (SJCacheLock) for it.  Same story for the main memory storage mgr.
+ *
+ */
+bool
+InitSpinLocks(int init, IPCKey key)
+{
+    extern SPINLOCK ShmemLock;
+    extern SPINLOCK BindingLock;
+    extern SPINLOCK BufMgrLock;
+    extern SPINLOCK LockMgrLock;
+    extern SPINLOCK ProcStructLock;
+    extern SPINLOCK SInvalLock;
+    extern SPINLOCK OidGenLockId;
+    
+#ifdef MAIN_MEMORY
+    extern SPINLOCK MMCacheLock;
+#endif /* MAIN_MEMORY */
+    
+    if (!init || key != IPC_PRIVATE) {
+	/* if bootstrap and key is IPC_PRIVATE, it means that we are running
+	 * backend by itself.  no need to attach spinlocks
+	 */
+	if (! AttachSpinLocks(key)) {
+	    elog(FATAL,"InitSpinLocks: couldnt attach spin locks");
+	    return(FALSE);
+	}
+    }
+    
+    /* These five (or six) spinlocks have fixed location is shmem */
+    ShmemLock = (SPINLOCK) SHMEMLOCKID;
+    BindingLock = (SPINLOCK) BINDINGLOCKID;
+    BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
+    LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
+    ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
+    SInvalLock = (SPINLOCK) SINVALLOCKID;
+    OidGenLockId = (SPINLOCK) OIDGENLOCKID;
+    
+#ifdef MAIN_MEMORY
+    MMCacheLock = (SPINLOCK) MMCACHELOCKID;
+#endif /* MAIN_MEMORY */
+    
+    return(TRUE);
+}
+#endif /* HAS_TEST_AND_SET */
--- a/src/backend/storage/item.h
+++ b/src/backend/storage/item.h
@@ -0,0 +1,20 @@
+/*-------------------------------------------------------------------------
+ *
+ * item.h--
+ *    POSTGRES disk item definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: item.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEM_H
+#define ITEM_H
+
+#include "c.h"
+
+typedef Pointer	Item;
+
+#endif	/* ITEM_H */
--- a/src/backend/storage/itemid.h
+++ b/src/backend/storage/itemid.h
@@ -0,0 +1,75 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemid.h--
+ *    Standard POSTGRES buffer page item identifier definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itemid.h,v 1.1.1.1 1996/07/09 06:21:52 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEMID_H
+#define ITEMID_H
+
+typedef uint16	ItemOffset;
+typedef uint16	ItemLength;
+
+typedef bits16	ItemIdFlags;
+
+
+
+typedef struct ItemIdData {		/* line pointers */
+	unsigned	lp_off:13,	/* offset to find tup */
+					/* can be reduced by 2 if necc. */
+			lp_flags:6,	/* flags on tuple */
+			lp_len:13;	/* length of tuple */
+} ItemIdData;
+
+typedef struct ItemIdData	*ItemId;
+
+#ifndef	LP_USED
+#define LP_USED		0x01	/* this line pointer is being used */
+#endif
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+/* 
+ *	ItemIdGetLength
+ */
+#define ItemIdGetLength(itemId) \
+   ((itemId)->lp_len)
+
+/* 
+ *	ItemIdGetOffset
+ */
+#define ItemIdGetOffset(itemId) \
+   ((itemId)->lp_off)
+
+/* 
+ *	ItemIdGetFlags
+ */
+#define ItemIdGetFlags(itemId) \
+   ((itemId)->lp_flags)
+
+/*
+ * ItemIdIsValid --
+ *	True iff disk item identifier is valid.
+ */
+#define	ItemIdIsValid(itemId)	PointerIsValid(itemId)
+
+/*
+ * ItemIdIsUsed --
+ *	True iff disk item identifier is in use.
+ *
+ * Note:
+ *	Assumes disk item identifier is valid.
+ */
+#define ItemIdIsUsed(itemId) \
+    (AssertMacro(ItemIdIsValid(itemId)) ? \
+     (bool) (((itemId)->lp_flags & LP_USED) != 0) : false)
+
+#endif	/* ITEMID_H */
--- a/src/backend/storage/itempos.h
+++ b/src/backend/storage/itempos.h
@@ -0,0 +1,44 @@
+/*-------------------------------------------------------------------------
+ *
+ * itempos.h--
+ *    Standard POSTGRES buffer page long item subposition definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itempos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEMPOS_H
+#define ITEMPOS_H
+
+#include "c.h"
+#include "storage/buf.h"
+#include "storage/itemid.h"
+
+typedef struct ItemSubpositionData {
+	Buffer		op_db;
+	ItemId		op_lpp;
+	char		*op_cp;		/* XXX */
+	uint32		op_len;
+} ItemSubpositionData;
+
+typedef ItemSubpositionData	*ItemSubposition;
+
+/*
+ *	PNOBREAK(OBJP, LEN)
+ *	struct	objpos	*OBJP;
+ *	unsigned	LEN;
+ */
+#define PNOBREAK(OBJP, LEN)	((OBJP)->op_len >= LEN)
+
+/*
+ *	PSKIP(OBJP, LEN)
+ *	struct	objpos	*OBJP;
+ *	unsigned	LEN;
+ */
+#define PSKIP(OBJP, LEN)\
+	{ (OBJP)->op_cp += (LEN); (OBJP)->op_len -= (LEN); }
+
+#endif	/* ITEMPOS_H */
--- a/src/backend/storage/itemptr.h
+++ b/src/backend/storage/itemptr.h
@@ -0,0 +1,115 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.h--
+ *    POSTGRES disk item pointer definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: itemptr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	ITEMPTR_H
+#define ITEMPTR_H
+
+#include "c.h"
+#include "storage/block.h"
+#include "storage/off.h"
+#include "storage/itemid.h"
+
+/*
+ * ItemPointer:
+ *
+ * this is a pointer to an item on another disk page in the same file.
+ * blkid tells us which block, posid tells us which entry in the linp
+ * (ItemIdData) array we want.
+ */
+typedef struct ItemPointerData {
+    BlockIdData		ip_blkid;
+    OffsetNumber	ip_posid;
+} ItemPointerData;
+
+typedef ItemPointerData	*ItemPointer;
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * ItemPointerIsValid --
+ *	True iff the disk item pointer is not NULL.
+ */
+#define ItemPointerIsValid(pointer) \
+    ((bool) (PointerIsValid(pointer) && ((pointer)->ip_posid != 0)))
+
+/*
+ * ItemPointerGetBlockNumber --
+ *	Returns the block number of a disk item pointer.
+ */
+#define ItemPointerGetBlockNumber(pointer) \
+    (AssertMacro(ItemPointerIsValid(pointer)) ? \
+     BlockIdGetBlockNumber(&(pointer)->ip_blkid) : (BlockNumber) 0)
+
+/*
+ * ItemPointerGetOffsetNumber --
+ *	Returns the offset number of a disk item pointer.
+ */
+#define ItemPointerGetOffsetNumber(pointer) \
+    (AssertMacro(ItemPointerIsValid(pointer)) ? \
+     (pointer)->ip_posid : \
+     InvalidOffsetNumber)
+
+/*
+ * ItemPointerSet --
+ *	Sets a disk item pointer to the specified block and offset.
+ */
+#define ItemPointerSet(pointer, blockNumber, offNum) \
+    Assert(PointerIsValid(pointer)); \
+    BlockIdSet(&((pointer)->ip_blkid), blockNumber); \
+    (pointer)->ip_posid = offNum
+
+/*
+ * ItemPointerSetBlockNumber --
+ *	Sets a disk item pointer to the specified block.
+ */
+#define ItemPointerSetBlockNumber(pointer, blockNumber) \
+    Assert(PointerIsValid(pointer)); \
+    BlockIdSet(&((pointer)->ip_blkid), blockNumber)
+
+/*
+ * ItemPointerSetOffsetNumber --
+ *	Sets a disk item pointer to the specified offset.
+ */
+#define ItemPointerSetOffsetNumber(pointer, offsetNumber) \
+    AssertMacro(PointerIsValid(pointer)); \
+    (pointer)->ip_posid = (offsetNumber)
+
+/*
+ * ItemPointerCopy --
+ *	Copies the contents of one disk item pointer to another.
+ */
+#define ItemPointerCopy(fromPointer, toPointer) \
+    Assert(PointerIsValid(toPointer)); \
+    Assert(PointerIsValid(fromPointer)); \
+    *(toPointer) = *(fromPointer)
+
+/*
+ * ItemPointerSetInvalid --
+ *	Sets a disk item pointer to be invalid.
+ */
+#define ItemPointerSetInvalid(pointer) \
+    Assert(PointerIsValid(pointer)); \
+    BlockIdSet(&((pointer)->ip_blkid), InvalidBlockNumber); \
+    (pointer)->ip_posid = InvalidOffsetNumber
+
+/* ----------------
+ *	externs
+ * ----------------
+ */
+
+extern bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2);
+
+#endif	/* ITEMPTR_H */
+
--- a/src/backend/storage/large_object.h
+++ b/src/backend/storage/large_object.h
@@ -0,0 +1,58 @@
+/*-------------------------------------------------------------------------
+ *
+ * large_object.h--
+ *    file of info for Postgres large objects. POSTGRES 4.2 supports
+ *    zillions of large objects (internal, external, jaquith, inversion).
+ *    Now we only support inversion.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: large_object.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	LARGE_OBJECT_H
+#define	LARGE_OBJECT_H
+
+#include "c.h"
+#include "utils/rel.h"
+#include "access/relscan.h"
+
+/*
+ * This structure will eventually have lots more stuff associated with it.
+ */
+typedef struct LargeObjectDesc
+{
+    Relation heap_r;		/* heap relation */
+    Relation index_r;		/* index relation on seqno attribute */
+    IndexScanDesc iscan; 	/* index scan we're using */
+    TupleDesc hdesc; 		/* heap relation tuple desc */
+    TupleDesc idesc; 		/* index relation tuple desc */
+    uint32 lowbyte;		/* low byte on the current page */
+    uint32 highbyte;		/* high byte on the current page */
+    uint32 offset;		/* current seek pointer */
+    ItemPointerData htid; 	/* tid of current heap tuple */
+
+#define IFS_RDLOCK	(1 << 0)
+#define IFS_WRLOCK	(1 << 1)
+#define IFS_ATEOF	(1 << 2)
+
+    u_long flags;		/* locking info, etc */
+} LargeObjectDesc;
+
+/*
+ * Function definitions...
+ */
+
+/* inversion stuff in inv_api.c */
+extern LargeObjectDesc *inv_create(int flags);
+extern LargeObjectDesc *inv_open(Oid lobjId, int flags);
+extern void inv_close(LargeObjectDesc *obj_desc);
+extern int inv_destroy(Oid lobjId);
+extern int inv_stat(LargeObjectDesc *obj_desc, struct pgstat *stbuf);
+extern int inv_seek(LargeObjectDesc *obj_desc, int offset, int whence);
+extern int inv_tell(LargeObjectDesc *obj_desc);
+extern int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+extern int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes);
+
+#endif	/* LARGE_OBJECT_H */
--- a/src/backend/storage/large_object/Makefile.inc
+++ b/src/backend/storage/large_object/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/large_object
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/large_object/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= inv_api.c 
--- a/src/backend/storage/large_object/inv_api.c
+++ b/src/backend/storage/large_object/inv_api.c
--- a/src/backend/storage/lmgr.h
+++ b/src/backend/storage/lmgr.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.h--
+ *    POSTGRES lock manager definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: lmgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	LMGR_H
+#define LMGR_H
+
+#include "postgres.h"
+
+#include "storage/itemptr.h"
+#include "storage/lock.h"
+#include "utils/rel.h"
+
+/* 
+ * This was moved from pladt.h for the new lock manager.  Want to obsolete
+ * all of the old code.
+ */
+typedef struct LRelId {
+    Oid	 relId;     /* a relation identifier */
+    Oid     dbId;      /* a database identifier */
+} LRelId;
+
+typedef struct LockInfoData  {
+    bool                    initialized;
+    LRelId                  lRelId;
+    TransactionId           transactionIdData;
+    uint16                  flags;
+} LockInfoData;
+typedef LockInfoData    *LockInfo;
+
+#define LockInfoIsValid(linfo) \
+	((PointerIsValid(linfo)) &&  ((LockInfo) linfo)->initialized)
+
+
+extern LRelId RelationGetLRelId(Relation relation);
+extern Oid LRelIdGetDatabaseId(LRelId lRelId);
+extern Oid LRelIdGetRelationId(LRelId lRelId);
+extern bool DatabaseIdIsMyDatabaseId(Oid databaseId);
+extern bool LRelIdContainsMyDatabaseId(LRelId lRelId);
+extern void RelationInitLockInfo(Relation relation);
+extern void RelationDiscardLockInfo(Relation relation);
+extern void RelationSetLockForDescriptorOpen(Relation relation);
+extern void RelationSetLockForRead(Relation relation);
+extern void RelationUnsetLockForRead(Relation relation);
+extern void RelationSetLockForWrite(Relation relation);
+extern void RelationUnsetLockForWrite(Relation relation);
+extern void RelationSetLockForTupleRead(Relation relation,
+					ItemPointer itemPointer);
+
+/* used in vaccum.c */
+extern void RelationSetLockForWritePage(Relation relation,
+		       ItemPointer itemPointer);
+
+/* used in nbtpage.c, hashpage.c */
+extern void RelationSetSingleWLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationUnsetSingleWLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationSetSingleRLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationUnsetSingleRLockPage(Relation relation,
+		       ItemPointer itemPointer);
+extern void RelationSetRIntentLock(Relation relation);
+extern void RelationUnsetRIntentLock(Relation relation);
+extern void RelationSetWIntentLock(Relation relation);
+extern void RelationUnsetWIntentLock(Relation relation);
+extern void RelationSetLockForExtend(Relation relation);
+extern void RelationUnsetLockForExtend(Relation relation);
+extern void LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId);
+
+/* single.c */
+extern bool SingleLockReln(LockInfo linfo, LOCKT lockt, int action);
+extern bool SingleLockPage(LockInfo linfo, ItemPointer tidPtr,
+			   LOCKT lockt, int action);
+
+#endif	/* LMGR_H */
--- a/src/backend/storage/lmgr/Makefile.inc
+++ b/src/backend/storage/lmgr/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/lmgr
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= lmgr.c lock.c multi.c proc.c single.c
--- a/src/backend/storage/lmgr/README
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,93 @@
+$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.1.1.1 1996/07/09 06:21:55 scrappy Exp $
+
+This file is an attempt to save me (and future code maintainers) some
+time and a lot of headaches.  The existing lock manager code at the time
+of this writing (June 16 1992) can best be described as confusing.  The
+complexity seems inherent in lock manager functionality, but variable
+names chosen in the current implementation really confuse me everytime
+I have to track down a bug.  Also, what gets done where and by whom isn't
+always clear....
+
+Starting with the data structures the lock manager relies upon...
+
+(NOTE - these will undoubtedly change over time and it is likely
+that this file won't always be updated along with the structs.)
+
+The lock manager's LOCK:
+
+tag -
+    The key fields that are used for hashing locks in the shared memory
+    lock hash table.  This is kept as a separate struct to ensure that we
+    always zero out the correct number of bytes.  This is a problem as
+    part of the tag is an itempointer which is 6 bytes and causes 2
+    additional bytes to be added as padding.
+
+    tag.relId -
+	Uniquely identifies the relation that the lock corresponds to.
+    
+    tag.dbId -
+	Uniquely identifies the database in which the relation lives.  If
+	this is a shared system relation (e.g. pg_user) the dbId should be
+	set to 0.
+
+    tag.tupleId -
+	Uniquely identifies the block/page within the relation and the
+	tuple within the block.  If we are setting a table level lock
+	both the blockId and tupleId (in an item pointer this is called
+	the position) are set to invalid, if it is a page level lock the
+	blockId is valid, while the tuleId is still invalid.  Finally if
+	this is a tuple level lock (we currently never do this) then both
+	the blockId and tupleId are set to valid specifications.  This is
+	how we get the appearance of a multi-level lock table while using
+	only a single table (see Gray's paper on 2 phase locking if
+	you are puzzled about how multi-level lock tables work).
+
+mask -
+    This field indicates what types of locks are currently held in the
+    given lock.  It is used (against the lock table's conflict table)
+    to determine if the new lock request will conflict with existing
+    lock types held.  Conficts are determined by bitwise AND operations
+    between the mask and the conflict table entry for the given lock type
+    to be set.  The current representation is that each bit (1 through 5)
+    is set when that lock type (WRITE, READ, WRITE INTENT, READ INTENT, EXTEND)
+    has been acquired for the lock.
+
+waitProcs -
+    This is a shared memory queue of all process structures corresponding to
+    a backend that is waiting (sleeping) until another backend releases this
+    lock.  The process structure holds the information needed to determine
+    if it should be woken up when this lock is released.  If, for example,
+    we are releasing a read lock and the process is sleeping trying to acquire
+    a read lock then there is no point in waking it since the lock being
+    released isn't what caused it to sleep in the first place.  There will
+    be more on this below (when I get to releasing locks and waking sleeping
+    process routines).
+
+nHolding -
+    Keeps a count of how many times this lock has been attempted to be
+    acquired.  The count includes attempts by processes which were put
+    to sleep due to conflicts.  It also counts the same backend twice
+    if, for example, a backend process first acquires a read and then
+    acquires a write.
+
+holders -
+    Keeps a count of how many locks of each type have been attempted.  Only
+    elements 1 through MAX_LOCK_TYPES are used as they correspond to the lock
+    type defined constants (WRITE through EXTEND).  Summing the values of
+    holders should come out equal to nHolding.
+
+nActive -
+    Keeps a count of how many times this lock has been succesfully acquired.
+    This count does not include attempts that were rejected due to conflicts,
+    but can count the same backend twice (e.g. a read then a write -- since
+    its the same transaction this won't cause a conflict)
+
+activeHolders -
+    Keeps a count of how locks of each type are currently held.  Once again
+    only elements 1 through MAX_LOCK_TYPES are used (0 is not).  Also, like
+    holders, summing the values of activeHolders should total to the value
+    of nActive.
+
+
+This is all I had the stomach for right now..... I will get back to this
+someday.	-mer 17 June 1992 12:00 am
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,933 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c--
+ *    POSTGRES lock manager code
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/* #define LOCKDEBUGALL	1 */
+/* #define LOCKDEBUG	1 */
+
+#ifdef	LOCKDEBUGALL
+#define LOCKDEBUG	1
+#endif /*  LOCKDEBUGALL */
+
+#include "postgres.h"
+
+#include "access/heapam.h"
+#include "access/htup.h"
+#include "access/relscan.h"
+#include "access/skey.h"
+#include "utils/tqual.h"
+#include "access/xact.h"
+
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+#include "storage/multilev.h"
+#include "storage/lmgr.h"
+
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/rel.h"
+
+#include "catalog/catname.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_class.h"
+
+#include "nodes/memnodes.h"
+#include "storage/bufmgr.h"
+#include "access/transam.h"	/* for AmiTransactionId */
+
+/* ----------------
+ *	
+ * ----------------
+ */
+#define MaxRetries	4	/* XXX about 1/4 minute--a hack */
+
+#define IntentReadRelationLock	0x0100
+#define ReadRelationLock	0x0200
+#define IntentWriteRelationLock	0x0400
+#define WriteRelationLock	0x0800
+#define IntentReadPageLock	0x1000
+#define ReadTupleLock		0x2000
+
+#define TupleLevelLockCountMask	0x000f
+
+#define TupleLevelLockLimit	10
+
+extern Oid	MyDatabaseId;
+
+static LRelId	VariableRelationLRelId = {
+    RelOid_pg_variable,
+    InvalidOid
+};
+
+/* ----------------
+ *	RelationGetLRelId
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_10 \
+elog(NOTICE, "RelationGetLRelId(%s) invalid lockInfo", \
+     RelationGetRelationName(relation));
+#else
+#define LOCKDEBUG_10
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationGetLRelId --
+ *	Returns "lock" relation identifier for a relation.
+ */
+LRelId
+RelationGetLRelId(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    linfo = (LockInfo) relation->lockInfo;
+    
+    /* ----------------
+     *	initialize lock info if necessary
+     * ----------------
+     */
+    if (! LockInfoIsValid(linfo)) {
+	LOCKDEBUG_10;
+	RelationInitLockInfo(relation);
+	linfo = (LockInfo) relation->lockInfo;
+    }
+    
+    /* ----------------
+     * XXX hack to prevent problems during
+     * VARIABLE relation initialization
+     * ----------------
+     */
+    if (strcmp(RelationGetRelationName(relation)->data,
+	       VariableRelationName) == 0) {
+	return (VariableRelationLRelId);
+    }
+    
+    return (linfo->lRelId);
+}
+
+/*
+ * LRelIdGetDatabaseId --
+ *	Returns database identifier for a "lock" relation identifier.
+ */
+/* ----------------
+ *	LRelIdGetDatabaseId
+ *
+ * Note: The argument may not be correct, if it is not used soon
+ *	 after it is created.
+ * ----------------
+ */
+Oid
+LRelIdGetDatabaseId(LRelId lRelId)
+{
+    return (lRelId.dbId);
+}
+
+
+/*
+ * LRelIdGetRelationId --
+ *	Returns relation identifier for a "lock" relation identifier.
+ */
+Oid 
+LRelIdGetRelationId(LRelId lRelId)
+{
+    return (lRelId.relId);
+}
+
+/*
+ * DatabaseIdIsMyDatabaseId --
+ *	True iff database object identifier is valid in my present database.
+ */
+bool
+DatabaseIdIsMyDatabaseId(Oid databaseId)
+{
+    return (bool)
+	(!OidIsValid(databaseId) || databaseId == MyDatabaseId);
+}
+
+/*
+ * LRelIdContainsMyDatabaseId --
+ *	True iff "lock" relation identifier is valid in my present database.
+ */
+bool
+LRelIdContainsMyDatabaseId(LRelId lRelId)
+{
+    return (bool)
+	(!OidIsValid(lRelId.dbId) || lRelId.dbId == MyDatabaseId);
+}
+
+/*
+ * RelationInitLockInfo --
+ *	Initializes the lock information in a relation descriptor.
+ */
+/* ----------------
+ *	RelationInitLockInfo
+ *
+ * 	XXX processingVariable is a hack to prevent problems during
+ * 	VARIABLE relation initialization.
+ * ----------------
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+    LockInfo		info;
+    char 		*relname;
+    Oid		relationid;
+    bool		processingVariable;
+    extern Oid	MyDatabaseId;		/* XXX use include */
+    extern GlobalMemory CacheCxt;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    Assert(OidIsValid(RelationGetRelationId(relation)));
+    
+    /* ----------------
+     *	get information from relation descriptor
+     * ----------------
+     */
+    info = (LockInfo) relation->lockInfo;
+    relname = (char *) RelationGetRelationName(relation);
+    relationid = RelationGetRelationId(relation);
+    processingVariable = (strcmp(relname, VariableRelationName) == 0);
+    
+    /* ----------------
+     *	create a new lockinfo if not already done
+     * ----------------
+     */
+    if (! PointerIsValid(info)) 
+	{
+	    MemoryContext oldcxt;
+	    
+	    oldcxt = MemoryContextSwitchTo((MemoryContext)CacheCxt);
+	    info = (LockInfo)palloc(sizeof(LockInfoData));
+	    MemoryContextSwitchTo(oldcxt);
+	}
+    else if (processingVariable) {
+	if (IsTransactionState()) {
+	    TransactionIdStore(GetCurrentTransactionId(),
+			       &info->transactionIdData);
+	}
+	info->flags = 0x0;
+	return;		/* prevent an infinite loop--still true? */
+    }
+    else if (info->initialized)
+	{
+	    /* ------------
+	     *  If we've already initialized we're done.
+	     * ------------
+	     */
+	    return;
+	}
+    
+    /* ----------------
+     *	initialize lockinfo.dbId and .relId appropriately
+     * ----------------
+     */
+    if (IsSharedSystemRelationName(relname))
+	LRelIdAssign(&info->lRelId, InvalidOid, relationid);
+    else
+	LRelIdAssign(&info->lRelId, MyDatabaseId, relationid);
+    
+    /* ----------------
+     *	store the transaction id in the lockInfo field
+     * ----------------
+     */
+    if (processingVariable)
+	TransactionIdStore(AmiTransactionId,
+			   &info->transactionIdData);
+    else if (IsTransactionState()) 
+	TransactionIdStore(GetCurrentTransactionId(),
+			   &info->transactionIdData);
+    else
+	StoreInvalidTransactionId(&(info->transactionIdData));
+    
+    /* ----------------
+     *	initialize rest of lockinfo
+     * ----------------
+     */
+    info->flags = 0x0;
+    info->initialized =	(bool)true;
+    relation->lockInfo = (Pointer) info;
+}
+
+/* ----------------
+ *	RelationDiscardLockInfo
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_20 \
+elog(DEBUG, "DiscardLockInfo: NULL relation->lockInfo")
+#else
+#define LOCKDEBUG_20
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationDiscardLockInfo --
+ *	Discards the lock information in a relation descriptor.
+ */
+void
+RelationDiscardLockInfo(Relation relation)
+{
+    if (! LockInfoIsValid(relation->lockInfo)) {
+	LOCKDEBUG_20;
+	return;
+    }
+    
+    pfree(relation->lockInfo);
+    relation->lockInfo = NULL;
+}
+
+/*
+ * RelationSetLockForDescriptorOpen --
+ *	Sets read locks for a relation descriptor.
+ */
+#ifdef	LOCKDEBUGALL
+#define LOCKDEBUGALL_30 \
+elog(DEBUG, "RelationSetLockForDescriptorOpen(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUGALL_30
+#endif	/* LOCKDEBUGALL*/
+     
+void
+RelationSetLockForDescriptorOpen(Relation relation)
+{
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUGALL_30;
+    
+    /* ----------------
+     * read lock catalog tuples which compose the relation descriptor
+     * XXX race condition? XXX For now, do nothing.
+     * ----------------
+     */
+}
+
+/* ----------------
+ *	RelationSetLockForRead
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_40 \
+elog(DEBUG, "RelationSetLockForRead(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_40
+#endif	/* LOCKDEBUG*/
+     
+/*
+ * RelationSetLockForRead --
+ *	Sets relation level read lock.
+ */
+void
+RelationSetLockForRead(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUG_40;
+    
+    /* ----------------
+     * If we don't have lock info on the reln just go ahead and
+     * lock it without trying to short circuit the lock manager.
+     * ----------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	{
+	    RelationInitLockInfo(relation);
+	    linfo = (LockInfo) relation->lockInfo;
+	    linfo->flags |= ReadRelationLock;
+	    MultiLockReln(linfo, READ_LOCK);
+	    return;
+	}
+    else
+        linfo = (LockInfo) relation->lockInfo;
+    
+    MultiLockReln(linfo, READ_LOCK);
+}
+
+/* ----------------
+ *	RelationUnsetLockForRead
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_50 \
+elog(DEBUG, "RelationUnsetLockForRead(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_50
+#endif	/* LOCKDEBUG*/
+     
+/*
+ * RelationUnsetLockForRead --
+ *	Unsets relation level read lock.
+ */
+void
+RelationUnsetLockForRead(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity check
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    linfo = (LockInfo) relation->lockInfo;
+    
+    /* ----------------
+     * If we don't have lock info on the reln just go ahead and
+     * release it.
+     * ----------------
+     */
+    if (!LockInfoIsValid(linfo))
+	{
+	    elog(WARN, 
+		 "Releasing a lock on %s with invalid lock information",
+		 RelationGetRelationName(relation));
+	}
+    
+    MultiReleaseReln(linfo, READ_LOCK);
+}
+
+/* ----------------
+ *	RelationSetLockForWrite(relation)
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_60 \
+elog(DEBUG, "RelationSetLockForWrite(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId)
+#else
+#define LOCKDEBUG_60
+#endif	/* LOCKDEBUG*/
+     
+/*
+ * RelationSetLockForWrite --
+ *	Sets relation level write lock.
+ */
+void
+RelationSetLockForWrite(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUG_60;
+    
+    /* ----------------
+     * If we don't have lock info on the reln just go ahead and
+     * lock it without trying to short circuit the lock manager.
+     * ----------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	{
+	    RelationInitLockInfo(relation);
+	    linfo = (LockInfo) relation->lockInfo;
+	    linfo->flags |= WriteRelationLock;
+	    MultiLockReln(linfo, WRITE_LOCK);
+	    return;
+	}
+    else
+        linfo = (LockInfo) relation->lockInfo;
+    
+    MultiLockReln(linfo, WRITE_LOCK);
+}
+
+/* ----------------
+ *	RelationUnsetLockForWrite
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_70 \
+elog(DEBUG, "RelationUnsetLockForWrite(%s[%d,%d]) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId);
+#else
+#define LOCKDEBUG_70
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationUnsetLockForWrite --
+ *	Unsets relation level write lock.
+ */
+void
+RelationUnsetLockForWrite(Relation relation)
+{
+    LockInfo	linfo;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled()) {
+	return;
+    }
+    
+    linfo = (LockInfo) relation->lockInfo;
+    
+    if (!LockInfoIsValid(linfo))
+	{
+	    elog(WARN, 
+		 "Releasing a lock on %s with invalid lock information",
+		 RelationGetRelationName(relation));
+	}
+    
+    MultiReleaseReln(linfo, WRITE_LOCK);
+}
+
+/* ----------------
+ *	RelationSetLockForTupleRead
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_80 \
+elog(DEBUG, "RelationSetLockForTupleRead(%s[%d,%d], 0x%x) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, \
+     itemPointer)
+#define LOCKDEBUG_81 \
+     elog(DEBUG, "RelationSetLockForTupleRead() escalating");
+#else
+#define LOCKDEBUG_80
+#define LOCKDEBUG_81
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationSetLockForTupleRead --
+ *	Sets tuple level read lock.
+ */
+void
+RelationSetLockForTupleRead(Relation relation, ItemPointer itemPointer)
+{
+    LockInfo	linfo;
+    TransactionId curXact;
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    LOCKDEBUG_80;
+    
+    /* ---------------------
+     * If our lock info is invalid don't bother trying to short circuit
+     * the lock manager.
+     * ---------------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	{
+	    RelationInitLockInfo(relation);
+	    linfo = (LockInfo) relation->lockInfo;
+	    linfo->flags |=
+                IntentReadRelationLock |
+		    IntentReadPageLock |
+			ReadTupleLock;
+	    MultiLockTuple(linfo, itemPointer, READ_LOCK);
+	    return;
+	}
+    else
+        linfo = (LockInfo) relation->lockInfo;
+    
+    /* ----------------
+     *	no need to set a lower granularity lock
+     * ----------------
+     */
+    curXact = GetCurrentTransactionId();
+    if ((linfo->flags & ReadRelationLock) &&
+	TransactionIdEquals(curXact, linfo->transactionIdData))
+	{
+	    return;
+	}
+    
+    /* ----------------
+     * If we don't already have a tuple lock this transaction
+     * ----------------
+     */
+    if (!( (linfo->flags & ReadTupleLock) &&
+	  TransactionIdEquals(curXact, linfo->transactionIdData) )) {
+	
+	linfo->flags |=
+	    IntentReadRelationLock |
+		IntentReadPageLock |
+		    ReadTupleLock;
+	
+	/* clear count */
+	linfo->flags &= ~TupleLevelLockCountMask;
+	
+    } else {
+	if (TupleLevelLockLimit == (TupleLevelLockCountMask &
+				    linfo->flags)) {
+	    LOCKDEBUG_81;
+	    
+	    /* escalate */
+	    MultiLockReln(linfo, READ_LOCK);
+	    
+	    /* clear count */
+	    linfo->flags &= ~TupleLevelLockCountMask;
+	    return;
+	}
+	
+	/* increment count */
+	linfo->flags =
+	    (linfo->flags & ~TupleLevelLockCountMask) |
+		(1 + (TupleLevelLockCountMask & linfo->flags));
+    }
+    
+    TransactionIdStore(curXact, &linfo->transactionIdData);
+    
+    /* ----------------
+     * Lock the tuple.
+     * ----------------
+     */
+    MultiLockTuple(linfo, itemPointer, READ_LOCK);
+}
+
+/* ----------------
+ *	RelationSetLockForReadPage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_90 \
+elog(DEBUG, "RelationSetLockForReadPage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
+#else
+#define LOCKDEBUG_90
+#endif	/* LOCKDEBUG*/
+     
+/* ----------------
+ *	RelationSetLockForWritePage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_100 \
+elog(DEBUG, "RelationSetLockForWritePage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page);
+#else
+#define LOCKDEBUG_100
+#endif	/* LOCKDEBUG */
+     
+/*
+ * RelationSetLockForWritePage --
+ *	Sets write lock on a page.
+ */
+void 
+RelationSetLockForWritePage(Relation relation,
+			    ItemPointer itemPointer)
+{
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    /* ---------------
+     * Make sure linfo is initialized
+     * ---------------
+     */
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    /* ----------------
+     *	attempt to set lock
+     * ----------------
+     */
+    MultiLockPage((LockInfo) relation->lockInfo, itemPointer, WRITE_LOCK);
+}
+
+/* ----------------
+ *	RelationUnsetLockForReadPage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_110 \
+elog(DEBUG, "RelationUnsetLockForReadPage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
+#else
+#define LOCKDEBUG_110
+#endif	/* LOCKDEBUG */
+     
+/* ----------------
+ *	RelationUnsetLockForWritePage
+ * ----------------
+ */
+#ifdef	LOCKDEBUG
+#define LOCKDEBUG_120 \
+elog(DEBUG, "RelationUnsetLockForWritePage(%s[%d,%d], @%d) called", \
+     RelationGetRelationName(relation), lRelId.dbId, lRelId.relId, page)
+#else
+#define LOCKDEBUG_120
+#endif	/* LOCKDEBUG */
+     
+/*
+ * Set a single level write page lock.  Assumes that you already
+ * have a write intent lock on the relation.
+ */
+void
+RelationSetSingleWLockPage(Relation relation,
+			   ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, !UNLOCK);
+}
+
+/*
+ * Unset a single level write page lock
+ */
+void
+RelationUnsetSingleWLockPage(Relation relation,
+			     ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+        elog(WARN, 
+	     "Releasing a lock on %s with invalid lock information",
+	     RelationGetRelationName(relation));
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, WRITE_LOCK, UNLOCK);
+}
+
+/*
+ * Set a single level read page lock.  Assumes you already have a read
+ * intent lock set on the relation.
+ */
+void
+RelationSetSingleRLockPage(Relation relation,
+			   ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, !UNLOCK);
+}
+
+/* 
+ * Unset a single level read page lock.
+ */
+void
+RelationUnsetSingleRLockPage(Relation relation,
+			     ItemPointer itemPointer)
+{
+    
+    /* ----------------
+     *	sanity checks
+     * ----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+        elog(WARN, 
+	     "Releasing a lock on %s with invalid lock information",
+	     RelationGetRelationName(relation));
+    
+    SingleLockPage((LockInfo)relation->lockInfo, itemPointer, READ_LOCK, UNLOCK);
+}
+
+/*
+ * Set a read intent lock on a relation.
+ *
+ * Usually these are set in a multi-level table when you acquiring a
+ * page level lock.  i.e. To acquire a lock on a page you first acquire
+ * an intent lock on the entire relation.  Acquiring an intent lock along
+ * allows one to use the single level locking routines later.  Good for
+ * index scans that do a lot of page level locking.
+ */
+void
+RelationSetRIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, !UNLOCK);
+}
+
+/*
+ * Unset a read intent lock on a relation
+ */
+void
+RelationUnsetRIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, READ_LOCK+INTENT, UNLOCK);
+}
+
+/*
+ * Set a write intent lock on a relation. For a more complete explanation
+ * see RelationSetRIntentLock()
+ */
+void
+RelationSetWIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, !UNLOCK);
+}
+
+/*
+ * Unset a write intent lock.
+ */
+void
+RelationUnsetWIntentLock(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    SingleLockReln((LockInfo)relation->lockInfo, WRITE_LOCK+INTENT, UNLOCK);
+}
+
+/*
+ * Extend locks are used primarily in tertiary storage devices such as
+ * a WORM disk jukebox.  Sometimes need exclusive access to extend a 
+ * file by a block.
+ */
+void
+RelationSetLockForExtend(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    MultiLockReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
+}
+
+void
+RelationUnsetLockForExtend(Relation relation)
+{
+    /* -----------------
+     * Sanity check
+     * -----------------
+     */
+    Assert(RelationIsValid(relation));
+    if (LockingDisabled())
+	return;
+    
+    if (!LockInfoIsValid(relation->lockInfo))
+	RelationInitLockInfo(relation);
+    
+    MultiReleaseReln((LockInfo) relation->lockInfo, EXTEND_LOCK);
+}
+
+/* 
+ * Create an LRelid --- Why not just pass in a pointer to the storage?
+ */
+void
+LRelIdAssign(LRelId *lRelId, Oid dbId, Oid relId)
+{   
+    lRelId->dbId = dbId;
+    lRelId->relId = relId;
+}
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
--- a/src/backend/storage/lmgr/multi.c
+++ b/src/backend/storage/lmgr/multi.c
@@ -0,0 +1,415 @@
+/*-------------------------------------------------------------------------
+ *
+ * multi.c--
+ *    multi level lock table manager
+ *
+ *    Standard multi-level lock manager as per the Gray paper
+ *    (at least, that is what it is supposed to be).  We implement
+ *    three levels -- RELN, PAGE, TUPLE.  Tuple is actually TID
+ *    a physical record pointer.  It isn't an object id.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/multi.c,v 1.1.1.1 1996/07/09 06:21:56 scrappy Exp $
+ *
+ * NOTES:
+ *   (1) The lock.c module assumes that the caller here is doing
+ *       two phase locking.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>
+#include <string.h>
+#include "storage/lmgr.h"
+#include "storage/multilev.h"
+
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "miscadmin.h"		/* MyDatabaseId */
+
+
+/*
+ * INTENT indicates to higher level that a lower level lock has been
+ * set.  For example, a write lock on a tuple conflicts with a write 
+ * lock on a relation.  This conflict is detected as a WRITE_INTENT/
+ * WRITE conflict between the tuple's intent lock and the relation's
+ * write lock.
+ */
+static int MultiConflicts[] = {
+    (int)NULL,	
+    /* All reads and writes at any level conflict with a write lock */
+    (1 << WRITE_LOCK)|(1 << WRITE_INTENT)|(1 << READ_LOCK)|(1 << READ_INTENT),
+    /* read locks conflict with write locks at curr and lower levels */
+    (1 << WRITE_LOCK)| (1 << WRITE_INTENT),  
+    /* write intent locks */
+    (1 << READ_LOCK) | (1 << WRITE_LOCK),
+    /* read intent locks*/
+    (1 << WRITE_LOCK),
+    /* extend locks for archive storage manager conflict only w/extend locks */
+    (1 << EXTEND_LOCK)
+};
+
+/*
+ * write locks have higher priority than read locks and extend locks.  May
+ * want to treat INTENT locks differently.
+ */
+static int MultiPrios[] = {
+    (int)NULL,
+    2,
+    1,
+    2,
+    1,
+    1
+};
+
+/* 
+ * Lock table identifier for this lock table.  The multi-level
+ * lock table is ONE lock table, not three.
+ */
+LockTableId MultiTableId = (LockTableId)NULL;
+LockTableId ShortTermTableId = (LockTableId)NULL;
+
+/*
+ * Create the lock table described by MultiConflicts and Multiprio.
+ */
+LockTableId
+InitMultiLevelLockm()
+{
+    int tableId;
+    
+    /* -----------------------
+     * If we're already initialized just return the table id.
+     * -----------------------
+     */
+    if (MultiTableId)
+	return MultiTableId;
+    
+    tableId = LockTabInit("LockTable", MultiConflicts, MultiPrios, 5);
+    MultiTableId = tableId;
+    if (! (MultiTableId)) {
+	elog(WARN,"InitMultiLockm: couldnt initialize lock table");
+    }
+    /* -----------------------
+     * No short term lock table for now.  -Jeff 15 July 1991
+     * 
+     * ShortTermTableId = LockTabRename(tableId);
+     * if (! (ShortTermTableId)) {
+     *   elog(WARN,"InitMultiLockm: couldnt rename lock table");
+     * }
+     * -----------------------
+     */
+    return MultiTableId;
+}
+
+/*
+ * MultiLockReln -- lock a relation
+ *
+ * Returns: TRUE if the lock can be set, FALSE otherwise.
+ */
+bool
+MultiLockReln(LockInfo linfo, LOCKT lockt)
+{
+    LOCKTAG	tag;
+    
+    /* LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    return(MultiAcquire(MultiTableId, &tag, lockt, RELN_LEVEL));
+}
+
+/*
+ * MultiLockTuple -- Lock the TID associated with a tuple
+ *
+ * Returns: TRUE if lock is set, FALSE otherwise.
+ *
+ * Side Effects: causes intention level locks to be set
+ * 	at the page and relation level.
+ */
+bool
+MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+    LOCKTAG	tag;
+    
+    /* LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    
+    /* not locking any valid Tuple, just the page */
+    tag.tupleId = *tidPtr;
+    return(MultiAcquire(MultiTableId, &tag, lockt, TUPLE_LEVEL));
+}
+
+/*
+ * same as above at page level
+ */
+bool
+MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt)
+{
+    LOCKTAG	tag;
+    
+    /* LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    
+    
+    /* ----------------------------
+     * Now we want to set the page offset to be invalid 
+     * and lock the block.  There is some confusion here as to what
+     * a page is.  In Postgres a page is an 8k block, however this
+     * block may be partitioned into many subpages which are sometimes
+     * also called pages.  The term is overloaded, so don't be fooled
+     * when we say lock the page we mean the 8k block. -Jeff 16 July 1991
+     * ----------------------------
+     */
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+    return(MultiAcquire(MultiTableId, &tag, lockt, PAGE_LEVEL));
+}
+
+/*
+ * MultiAcquire -- acquire multi level lock at requested level
+ *
+ * Returns: TRUE if lock is set, FALSE if not
+ * Side Effects:
+ */
+bool
+MultiAcquire(LockTableId tableId,
+	     LOCKTAG *tag,
+	     LOCKT lockt,
+	     LOCK_LEVEL level)
+{
+    LOCKT locks[N_LEVELS];
+    int	i,status;
+    LOCKTAG 	xxTag, *tmpTag = &xxTag;
+    int	retStatus = TRUE;
+    
+    /*
+     * Three levels implemented.  If we set a low level (e.g. Tuple)
+     * lock, we must set INTENT locks on the higher levels.  The 
+     * intent lock detects conflicts between the low level lock
+     * and an existing high level lock.  For example, setting a
+     * write lock on a tuple in a relation is disallowed if there
+     * is an existing read lock on the entire relation.  The
+     * write lock would set a WRITE + INTENT lock on the relation
+     * and that lock would conflict with the read.
+     */
+    switch (level) {
+    case RELN_LEVEL:
+	locks[0] = lockt;
+	locks[1] = NO_LOCK;
+	locks[2] = NO_LOCK;
+	break;
+    case PAGE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt;
+	locks[2] = NO_LOCK;
+	break;
+    case TUPLE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt + INTENT;
+	locks[2] = lockt;
+	break;
+    default:
+	elog(WARN,"MultiAcquire: bad lock level");
+	return(FALSE);
+    }
+    
+    /*
+     * construct a new tag as we go. Always loop through all levels,
+     * but if we arent' seting a low level lock, locks[i] is set to
+     * NO_LOCK for the lower levels.  Always start from the highest
+     * level and go to the lowest level. 
+     */
+    memset(tmpTag,0,sizeof(*tmpTag));
+    tmpTag->relId = tag->relId;
+    tmpTag->dbId = tag->dbId;
+    
+    for (i=0;i<N_LEVELS;i++) {
+	if (locks[i] != NO_LOCK) {
+	    switch (i) {
+	    case RELN_LEVEL:
+		/* -------------
+		 * Set the block # and offset to invalid
+		 * -------------
+		 */
+		BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case PAGE_LEVEL:
+		/* -------------
+		 * Copy the block #, set the offset to invalid
+		 * -------------
+		 */
+		BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
+			    &(tag->tupleId.ip_blkid));
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case TUPLE_LEVEL:
+		/* --------------
+		 * Copy the entire tuple id.
+		 * --------------
+		 */
+		ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
+		break;
+	    }
+	    
+	    status = LockAcquire(tableId, tmpTag, locks[i]);
+	    if (! status) {
+		/* failed for some reason. Before returning we have
+		 * to release all of the locks we just acquired.
+		 * MultiRelease(xx,xx,xx, i) means release starting from
+		 * the last level lock we successfully acquired
+		 */
+		retStatus = FALSE;
+		(void) MultiRelease(tableId, tag, lockt, i);
+		/* now leave the loop.  Don't try for any more locks */
+		break;
+	    }
+	}
+    }
+    return(retStatus);
+}
+
+/* ------------------
+ * Release a page in the multi-level lock table
+ * ------------------
+ */
+bool
+MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT	lockt)
+{
+    LOCKTAG tag;
+    
+    /* ------------------
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     * ------------------
+     */
+    memset(&tag, 0,sizeof(LOCKTAG));
+    
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+    
+    return (MultiRelease(MultiTableId, &tag, lockt, PAGE_LEVEL));
+}
+
+/* ------------------
+ * Release a relation in the multi-level lock table
+ * ------------------
+ */
+bool
+MultiReleaseReln(LockInfo linfo, LOCKT lockt)		
+{
+    LOCKTAG tag;
+    
+    /* ------------------
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     * ------------------
+     */
+    memset(&tag, 0, sizeof(LOCKTAG));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    
+    return (MultiRelease(MultiTableId, &tag, lockt, RELN_LEVEL));
+}
+
+/*
+ * MultiRelease -- release a multi-level lock
+ *
+ * Returns: TRUE if successful, FALSE otherwise.
+ */
+bool
+MultiRelease(LockTableId tableId,
+	     LOCKTAG *tag,
+	     LOCKT	lockt,
+	     LOCK_LEVEL level)
+{
+    LOCKT 	locks[N_LEVELS];
+    int		i,status;
+    LOCKTAG 	xxTag, *tmpTag = &xxTag;
+    
+    /* 
+     * same level scheme as MultiAcquire().
+     */
+    switch (level) {
+    case RELN_LEVEL:
+	locks[0] = lockt;
+	locks[1] = NO_LOCK;
+	locks[2] = NO_LOCK;
+	break;
+    case PAGE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt;
+	locks[2] = NO_LOCK;
+	break;
+    case TUPLE_LEVEL:
+	locks[0] = lockt + INTENT;
+	locks[1] = lockt + INTENT;
+	locks[2] = lockt;
+	break;
+    default:
+	elog(WARN,"MultiRelease: bad lockt");
+    }
+    
+    /*
+     * again, construct the tag on the fly.  This time, however,
+     * we release the locks in the REVERSE order -- from lowest
+     * level to highest level.  
+     *
+     * Must zero out the tag to set padding byes to zero and ensure
+     * hashing consistency.
+     */
+    memset(tmpTag, 0, sizeof(*tmpTag));
+    tmpTag->relId = tag->relId;
+    tmpTag->dbId =  tag->dbId;
+    
+    for (i=(N_LEVELS-1); i>=0; i--) {
+	if (locks[i] != NO_LOCK) {
+	    switch (i) {
+	    case RELN_LEVEL:
+		/* -------------
+		 * Set the block # and offset to invalid
+		 * -------------
+		 */
+		BlockIdSet(&(tmpTag->tupleId.ip_blkid), InvalidBlockNumber);
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case PAGE_LEVEL:
+		/* -------------
+		 * Copy the block #, set the offset to invalid
+		 * -------------
+		 */
+		BlockIdCopy(&(tmpTag->tupleId.ip_blkid),
+			    &(tag->tupleId.ip_blkid));
+		tmpTag->tupleId.ip_posid = InvalidOffsetNumber;
+		break;
+	    case TUPLE_LEVEL:
+		ItemPointerCopy(&tmpTag->tupleId, &tag->tupleId);
+		break;
+	    }
+	    status = LockRelease(tableId, tmpTag, locks[i]);
+	    if (! status) {
+		elog(WARN,"MultiRelease: couldn't release after error");
+	    }
+	}
+    }
+    /* shouldn't reach here */
+    return false;
+}
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,826 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c--
+ *    routines to manage per-process shared memory data structure
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ *  Each postgres backend gets one of these.  We'll use it to
+ *  clean up after the process should the process suddenly die.
+ *
+ *
+ * Interface (a):
+ *	ProcSleep(), ProcWakeup(), ProcWakeupNext(),
+ * 	ProcQueueAlloc() -- create a shm queue for sleeping processes
+ * 	ProcQueueInit() -- create a queue without allocing memory
+ *
+ * Locking and waiting for buffers can cause the backend to be
+ * put to sleep.  Whoever releases the lock, etc. wakes the
+ * process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with this process,
+ * ProcKill -- destroys the shared memory state (and locks)
+ *	associated with the process.
+ *
+ * 5/15/91 -- removed the buffer pool based lock chain in favor
+ *	of a shared memory lock chain.  The write-protection is
+ *	more expensive if the lock chain is in the buffer pool.
+ *	The only reason I kept the lock chain in the buffer pool
+ *	in the first place was to allow the lock table to grow larger
+ *	than available shared memory and that isn't going to work
+ *	without a lot of unimplemented support anyway.
+ *
+ * 4/7/95 -- instead of allocating a set of 1 semaphore per process, we
+ *      allocate a semaphore from a set of PROC_NSEMS_PER_SET semaphores
+ *      shared among backends (we keep a few sets of semaphores around).
+ *      This is so that we can support more backends. (system-wide semaphore
+ *      sets run out pretty fast.)                -ay 4/95
+ *
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ */
+#include <sys/time.h>
+#ifndef WIN32
+#include <unistd.h>
+#endif /* WIN32 */
+#include <string.h>
+#include <sys/types.h>
+#include "libpq/pqsignal.h"	/* substitute for <signal.h> */
+
+#if defined(PORTNAME_bsdi)
+/* hacka, hacka, hacka (XXX) */
+union semun {
+	int val; /* value for SETVAL */
+	struct semid_ds *buf; /* buffer for IPC_STAT & IPC_SET */
+	ushort *array; /* array for GETALL & SETALL */
+};
+#endif
+
+#include "access/xact.h"
+#include "utils/hsearch.h"
+#include "utils/elog.h"
+
+#include "storage/buf.h"	
+#include "storage/lock.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/proc.h"
+
+/*
+ * timeout (in seconds) for resolving possible deadlock
+ */
+#ifndef DEADLOCK_TIMEOUT
+#define DEADLOCK_TIMEOUT	60
+#endif
+
+/* --------------------
+ * Spin lock for manipulating the shared process data structure:
+ * ProcGlobal.... Adding an extra spin lock seemed like the smallest
+ * hack to get around reading and updating this structure in shared
+ * memory. -mer 17 July 1991
+ * --------------------
+ */
+SPINLOCK ProcStructLock;
+
+/*
+ * For cleanup routines.  Don't cleanup if the initialization
+ * has not happened.
+ */
+static bool	ProcInitialized = FALSE;
+
+static PROC_HDR *ProcGlobal = NULL;
+
+PROC 	*MyProc = NULL;
+
+static void ProcKill(int exitStatus, int pid);
+static void ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum);
+static void ProcFreeSem(IpcSemaphoreKey semKey, int semNum);
+#if defined(PORTNAME_linux)
+extern int HandleDeadLock(int);
+#else
+extern int HandleDeadLock(void);
+#endif
+/*
+ * InitProcGlobal -
+ *    initializes the global process table. We put it here so that
+ *    the postmaster can do this initialization. (ProcFreeAllSem needs
+ *    to read this table on exiting the postmaster. If we have the first
+ *    backend do this, starting up and killing the postmaster without
+ *    starting any backends will be a problem.)
+ */
+void
+InitProcGlobal(IPCKey key)
+{
+    bool found = false;
+
+    /* attach to the free list */
+    ProcGlobal = (PROC_HDR *)
+	ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
+
+    /* --------------------
+     * We're the first - initialize.
+     * --------------------
+     */
+    if (! found)
+	{
+	    int i;
+
+	    ProcGlobal->numProcs = 0;
+	    ProcGlobal->freeProcs = INVALID_OFFSET;
+	    ProcGlobal->currKey = IPCGetProcessSemaphoreInitKey(key);
+	    for (i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++)
+		ProcGlobal->freeSemMap[i] = 0;
+	}
+}
+
+/* ------------------------
+ * InitProc -- create a per-process data structure for this process
+ * used by the lock manager on semaphore queues.
+ * ------------------------
+ */
+void
+InitProcess(IPCKey key)
+{
+    bool found = false;
+    int pid;
+    int semstat;
+    unsigned long location, myOffset;
+    
+    /* ------------------
+     * Routine called if deadlock timer goes off. See ProcSleep()
+     * ------------------
+     */
+#ifndef WIN32
+    signal(SIGALRM, HandleDeadLock);
+#endif /* WIN32 we'll have to figure out how to handle this later */
+
+    SpinAcquire(ProcStructLock);
+    
+    /* attach to the free list */
+    ProcGlobal = (PROC_HDR *)
+	ShmemInitStruct("Proc Header",(unsigned)sizeof(PROC_HDR),&found);
+    if (!found) {
+	/* this should not happen. InitProcGlobal() is called before this. */
+	elog(WARN, "InitProcess: Proc Header uninitialized");
+    }
+    
+    if (MyProc != NULL)
+	{
+	    SpinRelease(ProcStructLock);
+	    elog(WARN,"ProcInit: you already exist");
+	    return;
+	}
+    
+    /* try to get a proc from the free list first */
+    
+    myOffset = ProcGlobal->freeProcs;
+    
+    if (myOffset != INVALID_OFFSET)
+	{
+	    MyProc = (PROC *) MAKE_PTR(myOffset);
+	    ProcGlobal->freeProcs = MyProc->links.next;
+	}
+    else
+	{
+	    /* have to allocate one.  We can't use the normal binding
+	     * table mechanism because the proc structure is stored
+	     * by PID instead of by a global name (need to look it
+	     * up by PID when we cleanup dead processes).
+	     */
+	    
+	    MyProc = (PROC *) ShmemAlloc((unsigned)sizeof(PROC));
+	    if (! MyProc)
+		{
+		    SpinRelease(ProcStructLock);
+		    elog (FATAL,"cannot create new proc: out of memory");
+		}
+	    
+	    /* this cannot be initialized until after the buffer pool */
+	    SHMQueueInit(&(MyProc->lockQueue));
+	    MyProc->procId = ProcGlobal->numProcs;
+	    ProcGlobal->numProcs++;
+	}
+    
+    /*
+     * zero out the spin lock counts and set the sLocks field for
+     * ProcStructLock to 1 as we have acquired this spinlock above but 
+     * didn't record it since we didn't have MyProc until now.
+     */
+    memset(MyProc->sLocks, 0, sizeof(MyProc->sLocks));
+    MyProc->sLocks[ProcStructLock] = 1;
+
+
+    if (IsUnderPostmaster) {
+	IPCKey semKey;
+	int semNum;
+	int semId;
+	union semun semun;
+
+	ProcGetNewSemKeyAndNum(&semKey, &semNum);
+	
+	semId = IpcSemaphoreCreate(semKey,
+				   PROC_NSEMS_PER_SET,
+				   IPCProtection,
+				   IpcSemaphoreDefaultStartValue,
+				   0,
+				   &semstat);
+	/*
+	 * we might be reusing a semaphore that belongs to a dead
+	 * backend. So be careful and reinitialize its value here.
+	 */
+	semun.val = IpcSemaphoreDefaultStartValue;
+	semctl(semId, semNum, SETVAL, semun);
+
+	IpcSemaphoreLock(semId, semNum, IpcExclusiveLock);
+	MyProc->sem.semId = semId;
+	MyProc->sem.semNum = semNum;
+	MyProc->sem.semKey = semKey;
+    } else {
+	MyProc->sem.semId = -1;
+    }
+    
+    /* ----------------------
+     * Release the lock.
+     * ----------------------
+     */
+    SpinRelease(ProcStructLock);
+    
+    MyProc->pid = 0;
+#if 0
+    MyProc->pid = MyPid;
+#endif
+    
+    /* ----------------
+     * Start keeping spin lock stats from here on.  Any botch before
+     * this initialization is forever botched
+     * ----------------
+     */
+    memset(MyProc->sLocks, 0, MAX_SPINS*sizeof(*MyProc->sLocks));
+    
+    /* -------------------------
+     * Install ourselves in the binding table.  The name to
+     * use is determined by the OS-assigned process id.  That
+     * allows the cleanup process to find us after any untimely
+     * exit.
+     * -------------------------
+     */
+    pid = getpid();
+    location = MAKE_OFFSET(MyProc);
+    if ((! ShmemPIDLookup(pid,&location)) || (location != MAKE_OFFSET(MyProc)))
+	{
+	    elog(FATAL,"InitProc: ShmemPID table broken");
+	}
+    
+    MyProc->errType = NO_ERROR;
+    SHMQueueElemInit(&(MyProc->links));
+    
+    on_exitpg(ProcKill, (caddr_t)pid);
+    
+    ProcInitialized = TRUE;
+}
+
+/*
+ * ProcReleaseLocks() -- release all locks associated with this process
+ *
+ */
+void
+ProcReleaseLocks()
+{
+    if (!MyProc)
+	return;
+    LockReleaseAll(1,&MyProc->lockQueue);
+}
+
+/*
+ * ProcRemove -
+ *    used by the postmaster to clean up the global tables. This also frees
+ *    up the semaphore used for the lmgr of the process. (We have to do
+ *    this is the postmaster instead of doing a IpcSemaphoreKill on exiting
+ *    the process because the semaphore set is shared among backends and
+ *    we don't want to remove other's semaphores on exit.)
+ */
+bool
+ProcRemove(int pid)
+{
+    SHMEM_OFFSET  location;
+    PROC *proc;
+    
+    location = INVALID_OFFSET;
+    
+    location = ShmemPIDDestroy(pid);
+    if (location == INVALID_OFFSET)
+	return(FALSE);
+    proc = (PROC *) MAKE_PTR(location);
+
+    SpinAcquire(ProcStructLock);
+    
+    ProcFreeSem(proc->sem.semKey, proc->sem.semNum);
+
+    proc->links.next =  ProcGlobal->freeProcs;
+    ProcGlobal->freeProcs = MAKE_OFFSET(proc);
+    
+    SpinRelease(ProcStructLock);
+
+    return(TRUE);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ *	this process. Release any of its held spin locks.
+ */
+static void
+ProcKill(int exitStatus, int pid)
+{
+    PROC 		*proc;
+    SHMEM_OFFSET	location;
+    
+    /* -------------------- 
+     * If this is a FATAL exit the postmaster will have to kill all the
+     * existing backends and reinitialize shared memory.  So all we don't 
+     * need to do anything here.
+     * --------------------
+     */
+    if (exitStatus != 0)
+	return;
+    
+    if (! pid)
+	{
+	    pid = getpid();
+	}
+    
+    ShmemPIDLookup(pid,&location);
+    if (location == INVALID_OFFSET)
+	return;
+    
+    proc = (PROC *) MAKE_PTR(location);
+    
+    if (proc != MyProc) {
+	Assert( pid != getpid() );
+    } else
+	MyProc = NULL;
+    
+    /* ---------------
+     * Assume one lock table.
+     * ---------------
+     */
+    ProcReleaseSpins(proc);
+    LockReleaseAll(1,&proc->lockQueue);
+    
+    /* ----------------
+     * get off the wait queue
+     * ----------------
+     */
+    LockLockTable();
+    if (proc->links.next != INVALID_OFFSET) {
+	Assert(proc->waitLock->waitProcs.size > 0);
+	SHMQueueDelete(&(proc->links));
+	--proc->waitLock->waitProcs.size;
+    }
+    SHMQueueElemInit(&(proc->links));
+    UnlockLockTable();
+    
+    return;
+}
+
+/*
+ * ProcQueue package: routines for putting processes to sleep
+ * 	and  waking them up
+ */
+
+/*
+ * ProcQueueAlloc -- alloc/attach to a shared memory process queue
+ *
+ * Returns: a pointer to the queue or NULL
+ * Side Effects: Initializes the queue if we allocated one
+ */
+PROC_QUEUE *
+ProcQueueAlloc(char *name)
+{
+    bool	found;
+    PROC_QUEUE *queue = (PROC_QUEUE *)
+	ShmemInitStruct(name,(unsigned)sizeof(PROC_QUEUE),&found);
+    
+    if (! queue)
+	{
+	    return(NULL);
+	}
+    if (! found)
+	{
+	    ProcQueueInit(queue);
+	}
+    return(queue);
+}
+
+/*
+ * ProcQueueInit -- initialize a shared memory process queue
+ */
+void
+ProcQueueInit(PROC_QUEUE *queue)
+{
+    SHMQueueInit(&(queue->links));
+    queue->size = 0;
+}
+
+
+
+/*
+ * ProcSleep -- put a process to sleep
+ *
+ * P() on the semaphore should put us to sleep.  The process
+ * semaphore is cleared by default, so the first time we try
+ * to acquire it, we sleep.
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ * 	we release the spin lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+int
+ProcSleep(PROC_QUEUE *queue,
+	  SPINLOCK spinlock,
+	  int token,
+	  int prio,
+	  LOCK *lock)
+{
+    int 	i;
+    PROC	*proc;
+#ifndef WIN32 /* figure this out later */
+    struct itimerval timeval, dummy;
+#endif /* WIN32 */
+    
+    proc = (PROC *) MAKE_PTR(queue->links.prev);
+    for (i=0;i<queue->size;i++)
+	{
+	    if (proc->prio < prio)
+		proc = (PROC *) MAKE_PTR(proc->links.prev);
+	    else
+		break;
+	}
+    
+    MyProc->token = token;
+    MyProc->waitLock = lock;
+    
+    /* -------------------
+     * currently, we only need this for the ProcWakeup routines
+     * -------------------
+     */
+    TransactionIdStore((TransactionId) GetCurrentTransactionId(), &MyProc->xid);
+    
+    /* -------------------
+     * assume that these two operations are atomic (because
+     * of the spinlock).
+     * -------------------
+     */
+    SHMQueueInsertTL(&(proc->links),&(MyProc->links));
+    queue->size++;
+    
+    SpinRelease(spinlock);
+    
+    /* --------------
+     * Postgres does not have any deadlock detection code and for this 
+     * reason we must set a timer to wake up the process in the event of
+     * a deadlock.  For now the timer is set for 1 minute and we assume that
+     * any process which sleeps for this amount of time is deadlocked and will 
+     * receive a SIGALRM signal.  The handler should release the processes
+     * semaphore and abort the current transaction.
+     *
+     * Need to zero out struct to set the interval and the micro seconds fields
+     * to 0.
+     * --------------
+     */
+#ifndef WIN32
+    memset(&timeval, 0, sizeof(struct itimerval));
+    timeval.it_value.tv_sec = DEADLOCK_TIMEOUT;
+    
+    if (setitimer(ITIMER_REAL, &timeval, &dummy))
+	elog(FATAL, "ProcSleep: Unable to set timer for process wakeup");
+#endif /* WIN32 */
+    
+    /* --------------
+     * if someone wakes us between SpinRelease and IpcSemaphoreLock,
+     * IpcSemaphoreLock will not block.  The wakeup is "saved" by
+     * the semaphore implementation.
+     * --------------
+     */
+    IpcSemaphoreLock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
+    
+    /* ---------------
+     * We were awoken before a timeout - now disable the timer
+     * ---------------
+     */
+#ifndef WIN32
+    timeval.it_value.tv_sec = 0;
+    
+    
+    if (setitimer(ITIMER_REAL, &timeval, &dummy))
+	elog(FATAL, "ProcSleep: Unable to diable timer for process wakeup");
+#endif /* WIN32 */
+    
+    /* ----------------
+     * We were assumed to be in a critical section when we went
+     * to sleep.
+     * ----------------
+     */
+    SpinAcquire(spinlock);
+    
+    return(MyProc->errType);
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by releasing its private semaphore.
+ *
+ *   remove the process from the wait queue and set its links invalid.
+ *   RETURN: the next process in the wait queue.
+ */
+PROC *
+ProcWakeup(PROC *proc, int errType)
+{
+    PROC *retProc;
+    /* assume that spinlock has been acquired */
+    
+    if (proc->links.prev == INVALID_OFFSET ||
+	proc->links.next == INVALID_OFFSET)
+	return((PROC *) NULL);
+    
+    retProc = (PROC *) MAKE_PTR(proc->links.prev);
+    
+    /* you have to update waitLock->waitProcs.size yourself */
+    SHMQueueDelete(&(proc->links));
+    SHMQueueElemInit(&(proc->links));
+    
+    proc->errType = errType;
+    
+    IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum, IpcExclusiveLock);
+    
+    return retProc;
+}
+
+
+/*
+ * ProcGetId --
+ */
+int
+ProcGetId()
+{
+    return( MyProc->procId );
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ * 	released.
+ */
+int
+ProcLockWakeup(PROC_QUEUE *queue, char *ltable, char *lock)
+{
+    PROC	*proc;
+    int	count;
+    
+    if (! queue->size)
+	return(STATUS_NOT_FOUND);
+    
+    proc = (PROC *) MAKE_PTR(queue->links.prev);
+    count = 0;
+    while ((LockResolveConflicts ((LOCKTAB *) ltable,
+				  (LOCK *) lock,
+				  proc->token,
+				  proc->xid) == STATUS_OK))
+	{
+	    /* there was a waiting process, grant it the lock before waking it
+	     * up.  This will prevent another process from seizing the lock
+	     * between the time we release the lock master (spinlock) and
+	     * the time that the awoken process begins executing again.
+	     */
+	    GrantLock((LOCK *) lock, proc->token);
+	    queue->size--;
+	    
+	    /*
+	     * ProcWakeup removes proc from the lock waiting process queue and
+	     * returns the next proc in chain.  If a writer just dropped
+	     * its lock and there are several waiting readers, wake them all up.
+	     */
+	    proc = ProcWakeup(proc, NO_ERROR);
+	    
+	    count++;
+	    if (!proc || queue->size == 0)
+		break;
+	}
+    
+    if (count)
+	return(STATUS_OK);
+    else
+	/* Something is still blocking us.  May have deadlocked. */
+	return(STATUS_NOT_FOUND);
+}
+
+void
+ProcAddLock(SHM_QUEUE *elem)
+{
+    SHMQueueInsertTL(&MyProc->lockQueue,elem);
+}
+
+/* --------------------
+ * We only get to this routine if we got SIGALRM after DEADLOCK_TIMEOUT
+ * while waiting for a lock to be released by some other process.  After
+ * the one minute deadline we assume we have a deadlock and must abort
+ * this transaction.  We must also indicate that I'm no longer waiting
+ * on a lock so that other processes don't try to wake me up and screw 
+ * up my semaphore.
+ * --------------------
+ */
+int
+#if defined(PORTNAME_linux)
+HandleDeadLock(int i)
+#else
+HandleDeadLock()
+#endif
+{
+    LOCK *lock;
+    int size;
+    
+    LockLockTable();
+    
+    /* ---------------------
+     * Check to see if we've been awoken by anyone in the interim.
+     *
+     * If we have we can return and resume our transaction -- happy day.
+     * Before we are awoken the process releasing the lock grants it to
+     * us so we know that we don't have to wait anymore.
+     * 
+     * Damn these names are LONG! -mer
+     * ---------------------
+     */
+    if (IpcSemaphoreGetCount(MyProc->sem.semId, MyProc->sem.semNum) == 
+	IpcSemaphoreDefaultStartValue) {
+	UnlockLockTable();
+	return 1;
+    }
+    
+    /*
+     * you would think this would be unnecessary, but...
+     *
+     * this also means we've been removed already.  in some ports
+     * (e.g., sparc and aix) the semop(2) implementation is such that
+     * we can actually end up in this handler after someone has removed
+     * us from the queue and bopped the semaphore *but the test above
+     * fails to detect the semaphore update* (presumably something weird
+     * having to do with the order in which the semaphore wakeup signal
+     * and SIGALRM get handled).
+     */
+    if (MyProc->links.prev == INVALID_OFFSET ||
+	MyProc->links.next == INVALID_OFFSET) {
+	UnlockLockTable();
+	return(1);
+    }
+    
+    lock = MyProc->waitLock;
+    size = lock->waitProcs.size; /* so we can look at this in the core */
+    
+    /* ------------------------
+     * Get this process off the lock's wait queue
+     * ------------------------
+     */
+    Assert(lock->waitProcs.size > 0);
+    --lock->waitProcs.size;
+    SHMQueueDelete(&(MyProc->links));
+    SHMQueueElemInit(&(MyProc->links));
+    
+    /* ------------------
+     * Unlock my semaphore so that the count is right for next time.
+     * I was awoken by a signal, not by someone unlocking my semaphore.
+     * ------------------
+     */
+    IpcSemaphoreUnlock(MyProc->sem.semId, MyProc->sem.semNum, IpcExclusiveLock);
+    
+    /* -------------
+     * Set MyProc->errType to STATUS_ERROR so that we abort after
+     * returning from this handler.
+     * -------------
+     */
+    MyProc->errType = STATUS_ERROR;
+    
+    /*
+     * if this doesn't follow the IpcSemaphoreUnlock then we get lock
+     * table corruption ("LockReplace: xid table corrupted") due to
+     * race conditions.  i don't claim to understand this...
+     */
+    UnlockLockTable();
+    
+    elog(NOTICE, "Timeout -- possible deadlock");
+    return 0;
+}
+
+void
+ProcReleaseSpins(PROC *proc)
+{
+    int i;
+    
+    if (!proc)
+	proc = MyProc;
+    
+    if (!proc)
+	return;
+    for (i=0; i < (int)MAX_SPINS; i++)
+	{
+	    if (proc->sLocks[i])
+		{
+		    Assert(proc->sLocks[i] == 1);
+		    SpinRelease(i);
+		}
+	}
+}
+
+/*****************************************************************************
+ * 
+ *****************************************************************************/
+
+/*
+ * ProcGetNewSemKeyAndNum -
+ *    scan the free semaphore bitmap and allocate a single semaphore from
+ *    a semaphore set. (If the semaphore set doesn't exist yet,
+ *    IpcSemaphoreCreate will create it. Otherwise, we use the existing
+ *    semaphore set.)
+ */
+static void
+ProcGetNewSemKeyAndNum(IPCKey *key, int *semNum)
+{
+    int i;
+    int32 *freeSemMap = ProcGlobal->freeSemMap;
+    unsigned int fullmask;
+
+    /*
+     * we hold ProcStructLock when entering this routine. We scan through
+     * the bitmap to look for a free semaphore.
+     */
+    fullmask = ~0 >> (32 - PROC_NSEMS_PER_SET);
+    for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
+	int mask = 1;
+	int j;
+
+	if (freeSemMap[i] == fullmask)
+	    continue; /* none free for this set */
+
+	for(j = 0; j < PROC_NSEMS_PER_SET; j++) {
+	    if ((freeSemMap[i] & mask) == 0) {
+		/*
+		 * a free semaphore found. Mark it as allocated.
+		 */
+		freeSemMap[i] |= mask;
+
+		*key = ProcGlobal->currKey + i;
+		*semNum = j;
+		return;
+	    }
+	    mask <<= 1;
+	}
+    }
+
+    /* if we reach here, all the semaphores are in use. */
+    elog(WARN, "InitProc: cannot allocate a free semaphore");
+}
+
+/*
+ * ProcFreeSem -
+ *    free up our semaphore in the semaphore set. If we're the last one
+ *    in the set, also remove the semaphore set.
+ */
+static void
+ProcFreeSem(IpcSemaphoreKey semKey, int semNum)
+{
+    int mask;
+    int i;
+    int32 *freeSemMap = ProcGlobal->freeSemMap;
+
+    i = semKey - ProcGlobal->currKey;
+    mask = ~(1 << semNum);
+    freeSemMap[i] &= mask;
+
+    if (freeSemMap[i]==0)
+	IpcSemaphoreKill(semKey);
+}
+
+/*
+ * ProcFreeAllSemaphores -
+ *    on exiting the postmaster, we free up all the semaphores allocated
+ *    to the lmgrs of the backends.
+ */
+void
+ProcFreeAllSemaphores()
+{
+    int i;
+    int32 *freeSemMap = ProcGlobal->freeSemMap;
+
+    for(i=0; i < MAX_PROC_SEMS/PROC_NSEMS_PER_SET; i++) {
+	if (freeSemMap[i]!=0)
+	    IpcSemaphoreKill(ProcGlobal->currKey + i);
+    }
+}
--- a/src/backend/storage/lmgr/single.c
+++ b/src/backend/storage/lmgr/single.c
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * single.c--
+ *    set single locks in the multi-level lock hierarchy
+ *
+ *    Sometimes we don't want to set all levels of the multi-level
+ *	lock hierarchy at once.  This allows us to set and release
+ * 	one level at a time.  It's useful in index scans when
+ *	you can set an intent lock at the beginning and thereafter
+ * 	only set page locks.  Tends to speed things up.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Attic/single.c,v 1.1.1.1 1996/07/09 06:21:57 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "storage/lmgr.h"	/* where the declarations go */
+#include "storage/lock.h"
+#include "storage/multilev.h"
+#include "utils/rel.h"
+
+/*
+ * SingleLockReln -- lock a relation
+ *
+ * Returns: TRUE if the lock can be set, FALSE otherwise.
+ */
+bool
+SingleLockReln(LockInfo linfo, LOCKT lockt, int action)
+{
+    LOCKTAG	tag;
+    
+    /* 
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdSet(&(tag.tupleId.ip_blkid), InvalidBlockNumber);
+    tag.tupleId.ip_posid = InvalidOffsetNumber;
+    
+    if (action == UNLOCK)
+	return(LockRelease(MultiTableId, &tag, lockt));
+    else
+	return(LockAcquire(MultiTableId, &tag, lockt));
+}
+
+/*
+ * SingleLockPage -- use multi-level lock table, but lock
+ *	only at the page level.
+ *
+ * Assumes that an INTENT lock has already been set in the
+ * multi-level lock table.
+ *
+ */
+bool
+SingleLockPage(LockInfo linfo,
+	       ItemPointer tidPtr,
+	       LOCKT lockt,
+	       int action)
+{
+    LOCKTAG	tag;
+    
+    /* 
+     * LOCKTAG has two bytes of padding, unfortunately.  The
+     * hash function will return miss if the padding bytes aren't
+     * zero'd.
+     */
+    memset(&tag,0,sizeof(tag));
+    tag.relId = linfo->lRelId.relId;
+    tag.dbId = linfo->lRelId.dbId;
+    BlockIdCopy(&(tag.tupleId.ip_blkid), &(tidPtr->ip_blkid));
+    tag.tupleId.ip_posid = InvalidOffsetNumber;
+    
+    
+    if (action == UNLOCK)
+	return(LockRelease(MultiTableId, &tag, lockt));
+    else
+	return(LockAcquire(MultiTableId, &tag, lockt));
+}
+
--- a/src/backend/storage/lock.h
+++ b/src/backend/storage/lock.h
@@ -0,0 +1,218 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.h--
+ *    
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: lock.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LOCK_H_
+#define LOCK_H_
+
+#include "postgres.h"
+#include "storage/itemptr.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "storage/backendid.h"
+#include "utils/hsearch.h"
+
+extern SPINLOCK LockMgrLock;
+typedef int MASK;
+
+#define INIT_TABLE_SIZE		100
+#define MAX_TABLE_SIZE 		1000
+
+
+/* ----------------------
+ * The following defines are used to estimate how much shared
+ * memory the lock manager is going to require.  
+ * 
+ * NBACKENDS - The number of concurrently running backends
+ * NLOCKS_PER_XACT - The number of unique locks acquired in a transaction
+ * NLOCKENTS - The maximum number of lock entries in the lock table.
+ * ----------------------
+ */
+#define NBACKENDS 50
+#define NLOCKS_PER_XACT 40
+#define NLOCKENTS NLOCKS_PER_XACT*NBACKENDS
+
+typedef int LOCK_TYPE;
+typedef int LOCKT;
+typedef int LockTableId;
+
+/* MAX_LOCKTYPES cannot be larger than the bits in MASK */
+#define MAX_LOCKTYPES 6
+
+/*
+ * MAX_TABLES corresponds to the number of spin locks allocated in
+ * CreateSpinLocks() or the number of shared memory locations allocated
+ * for lock table spin locks in the case of machines with TAS instructions.
+ */
+#define MAX_TABLES 2
+
+#define INVALID_TABLEID 0
+
+/*typedef struct LOCK LOCK; */
+
+
+typedef struct ltag {
+    Oid			relId;
+    Oid			dbId;
+    ItemPointerData	tupleId;
+} LOCKTAG;
+
+#define TAGSIZE (sizeof(LOCKTAG))
+
+/* This is the control structure for a lock table.  It
+ * lives in shared memory:
+ *
+ * tableID -- the handle used by the lock table's clients to
+ *	refer to the table.
+ *
+ * nLockTypes -- number of lock types (READ,WRITE,etc) that
+ *	are defined on this lock table
+ *
+ * conflictTab -- this is an array of bitmasks showing lock
+ *	type conflicts. conflictTab[i] is a mask with the j-th bit
+ *	turned on if lock types i and j conflict.
+ *
+ * prio -- each locktype has a priority, so, for example, waiting
+ *	writers can be given priority over readers (to avoid
+ *	starvation).
+ *
+ * masterlock -- synchronizes access to the table
+ *
+ */
+typedef struct lockctl {
+  LockTableId	tableId;
+  int		nLockTypes;
+  int		conflictTab[MAX_LOCKTYPES];
+  int		prio[MAX_LOCKTYPES];
+  SPINLOCK	masterLock;
+} LOCKCTL;
+
+/*
+ * lockHash -- hash table on lock Ids,
+ * xidHash -- hash on xid and lockId in case
+ *	multiple processes are holding the lock
+ * ctl - control structure described above.
+ */
+typedef struct ltable {
+    HTAB	*lockHash;
+    HTAB	*xidHash;
+    LOCKCTL	*ctl;
+} LOCKTAB;
+
+/* -----------------------
+ * A transaction never conflicts with its own locks.  Hence, if
+ * multiple transactions hold non-conflicting locks on the same
+ * data, private per-transaction information must be stored in the
+ * XID table.  The tag is XID + shared memory lock address so that
+ * all locks can use the same XID table.  The private information
+ * we store is the number of locks of each type (holders) and the
+ * total number of locks (nHolding) held by the transaction.
+ *
+ * NOTE: --
+ * There were some problems with the fact that currently TransactionIdData
+ * is a 5 byte entity and compilers long word aligning of structure fields.
+ * If the 3 byte padding is put in front of the actual xid data then the
+ * hash function (which uses XID_TAGSIZE when deciding how many bytes of a
+ * struct to look at for the key) might only see the last two bytes of the xid.
+ *
+ * Clearly this is not good since its likely that these bytes will be the
+ * same for many transactions and hence they will share the same entry in
+ * hash table causing the entry to be corrupted.  For this long-winded
+ * reason I have put the tag in a struct of its own to ensure that the
+ * XID_TAGSIZE is computed correctly.  It used to be sizeof (SHMEM_OFFSET) +
+ * sizeof(TransactionIdData) which != sizeof(XIDTAG).
+ *
+ * Finally since the hash function will now look at all 12 bytes of the tag
+ * the padding bytes MUST be zero'd before use in hash_search() as they
+ * will have random values otherwise.  Jeff 22 July 1991.
+ * -----------------------
+ */
+
+typedef struct XIDTAG {
+    SHMEM_OFFSET	lock;
+    int			pid;
+    TransactionId	xid;
+} XIDTAG;
+
+typedef struct XIDLookupEnt {
+    /* tag */
+    XIDTAG tag;
+
+    /* data */
+    int			holders[MAX_LOCKTYPES];
+    int			nHolding;
+    SHM_QUEUE		queue;
+} XIDLookupEnt;
+
+#define XID_TAGSIZE (sizeof(XIDTAG))
+
+/* originally in procq.h */
+typedef struct procQueue {
+    SHM_QUEUE	links;
+    int		size;
+} PROC_QUEUE;
+
+
+/*
+ * lock information:
+ *
+ * tag -- uniquely identifies the object being locked
+ * mask -- union of the conflict masks of all lock types
+ *	currently held on this object.
+ * waitProcs -- queue of processes waiting for this lock
+ * holders -- count of each lock type currently held on the
+ *	lock.
+ * nHolding -- total locks of all types.
+ */
+typedef struct Lock {
+    /* hash key */
+    LOCKTAG		tag;
+
+    /* data */
+    int			mask;
+    PROC_QUEUE		waitProcs;
+    int			holders[MAX_LOCKTYPES];
+    int			nHolding;
+    int			activeHolders[MAX_LOCKTYPES];
+    int			nActive;
+} LOCK;
+
+#define LockGetLock_nHolders(l) l->nHolders
+
+#define LockDecrWaitHolders(lock, lockt) \
+  lock->nHolding--; \
+  lock->holders[lockt]--
+
+#define LockLockTable() SpinAcquire(LockMgrLock);
+#define UnlockLockTable() SpinRelease(LockMgrLock);
+
+extern SPINLOCK LockMgrLock;
+
+/*
+ * function prototypes
+ */
+extern void InitLocks(void);
+extern void LockDisable(int status);
+extern LockTableId LockTabInit(char *tabName, MASK *conflictsP, int *prioP,
+			       int ntypes);
+extern LockTableId LockTabRename(LockTableId tableId);
+extern bool LockAcquire(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
+extern int LockResolveConflicts(LOCKTAB *ltable, LOCK *lock, LOCKT lockt,
+			    TransactionId xid);
+extern int WaitOnLock(LOCKTAB *ltable, LockTableId tableId, LOCK *lock,
+		      LOCKT lockt);
+extern bool LockRelease(LockTableId tableId, LOCKTAG *lockName, LOCKT lockt);
+extern void GrantLock(LOCK *lock, LOCKT lockt);
+extern bool LockReleaseAll(LockTableId tableId, SHM_QUEUE *lockQueue);
+extern int LockShmemSize(void);
+extern bool LockingDisabled(void);
+
+#endif /* LOCK_H */
--- a/src/backend/storage/multilev.h
+++ b/src/backend/storage/multilev.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * multilev.h--
+ *    multi level lock table consts/defs for single.c and multi.c and their
+ *    clients
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: multilev.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MULTILEV_H
+#define MULTILEV_H
+
+#include "storage/lock.h"
+#include "storage/lmgr.h"
+
+#define READ_LOCK  	2
+#define WRITE_LOCK 	1
+
+/* any time a small granularity READ/WRITE lock is set.  
+ * Higher granularity READ_INTENT/WRITE_INTENT locks must
+ * also be set.  A read intent lock is has value READ+INTENT.
+ * in this implementation.
+ */
+#define NO_LOCK		0
+#define INTENT		2
+#define READ_INTENT	(READ_LOCK+INTENT)
+#define WRITE_INTENT	(WRITE_LOCK+INTENT)
+
+#define EXTEND_LOCK	5
+
+#define SHORT_TERM	1
+#define LONG_TERM	2
+#define UNLOCK		0
+
+#define N_LEVELS 3
+#define RELN_LEVEL 0
+#define PAGE_LEVEL 1
+#define TUPLE_LEVEL 2
+typedef int LOCK_LEVEL;
+
+/* multi.c */
+
+extern LockTableId MultiTableId;
+extern LockTableId ShortTermTableId;
+
+/*
+ * function prototypes
+ */
+extern LockTableId InitMultiLevelLockm(void);
+extern bool MultiLockReln(LockInfo linfo, LOCKT lockt);
+extern bool MultiLockTuple(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiLockPage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiAcquire(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
+			 LOCK_LEVEL level);
+extern bool MultiReleasePage(LockInfo linfo, ItemPointer tidPtr, LOCKT lockt);
+extern bool MultiReleaseReln(LockInfo linfo, LOCKT lockt);
+extern bool MultiRelease(LockTableId tableId, LOCKTAG *tag, LOCKT lockt,
+			 LOCK_LEVEL level);
+
+#endif /* MULTILEV_H */
--- a/src/backend/storage/off.h
+++ b/src/backend/storage/off.h
@@ -0,0 +1,60 @@
+/*-------------------------------------------------------------------------
+ *
+ * off.h--
+ *    POSTGRES disk "offset" definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: off.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	OFF_H
+#define OFF_H
+
+#include "c.h"
+#include "machine.h"		/* for BLCKSZ */
+#include "storage/itemid.h"
+
+/*
+ * OffsetNumber:
+ *
+ * this is a 1-based index into the linp (ItemIdData) array in the
+ * header of each disk page.
+ */
+typedef uint16			OffsetNumber;
+
+#define InvalidOffsetNumber	((OffsetNumber) 0)
+#define FirstOffsetNumber	((OffsetNumber) 1)
+#define	MaxOffsetNumber		((OffsetNumber) (BLCKSZ / sizeof(ItemIdData)))
+#define	OffsetNumberMask	(0xffff)		/* valid uint16 bits */
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * OffsetNumberIsValid --
+ *	True iff the offset number is valid.
+ */
+#define OffsetNumberIsValid(offsetNumber) \
+    ((bool) ((offsetNumber != InvalidOffsetNumber) && \
+	     (offsetNumber <= MaxOffsetNumber)))
+
+/*
+ * OffsetNumberNext --
+ * OffsetNumberPrev --
+ *	Increments/decrements the argument.  These macros look pointless
+ *	but they help us disambiguate the different manipulations on
+ *	OffsetNumbers (e.g., sometimes we substract one from an
+ *	OffsetNumber to move back, and sometimes we do so to form a
+ *	real C array index).
+ */
+#define OffsetNumberNext(offsetNumber) \
+    ((OffsetNumber) (1 + (offsetNumber)))
+#define OffsetNumberPrev(offsetNumber) \
+    ((OffsetNumber) (-1 + (offsetNumber)))
+
+#endif	/* OFF_H */
--- a/src/backend/storage/page.h
+++ b/src/backend/storage/page.h
@@ -0,0 +1,26 @@
+/*-------------------------------------------------------------------------
+ *
+ * page.h--
+ *    POSTGRES buffer page abstraction definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: page.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	PAGE_H
+#define PAGE_H
+
+#include "c.h"
+
+typedef Pointer	Page;
+
+/*
+ * PageIsValid --
+ *	True iff page is valid.
+ */
+#define	PageIsValid(page) PointerIsValid(page)
+
+#endif	/* PAGE_H */
--- a/src/backend/storage/page/Makefile.inc
+++ b/src/backend/storage/page/Makefile.inc
@@ -0,0 +1,16 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/page
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/page/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= bufpage.c itemptr.c
+
+
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -0,0 +1,519 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.c--
+ *    POSTGRES standard buffer page code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/page/bufpage.c,v 1.1.1.1 1996/07/09 06:21:58 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <sys/types.h>
+#include <sys/file.h>
+
+#include "c.h"
+
+#include "storage/item.h"
+#include "storage/buf.h"
+#include "storage/bufmgr.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "utils/memutils.h"
+#include "storage/bufpage.h"
+
+#include "lib/qsort.h"
+
+static bool PageManagerShuffle = true;	/* default is shuffle mode */
+
+/* ----------------------------------------------------------------
+ *			Buffer support functions
+ * ----------------------------------------------------------------
+ */
+/*
+ * BufferGetPageSize --
+ *	Returns the page size within a buffer.
+ *
+ * Notes:
+ *	Assumes buffer is valid.
+ *
+ *	The buffer can be a raw disk block and need not contain a valid
+ *	(formatted) disk page.
+ */
+Size
+BufferGetPageSize(Buffer buffer)
+{
+    Size	pageSize;
+    
+    Assert(BufferIsValid(buffer));
+    pageSize = BLCKSZ;	/* XXX dig out of buffer descriptor */
+    
+    Assert(PageSizeIsValid(pageSize));
+    return (pageSize);
+}
+
+/*
+ * BufferGetPage --
+ *	Returns the page associated with a buffer.
+ */
+Page
+BufferGetPage(Buffer buffer)
+{
+    return (Page) BufferGetBlock(buffer);
+}
+
+
+/* ----------------------------------------------------------------
+ *			Page support functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageInit --
+ *	Initializes the contents of a page.
+ */
+void
+PageInit(Page page, Size pageSize, Size specialSize)
+{
+    PageHeader p = (PageHeader) page;
+
+    Assert(pageSize == BLCKSZ);
+    Assert(pageSize >
+	   specialSize + sizeof(PageHeaderData) - sizeof(ItemIdData));
+    
+    specialSize = DOUBLEALIGN(specialSize);
+
+    p->pd_lower = sizeof(PageHeaderData) - sizeof(ItemIdData);
+    p->pd_upper = pageSize - specialSize;
+    p->pd_special = pageSize - specialSize;
+    PageSetPageSize(page, pageSize);
+}
+
+/*
+ * PageGetItem --
+ *	Retrieves an item on the given page.
+ *
+ * Note:
+ *	This does change the status of any of the resources passed.
+ *	The semantics may change in the future.
+ */
+Item
+PageGetItem(Page page, ItemId itemId)
+{
+    Item	item;
+    
+    Assert(PageIsValid(page));
+    Assert((*itemId).lp_flags & LP_USED);
+    
+    item = (Item)(((char *)page) + (*itemId).lp_off);
+    
+    return (item);
+}
+
+/*
+ * PageAddItem --
+ *	Adds item to the given page.
+ *
+ * Note:
+ *	This does not assume that the item resides on a single page.
+ *	It is the responsiblity of the caller to act appropriately
+ *	depending on this fact.  The "pskip" routines provide a
+ *	friendlier interface, in this case.
+ *	
+ *	This does change the status of any of the resources passed.
+ *	The semantics may change in the future.
+ *
+ *	This routine should probably be combined with others?
+ */
+/* ----------------
+ *	PageAddItem
+ *
+ *	add an item to a page.
+ *
+ *   Notes on interface:
+ *  	If offsetNumber is valid, shuffle ItemId's down to make room
+ * 	to use it, if PageManagerShuffle is true.  If PageManagerShuffle is
+ *  	false, then overwrite the specified ItemId.  (PageManagerShuffle is
+ *  	true by default, and is modified by calling PageManagerModeSet.)
+ *  	If offsetNumber is not valid, then assign one by finding the first 
+ *  	one that is both unused and deallocated.
+ *
+ *   NOTE: If offsetNumber is valid, and PageManagerShuffle is true, it
+ *  	is assumed that there is room on the page to shuffle the ItemId's
+ *  	down by one.
+ * ----------------
+ */
+OffsetNumber
+PageAddItem(Page page,
+	    Item item,
+	    Size size,
+	    OffsetNumber offsetNumber,
+	    ItemIdFlags flags)
+{
+    register 		i;
+    Size		alignedSize;
+    Offset		lower;
+    Offset		upper;
+    ItemId		itemId;
+    ItemId		fromitemId, toitemId;
+    OffsetNumber 	limit;
+    
+    bool shuffled = false;
+    
+    /*
+     *  Find first unallocated offsetNumber
+     */
+    limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+    
+    /* was offsetNumber passed in? */
+    if (OffsetNumberIsValid(offsetNumber)) {
+	if (PageManagerShuffle == true) {
+	    /* shuffle ItemId's (Do the PageManager Shuffle...) */
+	    for (i = (limit - 1); i >= offsetNumber; i--) {
+		fromitemId = &((PageHeader)page)->pd_linp[i - 1];
+		toitemId = &((PageHeader)page)->pd_linp[i];
+		*toitemId = *fromitemId;
+	    }
+	    shuffled = true;	/* need to increase "lower" */
+	} else { /* overwrite mode */
+	    itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+	    if (((*itemId).lp_flags & LP_USED)  || 
+		((*itemId).lp_len != 0)) {
+		elog(WARN, "PageAddItem: tried overwrite of used ItemId");
+		return (InvalidOffsetNumber);
+	    }
+	}
+    } else {	/* offsetNumber was not passed in, so find one */
+	/* look for "recyclable" (unused & deallocated) ItemId */
+	for (offsetNumber = 1; offsetNumber < limit; offsetNumber++) {
+	    itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+	    if ((((*itemId).lp_flags & LP_USED) == 0) && 
+		((*itemId).lp_len == 0)) 
+		break;
+	}
+    }
+    if (offsetNumber > limit)
+	lower = (Offset) (((char *) (&((PageHeader)page)->pd_linp[offsetNumber])) - ((char *) page));
+    else if (offsetNumber == limit || shuffled == true)
+	lower = ((PageHeader)page)->pd_lower + sizeof (ItemIdData);
+    else
+	lower = ((PageHeader)page)->pd_lower;
+    
+    alignedSize = DOUBLEALIGN(size);
+    
+    upper = ((PageHeader)page)->pd_upper - alignedSize;
+    
+    if (lower > upper) {
+	return (InvalidOffsetNumber);
+    }
+    
+    itemId = &((PageHeader)page)->pd_linp[offsetNumber - 1];
+    (*itemId).lp_off = upper;
+    (*itemId).lp_len = size;
+    (*itemId).lp_flags = flags;
+    memmove((char *)page + upper, item, size);
+    ((PageHeader)page)->pd_lower = lower;
+    ((PageHeader)page)->pd_upper = upper;
+    
+    return (offsetNumber);
+}
+
+/*
+ * PageGetTempPage --
+ *	Get a temporary page in local memory for special processing
+ */
+Page
+PageGetTempPage(Page page, Size specialSize)
+{
+    Size	pageSize;
+    Size	size;
+    Page	temp;
+    PageHeader	thdr;
+    
+    pageSize = PageGetPageSize(page);
+    
+    if ((temp = (Page) palloc(pageSize)) == (Page) NULL)
+	elog(FATAL, "Cannot allocate %d bytes for temp page.", pageSize);
+    thdr = (PageHeader) temp;
+    
+    /* copy old page in */
+    memmove(temp, page, pageSize);
+    
+    /* clear out the middle */
+    size = (pageSize - sizeof(PageHeaderData)) + sizeof(ItemIdData);
+    size -= DOUBLEALIGN(specialSize);
+    memset((char *) &(thdr->pd_linp[0]), 0, size);
+    
+    /* set high, low water marks */
+    thdr->pd_lower = sizeof (PageHeaderData) - sizeof (ItemIdData);
+    thdr->pd_upper = pageSize - DOUBLEALIGN(specialSize);
+    
+    return (temp);
+}
+
+/*
+ * PageRestoreTempPage --
+ *	Copy temporary page back to permanent page after special processing
+ *	and release the temporary page.
+ */
+void
+PageRestoreTempPage(Page tempPage, Page oldPage)
+{
+    Size	pageSize;
+    
+    pageSize = PageGetPageSize(tempPage);
+    memmove((char *) oldPage, (char *) tempPage, pageSize);
+    
+    pfree(tempPage);
+}
+
+/*
+ * PageGetMaxOffsetNumber --
+ *	Returns the maximum offset number used by the given page.
+ *
+ *	NOTE: The offset is invalid if the page is non-empty.
+ *	Test whether PageIsEmpty before calling this routine
+ *	and/or using its return value.
+ */
+OffsetNumber
+PageGetMaxOffsetNumber(Page page)
+{
+    LocationIndex	low;
+    OffsetNumber	i;
+    
+    low = ((PageHeader) page)->pd_lower;
+    i = (low - (sizeof(PageHeaderData) - sizeof(ItemIdData)))
+	/ sizeof(ItemIdData);
+    
+    return(i);
+}	
+
+/* ----------------
+ *	itemid stuff for PageRepairFragmentation
+ * ----------------
+ */
+struct itemIdSortData {
+    int		offsetindex;	/* linp array index */
+    ItemIdData  itemiddata;
+};
+
+static int
+itemidcompare(struct itemIdSortData *itemidp1, struct itemIdSortData *itemidp2)
+{
+    if (itemidp1->itemiddata.lp_off == itemidp2->itemiddata.lp_off)
+	return(0);
+    else if (itemidp1->itemiddata.lp_off < itemidp2->itemiddata.lp_off)
+	return(1);
+    else
+	return(-1);
+}
+
+/*
+ * PageRepairFragmentation --
+ *	Frees fragmented space on a page.
+ */
+void
+PageRepairFragmentation(Page page)
+{
+    int 		i;
+    struct itemIdSortData 	*itemidbase, *itemidptr;
+    ItemId 		lp;
+    int 		nline, nused;
+    int 		itemidcompare();
+    Offset 		upper;
+    Size 		alignedSize;
+    
+    nline = (int16) PageGetMaxOffsetNumber(page);
+    nused = 0;
+    for (i=0; i<nline; i++) {
+	lp = ((PageHeader)page)->pd_linp + i;
+	if ((*lp).lp_flags & LP_USED)
+	    nused++;
+    }
+    
+    if (nused == 0) {
+	for (i=0; i<nline; i++) {
+	    lp = ((PageHeader)page)->pd_linp + i;
+	    if ((*lp).lp_len > 0) 	/* unused, but allocated */
+		(*lp).lp_len = 0;	/* indicate unused & deallocated */
+	}
+	
+	((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
+    } else {	/* nused != 0 */
+	itemidbase = (struct itemIdSortData *) 
+	    palloc(sizeof(struct itemIdSortData) * nused);
+	memset((char *) itemidbase, 0, sizeof(struct itemIdSortData) * nused);
+	itemidptr = itemidbase;
+	for (i=0; i<nline; i++) {
+	    lp = ((PageHeader)page)->pd_linp + i;
+	    if ((*lp).lp_flags & LP_USED) {
+		itemidptr->offsetindex = i;
+		itemidptr->itemiddata = *lp;
+		itemidptr++;
+	    } else {
+		if ((*lp).lp_len > 0) 	/* unused, but allocated */
+		    (*lp).lp_len = 0;	/* indicate unused & deallocated */
+	    }
+	}
+	
+	/* sort itemIdSortData array...*/
+	pg_qsort((char *) itemidbase, nused, sizeof(struct itemIdSortData),
+		 (void*) itemidcompare);
+	
+	/* compactify page */
+	((PageHeader)page)->pd_upper = ((PageHeader)page)->pd_special;
+	
+	for (i=0, itemidptr = itemidbase; i<nused; i++, itemidptr++) {
+	    lp = ((PageHeader)page)->pd_linp + itemidptr->offsetindex;
+	    alignedSize = DOUBLEALIGN((*lp).lp_len);
+	    upper = ((PageHeader)page)->pd_upper - alignedSize;
+	    memmove((char *) page + upper,
+		    (char *)page + (*lp).lp_off, 
+		    (*lp).lp_len);
+	    (*lp).lp_off = upper;
+	    ((PageHeader)page)->pd_upper = upper;
+	}
+	
+	pfree(itemidbase);
+    }
+}
+
+/*
+ * PageGetFreeSpace --
+ *	Returns the size of the free (allocatable) space on a page.
+ */
+Size
+PageGetFreeSpace(Page page)
+{
+    Size	space;
+    
+    
+    space = ((PageHeader)page)->pd_upper - ((PageHeader)page)->pd_lower;
+    
+    if (space < sizeof (ItemIdData)) {
+	return (0);
+    }
+    space -= sizeof (ItemIdData);		/* XXX not always true */
+    
+    return (space);
+}
+
+/*
+ * PageManagerModeSet --
+ *
+ *   Sets mode to either: ShufflePageManagerMode (the default) or
+ *   OverwritePageManagerMode.  For use by access methods code
+ *   for determining semantics of PageAddItem when the offsetNumber
+ *   argument is passed in.
+ */
+void
+PageManagerModeSet(PageManagerMode mode)
+{
+    if (mode == ShufflePageManagerMode)
+	PageManagerShuffle = true;
+    else if (mode == OverwritePageManagerMode)
+	PageManagerShuffle = false;
+}
+
+/*
+ *----------------------------------------------------------------
+ * PageIndexTupleDelete
+ *----------------------------------------------------------------
+ *
+ *	This routine does the work of removing a tuple from an index page.
+ */
+void
+PageIndexTupleDelete(Page page, OffsetNumber offnum)
+{
+    PageHeader 	phdr;
+    char 	*addr;
+    ItemId 	tup;
+    Size 	size;
+    char 	*locn;
+    int 	nbytes;
+    int		offidx;
+    
+    phdr = (PageHeader) page;
+    
+    /* change offset number to offset index */
+    offidx = offnum - 1;
+    
+    tup = PageGetItemId(page, offnum);
+    size = ItemIdGetLength(tup);
+    size = DOUBLEALIGN(size);
+    
+    /* location of deleted tuple data */
+    locn = (char *) (page + ItemIdGetOffset(tup));
+    
+    /*
+     * First, we want to get rid of the pd_linp entry for the index
+     * tuple.  We copy all subsequent linp's back one slot in the
+     * array.
+     */
+    
+    nbytes = phdr->pd_lower -
+	((char *)&phdr->pd_linp[offidx + 1] - (char *) phdr);
+    memmove((char *) &(phdr->pd_linp[offidx]),
+	    (char *) &(phdr->pd_linp[offidx + 1]),
+	    nbytes);
+    
+    /*
+     * Now move everything between the old upper bound (beginning of tuple
+     * space) and the beginning of the deleted tuple forward, so that
+     * space in the middle of the page is left free.  If we've just deleted
+     * the tuple at the beginning of tuple space, then there's no need
+     * to do the copy (and bcopy on some architectures SEGV's if asked
+     * to move zero bytes).
+     */
+    
+    /* beginning of tuple space */
+    addr = (char *) (page + phdr->pd_upper);
+    
+    if (locn != addr)
+	memmove(addr + size, addr, (int) (locn - addr));
+    
+    /* adjust free space boundary pointers */
+    phdr->pd_upper += size;
+    phdr->pd_lower -= sizeof (ItemIdData);
+    
+    /* finally, we need to adjust the linp entries that remain */
+    if (!PageIsEmpty(page))
+	PageIndexTupleDeleteAdjustLinePointers(phdr, locn, size);
+}
+
+/*
+ *----------------------------------------------------------------
+ * PageIndexTupleDeleteAdjustLinePointers
+ *----------------------------------------------------------------
+ *
+ *	Once the line pointers and tuple data have been shifted around
+ *	on the page, we need to go down the line pointer vector and
+ *	adjust pointers to reflect new locations.  Anything that used
+ *	to be before the deleted tuple's data was moved forward by the
+ *	size of the deleted tuple.
+ *
+ *	This routine does the work of adjusting the line pointers.
+ *	Location is where the tuple data used to lie; size is how
+ *	much space it occupied.  We assume that size has been aligned
+ *	as required by the time we get here.
+ *
+ *	This routine should never be called on an empty page.
+ */
+void
+PageIndexTupleDeleteAdjustLinePointers(PageHeader phdr,
+				       char *location,
+				       Size size)
+{
+    int i;
+    
+    /* location is an index into the page... */
+    location -= (int) phdr;
+    
+    for (i = PageGetMaxOffsetNumber((Page) phdr) - 1; i >= 0; i--) {
+	if (phdr->pd_linp[i].lp_off <= (unsigned) location) {
+	    phdr->pd_linp[i].lp_off += size;
+	}
+    }
+}
--- a/src/backend/storage/page/itemptr.c
+++ b/src/backend/storage/page/itemptr.c
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.c--
+ *    POSTGRES disk item pointer code.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/page/itemptr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "storage/block.h"
+#include "storage/off.h"
+#include "storage/itemptr.h"
+#include "storage/bufpage.h"
+
+/*
+ * ItemPointerEquals --
+ *  Returns true if both item pointers point to the same item, 
+ *   otherwise returns false.
+ *
+ * Note:
+ *  Assumes that the disk item pointers are not NULL.
+ */
+bool
+ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
+{
+    if (ItemPointerGetBlockNumber(pointer1) ==
+        ItemPointerGetBlockNumber(pointer2) &&
+        ItemPointerGetOffsetNumber(pointer1) ==
+        ItemPointerGetOffsetNumber(pointer2))
+	return(true);
+    else
+        return(false);
+}
+
--- a/src/backend/storage/pagenum.h
+++ b/src/backend/storage/pagenum.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * pagenum.h--
+ *    POSTGRES page number definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: pagenum.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	PAGENUM_H
+#define PAGENUM_H
+
+#include "c.h"
+#include "storage/page.h"
+
+typedef uint16	PageNumber;
+
+typedef uint32	LogicalPageNumber;
+
+#define InvalidLogicalPageNumber	0
+
+/*
+ * LogicalPageNumberIsValid --
+ *	True iff the logical page number is valid.
+ */
+#define LogicalPageNumberIsValid(pageNumber) \
+    ((bool)((pageNumber) != InvalidLogicalPageNumber))
+
+
+#endif	/* PAGENUM_H */
--- a/src/backend/storage/pos.h
+++ b/src/backend/storage/pos.h
@@ -0,0 +1,64 @@
+/*-------------------------------------------------------------------------
+ *
+ * pos.h--
+ *    POSTGRES "position" definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: pos.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	POS_H
+#define POS_H
+
+#include "c.h"
+
+/*
+ * a 'position' used to be <pagenumber, offset> in postgres.  this has
+ * been changed to just <offset> as the notion of having multiple pages
+ * within a block has been removed.
+ *
+ * the 'offset' abstraction is somewhat confusing.  it is NOT a byte
+ * offset within the page; instead, it is an offset into the line
+ * pointer array contained on every page that store (heap or index)
+ * tuples.
+ */
+typedef bits16		PositionIdData;
+typedef PositionIdData	*PositionId;
+
+/* ----------------
+ *	support macros
+ * ----------------
+ */
+
+/*
+ * PositionIdIsValid --
+ *	True iff the position identifier is valid.
+ */
+#define PositionIdIsValid(positionId) \
+    PointerIsValid(positionId)
+
+/*
+ * PositionIdSetInvalid --
+ *      Make an invalid position.
+ */
+#define PositionIdSetInvalid(positionId) \
+    *(positionId) = (bits16) 0
+
+/*
+ * PositionIdSet --
+ *	Sets a position identifier to the specified value.
+ */
+#define PositionIdSet(positionId, offsetNumber) \
+    *(positionId) = (offsetNumber)
+
+/*
+ * PositionIdGetOffsetNumber --
+ *	Retrieve the offset number from a position identifier.
+ */
+#define PositionIdGetOffsetNumber(positionId) \
+    ((OffsetNumber) *(positionId))
+
+#endif	/*  POS_H */
--- a/src/backend/storage/proc.h
+++ b/src/backend/storage/proc.h
@@ -0,0 +1,127 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.h--
+ *    
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: proc.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef _PROC_H_
+#define _PROC_H_
+
+#include "storage/ipc.h"
+#include "storage/lock.h"
+#ifndef WIN32
+#include <sys/sem.h>
+#else
+/* This is because WIN32 already defines PROC */
+#define PROC	PGL_PROC
+#endif /* WIN32 */
+#include "storage/shmem.h"
+
+
+typedef struct {
+  int	 		sleeplock;
+  int			semNum;
+  IpcSemaphoreId	semId;
+  IpcSemaphoreKey	semKey;
+} SEMA;
+
+/*
+ * Each backend has:
+ */
+typedef struct proc {
+
+  /* proc->links MUST BE THE FIRST ELEMENT OF STRUCT (see ProcWakeup()) */
+
+  SHM_QUEUE         links;	/* proc can be waiting for one event(lock) */
+  SEMA              sem;	/* ONE semaphore to sleep on */
+  int               errType; 	/* error code tells why we woke up */
+
+  int               procId;  	/* unique number for this structure
+			 	 * NOT unique per backend, these things
+				 * are reused after the backend dies.
+				 */
+
+  int               critSects;	/* If critSects > 0, we are in sensitive
+				 * routines that cannot be recovered when
+				 * the process fails.
+				 */
+
+  int               prio;	/* priority for sleep queue */
+
+  TransactionId     xid;	/* transaction currently being executed
+				 * by this proc
+				 */
+
+  LOCK *            waitLock;	/* Lock we're sleeping on */
+  int               token;	/* info for proc wakeup routines */	
+  int		    pid;	/* This procs process id */
+  short		    sLocks[MAX_SPINS];	/* Spin lock stats */
+  SHM_QUEUE	    lockQueue;	/* locks associated with current transaction */
+} PROC;
+
+
+/*
+ * MAX_PROC_SEMS is the maximum number of per-process semaphores (those used
+ * by the lock mgr) we can keep track of. PROC_NSEMS_PER_SET is the number
+ * of semaphores in each (sys-V) semaphore set allocated. (Be careful not
+ * to set it to greater 32. Otherwise, the bitmap will overflow.)
+ */
+#define  MAX_PROC_SEMS		128
+#define  PROC_NSEMS_PER_SET	16
+
+typedef struct procglobal {
+    SHMEM_OFFSET	freeProcs;
+    int			numProcs;
+    IPCKey		currKey;
+    int32		freeSemMap[MAX_PROC_SEMS/PROC_NSEMS_PER_SET];
+} PROC_HDR;
+
+extern PROC *MyProc;
+
+#define PROC_INCR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])++
+#define PROC_DECR_SLOCK(lock) if (MyProc) (MyProc->sLocks[(lock)])--
+
+/*
+ * flags explaining why process woke up
+ */
+#define NO_ERROR 	0
+#define ERR_TIMEOUT	1
+#define ERR_BUFFER_IO	2
+
+#define MAX_PRIO	50
+#define MIN_PRIO	(-1)
+
+extern SPINLOCK ProcStructLock;
+
+/*
+ * Function Prototypes
+ */
+extern void InitProcess(IPCKey key);
+extern void ProcReleaseLocks(void);
+extern bool ProcRemove(int pid);
+/* extern bool ProcKill(int exitStatus, int pid); */
+/* make static in storage/lmgr/proc.c -- jolly */
+
+extern PROC_QUEUE *ProcQueueAlloc(char *name);
+extern void ProcQueueInit(PROC_QUEUE *queue);
+extern int ProcSleep(PROC_QUEUE *queue, SPINLOCK spinlock, int token, 
+	      int prio, LOCK *lock);
+extern PROC *ProcWakeup(PROC *proc, int errType);
+extern int ProcGetId(void);
+extern int ProcLockWakeup(PROC_QUEUE *queue, char * ltable, char * lock);
+extern void ProcAddLock(SHM_QUEUE *elem);
+#if defined(PORTNAME_linux)
+extern int HandleDeadLock(int);
+#else
+extern int HandleDeadLock(void);
+#endif
+extern void ProcReleaseSpins(PROC *proc);
+extern void ProcFreeAllSemaphores(void);
+
+#endif /* PROC_H */
--- a/src/backend/storage/shmem.h
+++ b/src/backend/storage/shmem.h
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.h--
+ *    shared memory management structures
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: shmem.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	SHMEM_H
+#define SHMEM_H
+
+#include "storage/spin.h"		/* for SPINLOCK */
+#include "utils/hsearch.h"		/* for HTAB */
+
+/* The shared memory region can start at a different address
+ * in every process.  Shared memory "pointers" are actually
+ * offsets relative to the start of the shared memory region(s).
+ */
+typedef unsigned long SHMEM_OFFSET;
+#define INVALID_OFFSET (-1)
+#define BAD_LOCATION (-1)
+
+/* start of the lowest shared memory region.  For now, assume that
+ * there is only one shared memory region 
+ */
+extern SHMEM_OFFSET ShmemBase;
+
+
+/* coerce an offset into a pointer in this process's address space */
+#define MAKE_PTR(xx_offs)\
+  (ShmemBase+((unsigned long)(xx_offs)))
+
+/* coerce a pointer into a shmem offset */
+#define MAKE_OFFSET(xx_ptr)\
+  (SHMEM_OFFSET) (((unsigned long)(xx_ptr))-ShmemBase)
+
+#define SHM_PTR_VALID(xx_ptr)\
+  (((unsigned long)xx_ptr) > ShmemBase)
+
+/* cannot have an offset to ShmemFreeStart (offset 0) */
+#define SHM_OFFSET_VALID(xx_offs)\
+  ((xx_offs != 0) && (xx_offs != INVALID_OFFSET))
+
+
+extern SPINLOCK ShmemLock;
+extern SPINLOCK BindingLock;
+
+/* shmemqueue.c */
+typedef struct SHM_QUEUE {
+    SHMEM_OFFSET	prev;
+    SHMEM_OFFSET	next;
+} SHM_QUEUE;
+
+/* shmem.c */
+extern void ShmemBindingTabReset();
+extern void ShmemCreate(unsigned int key, unsigned int size);
+extern int InitShmem(unsigned int key, unsigned int size);
+extern long *ShmemAlloc(unsigned long size);
+extern int ShmemIsValid(unsigned long addr);
+extern HTAB *ShmemInitHash(char *name, long init_size, long max_size,
+			   HASHCTL *infoP, int hash_flags);
+extern bool ShmemPIDLookup(int pid, SHMEM_OFFSET* locationPtr);
+extern SHMEM_OFFSET ShmemPIDDestroy(int pid);
+extern long *ShmemInitStruct(char *name, unsigned long size,
+			     bool *foundPtr);
+
+
+typedef int TableID;
+
+/* size constants for the binding table */
+        /* max size of data structure string name */
+#define BTABLE_KEYSIZE  (50)
+        /* data in binding table hash bucket */
+#define BTABLE_DATASIZE (sizeof(BindingEnt) - BTABLE_KEYSIZE)
+        /* maximum size of the binding table */
+#define BTABLE_SIZE      (100)
+
+/* this is a hash bucket in the binding table */
+typedef struct {
+    char  	   key[BTABLE_KEYSIZE];	/* string name */
+    unsigned long  location;		/* location in shared mem */
+    unsigned long  size;		/* numbytes allocated for the
+					 * structure
+					 */
+} BindingEnt;
+
+/*
+ * prototypes for functions in shmqueue.c
+ */
+extern void SHMQueueInit(SHM_QUEUE *queue);
+extern bool SHMQueueIsDetached(SHM_QUEUE *queue);
+extern void SHMQueueElemInit(SHM_QUEUE *queue);
+extern void SHMQueueDelete(SHM_QUEUE *queue);
+extern void SHMQueueInsertHD(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueInsertTL(SHM_QUEUE *queue, SHM_QUEUE *elem);
+extern void SHMQueueFirst(SHM_QUEUE *queue, Pointer *nextPtrPtr,
+			  SHM_QUEUE *nextQueue);
+extern bool SHMQueueEmpty(SHM_QUEUE *queue);
+
+#endif	/* SHMEM_H */
--- a/src/backend/storage/sinval.h
+++ b/src/backend/storage/sinval.h
@@ -0,0 +1,33 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.h--
+ *    POSTGRES shared cache invalidation communication definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sinval.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	SINVAL_H
+#define SINVAL_H
+
+#include "c.h"
+#include "storage/spin.h"
+#include "storage/ipc.h"
+#include "storage/itemptr.h"
+#include "storage/backendid.h"
+
+extern SPINLOCK SInvalLock;
+
+extern void CreateSharedInvalidationState(IPCKey key);
+extern void AttachSharedInvalidationState(IPCKey key);
+extern void InitSharedInvalidationState();
+extern void RegisterSharedInvalid(int cacheId, Index hashIndex,
+				  ItemPointer pointer);
+extern void InvalidateSharedInvalid(void (*invalFunction)(),
+				    void (*resetFunction)());
+
+
+#endif /* SINVAL_H */
--- a/src/backend/storage/sinvaladt.h
+++ b/src/backend/storage/sinvaladt.h
@@ -0,0 +1,126 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.h--
+ *    POSTGRES shared cache invalidation segment definitions.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: sinvaladt.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SINVALADT_H
+#define SINVALADT_H
+
+#include "postgres.h"	/* XXX */
+
+#include "storage/ipc.h"
+#include "storage/itemptr.h"
+#include "storage/sinval.h"
+ 
+/*
+ * The structure of the shared cache invaidation segment
+ *
+ */
+/*
+A------------- Header info --------------
+    criticalSectionSemaphoreId
+    generalSemaphoreId
+    startEntrySection   (offset a)
+    endEntrySection     (offset a + b)
+    startFreeSpace      (offset relative to B)
+    startEntryChain     (offset relatiev to B)
+    endEntryChain       (offset relative to B)
+    numEntries
+    maxNumEntries
+    procState[MaxBackendId] --> limit
+				resetState (bool)
+a				tag (POSTID)
+B------------- Start entry section -------
+    SISegEntry  --> entryData --> ... (see  SharedInvalidData!)
+                    isfree  (bool)
+                    next  (offset to next entry in chain )
+b     .... (dynamically growing down)
+C----------------End shared segment -------  
+
+*/
+
+/* Parameters (configurable)  *******************************************/
+#define MaxBackendId 32      	    /* maximum number of backends   	*/
+#define MAXNUMMESSAGES 1000 	    /* maximum number of messages in seg*/
+
+
+#define	InvalidOffset	1000000000  /* a invalid offset  (End of chain)	*/
+
+typedef struct ProcState {
+    int 	limit;      	/* the number of read messages	    	*/
+    bool 	resetState; 	/* true, if backend has to reset its state */
+    int		tag;		/* special tag, recieved from the postmaster */
+} ProcState;
+
+
+typedef struct SISeg {
+    IpcSemaphoreId  	criticalSectionSemaphoreId; /* semaphore id     */
+    IpcSemaphoreId  	generalSemaphoreId; 	    /* semaphore id     */
+    Offset      startEntrySection;  	/* (offset a)	    	    	*/
+    Offset      endEntrySection;    	/* (offset a + b)   	    	*/
+    Offset      startFreeSpace;	    	/* (offset relative to B)   	*/
+    Offset      startEntryChain;    	/* (offset relative to B)   	*/
+    Offset      endEntryChain;          /* (offset relative to B)   	*/
+    int         numEntries;
+    int         maxNumEntries;
+    ProcState   procState[MaxBackendId]; /* reflects the invalidation state */
+    /* here starts the entry section, controlled by offsets */
+} SISeg;
+#define SizeSISeg     sizeof(SISeg)
+
+typedef struct SharedInvalidData {
+    int	    	    	cacheId;    /* XXX */
+    Index   	    	hashIndex;
+    ItemPointerData 	pointerData;
+} SharedInvalidData;
+
+typedef SharedInvalidData   *SharedInvalid;
+
+
+typedef struct SISegEntry {
+    SharedInvalidData	entryData;  	    	    /* the message data */
+    bool                isfree;	    	    	    /* entry free? */
+    Offset  	    	next;	    	    	    /* offset to next entry*/
+} SISegEntry;
+
+#define SizeOfOneSISegEntry   sizeof(SISegEntry)
+    
+typedef struct SISegOffsets {
+    Offset  startSegment;   	    	/* always 0 (for now) */
+    Offset  offsetToFirstEntry;         /* A + a = B */
+    Offset  offsetToEndOfSegemnt;       /* A + a + b */
+} SISegOffsets;
+
+
+/****************************************************************************/
+/* synchronization of the shared buffer access	    	    	    	    */
+/*    access to the buffer is synchronized by the lock manager !!   	    */
+/****************************************************************************/
+
+#define SI_LockStartValue  255
+#define SI_SharedLock     (-1)
+#define SI_ExclusiveLock  (-255)
+
+extern SISeg *shmInvalBuffer;	
+
+/*
+ * prototypes for functions in sinvaladt.c
+ */
+extern int SIBackendInit(SISeg *segInOutP);
+extern int SISegmentInit(bool killExistingSegment, IPCKey key);
+
+extern bool SISetDataEntry(SISeg *segP, SharedInvalidData  *data);
+extern void SISetProcStateInvalid(SISeg *segP);
+extern bool SIDelDataEntry(SISeg *segP);
+extern void SIReadEntryData(SISeg *segP, int backendId,
+		void (*invalFunction)(), void (*resetFunction)());
+extern void SIDelExpiredDataEntries(SISeg *segP);
+
+#endif	/* SINVALADT_H */
--- a/src/backend/storage/smgr.h
+++ b/src/backend/storage/smgr.h
@@ -0,0 +1,84 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.h--
+ *    storage manager switch public interface declarations.
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: smgr.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGR_H
+#define SMGR_H
+
+#include "utils/rel.h"
+#include "storage/spin.h"	/* for SPINLOCK */
+
+#define SM_FAIL		0
+#define	SM_SUCCESS	1
+
+#define	DEFAULT_SMGR	0
+
+extern int smgrinit(void);
+extern void smgrshutdown(int dummy);
+extern int smgrcreate(int16 which, Relation reln);
+extern int smgrunlink(int16 which, Relation reln);
+extern int smgrextend(int16 which, Relation reln, char *buffer);
+extern int smgropen(int16 which, Relation reln);
+extern int smgrclose(int16 which, Relation reln);
+extern int smgrread(int16 which, Relation reln, BlockNumber blocknum,
+		    char *buffer);
+extern int smgrwrite(int16 which, Relation reln, BlockNumber blocknum,
+		     char *buffer);
+extern int smgrflush(int16 which, Relation reln, BlockNumber blocknum,
+		     char *buffer);
+extern int smgrblindwrt(int16 which, char *dbname, char *relname, Oid dbid,
+			Oid relid, BlockNumber blkno, char *buffer);
+extern int smgrnblocks(int16 which, Relation reln);
+extern int smgrcommit(void);
+extern int smgrabort(void);
+extern bool smgriswo(int16 smgrno);
+
+
+
+/* internals: move me elsewhere -- ay 7/94 */
+
+/* in md.c */
+extern int mdinit(void);
+extern int mdcreate(Relation reln);
+extern int mdunlink(Relation reln);
+extern int mdextend(Relation reln, char *buffer);
+extern int mdopen(Relation reln);
+extern int mdclose(Relation reln);
+extern int mdread(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdwrite(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdflush(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mdblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+		      BlockNumber blkno, char *buffer);
+extern int mdnblocks(Relation reln);
+extern int mdcommit(void);
+extern int mdabort(void);
+
+/* mm.c */
+extern SPINLOCK MMCacheLock;
+
+extern int mminit(void);
+extern int mmshutdown(void);
+extern int mmcreate(Relation reln);
+extern int mmunlink(Relation reln);
+extern int mmextend(Relation reln, char *buffer);
+extern int mmopen(Relation reln);
+extern int mmclose(Relation reln);
+extern int mmread(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmwrite(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmflush(Relation reln, BlockNumber blocknum, char *buffer);
+extern int mmblindwrt(char *dbstr, char *relstr, Oid dbid, Oid relid,
+		      BlockNumber blkno, char *buffer);
+extern int mmnblocks(Relation reln);
+extern int mmcommit(void);
+extern int mmabort(void);
+extern int MMShmemSize(void);
+
+#endif	/* SMGR_H */
--- a/src/backend/storage/smgr/Makefile.inc
+++ b/src/backend/storage/smgr/Makefile.inc
@@ -0,0 +1,14 @@
+#-------------------------------------------------------------------------
+#
+# Makefile.inc--
+#    Makefile for storage/smgr
+#
+# Copyright (c) 1994, Regents of the University of California
+#
+#
+# IDENTIFICATION
+#    $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/Makefile.inc,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+#
+#-------------------------------------------------------------------------
+
+SUBSRCS+= md.c mm.c smgr.c smgrtype.c
--- a/src/backend/storage/smgr/README
+++ b/src/backend/storage/smgr/README
@@ -0,0 +1,40 @@
+# $Header: /cvsroot/pgsql/src/backend/storage/smgr/README,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+
+This directory contains the code that supports the Postgres storage manager
+switch and all of the installed storage managers.  In released systems,
+the only supported storage manager is the magnetic disk manager.  At UC
+Berkeley, the Sony WORM optical disk jukebox and persistent main memory are
+also supported.
+
+As of Postgres Release 3.0, every relation in the system is tagged with the
+storage manager on which it resides.  The storage manager switch code turns
+what used to by filesystem operations into operations on the correct store,
+for any given relation.
+
+The files in this directory, and their contents, are
+
+    smgrtype.c	Storage manager type -- maps string names to storage manager
+		IDs and provides simple comparison operators.  This is the
+		regproc support for type 'smgr' in the system catalogs.
+
+    smgr.c	The storage manager switch dispatch code.  The routines in
+		this file call the appropriate storage manager to do hardware
+		accesses requested by the backend.
+
+    md.c	The magnetic disk storage manager.
+
+    mm.c	The persistent main memory storage manager (#undef'ed in
+		tmp/c.h for all distributed systems).
+
+    sj.c	The sony jukebox storage manager and cache management code
+		(#undef'ed in tmp/c.h for all distributed systems).  The
+		routines in this file allocate extents, maintain block
+		maps, and guarantee the persistence and coherency of a cache
+		of jukebox blocks on magnetic disk.
+
+    pgjb.c	The postgres jukebox interface routines.  The routines here
+		handle exclusion on the physical device and translate requests
+		from the storage manager code (sj.c) into jbaccess calls.
+
+    jbaccess.c	Access code for the physical Sony jukebox device.  This code
+		was swiped from Andy McFadden's jblib.a code at UC Berkeley.
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -0,0 +1,697 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.c--
+ *    This code manages relations that reside on magnetic disk.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <stdio.h>		/* for sprintf() */
+#include <sys/file.h>
+
+#include "postgres.h"
+#include "miscadmin.h"  /* for DataDir */
+
+#include "machine.h"
+#include "storage/smgr.h"	/* where the declarations go */
+#include "storage/block.h"
+#include "storage/fd.h"
+#include "utils/mcxt.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "catalog/catalog.h"
+
+#undef DIAGNOSTIC
+
+/*
+ *  The magnetic disk storage manager keeps track of open file descriptors
+ *  in its own descriptor pool.  This happens for two reasons.  First, at
+ *  transaction boundaries, we walk the list of descriptors and flush
+ *  anything that we've dirtied in the current transaction.  Second, we
+ *  have to support relations of > 4GBytes.  In order to do this, we break
+ *  relations up into chunks of < 2GBytes and store one chunk in each of
+ *  several files that represent the relation.
+ */
+
+typedef struct _MdfdVec {
+    int			mdfd_vfd; /* fd number in vfd pool */
+    uint16		mdfd_flags; /* clean, dirty */
+    int			mdfd_lstbcnt; /* most recent block count */
+    struct _MdfdVec	*mdfd_chain; /* for large relations */
+} MdfdVec;
+
+static int	Nfds = 100;
+static MdfdVec	*Md_fdvec = (MdfdVec *) NULL;
+static int	CurFd = 0;
+static MemoryContext	MdCxt;
+
+#define MDFD_DIRTY	(uint16) 0x01
+
+#define	RELSEG_SIZE	262144		/* (2 ** 31) / 8192 -- 2GB file */
+
+/* routines declared here */
+static MdfdVec	*_mdfd_openseg(Relation reln, int segno, int oflags);
+static MdfdVec	*_mdfd_getseg(Relation reln, int blkno, int oflag);
+static int _fdvec_ext(void);
+static BlockNumber _mdnblocks(File file, Size blcksz);
+
+/*
+ *  mdinit() -- Initialize private state for magnetic disk storage manager.
+ *
+ *	We keep a private table of all file descriptors.  Whenever we do
+ *	a write to one, we mark it dirty in our table.  Whenever we force
+ *	changes to disk, we mark the file descriptor clean.  At transaction
+ *	commit, we force changes to disk for all dirty file descriptors.
+ *	This routine allocates and initializes the table.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdinit()
+{
+    MemoryContext oldcxt;
+
+    MdCxt = (MemoryContext) CreateGlobalMemory("MdSmgr");
+    if (MdCxt == (MemoryContext) NULL)
+	return (SM_FAIL);
+
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+    Md_fdvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    if (Md_fdvec == (MdfdVec *) NULL)
+	return (SM_FAIL);
+
+    memset(Md_fdvec, 0, Nfds * sizeof(MdfdVec)); 
+
+    return (SM_SUCCESS);
+}
+
+int
+mdcreate(Relation reln)
+{
+    int fd, vfd;
+    int tmp;
+    char *path;
+    extern bool IsBootstrapProcessingMode();
+
+    path = relpath(&(reln->rd_rel->relname.data[0]));
+    fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+
+    /*
+     *  If the file already exists and is empty, we pretend that the
+     *  create succeeded.  During bootstrap processing, we skip that check,
+     *  because pg_time, pg_variable, and pg_log get created before their
+     *  .bki file entries are processed.
+     */
+
+    if (fd < 0) {
+	if ((fd = FileNameOpenFile(path, O_RDWR, 0600)) >= 0) {
+	    if (!IsBootstrapProcessingMode() &&
+		FileRead(fd, (char *) &tmp, sizeof(tmp)) != 0) {
+		FileClose(fd);
+		return (-1);
+	    }
+	}
+    }
+
+    if (CurFd >= Nfds) {
+	if (_fdvec_ext() == SM_FAIL)
+	    return (-1);
+    }
+
+    Md_fdvec[CurFd].mdfd_vfd = fd;
+    Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
+    Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
+    Md_fdvec[CurFd].mdfd_lstbcnt = 0;
+
+    vfd = CurFd++;
+
+    return (vfd);
+}
+
+/*
+ *  mdunlink() -- Unlink a relation.
+ */
+int
+mdunlink(Relation reln)
+{
+    int fd;
+    int i;
+    MdfdVec *v, *ov;
+    MemoryContext oldcxt;
+    char fname[20];	/* XXX should have NAMESIZE defined */
+    char tname[20];
+
+ /* On Windows NT you can't unlink a file if it is open so we have
+ ** to do this.
+ */
+#ifdef WIN32
+    (void) mdclose(reln);
+#endif /* WIN32 */
+ 
+
+    memset(fname,0,20); 
+    strncpy(fname, RelationGetRelationName(reln)->data, 16);
+
+    if (FileNameUnlink(fname) < 0)
+	return (SM_FAIL);
+
+    /* unlink all the overflow files for large relations */
+    for (i = 1; ; i++) {
+#ifdef WIN32
+       (void) mdclose(reln);
+#endif /* WIN32 */
+	sprintf(tname, "%s.%d", fname, i);
+	if (FileNameUnlink(tname) < 0)
+	    break;
+    }
+
+    /* finally, clean out the mdfd vector */
+    fd = RelationGetFile(reln);
+    Md_fdvec[fd].mdfd_flags = (uint16) 0;
+
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+    for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; ) {
+	ov = v;
+	v = v->mdfd_chain;
+	if (ov != &Md_fdvec[fd])
+	    pfree(ov);
+    }
+    Md_fdvec[fd].mdfd_chain = (MdfdVec *) NULL;
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdextend() -- Add a block to the specified relation.
+ *
+ *	This routine returns SM_FAIL or SM_SUCCESS, with errno set as
+ *	appropriate.
+ */
+int
+mdextend(Relation reln, char *buffer)
+{
+    long pos;
+    int nblocks;
+    MdfdVec *v;
+
+    nblocks = mdnblocks(reln);
+    v = _mdfd_getseg(reln, nblocks, O_CREAT);
+
+    if ((pos = FileSeek(v->mdfd_vfd, 0L, SEEK_END)) < 0)
+	return (SM_FAIL);
+
+    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
+	return (SM_FAIL);
+
+    /* remember that we did a write, so we can sync at xact commit */
+    v->mdfd_flags |= MDFD_DIRTY;
+
+    /* try to keep the last block count current, though it's just a hint */
+    if ((v->mdfd_lstbcnt = (++nblocks % RELSEG_SIZE)) == 0)
+	v->mdfd_lstbcnt = RELSEG_SIZE;
+
+#ifdef DIAGNOSTIC
+    if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE
+	|| v->mdfd_lstbcnt > RELSEG_SIZE)
+	elog(FATAL, "segment too big!");
+#endif
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdopen() -- Open the specified relation.
+ */
+int
+mdopen(Relation reln)
+{
+    char *path;
+    int fd;
+    int vfd;
+
+    if (CurFd >= Nfds) {
+	if (_fdvec_ext() == SM_FAIL)
+	    return (-1);
+    }
+
+    path = relpath(&(reln->rd_rel->relname.data[0]));
+
+    fd = FileNameOpenFile(path, O_RDWR, 0600);
+
+    /* this should only happen during bootstrap processing */
+    if (fd < 0)
+	fd = FileNameOpenFile(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+
+    Md_fdvec[CurFd].mdfd_vfd = fd;
+    Md_fdvec[CurFd].mdfd_flags = (uint16) 0;
+    Md_fdvec[CurFd].mdfd_chain = (MdfdVec *) NULL;
+    Md_fdvec[CurFd].mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+
+#ifdef DIAGNOSTIC
+    if (Md_fdvec[CurFd].mdfd_lstbcnt > RELSEG_SIZE)
+	elog(FATAL, "segment too big on relopen!");
+#endif
+
+    vfd = CurFd++;
+
+    return (vfd);
+}
+
+/*
+ *  mdclose() -- Close the specified relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdclose(Relation reln)
+{
+    int fd;
+    MdfdVec *v;
+
+    fd = RelationGetFile(reln);
+
+    for (v = &Md_fdvec[fd]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+
+	/* may be closed already */
+	if (v->mdfd_vfd < 0)
+	    continue;
+
+	/*
+	 *  We sync the file descriptor so that we don't need to reopen it at
+	 *  transaction commit to force changes to disk.
+	 */
+
+	FileSync(v->mdfd_vfd);
+	FileClose(v->mdfd_vfd);
+
+	/* mark this file descriptor as clean in our private table */
+	v->mdfd_flags &= ~MDFD_DIRTY;
+    }
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdread() -- Read the specified block from a relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdread(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+    long seekpos;
+    int nbytes;
+    MdfdVec *v;
+
+    v = _mdfd_getseg(reln, blocknum, 0);
+
+    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+
+#ifdef DIAGNOSTIC
+    if (seekpos >= BLCKSZ * RELSEG_SIZE)
+	elog(FATAL, "seekpos too big!");
+#endif
+
+    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+	return (SM_FAIL);
+    }
+
+    status = SM_SUCCESS;
+    if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ) {
+	if (nbytes == 0) {
+	  memset(buffer, 0, BLCKSZ); 
+	} else {
+	    status = SM_FAIL;
+	}
+    }
+
+    return (status);
+}
+
+/*
+ *  mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mdwrite(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+    long seekpos;
+    MdfdVec *v;
+
+    v = _mdfd_getseg(reln, blocknum, 0);
+
+    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+    if (seekpos >= BLCKSZ * RELSEG_SIZE)
+	elog(FATAL, "seekpos too big!");
+#endif
+
+    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+	return (SM_FAIL);
+    }
+
+    status = SM_SUCCESS;
+    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
+	status = SM_FAIL;
+
+    v->mdfd_flags |= MDFD_DIRTY;
+
+    return (status);
+}
+
+/*
+ *  mdflush() -- Synchronously write a block to disk.
+ *
+ *	This is exactly like mdwrite(), but doesn't return until the file
+ *	system buffer cache has been flushed.
+ */
+int
+mdflush(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+    long seekpos;
+    MdfdVec *v;
+
+    v = _mdfd_getseg(reln, blocknum, 0);
+
+    seekpos = (long) (BLCKSZ * (blocknum % RELSEG_SIZE));
+#ifdef DIAGNOSTIC
+    if (seekpos >= BLCKSZ * RELSEG_SIZE)
+	elog(FATAL, "seekpos too big!");
+#endif
+
+    if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos) {
+	return (SM_FAIL);
+    }
+
+    /* write and sync the block */
+    status = SM_SUCCESS;
+    if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ
+	|| FileSync(v->mdfd_vfd) < 0)
+	status = SM_FAIL;
+
+    /*
+     *  By here, the block is written and changes have been forced to stable
+     *  storage.  Mark the descriptor as clean until the next write, so we
+     *  don't sync it again unnecessarily at transaction commit.
+     */
+
+    v->mdfd_flags &= ~MDFD_DIRTY;
+
+    return (status);
+}
+
+/*
+ *  mdblindwrt() -- Write a block to disk blind.
+ *
+ *	We have to be able to do this using only the name and OID of
+ *	the database and relation in which the block belongs.  This
+ *	is a synchronous write.
+ */
+int
+mdblindwrt(char *dbstr,
+	   char *relstr,
+	   Oid dbid,
+	   Oid relid,
+	   BlockNumber blkno,
+	   char *buffer)
+{
+    int fd;
+    int segno;
+    long seekpos;
+    int status;
+    char *path;
+    int nchars;
+
+    /* be sure we have enough space for the '.segno', if any */
+    segno = blkno / RELSEG_SIZE;
+    if (segno > 0)
+	nchars = 10;
+    else
+	nchars = 0;
+
+    /* construct the path to the file and open it */
+    if (dbid == (Oid) 0) {
+	path = (char *) palloc(strlen(DataDir) + sizeof(NameData) + 2 + nchars);
+	if (segno == 0)
+	    sprintf(path, "%s/%.*s", DataDir, NAMEDATALEN, relstr);
+	else
+	    sprintf(path, "%s/%.*s.%d", DataDir, NAMEDATALEN, relstr, segno);
+    } else {
+	path = (char *) palloc(strlen(DataDir) + strlen("/base/") + 2 * sizeof(NameData) + 2 + nchars);
+	if (segno == 0)
+	    sprintf(path, "%s/base/%.*s/%.*s", DataDir, NAMEDATALEN, 
+			dbstr, NAMEDATALEN, relstr);
+	else
+	    sprintf(path, "%s/base/%.*s/%.*s.%d", DataDir, NAMEDATALEN, dbstr,
+			NAMEDATALEN, relstr, segno);
+    }
+
+    if ((fd = open(path, O_RDWR, 0600)) < 0)
+	return (SM_FAIL);
+
+    /* seek to the right spot */
+    seekpos = (long) (BLCKSZ * (blkno % RELSEG_SIZE));
+    if (lseek(fd, seekpos, SEEK_SET) != seekpos) {
+	(void) close(fd);
+	return (SM_FAIL);
+    }
+
+    status = SM_SUCCESS;
+
+    /* write and sync the block */
+    if (write(fd, buffer, BLCKSZ) != BLCKSZ || fsync(fd) < 0)
+	status = SM_FAIL;
+
+    if (close(fd) < 0)
+	status = SM_FAIL;
+
+    pfree(path);
+
+    return (status);
+}
+
+/*
+ *  mdnblocks() -- Get the number of blocks stored in a relation.
+ *
+ *	Returns # of blocks or -1 on error.
+ */
+int
+mdnblocks(Relation reln)
+{
+    int fd;
+    MdfdVec *v;
+    int nblocks;
+    int segno;
+
+    fd = RelationGetFile(reln);
+    v = &Md_fdvec[fd];
+
+#ifdef DIAGNOSTIC
+    if (_mdnblocks(v->mdfd_vfd, BLCKSZ) > RELSEG_SIZE)
+	elog(FATAL, "segment too big in getseg!");
+#endif
+
+    segno = 0;
+    for (;;) {
+	if (v->mdfd_lstbcnt == RELSEG_SIZE
+	    || (nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ)) == RELSEG_SIZE) {
+
+	    v->mdfd_lstbcnt = RELSEG_SIZE;
+	    segno++;
+
+	    if (v->mdfd_chain == (MdfdVec *) NULL) {
+		v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
+		if (v->mdfd_chain == (MdfdVec *) NULL)
+		    elog(WARN, "cannot count blocks for %.16s -- open failed",
+				RelationGetRelationName(reln));
+	    }
+
+	    v = v->mdfd_chain;
+	} else {
+	    return ((segno * RELSEG_SIZE) + nblocks);
+	}
+    }
+}
+
+/*
+ *  mdcommit() -- Commit a transaction.
+ *
+ *	All changes to magnetic disk relations must be forced to stable
+ *	storage.  This routine makes a pass over the private table of
+ *	file descriptors.  Any descriptors to which we have done writes,
+ *	but not synced, are synced here.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mdcommit()
+{
+    int i;
+    MdfdVec *v;
+
+    for (i = 0; i < CurFd; i++) {
+	for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+	    if (v->mdfd_flags & MDFD_DIRTY) {
+		if (FileSync(v->mdfd_vfd) < 0)
+		    return (SM_FAIL);
+
+		v->mdfd_flags &= ~MDFD_DIRTY;
+	    }
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mdabort() -- Abort a transaction.
+ *
+ *	Changes need not be forced to disk at transaction abort.  We mark
+ *	all file descriptors as clean here.  Always returns SM_SUCCESS.
+ */
+int
+mdabort()
+{
+    int i;
+    MdfdVec *v;
+
+    for (i = 0; i < CurFd; i++) {
+	for (v = &Md_fdvec[i]; v != (MdfdVec *) NULL; v = v->mdfd_chain) {
+	    v->mdfd_flags &= ~MDFD_DIRTY;
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  _fdvec_ext() -- Extend the md file descriptor vector.
+ *
+ *	The file descriptor vector must be large enough to hold at least
+ *	'fd' entries.
+ */
+static
+int _fdvec_ext()
+{
+    MdfdVec *nvec;
+    MemoryContext oldcxt;
+
+    Nfds *= 2;
+
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+
+    nvec = (MdfdVec *) palloc(Nfds * sizeof(MdfdVec));
+    memset(nvec, 0, Nfds * sizeof(MdfdVec)); 
+    memmove(nvec, (char *) Md_fdvec, (Nfds / 2) * sizeof(MdfdVec)); 
+    pfree(Md_fdvec);
+
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    Md_fdvec = nvec;
+
+    return (SM_SUCCESS);
+}
+
+static MdfdVec *
+_mdfd_openseg(Relation reln, int segno, int oflags)
+{
+    MemoryContext oldcxt;
+    MdfdVec *v;
+    int fd;
+    bool dofree;
+    char *path, *fullpath;
+
+    /* be sure we have enough space for the '.segno', if any */
+    path = relpath(RelationGetRelationName(reln)->data);
+
+    dofree = false;
+    if (segno > 0) {
+	dofree = true;
+	fullpath = (char *) palloc(strlen(path) + 12);
+	sprintf(fullpath, "%s.%d", path, segno);
+    } else
+	fullpath = path;
+
+    /* open the file */
+    fd = PathNameOpenFile(fullpath, O_RDWR|oflags, 0600);
+
+    if (dofree)
+	pfree(fullpath);
+
+    if (fd < 0)
+	return ((MdfdVec *) NULL);
+
+    /* allocate an mdfdvec entry for it */
+    oldcxt = MemoryContextSwitchTo(MdCxt);
+    v = (MdfdVec *) palloc(sizeof(MdfdVec));
+    (void) MemoryContextSwitchTo(oldcxt);
+
+    /* fill the entry */
+    v->mdfd_vfd = fd;
+    v->mdfd_flags = (uint16) 0;
+    v->mdfd_chain = (MdfdVec *) NULL;
+    v->mdfd_lstbcnt = _mdnblocks(fd, BLCKSZ);
+
+#ifdef DIAGNOSTIC
+    if (v->mdfd_lstbcnt > RELSEG_SIZE)
+	elog(FATAL, "segment too big on open!");
+#endif
+
+    /* all done */
+    return (v);
+}
+
+static MdfdVec *
+_mdfd_getseg(Relation reln, int blkno, int oflag)
+{
+    MdfdVec *v;
+    int segno;
+    int fd;
+    int i;
+
+    fd = RelationGetFile(reln);
+    if (fd < 0) {
+	if ((fd = mdopen(reln)) < 0)
+	    elog(WARN, "cannot open relation %.16s",
+			RelationGetRelationName(reln));
+	reln->rd_fd = fd;
+    }
+
+    for (v = &Md_fdvec[fd], segno = blkno / RELSEG_SIZE, i = 1;
+	 segno > 0;
+	 i++, segno--) {
+
+	if (v->mdfd_chain == (MdfdVec *) NULL) {
+	    v->mdfd_chain = _mdfd_openseg(reln, i, oflag);
+
+	    if (v->mdfd_chain == (MdfdVec *) NULL)
+		elog(WARN, "cannot open segment %d of relation %.16s",
+			    i, RelationGetRelationName(reln));
+	}
+	v = v->mdfd_chain;
+    }
+
+    return (v);
+}
+
+static BlockNumber
+_mdnblocks(File file, Size blcksz)
+{
+    long len;
+    
+    len = FileSeek(file, 0L, SEEK_END) - 1;
+    return((BlockNumber)((len < 0) ? 0 : 1 + len / blcksz));
+}
--- a/src/backend/storage/smgr/mm.c
+++ b/src/backend/storage/smgr/mm.c
@@ -0,0 +1,586 @@
+/*-------------------------------------------------------------------------
+ *
+ * mm.c--
+ *    main memory storage manager
+ *
+ *    This code manages relations that reside in (presumably stable)
+ *    main memory.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#ifdef MAIN_MEMORY
+
+#include <math.h>
+#include "machine.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"	/* where the declarations go */
+#include "storage/block.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+#include "utils/hsearch.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/memutils.h"
+
+/*
+ *  MMCacheTag -- Unique triplet for blocks stored by the main memory
+ *		  storage manager.
+ */
+
+typedef struct MMCacheTag {
+    Oid			mmct_dbid;
+    Oid			mmct_relid;
+    BlockNumber		mmct_blkno;
+} MMCacheTag;
+
+/*
+ *  Shared-memory hash table for main memory relations contains
+ *  entries of this form.
+ */
+
+typedef struct MMHashEntry {
+    MMCacheTag		mmhe_tag;
+    int			mmhe_bufno;
+} MMHashEntry;
+
+/*
+ * MMRelTag -- Unique identifier for each relation that is stored in the
+ *	            main-memory storage manager.
+ */
+
+typedef struct MMRelTag {
+    Oid		mmrt_dbid;
+    Oid		mmrt_relid;
+} MMRelTag;
+
+/*
+ *  Shared-memory hash table for # blocks in main memory relations contains
+ *  entries of this form.
+ */
+
+typedef struct MMRelHashEntry {
+    MMRelTag		mmrhe_tag;
+    int			mmrhe_nblocks;
+} MMRelHashEntry;
+
+#define MMNBUFFERS	10
+#define MMNRELATIONS	2
+
+SPINLOCK	MMCacheLock;
+extern bool	IsPostmaster;
+extern Oid	MyDatabaseId;
+
+static int		*MMCurTop;
+static int		*MMCurRelno;
+static MMCacheTag	*MMBlockTags;
+static char		*MMBlockCache;
+static HTAB		*MMCacheHT;
+static HTAB		*MMRelCacheHT;
+
+int
+mminit()
+{
+    char *mmcacheblk;
+    int mmsize = 0;
+    bool found;
+    HASHCTL info;
+
+    SpinAcquire(MMCacheLock);
+
+    mmsize += MAXALIGN(BLCKSZ * MMNBUFFERS);
+    mmsize += MAXALIGN(sizeof(*MMCurTop));
+    mmsize += MAXALIGN(sizeof(*MMCurRelno));
+    mmsize += MAXALIGN((MMNBUFFERS * sizeof(MMCacheTag)));
+    mmcacheblk = (char *) ShmemInitStruct("Main memory smgr", mmsize, &found);
+
+    if (mmcacheblk == (char *) NULL) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    info.keysize = sizeof(MMCacheTag);
+    info.datasize = sizeof(int);
+    info.hash = tag_hash;
+
+    MMCacheHT = (HTAB *) ShmemInitHash("Main memory store HT",
+					MMNBUFFERS, MMNBUFFERS,
+					&info, (HASH_ELEM|HASH_FUNCTION));
+
+    if (MMCacheHT == (HTAB *) NULL) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    info.keysize = sizeof(MMRelTag);
+    info.datasize = sizeof(int);
+    info.hash = tag_hash;
+
+    MMRelCacheHT = (HTAB *) ShmemInitHash("Main memory rel HT",
+					  MMNRELATIONS, MMNRELATIONS,
+					  &info, (HASH_ELEM|HASH_FUNCTION));
+
+    if (MMRelCacheHT == (HTAB *) NULL) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    if (IsPostmaster) {
+	memset(mmcacheblk, 0, mmsize);
+	SpinRelease(MMCacheLock);
+	return (SM_SUCCESS);
+    }
+
+    SpinRelease(MMCacheLock);
+
+    MMCurTop = (int *) mmcacheblk;
+    mmcacheblk += sizeof(int);
+    MMCurRelno = (int *) mmcacheblk;
+    mmcacheblk += sizeof(int);
+    MMBlockTags = (MMCacheTag *) mmcacheblk;
+    mmcacheblk += (MMNBUFFERS * sizeof(MMCacheTag));
+    MMBlockCache = mmcacheblk;
+
+    return (SM_SUCCESS);
+}
+
+int
+mmshutdown()
+{
+    return (SM_SUCCESS);
+}
+
+int
+mmcreate(Relation reln)
+{
+    MMRelHashEntry *entry;
+    bool found;
+    MMRelTag tag;
+
+    SpinAcquire(MMCacheLock);
+
+    if (*MMCurRelno == MMNRELATIONS) {
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    (*MMCurRelno)++;
+
+    tag.mmrt_relid = reln->rd_id;
+    if (reln->rd_rel->relisshared)
+	tag.mmrt_dbid = (Oid) 0;
+    else
+	tag.mmrt_dbid = MyDatabaseId;
+
+    entry = (MMRelHashEntry *) hash_search(MMRelCacheHT,
+					   (char *) &tag, HASH_ENTER, &found);
+
+    if (entry == (MMRelHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "main memory storage mgr rel cache hash table corrupt");
+    }
+
+    if (found) {
+	/* already exists */
+	SpinRelease(MMCacheLock);
+	return (SM_FAIL);
+    }
+
+    entry->mmrhe_nblocks = 0;
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmunlink() -- Unlink a relation.
+ */
+int
+mmunlink(Relation reln)
+{
+    int i;
+    Oid reldbid;
+    MMHashEntry *entry;
+    MMRelHashEntry *rentry;
+    bool found;
+    MMRelTag rtag;
+
+    if (reln->rd_rel->relisshared)
+	reldbid = (Oid) 0;
+    else
+	reldbid = MyDatabaseId;
+
+    SpinAcquire(MMCacheLock);
+
+    for (i = 0; i < MMNBUFFERS; i++) {
+	if (MMBlockTags[i].mmct_dbid == reldbid
+	    && MMBlockTags[i].mmct_relid == reln->rd_id) {
+	    entry = (MMHashEntry *) hash_search(MMCacheHT,
+						(char *) &MMBlockTags[i],
+						 HASH_REMOVE, &found);
+	    if (entry == (MMHashEntry *) NULL || !found) {
+		SpinRelease(MMCacheLock);
+		elog(FATAL, "mmunlink: cache hash table corrupted");
+	    }
+	    MMBlockTags[i].mmct_dbid = (Oid) 0;
+	    MMBlockTags[i].mmct_relid = (Oid) 0;
+	    MMBlockTags[i].mmct_blkno = (BlockNumber) 0;
+	}
+    }
+    rtag.mmrt_dbid = reldbid;
+    rtag.mmrt_relid = reln->rd_id;
+
+    rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+					    HASH_REMOVE, &found);
+
+    if (rentry == (MMRelHashEntry *) NULL || !found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmunlink: rel cache hash table corrupted");
+    }
+
+    (*MMCurRelno)--;
+
+    SpinRelease(MMCacheLock);
+    return 1;
+}
+
+/*
+ *  mmextend() -- Add a block to the specified relation.
+ *
+ *	This routine returns SM_FAIL or SM_SUCCESS, with errno set as
+ *	appropriate.
+ */
+int
+mmextend(Relation reln, char *buffer)
+{
+    MMRelHashEntry *rentry;
+    MMHashEntry *entry;
+    int i;
+    Oid reldbid;
+    int offset;
+    bool found;
+    MMRelTag rtag;
+    MMCacheTag tag;
+
+    if (reln->rd_rel->relisshared)
+	reldbid = (Oid) 0;
+    else
+	reldbid = MyDatabaseId;
+
+    tag.mmct_dbid = rtag.mmrt_dbid = reldbid;
+    tag.mmct_relid = rtag.mmrt_relid = reln->rd_id;
+
+    SpinAcquire(MMCacheLock);
+
+    if (*MMCurTop == MMNBUFFERS) {
+	for (i = 0; i < MMNBUFFERS; i++) {
+	    if (MMBlockTags[i].mmct_dbid == 0 &&
+		MMBlockTags[i].mmct_relid == 0)
+		break;
+	}
+	if (i == MMNBUFFERS) {
+	    SpinRelease(MMCacheLock);
+	    return (SM_FAIL);
+	}
+    } else {
+	i = *MMCurTop;
+	(*MMCurTop)++;
+    }
+
+    rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+					    HASH_FIND, &found);
+    if (rentry == (MMRelHashEntry *) NULL || !found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmextend: rel cache hash table corrupt");
+    }
+
+    tag.mmct_blkno = rentry->mmrhe_nblocks;
+
+    entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+					HASH_ENTER, &found);
+    if (entry == (MMHashEntry *) NULL || found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmextend: cache hash table corrupt");
+    }
+
+    entry->mmhe_bufno = i;
+    MMBlockTags[i].mmct_dbid = reldbid;
+    MMBlockTags[i].mmct_relid = reln->rd_id;
+    MMBlockTags[i].mmct_blkno = rentry->mmrhe_nblocks;
+
+    /* page numbers are zero-based, so we increment this at the end */
+    (rentry->mmrhe_nblocks)++;
+
+    /* write the extended page */
+    offset = (i * BLCKSZ);
+    memmove(&(MMBlockCache[offset]), buffer, BLCKSZ);
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmopen() -- Open the specified relation.
+ */
+int
+mmopen(Relation reln)
+{
+    /* automatically successful */
+    return (0);
+}
+
+/*
+ *  mmclose() -- Close the specified relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mmclose(Relation reln)
+{
+    /* automatically successful */
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmread() -- Read the specified block from a relation.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mmread(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    MMHashEntry *entry;
+    bool found;
+    int offset;
+    MMCacheTag tag;
+
+    if (reln->rd_rel->relisshared)
+	tag.mmct_dbid = (Oid) 0;
+    else
+	tag.mmct_dbid = MyDatabaseId;
+
+    tag.mmct_relid = reln->rd_id;
+    tag.mmct_blkno = blocknum;
+
+    SpinAcquire(MMCacheLock);
+    entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+					HASH_FIND, &found);
+
+    if (entry == (MMHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmread: hash table corrupt");
+    }
+
+    if (!found) {
+	/* reading nonexistent pages is defined to fill them with zeroes */
+	SpinRelease(MMCacheLock);
+	memset(buffer, 0, BLCKSZ);
+	return (SM_SUCCESS);
+    }
+
+    offset = (entry->mmhe_bufno * BLCKSZ);
+    memmove(buffer, &MMBlockCache[offset], BLCKSZ);
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmwrite() -- Write the supplied block at the appropriate location.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL.
+ */
+int
+mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    MMHashEntry *entry;
+    bool found;
+    int offset;
+    MMCacheTag tag;
+
+    if (reln->rd_rel->relisshared)
+	tag.mmct_dbid = (Oid) 0;
+    else
+	tag.mmct_dbid = MyDatabaseId;
+
+    tag.mmct_relid = reln->rd_id;
+    tag.mmct_blkno = blocknum;
+
+    SpinAcquire(MMCacheLock);
+    entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
+					HASH_FIND, &found);
+
+    if (entry == (MMHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmread: hash table corrupt");
+    }
+
+    if (!found) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmwrite: hash table missing requested page");
+    }
+
+    offset = (entry->mmhe_bufno * BLCKSZ);
+    memmove(&MMBlockCache[offset], buffer, BLCKSZ);
+
+    SpinRelease(MMCacheLock);
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmflush() -- Synchronously write a block to stable storage.
+ *
+ *	For main-memory relations, this is exactly equivalent to mmwrite().
+ */
+int
+mmflush(Relation reln, BlockNumber blocknum, char *buffer)
+{
+    return (mmwrite(reln, blocknum, buffer));
+}
+
+/*
+ *  mmblindwrt() -- Write a block to stable storage blind.
+ *
+ *	We have to be able to do this using only the name and OID of
+ *	the database and relation in which the block belongs.
+ */
+int
+mmblindwrt(char *dbstr,
+	   char *relstr,
+	   Oid dbid,
+	   Oid relid,
+	   BlockNumber blkno,
+	   char *buffer)
+{
+    return (SM_FAIL);
+}
+
+/*
+ *  mmnblocks() -- Get the number of blocks stored in a relation.
+ *
+ *	Returns # of blocks or -1 on error.
+ */
+int
+mmnblocks(Relation reln)
+{
+    MMRelTag rtag;
+    MMRelHashEntry *rentry;
+    bool found;
+    int nblocks;
+
+    if (reln->rd_rel->relisshared)
+	rtag.mmrt_dbid = (Oid) 0;
+    else
+	rtag.mmrt_dbid = MyDatabaseId;
+
+    rtag.mmrt_relid = reln->rd_id;
+
+    SpinAcquire(MMCacheLock);
+
+    rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
+					    HASH_FIND, &found);
+
+    if (rentry == (MMRelHashEntry *) NULL) {
+	SpinRelease(MMCacheLock);
+	elog(FATAL, "mmnblocks: rel cache hash table corrupt");
+    }
+
+    if (found)
+	nblocks = rentry->mmrhe_nblocks;
+    else
+	nblocks = -1;
+
+    SpinRelease(MMCacheLock);
+
+    return (nblocks);
+}
+
+/*
+ *  mmcommit() -- Commit a transaction.
+ *
+ *	Returns SM_SUCCESS or SM_FAIL with errno set as appropriate.
+ */
+int
+mmcommit()
+{
+    return (SM_SUCCESS);
+}
+
+/*
+ *  mmabort() -- Abort a transaction.
+ */
+
+int
+mmabort()
+{
+    return (SM_SUCCESS);
+}
+
+/*
+ *  MMShmemSize() -- Declare amount of shared memory we require.
+ *
+ *	The shared memory initialization code creates a block of shared
+ *	memory exactly big enough to hold all the structures it needs to.
+ *	This routine declares how much space the main memory storage
+ *	manager will use.
+ */
+int
+MMShmemSize()
+{
+    int size = 0;
+    int nbuckets;
+    int nsegs;
+    int tmp;
+
+    /*
+     *  first compute space occupied by the (dbid,relid,blkno) hash table
+     */
+
+    nbuckets = 1 << (int)my_log2((MMNBUFFERS - 1) / DEF_FFACTOR + 1);
+    nsegs = 1 << (int)my_log2((nbuckets - 1) / DEF_SEGSIZE + 1);
+    
+    size += MAXALIGN(my_log2(MMNBUFFERS) * sizeof(void *));
+    size += MAXALIGN(sizeof(HHDR));
+    size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    tmp = (int)ceil((double)MMNBUFFERS/BUCKET_ALLOC_INCR);
+    size += tmp * BUCKET_ALLOC_INCR *
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(MMHashEntry)));	/* contains hash key */
+
+    /*
+     *  now do the same for the rel hash table
+     */
+
+    size += MAXALIGN(my_log2(MMNRELATIONS) * sizeof(void *));
+    size += MAXALIGN(sizeof(HHDR));
+    size += nsegs * MAXALIGN(DEF_SEGSIZE * sizeof(SEGMENT));
+    tmp = (int)ceil((double)MMNRELATIONS/BUCKET_ALLOC_INCR);
+    size += tmp * BUCKET_ALLOC_INCR *
+	(MAXALIGN(sizeof(BUCKET_INDEX)) +
+	 MAXALIGN(sizeof(MMRelHashEntry)));	/* contains hash key */
+
+    /*
+     *  finally, add in the memory block we use directly
+     */
+
+    size += MAXALIGN(BLCKSZ * MMNBUFFERS);
+    size += MAXALIGN(sizeof(*MMCurTop));
+    size += MAXALIGN(sizeof(*MMCurRelno));
+    size += MAXALIGN(MMNBUFFERS * sizeof(MMCacheTag));
+
+    return (size);
+}
+
+#endif /* MAIN_MEMORY */
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -0,0 +1,371 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.c--
+ *    public interface routines to storage manager switch.
+ *
+ *    All file system operations in POSTGRES dispatch through these
+ *    routines.
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+
+#include "machine.h"
+#include "storage/ipc.h"
+#include "storage/smgr.h"
+#include "storage/block.h"
+#include "utils/rel.h"
+#include "utils/elog.h"
+#include "utils/palloc.h"
+
+typedef struct f_smgr {
+    int		(*smgr_init)();		/* may be NULL */
+    int		(*smgr_shutdown)();	/* may be NULL */
+    int		(*smgr_create)();
+    int		(*smgr_unlink)();
+    int		(*smgr_extend)();
+    int		(*smgr_open)();
+    int		(*smgr_close)();
+    int		(*smgr_read)();
+    int		(*smgr_write)();
+    int		(*smgr_flush)();
+    int		(*smgr_blindwrt)();
+    int		(*smgr_nblocks)();
+    int		(*smgr_commit)();	/* may be NULL */
+    int		(*smgr_abort)();	/* may be NULL */
+} f_smgr;
+
+/*
+ *  The weird placement of commas in this init block is to keep the compiler
+ *  happy, regardless of what storage managers we have (or don't have).
+ */
+
+static f_smgr smgrsw[] = {
+
+    /* magnetic disk */
+    { mdinit, NULL, mdcreate, mdunlink, mdextend, mdopen, mdclose,
+      mdread, mdwrite, mdflush, mdblindwrt, mdnblocks, mdcommit, mdabort },
+
+#ifdef MAIN_MEMORY
+    /* main memory */
+    { mminit, mmshutdown, mmcreate, mmunlink, mmextend, mmopen, mmclose,
+      mmread, mmwrite, mmflush, mmblindwrt, mmnblocks, mmcommit, mmabort },
+
+#endif /* MAIN_MEMORY */
+};
+
+/*
+ *  This array records which storage managers are write-once, and which
+ *  support overwrite.  A 'true' entry means that the storage manager is
+ *  write-once.  In the best of all possible worlds, there would be no
+ *  write-once storage managers.
+ */
+
+static bool smgrwo[] = {
+    false,		/* magnetic disk */
+#ifdef MAIN_MEMORY
+    false,		/* main memory*/
+#endif /* MAIN_MEMORY */
+};
+static int NSmgr = lengthof(smgrsw);
+
+/*
+ *  smgrinit(), smgrshutdown() -- Initialize or shut down all storage
+ *				  managers.
+ *
+ */
+int
+smgrinit()
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_init) {
+	    if ((*(smgrsw[i].smgr_init))() == SM_FAIL)
+		elog(FATAL, "initialization failed on %s", smgrout(i));
+	}
+    }
+
+    /* register the shutdown proc */
+    on_exitpg(smgrshutdown, 0);
+
+    return (SM_SUCCESS);
+}
+
+void
+smgrshutdown(int dummy)
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_shutdown) {
+	    if ((*(smgrsw[i].smgr_shutdown))() == SM_FAIL)
+		elog(FATAL, "shutdown failed on %s", smgrout(i));
+	}
+    }
+}
+
+/*
+ *  smgrcreate() -- Create a new relation.
+ *
+ *	This routine takes a reldesc, creates the relation on the appropriate
+ *	device, and returns a file descriptor for it.
+ */
+int
+smgrcreate(int16 which, Relation reln)
+{
+    int fd;
+
+    if ((fd = (*(smgrsw[which].smgr_create))(reln)) < 0)
+	elog(WARN, "cannot open %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (fd);
+}
+
+/*
+ *  smgrunlink() -- Unlink a relation.
+ *
+ *	The relation is removed from the store.
+ */
+int
+smgrunlink(int16 which, Relation reln)
+{
+    int status;
+
+    if ((status = (*(smgrsw[which].smgr_unlink))(reln)) == SM_FAIL)
+	elog(WARN, "cannot unlink %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrextend() -- Add a new block to a file.
+ *
+ *	Returns SM_SUCCESS on success; aborts the current transaction on
+ *	failure.
+ */
+int
+smgrextend(int16 which, Relation reln, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_extend))(reln, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "%.*s: cannot extend",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgropen() -- Open a relation using a particular storage manager.
+ *
+ *	Returns the fd for the open relation on success, aborts the
+ *	transaction on failure.
+ */
+int
+smgropen(int16 which, Relation reln)
+{
+    int fd;
+
+    if ((fd = (*(smgrsw[which].smgr_open))(reln)) < 0)
+	elog(WARN, "cannot open %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (fd);
+}
+
+/*
+ *  smgrclose() -- Close a relation.
+ *
+ *	Returns SM_SUCCESS on success, aborts on failure.
+ */
+int
+smgrclose(int16 which, Relation reln)
+{
+    if ((*(smgrsw[which].smgr_close))(reln) == SM_FAIL)
+	elog(WARN, "cannot close %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (SM_SUCCESS);
+}
+
+/*
+ *  smgrread() -- read a particular block from a relation into the supplied
+ *		  buffer.
+ *
+ *	This routine is called from the buffer manager in order to
+ *	instantiate pages in the shared buffer cache.  All storage managers
+ *	return pages in the format that POSTGRES expects.  This routine
+ *	dispatches the read.  On success, it returns SM_SUCCESS.  On failure,
+ *	the current transaction is aborted.
+ */
+int
+smgrread(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_read))(reln, blocknum, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot read block %d of %.*s",
+	     blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrwrite() -- Write the supplied buffer out.
+ *
+ *	This is not a synchronous write -- the interface for that is
+ *	smgrflush().  The buffer is written out via the appropriate
+ *	storage manager.  This routine returns SM_SUCCESS or aborts
+ *	the current transaction.
+ */
+int
+smgrwrite(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_write))(reln, blocknum, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot write block %d of %.*s",
+	     blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrflush() -- A synchronous smgrwrite().
+ */
+int
+smgrflush(int16 which, Relation reln, BlockNumber blocknum, char *buffer)
+{
+    int status;
+
+    status = (*(smgrsw[which].smgr_flush))(reln, blocknum, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot flush block %d of %.*s to stable store",
+	     blocknum, NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (status);
+}
+
+/*
+ *  smgrblindwrt() -- Write a page out blind.
+ *
+ *	In some cases, we may find a page in the buffer cache that we
+ *	can't make a reldesc for.  This happens, for example, when we
+ *	want to reuse a dirty page that was written by a transaction
+ *	that has not yet committed, which created a new relation.  In
+ *	this case, the buffer manager will call smgrblindwrt() with
+ *	the name and OID of the database and the relation to which the
+ *	buffer belongs.  Every storage manager must be able to force
+ *	this page down to stable storage in this circumstance.
+ */
+int
+smgrblindwrt(int16 which,
+	     char *dbname,
+	     char *relname,
+	     Oid dbid,
+	     Oid relid,
+	     BlockNumber blkno,
+	     char *buffer)
+{
+    char *dbstr;
+    char *relstr;
+    int status;
+
+    dbstr = pstrdup(dbname);
+    relstr = pstrdup(relname);
+
+    status = (*(smgrsw[which].smgr_blindwrt))(dbstr, relstr, dbid, relid,
+					      blkno, buffer);
+
+    if (status == SM_FAIL)
+	elog(WARN, "cannot write block %d of %s [%s] blind",
+	     blkno, relstr, dbstr);
+
+    pfree(dbstr);
+    pfree(relstr);
+
+    return (status);
+}
+
+/*
+ *  smgrnblocks() -- Calculate the number of POSTGRES blocks in the
+ *		     supplied relation.
+ *
+ *	Returns the number of blocks on success, aborts the current
+ *	transaction on failure.
+ */
+int
+smgrnblocks(int16 which, Relation reln)
+{
+    int nblocks;
+
+    if ((nblocks = (*(smgrsw[which].smgr_nblocks))(reln)) < 0)
+	elog(WARN, "cannot count blocks for %.*s",
+	     NAMEDATALEN, &(reln->rd_rel->relname.data[0]));
+
+    return (nblocks);
+}
+
+/*
+ *  smgrcommit(), smgrabort() -- Commit or abort changes made during the
+ *				 current transaction.
+ */
+int
+smgrcommit()
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_commit) {
+	    if ((*(smgrsw[i].smgr_commit))() == SM_FAIL)
+		elog(FATAL, "transaction commit failed on %s", smgrout(i));
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+int
+smgrabort()
+{
+    int i;
+    extern char *smgrout();
+
+    for (i = 0; i < NSmgr; i++) {
+	if (smgrsw[i].smgr_abort) {
+	    if ((*(smgrsw[i].smgr_abort))() == SM_FAIL)
+		elog(FATAL, "transaction abort failed on %s", smgrout(i));
+	}
+    }
+
+    return (SM_SUCCESS);
+}
+
+bool
+smgriswo(int16 smgrno)
+{
+    if (smgrno < 0 || smgrno >= NSmgr)
+	elog(WARN, "illegal storage manager number %d", smgrno);
+
+    return (smgrwo[smgrno]);
+}
--- a/src/backend/storage/smgr/smgrtype.c
+++ b/src/backend/storage/smgr/smgrtype.c
@@ -0,0 +1,82 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrtype.c--
+ *    storage manager type
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *    $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgrtype.c,v 1.1.1.1 1996/07/09 06:21:59 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include <string.h>
+#include "postgres.h"
+
+#include "utils/builtins.h"	/* where the declarations go */
+#include "utils/elog.h"
+#include "utils/palloc.h"
+#include "storage/smgr.h"
+
+typedef struct smgrid {
+    char *smgr_name;
+} smgrid;
+
+/*
+ *  StorageManager[] -- List of defined storage managers.
+ *
+ *	The weird comma placement is to keep compilers happy no matter
+ *	which of these is (or is not) defined.
+ */
+
+static smgrid StorageManager[] = {
+	{"magnetic disk"},
+#ifdef MAIN_MEMORY
+	{"main memory"}
+#endif /* MAIN_MEMORY */
+};
+
+static int NStorageManagers = lengthof(StorageManager);
+
+int2
+smgrin(char *s)
+{
+    int i;
+
+    for (i = 0; i < NStorageManagers; i++) {
+	if (strcmp(s, StorageManager[i].smgr_name) == 0)
+	    return((int2) i);
+    }
+    elog(WARN, "smgrin: illegal storage manager name %s", s);
+    return 0;
+}
+
+char *
+smgrout(int2 i)
+{
+    char *s;
+
+    if (i >= NStorageManagers || i < 0)
+	elog(WARN, "Illegal storage manager id %d", i);
+
+    s = (char *) palloc(strlen(StorageManager[i].smgr_name) + 1);
+    strcpy(s, StorageManager[i].smgr_name);
+    return (s);
+}
+
+bool
+smgreq(int2 a, int2 b)
+{
+    if (a == b)
+	return (true);
+    return (false);
+}
+
+bool
+smgrne(int2 a, int2 b)
+{
+    if (a == b)
+	return (false);
+    return (true);
+}
--- a/src/backend/storage/spin.h
+++ b/src/backend/storage/spin.h
@@ -0,0 +1,38 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.h--
+ *    synchronization routines
+ *
+ *
+ * Copyright (c) 1994, Regents of the University of California
+ *
+ * $Id: spin.h,v 1.1.1.1 1996/07/09 06:21:53 scrappy Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef	SPIN_H
+#define SPIN_H
+
+#include "ipc.h"
+
+/* 
+ * two implementations of spin locks
+ *
+ * sequent, sparc, sun3: real spin locks. uses a TAS instruction; see
+ * src/storage/ipc/s_lock.c for details.
+ *
+ * default: fake spin locks using semaphores.  see spin.c
+ *
+ */
+
+typedef int SPINLOCK;
+
+extern bool CreateSpinlocks(IPCKey key);
+extern bool AttachSpinLocks(IPCKey key);
+extern bool InitSpinLocks(int init, IPCKey key);
+
+extern void SpinAcquire(SPINLOCK lock);
+extern void SpinRelease(SPINLOCK lock);
+extern bool SpinIsLocked(SPINLOCK lock);
+
+#endif	/* SPIN_H */