Implement new 'lightweight lock manager' that's intermediate between

existing lock manager and spinlocks: it understands exclusive vs shared lock but has few other fancy features. Replace most uses of spinlocks with lightweight locks. All remaining uses of spinlocks have very short lock hold times (a few dozen instructions), so tweak spinlock backoff code to work efficiently given this assumption. All per my proposal on pghackers 26-Sep-01.
2025-11-15 03:41:20 +03:00 · 2001-09-29 04:02:27 +00:00
parent 818fb55ac4
commit 499abb0c0f
46 changed files with 1595 additions and 1355 deletions
--- a/src/backend/storage/buffer/README
+++ b/src/backend/storage/buffer/README
@@ -1,4 +1,4 @@
-$Header: /cvsroot/pgsql/src/backend/storage/buffer/README,v 1.2 2001/08/25 18:52:42 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/storage/buffer/README,v 1.3 2001/09/29 04:02:22 tgl Exp $

 Notes about shared buffer access rules
 --------------------------------------
@@ -30,12 +30,10 @@ Buffer locks: there are two kinds of buffer locks, shared and exclusive,
 which act just as you'd expect: multiple backends can hold shared locks on
 the same buffer, but an exclusive lock prevents anyone else from holding
 either shared or exclusive lock.  (These can alternatively be called READ
-and WRITE locks.)  These locks are short-term: they should not be held for
-long.  They are implemented as per-buffer spinlocks, so another backend
-trying to acquire a competing lock will spin as long as you hold yours!
-Buffer locks are acquired and released by LockBuffer().  It will *not* work
-for a single backend to try to acquire multiple locks on the same buffer.
-One must pin a buffer before trying to lock it.
+and WRITE locks.)  These locks are intended to be short-term: they should not
+be held for long.  Buffer locks are acquired and released by LockBuffer().
+It will *not* work for a single backend to try to acquire multiple locks on
+the same buffer.  One must pin a buffer before trying to lock it.

 Buffer access rules:

--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.43 2001/07/06 21:04:25 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.44 2001/09/29 04:02:22 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -28,10 +28,9 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
-#include "storage/s_lock.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
-#include "storage/spin.h"
+#include "storage/lwlock.h"
 #include "utils/builtins.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
@@ -117,8 +116,6 @@ bool	   *BufferDirtiedByMe;	/* T if buf has been dirtied in cur xact */
 *
 */

-SPINLOCK	BufMgrLock;
-
 long int	ReadBufferCount;
 long int	ReadLocalBufferCount;
 long int	BufferHitCount;
@@ -151,7 +148,7 @@ InitBufferPool(void)
 	 * anyone else attached to the shmem at this point, we've got
 	 * problems.
 	 */
-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);

 #ifdef BMTRACE
 	CurTraceBuf = (long *) ShmemInitStruct("Buffer trace",
@@ -186,8 +183,8 @@ InitBufferPool(void)

 		/*
 		 * link the buffers into a circular, doubly-linked list to
-		 * initialize free list.  Still don't know anything about
-		 * replacement strategy in this file.
+		 * initialize free list, and initialize the buffer headers.
+		 * Still don't know anything about replacement strategy in this file.
 		 */
 		for (i = 0; i < Data_Descriptors; block += BLCKSZ, buf++, i++)
 		{
@@ -197,12 +194,15 @@ InitBufferPool(void)
 			buf->freePrev = i - 1;

 			CLEAR_BUFFERTAG(&(buf->tag));
+			buf->buf_id = i;
+
 			buf->data = MAKE_OFFSET(block);
 			buf->flags = (BM_DELETED | BM_FREE | BM_VALID);
 			buf->refcount = 0;
-			buf->buf_id = i;
-			S_INIT_LOCK(&(buf->io_in_progress_lock));
-			S_INIT_LOCK(&(buf->cntx_lock));
+			buf->io_in_progress_lock = LWLockAssign();
+			buf->cntx_lock = LWLockAssign();
+			buf->cntxDirty = false;
+			buf->wait_backend_id = 0;
 		}

 		/* close the circular queue */
@@ -214,7 +214,7 @@ InitBufferPool(void)
 	InitBufTable();
 	InitFreeList(!foundDescs);

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);
 }

 /*
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.21 2001/03/22 03:59:44 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_table.c,v 1.22 2001/09/29 04:02:22 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -23,8 +23,7 @@
 *
 * Synchronization:
 *
- *	All routines in this file assume buffer manager spinlock is
- *	held by their caller.
+ *	All routines in this file assume BufMgrLock is held by their caller.
 */

 #include "postgres.h"
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.116 2001/07/06 21:04:25 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.117 2001/09/29 04:02:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -59,7 +59,6 @@
 	(*((XLogRecPtr*)MAKE_PTR((bufHdr)->data)))


-extern SPINLOCK BufMgrLock;
 extern long int ReadBufferCount;
 extern long int ReadLocalBufferCount;
 extern long int BufferHitCount;
@@ -76,7 +75,7 @@ extern long int LocalBufferFlushCount;
 */
 bool		SharedBufferChanged = false;

-static void WaitIO(BufferDesc *buf, SPINLOCK spinlock);
+static void WaitIO(BufferDesc *buf);
 static void StartBufferIO(BufferDesc *buf, bool forInput);
 static void TerminateBufferIO(BufferDesc *buf);
 static void ContinueBufferIO(BufferDesc *buf, bool forInput);
@@ -130,7 +129,7 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
 /*
 * ReadBufferInternal -- internal version of ReadBuffer with more options
 *
- * bufferLockHeld: if true, caller already acquired the bufmgr spinlock.
+ * bufferLockHeld: if true, caller already acquired the bufmgr lock.
 * (This is assumed never to be true if dealing with a local buffer!)
 */
 static Buffer
@@ -179,7 +178,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 		 * block is not currently in memory.
 		 */
 		if (!bufferLockHeld)
-			SpinAcquire(BufMgrLock);
+			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		bufHdr = BufferAlloc(reln, blockNum, &found);
 		if (found)
 		{
@@ -188,7 +187,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 		}
 	}

-	/* At this point we do NOT hold the bufmgr spinlock. */
+	/* At this point we do NOT hold the bufmgr lock. */

 	if (!bufHdr)
 		return InvalidBuffer;
@@ -208,9 +207,9 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 		 */
 		if (!isLocalBuf)
 		{
-			SpinAcquire(BufMgrLock);
+			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 			StartBufferIO(bufHdr, false);
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 		}
 	}

@@ -243,7 +242,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	}

 	/* lock buffer manager again to update IO IN PROGRESS */
-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);

 	if (status == SM_FAIL)
 	{
@@ -251,7 +250,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,

 		if (!BufTableDelete(bufHdr))
 		{
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			elog(FATAL, "BufRead: buffer table broken after IO error");
 		}
 		/* remember that BufferAlloc() pinned the buffer */
@@ -274,7 +273,7 @@ ReadBufferInternal(Relation reln, BlockNumber blockNum,
 	/* If anyone was waiting for IO to complete, wake them up now */
 	TerminateBufferIO(bufHdr);

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);

 	if (status == SM_FAIL)
 		return InvalidBuffer;
@@ -322,7 +321,7 @@ BufferAlloc(Relation reln,
 		*foundPtr = TRUE;
 		if (inProgress)			/* confirm end of IO */
 		{
-			WaitIO(buf, BufMgrLock);
+			WaitIO(buf);
 			inProgress = (buf->flags & BM_IO_IN_PROGRESS);
 		}
 		if (BUFFER_IS_BROKEN(buf))
@@ -354,7 +353,7 @@ BufferAlloc(Relation reln,

 		if (!(*foundPtr))
 			StartBufferIO(buf, true);
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);

 		return buf;
 	}
@@ -364,7 +363,7 @@ BufferAlloc(Relation reln,
 	/*
 	 * Didn't find it in the buffer pool.  We'll have to initialize a new
 	 * buffer.	First, grab one from the free list.  If it's dirty, flush
-	 * it to disk. Remember to unlock BufMgr spinlock while doing the IOs.
+	 * it to disk. Remember to unlock BufMgrLock while doing the IOs.
 	 */
 	inProgress = FALSE;
 	for (buf = (BufferDesc *) NULL; buf == (BufferDesc *) NULL;)
@@ -502,7 +501,7 @@ BufferAlloc(Relation reln,
 				*foundPtr = TRUE;
 				if (inProgress)
 				{
-					WaitIO(buf2, BufMgrLock);
+					WaitIO(buf2);
 					inProgress = (buf2->flags & BM_IO_IN_PROGRESS);
 				}
 				if (BUFFER_IS_BROKEN(buf2))
@@ -510,7 +509,7 @@ BufferAlloc(Relation reln,

 				if (!(*foundPtr))
 					StartBufferIO(buf2, true);
-				SpinRelease(BufMgrLock);
+				LWLockRelease(BufMgrLock);

 				return buf2;
 			}
@@ -534,7 +533,7 @@ BufferAlloc(Relation reln,

 	if (!BufTableDelete(buf))
 	{
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 		elog(FATAL, "buffer wasn't in the buffer table");
 	}

@@ -542,7 +541,7 @@ BufferAlloc(Relation reln,

 	if (!BufTableInsert(buf))
 	{
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 		elog(FATAL, "Buffer in lookup table twice");
 	}

@@ -561,7 +560,7 @@ BufferAlloc(Relation reln,
 	_bm_trace((reln->rd_rel->relisshared ? 0 : MyDatabaseId), RelationGetRelid(reln), blockNum, BufferDescriptorGetBuffer(buf), BMT_ALLOCNOTFND);
 #endif	 /* BMTRACE */

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);

 	return buf;
 }
@@ -595,13 +594,13 @@ WriteBuffer(Buffer buffer)

 	SharedBufferChanged = true;

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	Assert(bufHdr->refcount > 0);

 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);

 	UnpinBuffer(bufHdr);
-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);

 	return TRUE;
 }
@@ -625,12 +624,12 @@ WriteNoReleaseBuffer(Buffer buffer)

 	SharedBufferChanged = true;

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	Assert(bufHdr->refcount > 0);

 	bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);

 	return STATUS_OK;
 }
@@ -639,10 +638,10 @@ WriteNoReleaseBuffer(Buffer buffer)
 #undef ReleaseAndReadBuffer
 /*
 * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
- *		to save a spinlock release/acquire.
+ *		to save a lock release/acquire.
 *
 * Also, if the passed buffer is valid and already contains the desired block
- * number, we simply return it without ever acquiring the spinlock at all.
+ * number, we simply return it without ever acquiring the lock at all.
 * Since the passed buffer must be pinned, it's OK to examine its block
 * number without getting the lock first.
 *
@@ -652,7 +651,7 @@ WriteNoReleaseBuffer(Buffer buffer)
 *
 * Also note: while it will work to call this routine with blockNum == P_NEW,
 * it's best to avoid doing so, since that would result in calling
- * smgrnblocks() while holding the bufmgr spinlock, hence some loss of
+ * smgrnblocks() while holding the bufmgr lock, hence some loss of
 * concurrency.
 */
 Buffer
@@ -684,7 +683,7 @@ ReleaseAndReadBuffer(Buffer buffer,
 				PrivateRefCount[buffer - 1]--;
 			else
 			{
-				SpinAcquire(BufMgrLock);
+				LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 				UnpinBuffer(bufHdr);
 				return ReadBufferInternal(relation, blockNum, true);
 			}
@@ -712,12 +711,11 @@ BufferSync()

 	for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
 	{
-
-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);

 		if (!(bufHdr->flags & BM_VALID))
 		{
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			continue;
 		}

@@ -731,7 +729,7 @@ BufferSync()
 		 */
 		if (!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty))
 		{
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			continue;
 		}

@@ -741,11 +739,11 @@ BufferSync()
 		 */
 		if (bufHdr->flags & BM_IO_IN_PROGRESS)
 		{
-			WaitIO(bufHdr, BufMgrLock);
+			WaitIO(bufHdr);
 			if (!(bufHdr->flags & BM_VALID) ||
 				(!(bufHdr->flags & BM_DIRTY) && !(bufHdr->cntxDirty)))
 			{
-				SpinRelease(BufMgrLock);
+				LWLockRelease(BufMgrLock);
 				continue;
 			}
 		}
@@ -761,7 +759,7 @@ BufferSync()
 		buffer = BufferDescriptorGetBuffer(bufHdr);
 		rnode = bufHdr->tag.rnode;

-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);

 		/*
 		 * Try to find relation for buffer
@@ -784,10 +782,10 @@ BufferSync()
 		 * should not be able to write it while we were busy with locking
 		 * and log flushing because of we setted IO flag.
 		 */
-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		Assert(bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty);
 		bufHdr->flags &= ~BM_JUST_DIRTIED;
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);

 		if (reln == (Relation) NULL)
 		{
@@ -822,7 +820,7 @@ BufferSync()
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 		BufferFlushCount++;

-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);

 		bufHdr->flags &= ~BM_IO_IN_PROGRESS;	/* mark IO finished */
 		TerminateBufferIO(bufHdr);		/* Sync IO finished */
@@ -834,7 +832,7 @@ BufferSync()
 		if (!(bufHdr->flags & BM_JUST_DIRTIED))
 			bufHdr->flags &= ~BM_DIRTY;
 		UnpinBuffer(bufHdr);
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);

 		/* drop refcnt obtained by RelationNodeCacheGetRelation */
 		if (reln != (Relation) NULL)
@@ -846,24 +844,25 @@ BufferSync()
 /*
 * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
 *
- * Should be entered with buffer manager spinlock held; releases it before
+ * Should be entered with buffer manager lock held; releases it before
 * waiting and re-acquires it afterwards.
 */
 static void
-WaitIO(BufferDesc *buf, SPINLOCK spinlock)
+WaitIO(BufferDesc *buf)
 {
-
 	/*
 	 * Changed to wait until there's no IO - Inoue 01/13/2000
+	 *
+	 * Note this is *necessary* because an error abort in the process
+	 * doing I/O could release the io_in_progress_lock prematurely.
+	 * See AbortBufferIO.
 	 */
 	while ((buf->flags & BM_IO_IN_PROGRESS) != 0)
 	{
-		SpinRelease(spinlock);
-		HOLD_INTERRUPTS();		/* don't want to die() holding the lock... */
-		S_LOCK(&(buf->io_in_progress_lock));
-		S_UNLOCK(&(buf->io_in_progress_lock));
-		RESUME_INTERRUPTS();
-		SpinAcquire(spinlock);
+		LWLockRelease(BufMgrLock);
+		LWLockAcquire(buf->io_in_progress_lock, LW_SHARED);
+		LWLockRelease(buf->io_in_progress_lock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	}
 }

@@ -932,9 +931,9 @@ ResetBufferPool(bool isCommit)
 			BufferDesc *buf = &BufferDescriptors[i];

 			PrivateRefCount[i] = 1;	/* make sure we release shared pin */
-			SpinAcquire(BufMgrLock);
+			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 			UnpinBuffer(buf);
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			Assert(PrivateRefCount[i] == 0);
 		}
 	}
@@ -1039,7 +1038,7 @@ BufferReplace(BufferDesc *bufHdr)
 	/* To check if block content changed while flushing. - vadim 01/17/97 */
 	bufHdr->flags &= ~BM_JUST_DIRTIED;

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);

 	/*
 	 * No need to lock buffer context - no one should be able to end
@@ -1067,7 +1066,7 @@ BufferReplace(BufferDesc *bufHdr)
 	if (reln != (Relation) NULL)
 		RelationDecrementReferenceCount(reln);

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);

 	if (status == SM_FAIL)
 		return FALSE;
@@ -1140,7 +1139,8 @@ DropRelationBuffers(Relation rel)
 		return;
 	}

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+
 	for (i = 1; i <= NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i - 1];
@@ -1155,7 +1155,7 @@ recheck:
 			 */
 			if (bufHdr->flags & BM_IO_IN_PROGRESS)
 			{
-				WaitIO(bufHdr, BufMgrLock);
+				WaitIO(bufHdr);

 				/*
 				 * By now, the buffer very possibly belongs to some other
@@ -1189,7 +1189,7 @@ recheck:
 		}
 	}

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);
 }

 /* ---------------------------------------------------------------------
@@ -1223,7 +1223,8 @@ DropRelFileNodeBuffers(RelFileNode rnode)
 		}
 	}

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+
 	for (i = 1; i <= NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i - 1];
@@ -1238,7 +1239,7 @@ recheck:
 			 */
 			if (bufHdr->flags & BM_IO_IN_PROGRESS)
 			{
-				WaitIO(bufHdr, BufMgrLock);
+				WaitIO(bufHdr);

 				/*
 				 * By now, the buffer very possibly belongs to some other
@@ -1272,7 +1273,7 @@ recheck:
 		}
 	}

-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);
 }

 /* ---------------------------------------------------------------------
@@ -1292,7 +1293,8 @@ DropBuffers(Oid dbid)
 	int			i;
 	BufferDesc *bufHdr;

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+
 	for (i = 1; i <= NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i - 1];
@@ -1313,7 +1315,7 @@ recheck:
 			 */
 			if (bufHdr->flags & BM_IO_IN_PROGRESS)
 			{
-				WaitIO(bufHdr, BufMgrLock);
+				WaitIO(bufHdr);

 				/*
 				 * By now, the buffer very possibly belongs to some other
@@ -1337,7 +1339,8 @@ recheck:
 			BufTableDelete(bufHdr);
 		}
 	}
-	SpinRelease(BufMgrLock);
+
+	LWLockRelease(BufMgrLock);
 }

 /* -----------------------------------------------------------------
@@ -1355,7 +1358,7 @@ PrintBufferDescs()

 	if (IsUnderPostmaster)
 	{
-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		for (i = 0; i < NBuffers; ++i, ++buf)
 		{
 			elog(DEBUG, "[%02d] (freeNext=%d, freePrev=%d, rel=%u/%u, \
@@ -1365,7 +1368,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)",
 				 buf->tag.blockNum, buf->flags,
 				 buf->refcount, PrivateRefCount[i]);
 		}
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 	}
 	else
 	{
@@ -1386,7 +1389,7 @@ PrintPinnedBufs()
 	int			i;
 	BufferDesc *buf = BufferDescriptors;

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 	for (i = 0; i < NBuffers; ++i, ++buf)
 	{
 		if (PrivateRefCount[i] > 0)
@@ -1397,7 +1400,7 @@ blockNum=%u, flags=0x%x, refcount=%d %ld)",
 				 buf->tag.blockNum, buf->flags,
 				 buf->refcount, PrivateRefCount[i]);
 	}
-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);
 }

 /*
@@ -1514,7 +1517,8 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 		return 0;
 	}

-	SpinAcquire(BufMgrLock);
+	LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+
 	for (i = 0; i < NBuffers; i++)
 	{
 		bufHdr = &BufferDescriptors[i];
@@ -1524,8 +1528,8 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 			{
 				PinBuffer(bufHdr);
 				if (bufHdr->flags & BM_IO_IN_PROGRESS)
-					WaitIO(bufHdr, BufMgrLock);
-				SpinRelease(BufMgrLock);
+					WaitIO(bufHdr);
+				LWLockRelease(BufMgrLock);

 				/*
 				 * Force XLOG flush for buffer' LSN
@@ -1537,16 +1541,16 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 				 * Now it's safe to write buffer to disk
 				 */

-				SpinAcquire(BufMgrLock);
+				LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 				if (bufHdr->flags & BM_IO_IN_PROGRESS)
-					WaitIO(bufHdr, BufMgrLock);
+					WaitIO(bufHdr);

 				if (bufHdr->flags & BM_DIRTY || bufHdr->cntxDirty)
 				{
 					bufHdr->flags &= ~BM_JUST_DIRTIED;
 					StartBufferIO(bufHdr, false);		/* output IO start */

-					SpinRelease(BufMgrLock);
+					LWLockRelease(BufMgrLock);

 					status = smgrwrite(DEFAULT_SMGR, rel,
 									   bufHdr->tag.blockNum,
@@ -1560,7 +1564,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)

 					BufferFlushCount++;

-					SpinAcquire(BufMgrLock);
+					LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 					bufHdr->flags &= ~BM_IO_IN_PROGRESS;
 					TerminateBufferIO(bufHdr);
 					Assert(!(bufHdr->flags & BM_JUST_DIRTIED));
@@ -1578,7 +1582,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 			}
 			if (!(bufHdr->flags & BM_FREE))
 			{
-				SpinRelease(BufMgrLock);
+				LWLockRelease(BufMgrLock);
 				elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %ld, global %d)",
 					 RelationGetRelationName(rel), firstDelBlock,
 					 bufHdr->tag.blockNum,
@@ -1589,7 +1593,7 @@ FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock)
 				BufTableDelete(bufHdr);
 		}
 	}
-	SpinRelease(BufMgrLock);
+	LWLockRelease(BufMgrLock);
 	return 0;
 }

@@ -1621,9 +1625,9 @@ ReleaseBuffer(Buffer buffer)
 		PrivateRefCount[buffer - 1]--;
 	else
 	{
-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		UnpinBuffer(bufHdr);
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 	}

 	return STATUS_OK;
@@ -1919,13 +1923,18 @@ SetBufferCommitInfoNeedsSave(Buffer buffer)
 	if ((bufHdr->flags & (BM_DIRTY | BM_JUST_DIRTIED)) !=
 		(BM_DIRTY | BM_JUST_DIRTIED))
 	{
-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		Assert(bufHdr->refcount > 0);
 		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 	}
 }

+/*
+ * Release buffer context locks for shared buffers.
+ *
+ * Used to clean up after errors.
+ */
 void
 UnlockBuffers(void)
 {
@@ -1942,36 +1951,15 @@ UnlockBuffers(void)
 		Assert(BufferIsValid(i + 1));
 		buf = &(BufferDescriptors[i]);

-		HOLD_INTERRUPTS();		/* don't want to die() holding the lock... */
+		HOLD_INTERRUPTS();		/* don't want to die() partway through... */

-		S_LOCK(&(buf->cntx_lock));
-
-		if (buflocks & BL_R_LOCK)
-		{
-			Assert(buf->r_locks > 0);
-			(buf->r_locks)--;
-		}
-		if (buflocks & BL_RI_LOCK)
-		{
-			/*
-			 * Someone else could remove our RI lock when acquiring W
-			 * lock. This is possible if we came here from elog(ERROR)
-			 * from IpcSemaphore{Lock|Unlock}(WaitCLSemId). And so we
-			 * don't do Assert(buf->ri_lock) here.
-			 */
-			buf->ri_lock = false;
-		}
-		if (buflocks & BL_W_LOCK)
-		{
-			Assert(buf->w_lock);
-			buf->w_lock = false;
-		}
-
-		S_UNLOCK(&(buf->cntx_lock));
+		/*
+		 * The buffer's cntx_lock has already been released by lwlock.c.
+		 */

 		if (buflocks & BL_PIN_COUNT_LOCK)
 		{
-			SpinAcquire(BufMgrLock);
+			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 			/*
 			 * Don't complain if flag bit not set; it could have been reset
 			 * but we got a cancel/die interrupt before getting the signal.
@@ -1979,7 +1967,7 @@ UnlockBuffers(void)
 			if ((buf->flags & BM_PIN_COUNT_WAITER) != 0 &&
 				buf->wait_backend_id == MyBackendId)
 				buf->flags &= ~BM_PIN_COUNT_WAITER;
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			ProcCancelWaitForSignal();
 		}

@@ -1989,94 +1977,31 @@ UnlockBuffers(void)
 	}
 }

-/* Max time to wait to acquire a buffer read or write lock */
-#define BUFFER_LOCK_TIMEOUT		(10*60*1000000) /* 10 minutes */
-
+/*
+ * Acquire or release the cntx_lock for the buffer.
+ */
 void
 LockBuffer(Buffer buffer, int mode)
 {
 	BufferDesc *buf;
-	bits8	   *buflock;

 	Assert(BufferIsValid(buffer));
 	if (BufferIsLocal(buffer))
 		return;

 	buf = &(BufferDescriptors[buffer - 1]);
-	buflock = &(BufferLocks[buffer - 1]);
-
-	HOLD_INTERRUPTS();			/* don't want to die() holding the lock... */
-
-	S_LOCK(&(buf->cntx_lock));

 	if (mode == BUFFER_LOCK_UNLOCK)
 	{
-		if (*buflock & BL_R_LOCK)
-		{
-			Assert(buf->r_locks > 0);
-			Assert(!(buf->w_lock));
-			Assert(!(*buflock & (BL_W_LOCK | BL_RI_LOCK)));
-			(buf->r_locks)--;
-			*buflock &= ~BL_R_LOCK;
-		}
-		else if (*buflock & BL_W_LOCK)
-		{
-			Assert(buf->w_lock);
-			Assert(buf->r_locks == 0);
-			Assert(!(*buflock & (BL_R_LOCK | BL_RI_LOCK)));
-			buf->w_lock = false;
-			*buflock &= ~BL_W_LOCK;
-		}
-		else
-		{
-			S_UNLOCK(&(buf->cntx_lock));
-			RESUME_INTERRUPTS();
-			elog(ERROR, "UNLockBuffer: buffer %d is not locked", buffer);
-		}
+		LWLockRelease(buf->cntx_lock);
 	}
 	else if (mode == BUFFER_LOCK_SHARE)
 	{
-		unsigned	i = 0;
-
-		Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK)));
-		while (buf->ri_lock || buf->w_lock)
-		{
-			S_UNLOCK(&(buf->cntx_lock));
-			RESUME_INTERRUPTS();
-			S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
-			HOLD_INTERRUPTS();
-			S_LOCK(&(buf->cntx_lock));
-		}
-		(buf->r_locks)++;
-		*buflock |= BL_R_LOCK;
+		LWLockAcquire(buf->cntx_lock, LW_SHARED);
 	}
 	else if (mode == BUFFER_LOCK_EXCLUSIVE)
 	{
-		unsigned	i = 0;
-
-		Assert(!(*buflock & (BL_R_LOCK | BL_W_LOCK | BL_RI_LOCK)));
-		while (buf->r_locks > 0 || buf->w_lock)
-		{
-			if (buf->r_locks > 3 || (*buflock & BL_RI_LOCK))
-			{
-
-				/*
-				 * Our RI lock might be removed by concurrent W lock
-				 * acquiring (see what we do with RI locks below when our
-				 * own W acquiring succeeded) and so we set RI lock again
-				 * if we already did this.
-				 */
-				*buflock |= BL_RI_LOCK;
-				buf->ri_lock = true;
-			}
-			S_UNLOCK(&(buf->cntx_lock));
-			RESUME_INTERRUPTS();
-			S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
-			HOLD_INTERRUPTS();
-			S_LOCK(&(buf->cntx_lock));
-		}
-		buf->w_lock = true;
-		*buflock |= BL_W_LOCK;
+		LWLockAcquire(buf->cntx_lock, LW_EXCLUSIVE);

 		/*
 		 * This is not the best place to set cntxDirty flag (eg indices do
@@ -2085,27 +2010,11 @@ LockBuffer(Buffer buffer, int mode)
 		 * changes with XLogInsert() - see comments in BufferSync().
 		 */
 		buf->cntxDirty = true;
-
-		if (*buflock & BL_RI_LOCK)
-		{
-
-			/*
-			 * It's possible to remove RI locks acquired by another W
-			 * lockers here, but they'll take care about it.
-			 */
-			buf->ri_lock = false;
-			*buflock &= ~BL_RI_LOCK;
-		}
 	}
 	else
 	{
-		S_UNLOCK(&(buf->cntx_lock));
-		RESUME_INTERRUPTS();
 		elog(ERROR, "LockBuffer: unknown lock mode %d", mode);
 	}
-
-	S_UNLOCK(&(buf->cntx_lock));
-	RESUME_INTERRUPTS();
 }

 /*
@@ -2152,25 +2061,25 @@ LockBufferForCleanup(Buffer buffer)
 	{
 		/* Try to acquire lock */
 		LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-		SpinAcquire(BufMgrLock);
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		Assert(bufHdr->refcount > 0);
 		if (bufHdr->refcount == 1)
 		{
 			/* Successfully acquired exclusive lock with pincount 1 */
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			return;
 		}
 		/* Failed, so mark myself as waiting for pincount 1 */
 		if (bufHdr->flags & BM_PIN_COUNT_WAITER)
 		{
-			SpinRelease(BufMgrLock);
+			LWLockRelease(BufMgrLock);
 			LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 			elog(ERROR, "Multiple backends attempting to wait for pincount 1");
 		}
 		bufHdr->wait_backend_id = MyBackendId;
 		bufHdr->flags |= BM_PIN_COUNT_WAITER;
 		*buflock |= BL_PIN_COUNT_LOCK;
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 		LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 		/* Wait to be signaled by UnpinBuffer() */
 		ProcWaitForSignal();
@@ -2183,8 +2092,7 @@ LockBufferForCleanup(Buffer buffer)
 *	Functions for IO error handling
 *
 *	Note : We assume that nested buffer IO never occur.
- *	i.e at most one io_in_progress spinlock is held
- *	per proc.
+ *	i.e at most one io_in_progress lock is held per proc.
 */
 static BufferDesc *InProgressBuf = (BufferDesc *) NULL;
 static bool IsForInput;
@@ -2207,18 +2115,7 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 	Assert(!(buf->flags & BM_IO_IN_PROGRESS));
 	buf->flags |= BM_IO_IN_PROGRESS;

-	/*
-	 * There used to be
-	 *
-	 * Assert(S_LOCK_FREE(&(buf->io_in_progress_lock)));
-	 *
-	 * here, but that's wrong because of the way WaitIO works: someone else
-	 * waiting for the I/O to complete will succeed in grabbing the lock
-	 * for a few instructions, and if we context-swap back to here the
-	 * Assert could fail.  Tiny window for failure, but I've seen it
-	 * happen -- tgl
-	 */
-	S_LOCK(&(buf->io_in_progress_lock));
+	LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);

 	InProgressBuf = buf;
 	IsForInput = forInput;
@@ -2238,7 +2135,7 @@ static void
 TerminateBufferIO(BufferDesc *buf)
 {
 	Assert(buf == InProgressBuf);
-	S_UNLOCK(&(buf->io_in_progress_lock));
+	LWLockRelease(buf->io_in_progress_lock);
 	InProgressBuf = (BufferDesc *) 0;
 }

@@ -2271,7 +2168,6 @@ InitBufferIO(void)

 /*
 *	Clean up any active buffer I/O after an error.
- *	This function is called from ProcReleaseSpins().
 *	BufMgrLock isn't held when this function is called.
 *
 *	If I/O was in progress, we always set BM_IO_ERROR.
@@ -2283,7 +2179,16 @@ AbortBufferIO(void)

 	if (buf)
 	{
-		SpinAcquire(BufMgrLock);
+		/*
+		 *	Since LWLockReleaseAll has already been called,
+		 *	we're not holding the buffer's io_in_progress_lock.
+		 *	We have to re-acquire it so that we can use TerminateBufferIO.
+		 *	Anyone who's executing WaitIO on the buffer will be in a busy spin
+		 *	until we succeed in doing this.
+		 */
+		LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE);
+
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
 		Assert(buf->flags & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 			Assert(!(buf->flags & BM_DIRTY) && !(buf->cntxDirty));
@@ -2302,7 +2207,7 @@ AbortBufferIO(void)
 		buf->flags |= BM_IO_ERROR;
 		buf->flags &= ~BM_IO_IN_PROGRESS;
 		TerminateBufferIO(buf);
-		SpinRelease(BufMgrLock);
+		LWLockRelease(BufMgrLock);
 	}
 }

--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -9,7 +9,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.24 2001/07/06 21:04:26 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.25 2001/09/29 04:02:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -29,6 +29,7 @@

 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/ipc.h"
 #include "storage/proc.h"


--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -8,7 +8,7 @@
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/freespace/freespace.c,v 1.4 2001/07/19 21:25:37 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/freespace/freespace.c,v 1.5 2001/09/29 04:02:23 tgl Exp $
 *
 *
 * NOTES:
@@ -56,6 +56,7 @@

 #include "storage/freespace.h"
 #include "storage/itemid.h"
+#include "storage/lwlock.h"
 #include "storage/shmem.h"


@@ -122,9 +123,6 @@ struct FSMChunk
 };


-SPINLOCK	FreeSpaceLock;		/* in Shmem or created in
-								 * CreateSpinlocks() */
-
 int		MaxFSMRelations;		/* these are set by guc.c */
 int		MaxFSMPages;

@@ -256,7 +254,7 @@ GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded)
 	FSMRelation *fsmrel;
 	BlockNumber	freepage;

-	SpinAcquire(FreeSpaceLock);
+	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
 	/*
 	 * We always add a rel to the hashtable when it is inquired about.
 	 */
@@ -279,7 +277,7 @@ GetPageWithFreeSpace(RelFileNode *rel, Size spaceNeeded)
 		fsmrel->threshold = (Size) cur_avg;
 	}
 	freepage = find_free_space(fsmrel, spaceNeeded);
-	SpinRelease(FreeSpaceLock);
+	LWLockRelease(FreeSpaceLock);
 	return freepage;
 }

@@ -299,7 +297,7 @@ RecordFreeSpace(RelFileNode *rel, BlockNumber page, Size spaceAvail)
 	/* Sanity check: ensure spaceAvail will fit into ItemLength */
 	AssertArg(spaceAvail < BLCKSZ);

-	SpinAcquire(FreeSpaceLock);
+	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
 	/*
 	 * We choose not to add rels to the hashtable unless they've been
 	 * inquired about with GetPageWithFreeSpace.  Also, a Record operation
@@ -308,11 +306,11 @@ RecordFreeSpace(RelFileNode *rel, BlockNumber page, Size spaceAvail)
 	fsmrel = lookup_fsm_rel(rel);
 	if (fsmrel)
 		fsm_record_free_space(fsmrel, page, spaceAvail);
-	SpinRelease(FreeSpaceLock);
+	LWLockRelease(FreeSpaceLock);
 }

 /*
- * RecordAndGetPageWithFreeSpace - combo form to save one spinlock and
+ * RecordAndGetPageWithFreeSpace - combo form to save one lock and
 *		hash table lookup cycle.
 */
 BlockNumber
@@ -327,7 +325,7 @@ RecordAndGetPageWithFreeSpace(RelFileNode *rel,
 	/* Sanity check: ensure spaceAvail will fit into ItemLength */
 	AssertArg(oldSpaceAvail < BLCKSZ);

-	SpinAcquire(FreeSpaceLock);
+	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
 	/*
 	 * We always add a rel to the hashtable when it is inquired about.
 	 */
@@ -351,7 +349,7 @@ RecordAndGetPageWithFreeSpace(RelFileNode *rel,
 	fsm_record_free_space(fsmrel, oldPage, oldSpaceAvail);
 	/* Do the Get */
 	freepage = find_free_space(fsmrel, spaceNeeded);
-	SpinRelease(FreeSpaceLock);
+	LWLockRelease(FreeSpaceLock);
 	return freepage;
 }

@@ -378,7 +376,7 @@ MultiRecordFreeSpace(RelFileNode *rel,
 	FSMRelation *fsmrel;
 	int			i;

-	SpinAcquire(FreeSpaceLock);
+	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
 	fsmrel = lookup_fsm_rel(rel);
 	if (fsmrel)
 	{
@@ -437,7 +435,7 @@ MultiRecordFreeSpace(RelFileNode *rel,
 				fsm_record_free_space(fsmrel, page, avail);
 		}
 	}
-	SpinRelease(FreeSpaceLock);
+	LWLockRelease(FreeSpaceLock);
 }

 /*
@@ -452,11 +450,11 @@ FreeSpaceMapForgetRel(RelFileNode *rel)
 {
 	FSMRelation *fsmrel;

-	SpinAcquire(FreeSpaceLock);
+	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
 	fsmrel = lookup_fsm_rel(rel);
 	if (fsmrel)
 		delete_fsm_rel(fsmrel);
-	SpinRelease(FreeSpaceLock);
+	LWLockRelease(FreeSpaceLock);
 }

 /*
@@ -474,14 +472,14 @@ FreeSpaceMapForgetDatabase(Oid dbid)
 	FSMRelation *fsmrel,
 			   *nextrel;

-	SpinAcquire(FreeSpaceLock);
+	LWLockAcquire(FreeSpaceLock, LW_EXCLUSIVE);
 	for (fsmrel = FreeSpaceMap->relList; fsmrel; fsmrel = nextrel)
 	{
 		nextrel = fsmrel->nextRel; /* in case we delete it */
 		if (fsmrel->key.tblNode == dbid)
 			delete_fsm_rel(fsmrel);
 	}
-	SpinRelease(FreeSpaceLock);
+	LWLockRelease(FreeSpaceLock);
 }


--- a/src/backend/storage/ipc/ipc.c
+++ b/src/backend/storage/ipc/ipc.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.68 2001/09/04 00:22:34 petere Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipc.c,v 1.69 2001/09/29 04:02:23 tgl Exp $
 *
 * NOTES
 *
@@ -34,7 +34,6 @@
 #include <unistd.h>

 #include "storage/ipc.h"
-#include "storage/s_lock.h"
 /* In Ultrix, sem.h and shm.h must be included AFTER ipc.h */
 #ifdef HAVE_SYS_SEM_H
 #include <sys/sem.h>
@@ -306,7 +305,7 @@ InternalIpcSemaphoreCreate(IpcSemaphoreKey semKey,
 		if (errno == ENOSPC)
 			fprintf(stderr,
 					"\nThis error does *not* mean that you have run out of disk space.\n\n"
-					"It occurs either because system limit for the maximum number of\n"
+					"It occurs because either the system limit for the maximum number of\n"
 					"semaphore sets (SEMMNI), or the system wide maximum number of\n"
 					"semaphores (SEMMNS), would be exceeded.  You need to raise the\n"
 					"respective kernel parameter.  Look into the PostgreSQL documentation\n"
@@ -416,8 +415,8 @@ IpcSemaphoreLock(IpcSemaphoreId semId, int sem, bool interruptOK)
 	 * record acquiring the lock.  (This is currently true for lockmanager
 	 * locks, since the process that granted us the lock did all the
 	 * necessary state updates. It's not true for SysV semaphores used to
-	 * emulate spinlocks --- but our performance on such platforms is so
-	 * horrible anyway that I'm not going to worry too much about it.)
+	 * implement LW locks or emulate spinlocks --- but the wait time for
+	 * such locks should not be very long, anyway.)
 	 */
 	do
 	{
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.42 2001/08/25 18:52:42 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.43 2001/09/29 04:02:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -22,6 +22,7 @@
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
 #include "storage/lmgr.h"
+#include "storage/lwlock.h"
 #include "storage/proc.h"
 #include "storage/sinval.h"
 #include "storage/spin.h"
@@ -53,7 +54,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int maxBackends)
 	size += LockShmemSize(maxBackends);
 	size += XLOGShmemSize();
 	size += CLOGShmemSize();
-	size += SLockShmemSize();
+	size += LWLockShmemSize();
 	size += SInvalShmemSize(maxBackends);
 	size += FreeSpaceShmemSize();
 #ifdef STABLE_MEMORY_STORAGE
@@ -74,13 +75,24 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int maxBackends)
 	/*
 	 * First initialize spinlocks --- needed by InitShmemAllocation()
 	 */
-	CreateSpinlocks(seghdr);
+	CreateSpinlocks();

 	/*
-	 * Set up shmem.c hashtable
+	 * Set up shared memory allocation mechanism
 	 */
 	InitShmemAllocation(seghdr);

+	/*
+	 * Now initialize LWLocks, which do shared memory allocation and
+	 * are needed for InitShmemIndex.
+	 */
+	CreateLWLocks();
+
+	/*
+	 * Set up shmem.c index hashtable
+	 */
+	InitShmemIndex();
+
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.58 2001/09/07 00:27:29 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.59 2001/09/29 04:02:23 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -61,8 +61,10 @@
 #include "postgres.h"

 #include "access/transam.h"
+#include "storage/spin.h"
 #include "utils/tqual.h"

+
 /* shared memory global variables */

 static PGShmemHeader *ShmemSegHdr;		/* shared mem segment header */
@@ -71,9 +73,7 @@ SHMEM_OFFSET ShmemBase;			/* start address of shared memory */

 static SHMEM_OFFSET ShmemEnd;	/* end+1 address of shared memory */

-SPINLOCK	ShmemLock;			/* lock for shared memory allocation */
-
-SPINLOCK	ShmemIndexLock;		/* lock for shmem index access */
+static slock_t *ShmemLock;		/* spinlock for shared memory allocation */

 static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */

@@ -81,63 +81,33 @@ static bool ShmemBootstrap = false;		/* bootstrapping shmem index? */


 /*
- *	InitShmemAllocation() --- set up shared-memory allocation and index table.
+ *	InitShmemAllocation() --- set up shared-memory allocation.
+ *
+ * Note: the argument should be declared "PGShmemHeader *seghdr",
+ * but we use void to avoid having to include ipc.h in shmem.h.
 */
 void
-InitShmemAllocation(PGShmemHeader *seghdr)
+InitShmemAllocation(void *seghdr)
 {
-	HASHCTL		info;
-	int			hash_flags;
-	ShmemIndexEnt *result,
-				item;
-	bool		found;
+	PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr;

 	/* Set up basic pointers to shared memory */
-	ShmemSegHdr = seghdr;
-	ShmemBase = (SHMEM_OFFSET) seghdr;
-	ShmemEnd = ShmemBase + seghdr->totalsize;
+	ShmemSegHdr = shmhdr;
+	ShmemBase = (SHMEM_OFFSET) shmhdr;
+	ShmemEnd = ShmemBase + shmhdr->totalsize;

 	/*
-	 * Since ShmemInitHash calls ShmemInitStruct, which expects the
-	 * ShmemIndex hashtable to exist already, we have a bit of a
-	 * circularity problem in initializing the ShmemIndex itself.  We set
-	 * ShmemBootstrap to tell ShmemInitStruct to fake it.
+	 * Initialize the spinlock used by ShmemAlloc.  We have to do the
+	 * space allocation the hard way, since ShmemAlloc can't be called yet.
 	 */
+	ShmemLock = (slock_t *) (((char *) shmhdr) + shmhdr->freeoffset);
+	shmhdr->freeoffset += MAXALIGN(sizeof(slock_t));
+	Assert(shmhdr->freeoffset <= shmhdr->totalsize);
+
+	SpinLockInit(ShmemLock);
+
+	/* ShmemIndex can't be set up yet (need LWLocks first) */
 	ShmemIndex = (HTAB *) NULL;
-	ShmemBootstrap = true;
-
-	/* create the shared memory shmem index */
-	info.keysize = SHMEM_INDEX_KEYSIZE;
-	info.datasize = SHMEM_INDEX_DATASIZE;
-	hash_flags = HASH_ELEM;
-
-	/* This will acquire the shmem index lock, but not release it. */
-	ShmemIndex = ShmemInitHash("ShmemIndex",
-							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
-							   &info, hash_flags);
-	if (!ShmemIndex)
-		elog(FATAL, "InitShmemAllocation: couldn't initialize Shmem Index");
-
-	/*
-	 * Now, create an entry in the hashtable for the index itself.
-	 */
-	MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE);
-	strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE);
-
-	result = (ShmemIndexEnt *)
-		hash_search(ShmemIndex, (char *) &item, HASH_ENTER, &found);
-	if (!result)
-		elog(FATAL, "InitShmemAllocation: corrupted shmem index");
-
-	Assert(ShmemBootstrap && !found);
-
-	result->location = MAKE_OFFSET(ShmemIndex->hctl);
-	result->size = SHMEM_INDEX_SIZE;
-
-	ShmemBootstrap = false;
-
-	/* now release the lock acquired in ShmemInitStruct */
-	SpinRelease(ShmemIndexLock);

 	/*
 	 * Initialize ShmemVariableCache for transaction manager.
@@ -167,9 +137,9 @@ ShmemAlloc(Size size)
 	 */
 	size = MAXALIGN(size);

-	Assert(ShmemSegHdr);
+	Assert(ShmemSegHdr != NULL);

-	SpinAcquire(ShmemLock);
+	SpinLockAcquire(ShmemLock);

 	newFree = ShmemSegHdr->freeoffset + size;
 	if (newFree <= ShmemSegHdr->totalsize)
@@ -180,7 +150,7 @@ ShmemAlloc(Size size)
 	else
 		newSpace = NULL;

-	SpinRelease(ShmemLock);
+	SpinLockRelease(ShmemLock);

 	if (!newSpace)
 		elog(NOTICE, "ShmemAlloc: out of memory");
@@ -199,6 +169,60 @@ ShmemIsValid(unsigned long addr)
 	return (addr < ShmemEnd) && (addr >= ShmemBase);
 }

+/*
+ *	InitShmemIndex() --- set up shmem index table.
+ */
+void
+InitShmemIndex(void)
+{
+	HASHCTL		info;
+	int			hash_flags;
+	ShmemIndexEnt *result,
+				item;
+	bool		found;
+
+	/*
+	 * Since ShmemInitHash calls ShmemInitStruct, which expects the
+	 * ShmemIndex hashtable to exist already, we have a bit of a
+	 * circularity problem in initializing the ShmemIndex itself.  We set
+	 * ShmemBootstrap to tell ShmemInitStruct to fake it.
+	 */
+	ShmemBootstrap = true;
+
+	/* create the shared memory shmem index */
+	info.keysize = SHMEM_INDEX_KEYSIZE;
+	info.datasize = SHMEM_INDEX_DATASIZE;
+	hash_flags = HASH_ELEM;
+
+	/* This will acquire the shmem index lock, but not release it. */
+	ShmemIndex = ShmemInitHash("ShmemIndex",
+							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+							   &info, hash_flags);
+	if (!ShmemIndex)
+		elog(FATAL, "InitShmemIndex: couldn't initialize Shmem Index");
+
+	/*
+	 * Now, create an entry in the hashtable for the index itself.
+	 */
+	MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE);
+	strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE);
+
+	result = (ShmemIndexEnt *)
+		hash_search(ShmemIndex, (char *) &item, HASH_ENTER, &found);
+	if (!result)
+		elog(FATAL, "InitShmemIndex: corrupted shmem index");
+
+	Assert(ShmemBootstrap && !found);
+
+	result->location = MAKE_OFFSET(ShmemIndex->hctl);
+	result->size = SHMEM_INDEX_SIZE;
+
+	ShmemBootstrap = false;
+
+	/* now release the lock acquired in ShmemInitStruct */
+	LWLockRelease(ShmemIndexLock);
+}
+
 /*
 * ShmemInitHash -- Create/Attach to and initialize
 *		shared memory hash table.
@@ -207,8 +231,7 @@ ShmemIsValid(unsigned long addr)
 *
 * assume caller is doing some kind of synchronization
 * so that two people dont try to create/initialize the
- * table at once.  Use SpinAlloc() to create a spinlock
- * for the structure before creating the structure itself.
+ * table at once.
 */
 HTAB *
 ShmemInitHash(char *name,		/* table string name for shmem index */
@@ -283,7 +306,7 @@ ShmemInitStruct(char *name, Size size, bool *foundPtr)
 	strncpy(item.key, name, SHMEM_INDEX_KEYSIZE);
 	item.location = BAD_LOCATION;

-	SpinAcquire(ShmemIndexLock);
+	LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);

 	if (!ShmemIndex)
 	{
@@ -306,7 +329,7 @@ ShmemInitStruct(char *name, Size size, bool *foundPtr)

 	if (!result)
 	{
-		SpinRelease(ShmemIndexLock);
+		LWLockRelease(ShmemIndexLock);
 		elog(ERROR, "ShmemInitStruct: Shmem Index corrupted");
 		return NULL;
 	}
@@ -320,7 +343,7 @@ ShmemInitStruct(char *name, Size size, bool *foundPtr)
 		 */
 		if (result->size != size)
 		{
-			SpinRelease(ShmemIndexLock);
+			LWLockRelease(ShmemIndexLock);

 			elog(NOTICE, "ShmemInitStruct: ShmemIndex entry size is wrong");
 			/* let caller print its message too */
@@ -337,7 +360,7 @@ ShmemInitStruct(char *name, Size size, bool *foundPtr)
 			/* out of memory */
 			Assert(ShmemIndex);
 			hash_search(ShmemIndex, (char *) &item, HASH_REMOVE, foundPtr);
-			SpinRelease(ShmemIndexLock);
+			LWLockRelease(ShmemIndexLock);
 			*foundPtr = FALSE;

 			elog(NOTICE, "ShmemInitStruct: cannot allocate '%s'",
@@ -349,6 +372,6 @@ ShmemInitStruct(char *name, Size size, bool *foundPtr)
 	}
 	Assert(ShmemIsValid((unsigned long) structPtr));

-	SpinRelease(ShmemIndexLock);
+	LWLockRelease(ShmemIndexLock);
 	return structPtr;
 }
--- a/src/backend/storage/ipc/sinval.c
+++ b/src/backend/storage/ipc/sinval.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.40 2001/08/26 16:56:00 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.41 2001/09/29 04:02:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -23,8 +23,6 @@
 #include "miscadmin.h"


-SPINLOCK	SInvalLock = (SPINLOCK) NULL;
-
 /****************************************************************************/
 /*	CreateSharedInvalidationState()		 Initialize SI buffer				*/
 /*																			*/
@@ -33,7 +31,7 @@ SPINLOCK	SInvalLock = (SPINLOCK) NULL;
 void
 CreateSharedInvalidationState(int maxBackends)
 {
-	/* SInvalLock must be initialized already, during spinlock init */
+	/* SInvalLock must be initialized already, during LWLock init */
 	SIBufferInit(maxBackends);
 }

@@ -46,9 +44,9 @@ InitBackendSharedInvalidationState(void)
 {
 	int		flag;

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
 	flag = SIBackendInit(shmInvalBuffer);
-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);
 	if (flag < 0)				/* unexpected problem */
 		elog(FATAL, "Backend cache invalidation initialization failed");
 	if (flag == 0)				/* expected problem: MaxBackends exceeded */
@@ -64,9 +62,9 @@ SendSharedInvalidMessage(SharedInvalidationMessage *msg)
 {
 	bool		insertOK;

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
 	insertOK = SIInsertDataEntry(shmInvalBuffer, msg);
-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);
 	if (!insertOK)
 		elog(DEBUG, "SendSharedInvalidMessage: SI buffer overflow");
 }
@@ -86,9 +84,25 @@ ReceiveSharedInvalidMessages(

 	for (;;)
 	{
-		SpinAcquire(SInvalLock);
+		/*
+		 * We can run SIGetDataEntry in parallel with other backends running
+		 * SIGetDataEntry for themselves, since each instance will modify
+		 * only fields of its own backend's ProcState, and no instance will
+		 * look at fields of other backends' ProcStates.  We express this
+		 * by grabbing SInvalLock in shared mode.  Note that this is not
+		 * exactly the normal (read-only) interpretation of a shared lock!
+		 * Look closely at the interactions before allowing SInvalLock to
+		 * be grabbed in shared mode for any other reason!
+		 *
+		 * The routines later in this file that use shared mode are okay
+		 * with this, because they aren't looking at the ProcState fields
+		 * associated with SI message transfer; they only use the ProcState
+		 * array as an easy way to find all the PROC structures.
+		 */
+		LWLockAcquire(SInvalLock, LW_SHARED);
 		getResult = SIGetDataEntry(shmInvalBuffer, MyBackendId, &data);
-		SpinRelease(SInvalLock);
+		LWLockRelease(SInvalLock);
+
 		if (getResult == 0)
 			break;				/* nothing more to do */
 		if (getResult < 0)
@@ -108,9 +122,9 @@ ReceiveSharedInvalidMessages(
 	/* If we got any messages, try to release dead messages */
 	if (gotMessage)
 	{
-		SpinAcquire(SInvalLock);
+		LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
 		SIDelExpiredDataEntries(shmInvalBuffer);
-		SpinRelease(SInvalLock);
+		LWLockRelease(SInvalLock);
 	}
 }

@@ -149,7 +163,7 @@ DatabaseHasActiveBackends(Oid databaseId, bool ignoreMyself)
 	ProcState  *stateP = segP->procState;
 	int			index;

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_SHARED);

 	for (index = 0; index < segP->lastBackend; index++)
 	{
@@ -170,7 +184,7 @@ DatabaseHasActiveBackends(Oid databaseId, bool ignoreMyself)
 		}
 	}

-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);

 	return result;
 }
@@ -186,7 +200,7 @@ TransactionIdIsInProgress(TransactionId xid)
 	ProcState  *stateP = segP->procState;
 	int			index;

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_SHARED);

 	for (index = 0; index < segP->lastBackend; index++)
 	{
@@ -206,7 +220,7 @@ TransactionIdIsInProgress(TransactionId xid)
 		}
 	}

-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);

 	return result;
 }
@@ -237,7 +251,7 @@ GetOldestXmin(bool allDbs)

 	result = GetCurrentTransactionId();

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_SHARED);

 	for (index = 0; index < segP->lastBackend; index++)
 	{
@@ -265,7 +279,7 @@ GetOldestXmin(bool allDbs)
 		}
 	}

-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);

 	return result;
 }
@@ -298,7 +312,7 @@ GetSnapshotData(bool serializable)

 	snapshot->xmin = GetCurrentTransactionId();

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_SHARED);

 	/*
 	 * There can be no more than lastBackend active transactions, so this
@@ -307,15 +321,12 @@ GetSnapshotData(bool serializable)
 	snapshot->xip = (TransactionId *)
 		malloc(segP->lastBackend * sizeof(TransactionId));
 	if (snapshot->xip == NULL)
-	{
-		SpinRelease(SInvalLock);
 		elog(ERROR, "Memory exhausted in GetSnapshotData");
-	}

 	/*--------------------
 	 * Unfortunately, we have to call ReadNewTransactionId() after acquiring
 	 * SInvalLock above.  It's not good because ReadNewTransactionId() does
-	 * SpinAcquire(XidGenLockId), but *necessary*.  We need to be sure that
+	 * LWLockAcquire(XidGenLock), but *necessary*.  We need to be sure that
 	 * no transactions exit the set of currently-running transactions
 	 * between the time we fetch xmax and the time we finish building our
 	 * snapshot.  Otherwise we could have a situation like this:
@@ -373,7 +384,7 @@ GetSnapshotData(bool serializable)
 	if (serializable)
 		MyProc->xmin = snapshot->xmin;

-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);

 	/* Serializable snapshot must be computed before any other... */
 	Assert(TransactionIdIsValid(MyProc->xmin));
@@ -439,7 +450,7 @@ GetUndoRecPtr(void)
 	XLogRecPtr	tempr;
 	int			index;

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_SHARED);

 	for (index = 0; index < segP->lastBackend; index++)
 	{
@@ -458,7 +469,7 @@ GetUndoRecPtr(void)
 		}
 	}

-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);

 	return (urec);
 }
@@ -470,7 +481,7 @@ GetUndoRecPtr(void)
 * knows that the backend isn't going to go away, so we do not bother with
 * locking.
 */
-struct proc *
+struct PROC *
 BackendIdGetProc(BackendId procId)
 {
 	SISeg	   *segP = shmInvalBuffer;
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.40 2001/06/19 19:42:15 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.41 2001/09/29 04:02:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -83,7 +83,7 @@ SIBufferInit(int maxBackends)
 *	   <0	Some other failure (not currently used)
 *
 * NB: this routine, and all following ones, must be executed with the
- * SInvalLock spinlock held, since there may be multiple backends trying
+ * SInvalLock lock held, since there may be multiple backends trying
 * to access the buffer.
 */
 int
@@ -152,7 +152,7 @@ CleanupInvalidationState(int status, Datum arg)

 	Assert(PointerIsValid(segP));

-	SpinAcquire(SInvalLock);
+	LWLockAcquire(SInvalLock, LW_EXCLUSIVE);

 	/* Mark myself inactive */
 	segP->procState[MyBackendId - 1].nextMsgNum = -1;
@@ -167,7 +167,7 @@ CleanupInvalidationState(int status, Datum arg)
 	}
 	segP->lastBackend = i;

-	SpinRelease(SInvalLock);
+	LWLockRelease(SInvalLock);
 }

 /*
@@ -267,6 +267,10 @@ SISetProcStateInvalid(SISeg *segP)
 *	1: next SI message has been extracted into *data
 *		(there may be more messages available after this one!)
 * -1: SI reset message extracted
+ *
+ * NB: this can run in parallel with other instances of SIGetDataEntry
+ * executing on behalf of other backends.  See comments in sinval.c in
+ * ReceiveSharedInvalidMessages().
 */
 int
 SIGetDataEntry(SISeg *segP, int backendId,
--- a/src/backend/storage/lmgr/Makefile
+++ b/src/backend/storage/lmgr/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for storage/lmgr
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Makefile,v 1.16 2001/09/27 19:10:02 tgl Exp $
+#    $Header: /cvsroot/pgsql/src/backend/storage/lmgr/Makefile,v 1.17 2001/09/29 04:02:24 tgl Exp $
 #
 #-------------------------------------------------------------------------

@@ -12,7 +12,7 @@ subdir = src/backend/storage/lmgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global

-OBJS = lmgr.o lock.o proc.o deadlock.o spin.o s_lock.o
+OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o spin.o s_lock.o

 all: SUBSYS.o

--- a/src/backend/storage/lmgr/README
+++ b/src/backend/storage/lmgr/README
@@ -1,4 +1,49 @@
-$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.8 2001/01/26 18:23:12 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.9 2001/09/29 04:02:24 tgl Exp $
+
+
+LOCKING OVERVIEW
+
+Postgres uses three types of interprocess locks:
+
+* Spinlocks.  These are intended for *very* short-term locks.  If a lock
+is to be held more than a few dozen instructions, or across any sort of
+kernel call (or even a call to a nontrivial subroutine), don't use a spinlock.
+Spinlocks are primarily used as infrastructure for lightweight locks.
+They are implemented using a hardware atomic-test-and-set instruction,
+if available.  Waiting processes busy-loop until they can get the lock.
+There is no provision for deadlock detection, automatic release on error,
+or any other nicety.  There is a timeout if the lock cannot be gotten after
+a minute or so (which is approximately forever in comparison to the intended
+lock hold time, so this is certainly an error condition).
+
+* Lightweight locks (LWLocks).  These locks are typically used to interlock
+access to datastructures in shared memory.  LWLocks support both exclusive
+and shared lock modes (for read/write and read-only access to a shared object).
+There is no provision for deadlock detection, but the LWLock manager will
+automatically release held LWLocks during elog() recovery, so it is safe to
+raise an error while holding LWLocks.  Obtaining or releasing an LWLock is
+quite fast (a few dozen instructions) when there is no contention for the
+lock.  When a process has to wait for an LWLock, it blocks on a SysV semaphore
+so as to not consume CPU time.  Waiting processes will be granted the lock
+in arrival order.  There is no timeout.
+
+* Regular locks (a/k/a heavyweight locks).  The regular lock manager supports
+a variety of lock modes with table-driven semantics, and it has full deadlock
+detection and automatic release at transaction end.  Regular locks should be
+used for all user-driven lock requests.
+
+Acquisition of either a spinlock or a lightweight lock causes query cancel
+and die() interrupts to be held off until all such locks are released.
+No such restriction exists for regular locks, however.  Also note that we
+can accept query cancel and die() interrupts while waiting for a regular
+lock, but we will not accept them while waiting for spinlocks or LW locks.
+It is therefore not a good idea to use LW locks when the wait time might
+exceed a few seconds.
+
+The rest of this README file discusses the regular lock manager in detail.
+
+
+LOCK DATA STRUCTURES

 There are two fundamental lock structures: the per-lockable-object LOCK
 struct, and the per-lock-holder HOLDER struct.  A LOCK object exists
--- a/src/backend/storage/lmgr/deadlock.c
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -12,7 +12,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/deadlock.c,v 1.3 2001/03/22 03:59:46 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/deadlock.c,v 1.4 2001/09/29 04:02:24 tgl Exp $
 *
 *	Interface:
 *
@@ -172,8 +172,8 @@ InitDeadLockChecking(void)
 *
 * We must have already locked the master lock before being called.
 * NOTE: although the lockctl structure appears to allow each lock
- * table to have a different spinlock, all locks that can block had
- * better use the same spinlock, else this code will not be adequately
+ * table to have a different LWLock, all locks that can block had
+ * better use the same LWLock, else this code will not be adequately
 * interlocked!
 */
 bool
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.95 2001/09/27 16:29:12 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lock.c,v 1.96 2001/09/29 04:02:24 tgl Exp $
 *
 * NOTES
 *	  Outside modules can create a lock table and acquire/release
@@ -78,8 +78,8 @@ static char *lock_mode_names[] =
 *	   TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally
 *	   DEBUG_DEADLOCKS	-- currently dumps locks at untimely occasions ;)
 *
- * Furthermore, but in storage/ipc/spin.c:
- *	   TRACE_SPINLOCKS	-- trace spinlocks (pretty useless)
+ * Furthermore, but in storage/lmgr/lwlock.c:
+ *	   TRACE_LWLOCKS	-- trace lightweight locks (pretty useless)
 *
 * Define LOCK_DEBUG at compile time to get all these enabled.
 * --------
@@ -151,10 +151,6 @@ HOLDER_PRINT(const char *where, const HOLDER *holderP)
 #endif	 /* not LOCK_DEBUG */


-
-SPINLOCK	LockMgrLock;		/* in Shmem or created in
-								 * CreateSpinlocks() */
-
 /*
 * These are to simplify/speed up some bit arithmetic.
 *
@@ -230,12 +226,6 @@ LockMethodInit(LOCKMETHODTABLE *lockMethodTable,
 /*
 * LockMethodTableInit -- initialize a lock table structure
 *
- * Notes:
- *		(a) a lock table has four separate entries in the shmem index
- *		table.	This is because every shared hash table and spinlock
- *		has its name stored in the shmem index at its creation.  It
- *		is wasteful, in this case, but not much space is involved.
- *
 * NOTE: data structures allocated here are allocated permanently, using
 * TopMemoryContext and shared memory.	We don't ever release them anyway,
 * and in normal multi-backend operation the lock table structures set up
@@ -277,9 +267,9 @@ LockMethodTableInit(char *tabName,
 		MemoryContextAlloc(TopMemoryContext, sizeof(LOCKMETHODTABLE));

 	/*
-	 * find/acquire the spinlock for the table
+	 * Lock the LWLock for the table (probably not necessary here)
 	 */
-	SpinAcquire(LockMgrLock);
+	LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);

 	/*
 	 * allocate a control structure from shared memory or attach to it if
@@ -356,7 +346,7 @@ LockMethodTableInit(char *tabName,
 	/* init ctl data structures */
 	LockMethodInit(lockMethodTable, conflictsP, prioP, numModes);

-	SpinRelease(LockMgrLock);
+	LWLockRelease(LockMgrLock);

 	pfree(shmemName);

@@ -464,7 +454,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	HTAB	   *holderTable;
 	bool		found;
 	LOCK	   *lock;
-	SPINLOCK	masterLock;
+	LWLockId	masterLock;
 	LOCKMETHODTABLE *lockMethodTable;
 	int			status;
 	int			myHolding[MAX_LOCKMODES];
@@ -489,7 +479,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,

 	masterLock = lockMethodTable->ctl->masterLock;

-	SpinAcquire(masterLock);
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);

 	/*
 	 * Find or create a lock with this tag
@@ -499,7 +489,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 								HASH_ENTER, &found);
 	if (!lock)
 	{
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		elog(FATAL, "LockAcquire: lock table %d is corrupted", lockmethod);
 		return FALSE;
 	}
@@ -544,7 +534,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 									HASH_ENTER, &found);
 	if (!holder)
 	{
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		elog(FATAL, "LockAcquire: holder table corrupted");
 		return FALSE;
 	}
@@ -617,7 +607,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	{
 		GrantLock(lock, holder, lockmode);
 		HOLDER_PRINT("LockAcquire: owning", holder);
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		return TRUE;
 	}

@@ -630,7 +620,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	{
 		GrantLock(lock, holder, lockmode);
 		HOLDER_PRINT("LockAcquire: my other XID owning", holder);
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		return TRUE;
 	}

@@ -677,7 +667,7 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 			LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
 			Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
 			Assert(lock->nGranted <= lock->nRequested);
-			SpinRelease(masterLock);
+			LWLockRelease(masterLock);
 			return FALSE;
 		}

@@ -719,14 +709,14 @@ LockAcquire(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 			HOLDER_PRINT("LockAcquire: INCONSISTENT", holder);
 			LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
 			/* Should we retry ? */
-			SpinRelease(masterLock);
+			LWLockRelease(masterLock);
 			return FALSE;
 		}
 		HOLDER_PRINT("LockAcquire: granted", holder);
 		LOCK_PRINT("LockAcquire: granted", lock, lockmode);
 	}

-	SpinRelease(masterLock);
+	LWLockRelease(masterLock);

 	return status == STATUS_OK;
 }
@@ -879,7 +869,7 @@ GrantLock(LOCK *lock, HOLDER *holder, LOCKMODE lockmode)
 * Caller must have set MyProc->heldLocks to reflect locks already held
 * on the lockable object by this process (under all XIDs).
 *
- * The locktable spinlock must be held at entry.
+ * The locktable's masterLock must be held at entry.
 */
 static int
 WaitOnLock(LOCKMETHOD lockmethod, LOCKMODE lockmode,
@@ -925,7 +915,7 @@ WaitOnLock(LOCKMETHOD lockmethod, LOCKMODE lockmode,
 		 * needed, will happen in xact cleanup (see above for motivation).
 		 */
 		LOCK_PRINT("WaitOnLock: aborting on lock", lock, lockmode);
-		SpinRelease(lockMethodTable->ctl->masterLock);
+		LWLockRelease(lockMethodTable->ctl->masterLock);
 		elog(ERROR, "deadlock detected");
 		/* not reached */
 	}
@@ -998,7 +988,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 			TransactionId xid, LOCKMODE lockmode)
 {
 	LOCK	   *lock;
-	SPINLOCK	masterLock;
+	LWLockId	masterLock;
 	bool		found;
 	LOCKMETHODTABLE *lockMethodTable;
 	HOLDER	   *holder;
@@ -1023,7 +1013,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	}

 	masterLock = lockMethodTable->ctl->masterLock;
-	SpinAcquire(masterLock);
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);

 	/*
 	 * Find a lock with this tag
@@ -1038,14 +1028,14 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	 */
 	if (!lock)
 	{
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		elog(NOTICE, "LockRelease: locktable corrupted");
 		return FALSE;
 	}

 	if (!found)
 	{
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		elog(NOTICE, "LockRelease: no such lock");
 		return FALSE;
 	}
@@ -1065,7 +1055,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 									HASH_FIND_SAVE, &found);
 	if (!holder || !found)
 	{
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 #ifdef USER_LOCKS
 		if (!found && lockmethod == USER_LOCKMETHOD)
 			elog(NOTICE, "LockRelease: no lock with this tag");
@@ -1084,7 +1074,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	{
 		HOLDER_PRINT("LockRelease: WRONGTYPE", holder);
 		Assert(holder->holding[lockmode] >= 0);
-		SpinRelease(masterLock);
+		LWLockRelease(masterLock);
 		elog(NOTICE, "LockRelease: you don't own a lock of type %s",
 			 lock_mode_names[lockmode]);
 		return FALSE;
@@ -1139,7 +1129,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 									&found);
 		if (!lock || !found)
 		{
-			SpinRelease(masterLock);
+			LWLockRelease(masterLock);
 			elog(NOTICE, "LockRelease: remove lock, table corrupted");
 			return FALSE;
 		}
@@ -1167,7 +1157,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 										HASH_REMOVE_SAVED, &found);
 		if (!holder || !found)
 		{
-			SpinRelease(masterLock);
+			LWLockRelease(masterLock);
 			elog(NOTICE, "LockRelease: remove holder, table corrupted");
 			return FALSE;
 		}
@@ -1179,7 +1169,7 @@ LockRelease(LOCKMETHOD lockmethod, LOCKTAG *locktag,
 	if (wakeupNeeded)
 		ProcLockWakeup(lockMethodTable, lock);

-	SpinRelease(masterLock);
+	LWLockRelease(masterLock);
 	return TRUE;
 }

@@ -1201,7 +1191,7 @@ LockReleaseAll(LOCKMETHOD lockmethod, PROC *proc,
 	SHM_QUEUE  *procHolders = &(proc->procHolders);
 	HOLDER	   *holder;
 	HOLDER	   *nextHolder;
-	SPINLOCK	masterLock;
+	LWLockId	masterLock;
 	LOCKMETHODTABLE *lockMethodTable;
 	int			i,
 				numLockModes;
@@ -1225,7 +1215,7 @@ LockReleaseAll(LOCKMETHOD lockmethod, PROC *proc,
 	numLockModes = lockMethodTable->ctl->numLockModes;
 	masterLock = lockMethodTable->ctl->masterLock;

-	SpinAcquire(masterLock);
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);

 	holder = (HOLDER *) SHMQueueNext(procHolders, procHolders,
 									 offsetof(HOLDER, procLink));
@@ -1321,7 +1311,7 @@ LockReleaseAll(LOCKMETHOD lockmethod, PROC *proc,
 										&found);
 		if (!holder || !found)
 		{
-			SpinRelease(masterLock);
+			LWLockRelease(masterLock);
 			elog(NOTICE, "LockReleaseAll: holder table corrupted");
 			return FALSE;
 		}
@@ -1340,7 +1330,7 @@ LockReleaseAll(LOCKMETHOD lockmethod, PROC *proc,
 										HASH_REMOVE, &found);
 			if (!lock || !found)
 			{
-				SpinRelease(masterLock);
+				LWLockRelease(masterLock);
 				elog(NOTICE, "LockReleaseAll: cannot remove lock from HTAB");
 				return FALSE;
 			}
@@ -1352,7 +1342,7 @@ next_item:
 		holder = nextHolder;
 	}

-	SpinRelease(masterLock);
+	LWLockRelease(masterLock);

 #ifdef LOCK_DEBUG
 	if (lockmethod == USER_LOCKMETHOD ? Trace_userlocks : Trace_locks)
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -0,0 +1,483 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.c
+ *	  Lightweight lock manager
+ *
+ * Lightweight locks are intended primarily to provide mutual exclusion of
+ * access to shared-memory data structures.  Therefore, they offer both
+ * exclusive and shared lock modes (to support read/write and read-only
+ * access to a shared object).  There are few other frammishes.  User-level
+ * locking should be done with the full lock manager --- which depends on
+ * an LWLock to protect its shared state.
+ *
+ *
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lwlock.c,v 1.1 2001/09/29 04:02:24 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "storage/lwlock.h"
+#include "storage/proc.h"
+#include "storage/spin.h"
+
+
+typedef struct LWLock
+{
+	slock_t		mutex;			/* Protects LWLock and queue of PROCs */
+	char		exclusive;		/* # of exclusive holders (0 or 1) */
+	int			shared;			/* # of shared holders (0..MaxBackends) */
+	PROC	   *head;			/* head of list of waiting PROCs */
+	PROC	   *tail;			/* tail of list of waiting PROCs */
+	/* tail is undefined when head is NULL */
+} LWLock;
+
+/*
+ * This points to the array of LWLocks in shared memory.  Backends inherit
+ * the pointer by fork from the postmaster.  LWLockIds are indexes into
+ * the array.
+ */
+static LWLock *LWLockArray = NULL;
+/* shared counter for dynamic allocation of LWLockIds */
+static int	  *LWLockCounter;
+
+
+/*
+ * We use this structure to keep track of locked LWLocks for release
+ * during error recovery.  The maximum size could be determined at runtime
+ * if necessary, but it seems unlikely that more than a few locks could
+ * ever be held simultaneously.
+ */
+#define MAX_SIMUL_LWLOCKS	100
+
+static int		num_held_lwlocks = 0;
+static LWLockId	held_lwlocks[MAX_SIMUL_LWLOCKS];
+
+
+#ifdef LOCK_DEBUG
+bool		Trace_lwlocks = false;
+
+inline static void
+PRINT_LWDEBUG(const char *where, LWLockId lockid, const LWLock *lock)
+{
+	if (Trace_lwlocks)
+		elog(DEBUG, "%s(%d): excl %d shared %d head %p",
+			 where, (int) lockid,
+			 (int) lock->exclusive, lock->shared, lock->head);
+}
+
+#else	/* not LOCK_DEBUG */
+#define PRINT_LWDEBUG(a,b,c)
+#endif	/* LOCK_DEBUG */
+
+
+/*
+ * Compute number of LWLocks to allocate.
+ */
+int
+NumLWLocks(void)
+{
+	int		numLocks;
+
+	/*
+	 * Possibly this logic should be spread out among the affected modules,
+	 * the same way that shmem space estimation is done.  But for now,
+	 * there are few enough users of LWLocks that we can get away with
+	 * just keeping the knowledge here.
+	 */
+
+	/* Predefined LWLocks */
+	numLocks = (int) NumFixedLWLocks;
+
+	/* bufmgr.c needs two for each shared buffer */
+	numLocks += 2 * NBuffers;
+
+	/* clog.c needs one per CLOG buffer */
+	numLocks += NUM_CLOG_BUFFERS;
+
+	/* Perhaps create a few more for use by user-defined modules? */
+
+	return numLocks;
+}
+
+
+/*
+ * Compute shmem space needed for LWLocks.
+ */
+int
+LWLockShmemSize(void)
+{
+	int		numLocks = NumLWLocks();
+	uint32	spaceLocks;
+
+	/* Allocate the LWLocks plus space for shared allocation counter. */
+	spaceLocks = numLocks * sizeof(LWLock) + 2 * sizeof(int);
+	spaceLocks = MAXALIGN(spaceLocks);
+
+	return (int) spaceLocks;
+}
+
+
+/*
+ * Allocate shmem space for LWLocks and initialize the locks.
+ */
+void
+CreateLWLocks(void)
+{
+	int		numLocks = NumLWLocks();
+	uint32	spaceLocks = LWLockShmemSize();
+	LWLock *lock;
+	int		id;
+
+	/* Allocate space */
+	LWLockArray = (LWLock *) ShmemAlloc(spaceLocks);
+
+	/*
+	 * Initialize all LWLocks to "unlocked" state
+	 */
+	for (id = 0, lock = LWLockArray; id < numLocks; id++, lock++)
+	{
+		SpinLockInit(&lock->mutex);
+		lock->exclusive = 0;
+		lock->shared = 0;
+		lock->head = NULL;
+		lock->tail = NULL;
+	}
+
+	/*
+	 * Initialize the dynamic-allocation counter at the end of the array
+	 */
+	LWLockCounter = (int *) lock;
+	LWLockCounter[0] = (int) NumFixedLWLocks;
+	LWLockCounter[1] = numLocks;
+}
+
+
+/*
+ * LWLockAssign - assign a dynamically-allocated LWLock number
+ *
+ * NB: we do not currently try to interlock this.  Could perhaps use
+ * ShmemLock spinlock if there were any need to assign LWLockIds after
+ * shmem setup.
+ */
+LWLockId
+LWLockAssign(void)
+{
+	if (LWLockCounter[0] >= LWLockCounter[1])
+		elog(FATAL, "No more LWLockIds available");
+	return (LWLockId) (LWLockCounter[0]++);
+}
+
+
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+void
+LWLockAcquire(LWLockId lockid, LWLockMode mode)
+{
+	LWLock *lock = LWLockArray + lockid;
+	bool	mustwait;
+
+	PRINT_LWDEBUG("LWLockAcquire", lockid, lock);
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section
+	 * protected by the LWLock.  This ensures that interrupts will not
+	 * interfere with manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/* Acquire mutex.  Time spent holding mutex should be short! */
+	SpinLockAcquire_NoHoldoff(&lock->mutex);
+
+	/* If I can get the lock, do so quickly. */
+	if (mode == LW_EXCLUSIVE)
+	{
+		if (lock->exclusive == 0 && lock->shared == 0)
+		{
+			lock->exclusive++;
+			mustwait = false;
+		}
+		else
+			mustwait = true;
+	}
+	else
+	{
+		/*
+		 * If there is someone waiting (presumably for exclusive access),
+		 * queue up behind him even though I could get the lock.  This
+		 * prevents a stream of read locks from starving a writer.
+		 */
+		if (lock->exclusive == 0 && lock->head == NULL)
+		{
+			lock->shared++;
+			mustwait = false;
+		}
+		else
+			mustwait = true;
+	}
+
+	if (mustwait)
+	{
+		/* Add myself to wait queue */
+		PROC	*proc = MyProc;
+		int		extraWaits = 0;
+
+		/*
+		 * If we don't have a PROC structure, there's no way to wait.
+		 * This should never occur, since MyProc should only be null
+		 * during shared memory initialization.
+		 */
+		if (proc == NULL)
+			elog(FATAL, "LWLockAcquire: can't wait without a PROC structure");
+
+		proc->lwWaiting = true;
+		proc->lwExclusive = (mode == LW_EXCLUSIVE);
+		proc->lwWaitLink = NULL;
+		if (lock->head == NULL)
+			lock->head = proc;
+		else
+			lock->tail->lwWaitLink = proc;
+		lock->tail = proc;
+
+		/* Can release the mutex now */
+		SpinLockRelease_NoHoldoff(&lock->mutex);
+
+		/*
+		 * Wait until awakened.
+		 *
+		 * Since we share the process wait semaphore with the regular lock
+		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
+		 * while one of those is pending, it is possible that we get awakened
+		 * for a reason other than being granted the LWLock.  If so, loop back
+		 * and wait again.  Once we've gotten the lock, re-increment the sema
+		 * by the number of additional signals received, so that the lock
+		 * manager or signal manager will see the received signal when it
+		 * next waits.
+		 */
+		for (;;)
+		{
+			/* "false" means cannot accept cancel/die interrupt here. */
+			IpcSemaphoreLock(proc->sem.semId, proc->sem.semNum, false);
+			if (!proc->lwWaiting)
+				break;
+			extraWaits++;
+		}
+		/*
+		 * The awakener already updated the lock struct's state, so we
+		 * don't need to do anything more to it.  Just need to fix the
+		 * semaphore count.
+		 */
+		while (extraWaits-- > 0)
+			IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum);
+	}
+	else
+	{
+		/* Got the lock without waiting */
+		SpinLockRelease_NoHoldoff(&lock->mutex);
+	}
+
+	/* Add lock to list of locks held by this backend */
+	Assert(num_held_lwlocks < MAX_SIMUL_LWLOCKS);
+	held_lwlocks[num_held_lwlocks++] = lockid;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return FALSE with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLockId lockid, LWLockMode mode)
+{
+	LWLock *lock = LWLockArray + lockid;
+	bool	mustwait;
+
+	PRINT_LWDEBUG("LWLockConditionalAcquire", lockid, lock);
+
+	/*
+	 * Lock out cancel/die interrupts until we exit the code section
+	 * protected by the LWLock.  This ensures that interrupts will not
+	 * interfere with manipulations of data structures in shared memory.
+	 */
+	HOLD_INTERRUPTS();
+
+	/* Acquire mutex.  Time spent holding mutex should be short! */
+	SpinLockAcquire_NoHoldoff(&lock->mutex);
+
+	/* If I can get the lock, do so quickly. */
+	if (mode == LW_EXCLUSIVE)
+	{
+		if (lock->exclusive == 0 && lock->shared == 0)
+		{
+			lock->exclusive++;
+			mustwait = false;
+		}
+		else
+			mustwait = true;
+	}
+	else
+	{
+		/*
+		 * If there is someone waiting (presumably for exclusive access),
+		 * queue up behind him even though I could get the lock.  This
+		 * prevents a stream of read locks from starving a writer.
+		 */
+		if (lock->exclusive == 0 && lock->head == NULL)
+		{
+			lock->shared++;
+			mustwait = false;
+		}
+		else
+			mustwait = true;
+	}
+
+	/* We are done updating shared state of the lock itself. */
+	SpinLockRelease_NoHoldoff(&lock->mutex);
+
+	if (mustwait)
+	{
+		/* Failed to get lock, so release interrupt holdoff */
+		RESUME_INTERRUPTS();
+	}
+	else
+	{
+		/* Add lock to list of locks held by this backend */
+		Assert(num_held_lwlocks < MAX_SIMUL_LWLOCKS);
+		held_lwlocks[num_held_lwlocks++] = lockid;
+	}
+
+	return !mustwait;
+}
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLockId lockid)
+{
+	LWLock *lock = LWLockArray + lockid;
+	PROC	*head;
+	PROC	*proc;
+	int		i;
+
+	PRINT_LWDEBUG("LWLockRelease", lockid, lock);
+
+	/*
+	 * Remove lock from list of locks held.  Usually, but not always,
+	 * it will be the latest-acquired lock; so search array backwards.
+	 */
+	for (i = num_held_lwlocks; --i >= 0; )
+	{
+		if (lockid == held_lwlocks[i])
+			break;
+	}
+	if (i < 0)
+		elog(ERROR, "LWLockRelease: lock %d is not held", (int) lockid);
+	num_held_lwlocks--;
+	for (; i < num_held_lwlocks; i++)
+		held_lwlocks[i] = held_lwlocks[i+1];
+
+	/* Acquire mutex.  Time spent holding mutex should be short! */
+	SpinLockAcquire_NoHoldoff(&lock->mutex);
+
+	/* Release my hold on lock */
+	if (lock->exclusive > 0)
+		lock->exclusive--;
+	else
+	{
+		Assert(lock->shared > 0);
+		lock->shared--;
+	}
+
+	/*
+	 * See if I need to awaken any waiters.  If I released a non-last shared
+	 * hold, there cannot be anything to do.
+	 */
+	head = lock->head;
+	if (head != NULL)
+	{
+		if (lock->exclusive == 0 && lock->shared == 0)
+		{
+			/*
+			 * Remove the to-be-awakened PROCs from the queue, and update the
+			 * lock state to show them as holding the lock.
+			 */
+			proc = head;
+			if (proc->lwExclusive)
+			{
+				lock->exclusive++;
+			}
+			else
+			{
+				lock->shared++;
+				while (proc->lwWaitLink != NULL &&
+					   !proc->lwWaitLink->lwExclusive)
+				{
+					proc = proc->lwWaitLink;
+					lock->shared++;
+				}
+			}
+			/* proc is now the last PROC to be released */
+			lock->head = proc->lwWaitLink;
+			proc->lwWaitLink = NULL;
+		}
+		else
+		{
+			/* lock is still held, can't awaken anything */
+			head = NULL;
+		}
+	}
+
+	/* We are done updating shared state of the lock itself. */
+	SpinLockRelease_NoHoldoff(&lock->mutex);
+
+	/*
+	 * Awaken any waiters I removed from the queue.
+	 */
+	while (head != NULL)
+	{
+		proc = head;
+		head = proc->lwWaitLink;
+		proc->lwWaitLink = NULL;
+		proc->lwWaiting = false;
+		IpcSemaphoreUnlock(proc->sem.semId, proc->sem.semNum);
+	}
+
+	/*
+	 * Now okay to allow cancel/die interrupts.
+	 */
+	RESUME_INTERRUPTS();
+}
+
+
+/*
+ * LWLockReleaseAll - release all currently-held locks
+ *
+ * Used to clean up after elog(ERROR).  An important difference between this
+ * function and retail LWLockRelease calls is that InterruptHoldoffCount is
+ * unchanged by this operation.  This is necessary since InterruptHoldoffCount
+ * has been set to an appropriate level earlier in error recovery.  We could
+ * decrement it below zero if we allow it to drop for each released lock!
+ */
+void
+LWLockReleaseAll(void)
+{
+	while (num_held_lwlocks > 0)
+	{
+		HOLD_INTERRUPTS();		/* match the upcoming RESUME_INTERRUPTS */
+
+		LWLockRelease(held_lwlocks[num_held_lwlocks-1]);
+	}
+}
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,15 +8,11 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.108 2001/09/21 17:06:12 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.109 2001/09/29 04:02:24 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 /*
- *	Each postgres backend gets one of these.  We'll use it to
- *	clean up after the process should the process suddenly die.
- *
- *
 * Interface (a):
 *		ProcSleep(), ProcWakeup(),
 *		ProcQueueAlloc() -- create a shm queue for sleeping processes
@@ -75,27 +71,31 @@
 #include "access/xact.h"
 #include "storage/proc.h"
 #include "storage/sinval.h"
+#include "storage/spin.h"


 int			DeadlockTimeout = 1000;

-/* --------------------
- * Spin lock for manipulating the shared process data structure:
- * ProcGlobal.... Adding an extra spin lock seemed like the smallest
- * hack to get around reading and updating this structure in shared
- * memory. -mer 17 July 1991
- * --------------------
+PROC	   *MyProc = NULL;
+
+/*
+ * This spinlock protects the freelist of recycled PROC structures and the
+ * bitmap of free semaphores.  We cannot use an LWLock because the LWLock
+ * manager depends on already having a PROC and a wait semaphore!  But these
+ * structures are touched relatively infrequently (only at backend startup
+ * or shutdown) and not for very long, so a spinlock is okay.
 */
-SPINLOCK	ProcStructLock;
+static slock_t *ProcStructLock = NULL;

 static PROC_HDR *ProcGlobal = NULL;

-PROC	   *MyProc = NULL;
+static PROC *DummyProc = NULL;

 static bool waitingForLock = false;
 static bool waitingForSignal = false;

 static void ProcKill(void);
+static void DummyProcKill(void);
 static void ProcGetNewSemIdAndNum(IpcSemaphoreId *semId, int *semNum);
 static void ProcFreeSem(IpcSemaphoreId semId, int semNum);
 static void ZeroProcSemaphore(PROC *proc);
@@ -128,9 +128,12 @@ InitProcGlobal(int maxBackends)
 	Size		procGlobalSize;
 	bool		found = false;

-	/* Compute size for ProcGlobal structure */
+	/*
+	 * Compute size for ProcGlobal structure.  Note we need one more sema
+	 * besides those used for regular backends.
+	 */
 	Assert(maxBackends > 0);
-	semMapEntries = PROC_SEM_MAP_ENTRIES(maxBackends);
+	semMapEntries = PROC_SEM_MAP_ENTRIES(maxBackends+1);
 	procGlobalSize = sizeof(PROC_HDR) + (semMapEntries-1) * sizeof(SEM_MAP_ENTRY);

 	/* Create or attach to the ProcGlobal shared structure */
@@ -178,13 +181,26 @@ InitProcGlobal(int maxBackends)
 									   false);
 			ProcGlobal->procSemMap[i].procSemId = semId;
 		}
+
+		/*
+		 * Pre-allocate a PROC structure for dummy (checkpoint) processes,
+		 * and reserve the last sema of the precreated semas for it.
+		 */
+		DummyProc = (PROC *) ShmemAlloc(sizeof(PROC));
+		DummyProc->pid = 0;		/* marks DummyProc as not in use */
+		i = semMapEntries-1;
+		ProcGlobal->procSemMap[i].freeSemMap |= 1 << (PROC_NSEMS_PER_SET-1);
+		DummyProc->sem.semId = ProcGlobal->procSemMap[i].procSemId;
+		DummyProc->sem.semNum = PROC_NSEMS_PER_SET-1;
+
+		/* Create ProcStructLock spinlock, too */
+		ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t));
+		SpinLockInit(ProcStructLock);
 	}
 }

-/* ------------------------
- * InitProc -- create a per-process data structure for this process
- * used by the lock manager on semaphore queues.
- * ------------------------
+/*
+ * InitProcess -- create a per-process data structure for this backend
 */
 void
 InitProcess(void)
@@ -202,39 +218,27 @@ InitProcess(void)
 		elog(ERROR, "InitProcess: you already exist");

 	/*
-	 * ProcStructLock protects the freelist of PROC entries and the map
-	 * of free semaphores.  Note that when we acquire it here, we do not
-	 * have a PROC entry and so the ownership of the spinlock is not
-	 * recorded anywhere; even if it was, until we register ProcKill as
-	 * an on_shmem_exit callback, there is no exit hook that will cause
-	 * owned spinlocks to be released.  Upshot: during the first part of
-	 * this routine, be careful to release the lock manually before any
-	 * elog(), else you'll have a stuck spinlock to add to your woes.
+	 * try to get a proc struct from the free list first
 	 */
-	SpinAcquire(ProcStructLock);
+	SpinLockAcquire(ProcStructLock);

-	/* try to get a proc struct from the free list first */
 	myOffset = ProcGlobal->freeProcs;

 	if (myOffset != INVALID_OFFSET)
 	{
 		MyProc = (PROC *) MAKE_PTR(myOffset);
 		ProcGlobal->freeProcs = MyProc->links.next;
+		SpinLockRelease(ProcStructLock);
 	}
 	else
 	{
 		/*
-		 * have to allocate one.  We can't use the normal shmem index
-		 * table mechanism because the proc structure is stored by PID
-		 * instead of by a global name (need to look it up by PID when we
-		 * cleanup dead processes).
+		 * have to allocate a new one.
 		 */
+		SpinLockRelease(ProcStructLock);
 		MyProc = (PROC *) ShmemAlloc(sizeof(PROC));
 		if (!MyProc)
-		{
-			SpinRelease(ProcStructLock);
 			elog(FATAL, "cannot create new proc: out of memory");
-		}
 	}

 	/*
@@ -246,39 +250,30 @@ InitProcess(void)
 	MyProc->errType = STATUS_OK;
 	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
-	MyProc->logRec.xrecoff = 0;
-	MyProc->waitLock = NULL;
-	MyProc->waitHolder = NULL;
 	MyProc->pid = MyProcPid;
 	MyProc->databaseId = MyDatabaseId;
+	MyProc->logRec.xrecoff = 0;
+	MyProc->lwWaiting = false;
+	MyProc->lwExclusive = false;
+	MyProc->lwWaitLink = NULL;
+	MyProc->waitLock = NULL;
+	MyProc->waitHolder = NULL;
 	SHMQueueInit(&(MyProc->procHolders));
-	/*
-	 * Zero out the spin lock counts and set the sLocks field for
-	 * ProcStructLock to 1 as we have acquired this spinlock above but
-	 * didn't record it since we didn't have MyProc until now.
-	 */
-	MemSet(MyProc->sLocks, 0, sizeof(MyProc->sLocks));
-	MyProc->sLocks[ProcStructLock] = 1;

 	/*
-	 * Arrange to clean up at backend exit.  Once we do this, owned
-	 * spinlocks will be released on exit, and so we can be a lot less
-	 * tense about errors.
+	 * Arrange to clean up at backend exit.
 	 */
 	on_shmem_exit(ProcKill, 0);

 	/*
 	 * Set up a wait-semaphore for the proc.  (We rely on ProcKill to clean
-	 * up if this fails.)
+	 * up MyProc if this fails.)
 	 */
 	if (IsUnderPostmaster)
 		ProcGetNewSemIdAndNum(&MyProc->sem.semId, &MyProc->sem.semNum);

-	/* Done with freelist and sem map */
-	SpinRelease(ProcStructLock);
-
 	/*
-	 * We might be reusing a semaphore that belongs to a dead backend.
+	 * We might be reusing a semaphore that belonged to a failed process.
 	 * So be careful and reinitialize its value here.
 	 */
 	if (MyProc->sem.semId >= 0)
@@ -291,6 +286,65 @@ InitProcess(void)
 	InitDeadLockChecking();
 }

+/*
+ * InitDummyProcess -- create a dummy per-process data structure
+ *
+ * This is called by checkpoint processes so that they will have a MyProc
+ * value that's real enough to let them wait for LWLocks.  The PROC and
+ * sema that are assigned are the extra ones created during InitProcGlobal.
+ */
+void
+InitDummyProcess(void)
+{
+	/*
+	 * ProcGlobal should be set by a previous call to InitProcGlobal
+	 * (we inherit this by fork() from the postmaster).
+	 */
+	if (ProcGlobal == NULL || DummyProc == NULL)
+		elog(STOP, "InitDummyProcess: Proc Header uninitialized");
+
+	if (MyProc != NULL)
+		elog(ERROR, "InitDummyProcess: you already exist");
+
+	/*
+	 * DummyProc should not presently be in use by anyone else
+	 */
+	if (DummyProc->pid != 0)
+		elog(FATAL, "InitDummyProcess: DummyProc is in use by PID %d",
+			 DummyProc->pid);
+	MyProc = DummyProc;
+
+	/*
+	 * Initialize all fields of MyProc, except MyProc->sem which was
+	 * set up by InitProcGlobal.
+	 */
+	MyProc->pid = MyProcPid;	/* marks DummyProc as in use by me */
+	SHMQueueElemInit(&(MyProc->links));
+	MyProc->errType = STATUS_OK;
+	MyProc->xid = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
+	MyProc->databaseId = MyDatabaseId;
+	MyProc->logRec.xrecoff = 0;
+	MyProc->lwWaiting = false;
+	MyProc->lwExclusive = false;
+	MyProc->lwWaitLink = NULL;
+	MyProc->waitLock = NULL;
+	MyProc->waitHolder = NULL;
+	SHMQueueInit(&(MyProc->procHolders));
+
+	/*
+	 * Arrange to clean up at process exit.
+	 */
+	on_shmem_exit(DummyProcKill, 0);
+
+	/*
+	 * We might be reusing a semaphore that belonged to a failed process.
+	 * So be careful and reinitialize its value here.
+	 */
+	if (MyProc->sem.semId >= 0)
+		ZeroProcSemaphore(MyProc);
+}
+
 /*
 * Initialize the proc's wait-semaphore to count zero.
 */
@@ -330,10 +384,10 @@ LockWaitCancel(void)
 	disable_sigalrm_interrupt();

 	/* Unlink myself from the wait queue, if on it (might not be anymore!) */
-	LockLockTable();
+	LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);
 	if (MyProc->links.next != INVALID_OFFSET)
 		RemoveFromWaitQueue(MyProc);
-	UnlockLockTable();
+	LWLockRelease(LockMgrLock);

 	/*
 	 * Reset the proc wait semaphore to zero.  This is necessary in the
@@ -381,15 +435,18 @@ ProcReleaseLocks(bool isCommit)

 /*
 * ProcKill() -- Destroy the per-proc data structure for
- *		this process. Release any of its held spin locks.
+ *		this process. Release any of its held LW locks.
 */
 static void
 ProcKill(void)
 {
 	Assert(MyProc != NULL);

-	/* Release any spinlocks I am holding */
-	ProcReleaseSpins(MyProc);
+	/* Release any LW locks I am holding */
+	LWLockReleaseAll();
+
+	/* Abort any buffer I/O in progress */
+	AbortBufferIO();

 	/* Get off any wait queue I might be on */
 	LockWaitCancel();
@@ -402,7 +459,7 @@ ProcKill(void)
 	LockReleaseAll(USER_LOCKMETHOD, MyProc, true, InvalidTransactionId);
 #endif

-	SpinAcquire(ProcStructLock);
+	SpinLockAcquire(ProcStructLock);

 	/* Free up my wait semaphore, if I got one */
 	if (MyProc->sem.semId >= 0)
@@ -412,10 +469,35 @@ ProcKill(void)
 	MyProc->links.next = ProcGlobal->freeProcs;
 	ProcGlobal->freeProcs = MAKE_OFFSET(MyProc);

-	/* PROC struct isn't mine anymore; stop tracking spinlocks with it! */
+	/* PROC struct isn't mine anymore */
 	MyProc = NULL;

-	SpinRelease(ProcStructLock);
+	SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * DummyProcKill() -- Cut-down version of ProcKill for dummy (checkpoint)
+ *		processes.  The PROC and sema are not released, only marked
+ *		as not-in-use.
+ */
+static void
+DummyProcKill(void)
+{
+	Assert(MyProc != NULL && MyProc == DummyProc);
+
+	/* Release any LW locks I am holding */
+	LWLockReleaseAll();
+
+	/* Abort any buffer I/O in progress */
+	AbortBufferIO();
+
+	/* I can't be on regular lock queues, so needn't check */
+
+	/* Mark DummyProc no longer in use */
+	MyProc->pid = 0;
+
+	/* PROC struct isn't mine anymore */
+	MyProc = NULL;
 }


@@ -464,13 +546,13 @@ ProcQueueInit(PROC_QUEUE *queue)
 * Caller must have set MyProc->heldLocks to reflect locks already held
 * on the lockable object by this process (under all XIDs).
 *
- * Locktable's spinlock must be held at entry, and will be held
+ * Locktable's masterLock must be held at entry, and will be held
 * at exit.
 *
 * Result: STATUS_OK if we acquired the lock, STATUS_ERROR if not (deadlock).
 *
 * ASSUME: that no one will fiddle with the queue until after
- *		we release the spin lock.
+ *		we release the masterLock.
 *
 * NOTES: The process queue is now a priority queue for locking.
 *
@@ -484,7 +566,7 @@ ProcSleep(LOCKMETHODTABLE *lockMethodTable,
 		  HOLDER *holder)
 {
 	LOCKMETHODCTL *lockctl = lockMethodTable->ctl;
-	SPINLOCK	spinlock = lockctl->masterLock;
+	LWLockId	masterLock = lockctl->masterLock;
 	PROC_QUEUE *waitQueue = &(lock->waitProcs);
 	int			myHeldLocks = MyProc->heldLocks;
 	bool		early_deadlock = false;
@@ -595,14 +677,14 @@ ProcSleep(LOCKMETHODTABLE *lockMethodTable,
 	waitingForLock = true;

 	/*
-	 * Release the locktable's spin lock.
+	 * Release the locktable's masterLock.
 	 *
 	 * NOTE: this may also cause us to exit critical-section state, possibly
 	 * allowing a cancel/die interrupt to be accepted. This is OK because
 	 * we have recorded the fact that we are waiting for a lock, and so
 	 * LockWaitCancel will clean up if cancel/die happens.
 	 */
-	SpinRelease(spinlock);
+	LWLockRelease(masterLock);

 	/*
 	 * Set timer so we can wake up after awhile and check for a deadlock.
@@ -617,7 +699,7 @@ ProcSleep(LOCKMETHODTABLE *lockMethodTable,
 		elog(FATAL, "ProcSleep: Unable to set timer for process wakeup");

 	/*
-	 * If someone wakes us between SpinRelease and IpcSemaphoreLock,
+	 * If someone wakes us between LWLockRelease and IpcSemaphoreLock,
 	 * IpcSemaphoreLock will not block.  The wakeup is "saved" by the
 	 * semaphore implementation.  Note also that if HandleDeadLock is
 	 * invoked but does not detect a deadlock, IpcSemaphoreLock() will
@@ -644,12 +726,9 @@ ProcSleep(LOCKMETHODTABLE *lockMethodTable,
 	waitingForLock = false;

 	/*
-	 * Re-acquire the locktable's spin lock.
-	 *
-	 * We could accept a cancel/die interrupt here.  That's OK because the
-	 * lock is now registered as being held by this process.
+	 * Re-acquire the locktable's masterLock.
 	 */
-	SpinAcquire(spinlock);
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);

 	/*
 	 * We don't have to do anything else, because the awaker did all the
@@ -674,7 +753,7 @@ ProcWakeup(PROC *proc, int errType)
 {
 	PROC	   *retProc;

-	/* assume that spinlock has been acquired */
+	/* assume that masterLock has been acquired */

 	/* Proc should be sleeping ... */
 	if (proc->links.prev == INVALID_OFFSET ||
@@ -777,11 +856,11 @@ HandleDeadLock(SIGNAL_ARGS)
 	/*
 	 * Acquire locktable lock.	Note that the SIGALRM interrupt had better
 	 * not be enabled anywhere that this process itself holds the
-	 * locktable lock, else this will wait forever.  Also note that this
-	 * calls SpinAcquire which creates a critical section, so that this
+	 * locktable lock, else this will wait forever.  Also note that
+	 * LWLockAcquire creates a critical section, so that this
 	 * routine cannot be interrupted by cancel/die interrupts.
 	 */
-	LockLockTable();
+	LWLockAcquire(LockMgrLock, LW_EXCLUSIVE);

 	/*
 	 * Check to see if we've been awoken by anyone in the interim.
@@ -799,7 +878,7 @@ HandleDeadLock(SIGNAL_ARGS)
 	if (MyProc->links.prev == INVALID_OFFSET ||
 		MyProc->links.next == INVALID_OFFSET)
 	{
-		UnlockLockTable();
+		LWLockRelease(LockMgrLock);
 		errno = save_errno;
 		return;
 	}
@@ -812,7 +891,7 @@ HandleDeadLock(SIGNAL_ARGS)
 	if (!DeadLockCheck(MyProc))
 	{
 		/* No deadlock, so keep waiting */
-		UnlockLockTable();
+		LWLockRelease(LockMgrLock);
 		errno = save_errno;
 		return;
 	}
@@ -846,30 +925,10 @@ HandleDeadLock(SIGNAL_ARGS)
 	 * wakable because we're not in front of them anymore.  However,
 	 * RemoveFromWaitQueue took care of waking up any such processes.
 	 */
-	UnlockLockTable();
+	LWLockRelease(LockMgrLock);
 	errno = save_errno;
 }

-void
-ProcReleaseSpins(PROC *proc)
-{
-	int			i;
-
-	if (!proc)
-		proc = MyProc;
-
-	if (!proc)
-		return;
-	for (i = 0; i < (int) MAX_SPINS; i++)
-	{
-		if (proc->sLocks[i])
-		{
-			Assert(proc->sLocks[i] == 1);
-			SpinRelease(i);
-		}
-	}
-	AbortBufferIO();
-}

 /*
 * ProcWaitForSignal - wait for a signal from another backend.
@@ -994,10 +1053,7 @@ ProcGetNewSemIdAndNum(IpcSemaphoreId *semId, int *semNum)
 	SEM_MAP_ENTRY  *procSemMap = ProcGlobal->procSemMap;
 	int32		fullmask = (1 << PROC_NSEMS_PER_SET) - 1;

-	/*
-	 * we hold ProcStructLock when entering this routine. We scan through
-	 * the bitmap to look for a free semaphore.
-	 */
+	SpinLockAcquire(ProcStructLock);

 	for (i = 0; i < semMapEntries; i++)
 	{
@@ -1018,12 +1074,17 @@ ProcGetNewSemIdAndNum(IpcSemaphoreId *semId, int *semNum)

 				*semId = procSemMap[i].procSemId;
 				*semNum = j;
+
+				SpinLockRelease(ProcStructLock);
+
 				return;
 			}
 			mask <<= 1;
 		}
 	}

+	SpinLockRelease(ProcStructLock);
+
 	/*
 	 * If we reach here, all the semaphores are in use.  This is one of the
 	 * possible places to detect "too many backends", so give the standard
@@ -1036,6 +1097,8 @@ ProcGetNewSemIdAndNum(IpcSemaphoreId *semId, int *semNum)
 /*
 * ProcFreeSem -
 *	  free up our semaphore in the semaphore set.
+ *
+ * Caller is assumed to hold ProcStructLock.
 */
 static void
 ProcFreeSem(IpcSemaphoreId semId, int semNum)
@@ -1054,6 +1117,7 @@ ProcFreeSem(IpcSemaphoreId semId, int semNum)
 			return;
 		}
 	}
+	/* can't elog here!!! */
 	fprintf(stderr, "ProcFreeSem: no ProcGlobal entry for semId %d\n", semId);
 }

--- a/src/backend/storage/lmgr/s_lock.c
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -1,14 +1,15 @@
 /*-------------------------------------------------------------------------
 *
 * s_lock.c
- *	  Spinlock support routines
+ *	   Hardware-dependent implementation of spinlocks.
+ *
 *
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/s_lock.c,v 1.1 2001/09/27 19:10:02 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/s_lock.c,v 1.2 2001/09/29 04:02:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -17,49 +18,14 @@
 #include <sys/time.h>
 #include <unistd.h>

-#include "miscadmin.h"
 #include "storage/s_lock.h"


-/*----------
- * Each time we busy spin we select the next element of this array as the
- * number of microseconds to wait. This accomplishes pseudo random back-off.
- *
- * Note that on most platforms, specified values will be rounded up to the
- * next multiple of a clock tick, which is often ten milliseconds (10000).
- * So, we are being way overoptimistic to assume that these different values
- * are really different, other than the last.  But there are a few platforms
- * with better-than-usual timekeeping, and on these we will get pretty good
- * pseudo-random behavior.
- *
- * Total time to cycle through all 20 entries will be at least 100 msec,
- * more commonly (10 msec resolution) 220 msec, and on some platforms
- * as much as 420 msec (when the remainder of the current tick cycle is
- * ignored in deciding when to time out, as on FreeBSD and older Linuxen).
- * We use the 100msec figure to figure max_spins, so actual timeouts may
- * be as much as four times the nominal value, but will never be less.
- *----------
- */
-#define S_NSPINCYCLE	20
-
-int			s_spincycle[S_NSPINCYCLE] =
-{1, 10, 100, 1000,
-	10000, 1000, 1000, 1000,
-	10000, 1000, 1000, 10000,
-	1000, 1000, 10000, 1000,
-	10000, 1000, 10000, 30000
-};
-
-#define AVG_SPINCYCLE	5000	/* average entry in microsec: 100ms / 20 */
-
-#define DEFAULT_TIMEOUT (100*1000000)	/* default timeout: 100 sec */
-
-
 /*
 * s_lock_stuck() - complain about a stuck spinlock
 */
 static void
-s_lock_stuck(volatile slock_t *lock, const char *file, const int line)
+s_lock_stuck(volatile slock_t *lock, const char *file, int line)
 {
 	fprintf(stderr,
 			"\nFATAL: s_lock(%p) at %s:%d, stuck spinlock. Aborting.\n",
@@ -72,69 +38,41 @@ s_lock_stuck(volatile slock_t *lock, const char *file, const int line)


 /*
- * s_lock_sleep() - sleep a pseudo-random amount of time, check for timeout
- *
- * The 'timeout' is given in microsec, or may be 0 for "infinity".	Note that
- * this will be a lower bound (a fairly loose lower bound, on most platforms).
- *
- * 'microsec' is the number of microsec to delay per loop.	Normally
- * 'microsec' is 0, specifying to use the next s_spincycle[] value.
- * Some callers may pass a nonzero interval, specifying to use exactly that
- * delay value rather than a pseudo-random delay.
+ * s_lock(lock) - platform-independent portion of waiting for a spinlock.
 */
 void
-s_lock_sleep(unsigned spins, int timeout, int microsec,
-			 volatile slock_t *lock,
-			 const char *file, const int line)
-{
-	struct timeval delay;
-
-	if (microsec > 0)
-	{
-		delay.tv_sec = microsec / 1000000;
-		delay.tv_usec = microsec % 1000000;
-	}
-	else
-	{
-		delay.tv_sec = 0;
-		delay.tv_usec = s_spincycle[spins % S_NSPINCYCLE];
-		microsec = AVG_SPINCYCLE;		/* use average to figure timeout */
-	}
-
-	if (timeout > 0)
-	{
-		unsigned	max_spins = timeout / microsec;
-
-		if (spins > max_spins)
-			s_lock_stuck(lock, file, line);
-	}
-
-	(void) select(0, NULL, NULL, NULL, &delay);
-}
-
-
-/*
- * s_lock(lock) - take a spinlock with backoff
- */
-void
-s_lock(volatile slock_t *lock, const char *file, const int line)
+s_lock(volatile slock_t *lock, const char *file, int line)
 {
 	unsigned	spins = 0;
+	unsigned	delays = 0;
+	struct timeval delay;

 	/*
-	 * If you are thinking of changing this code, be careful.  This same
-	 * loop logic is used in other places that call TAS() directly.
+	 * We loop tightly for awhile, then delay using select() and try again.
+	 * Preferably, "awhile" should be a small multiple of the maximum time
+	 * we expect a spinlock to be held.  100 iterations seems about right.
 	 *
-	 * While waiting for a lock, we check for cancel/die interrupts (which is
-	 * a no-op if we are inside a critical section).  The interrupt check
-	 * can be omitted in places that know they are inside a critical
-	 * section. Note that an interrupt must NOT be accepted after
-	 * acquiring the lock.
+	 * We use a 10 millisec select delay because that is the lower limit on
+	 * many platforms.  The timeout is figured on this delay only, and so the
+	 * nominal 1 minute is a lower bound.
 	 */
+#define SPINS_PER_DELAY		100
+#define DELAY_MSEC			10
+#define TIMEOUT_MSEC		(60 * 1000)
+
 	while (TAS(lock))
 	{
-		s_lock_sleep(spins++, DEFAULT_TIMEOUT, 0, lock, file, line);
-		CHECK_FOR_INTERRUPTS();
+		if (++spins > SPINS_PER_DELAY)
+		{
+			if (++delays > (TIMEOUT_MSEC / DELAY_MSEC))
+				s_lock_stuck(lock, file, line);
+
+			delay.tv_sec = 0;
+			delay.tv_usec = DELAY_MSEC * 1000;
+			(void) select(0, NULL, NULL, NULL, &delay);
+
+			spins = 0;
+		}
 	}
 }

--- a/src/backend/storage/lmgr/spin.c
+++ b/src/backend/storage/lmgr/spin.c
@@ -1,197 +1,45 @@
 /*-------------------------------------------------------------------------
 *
 * spin.c
- *	  routines for managing spin locks
+ *	   Hardware-independent implementation of spinlocks.
+ *
+ *
+ * For machines that have test-and-set (TAS) instructions, s_lock.h/.c
+ * define the spinlock implementation.  This file contains only a stub
+ * implementation for spinlocks using SysV semaphores.  The semaphore method
+ * is too slow to be very useful :-(
 *
- * POSTGRES has two kinds of locks: semaphores (which put the
- * process to sleep) and spinlocks (which are supposed to be
- * short term locks).  Spinlocks are implemented via test-and-set (TAS)
- * instructions if possible, else via semaphores.  The semaphore method
- * is too slow to be useful :-(
 *
 * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/spin.c,v 1.1 2001/09/27 19:10:02 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/spin.c,v 1.2 2001/09/29 04:02:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

 #include <errno.h>
-#if !defined(HAS_TEST_AND_SET) && defined(HAVE_SYS_SEM_H)
+#ifdef HAVE_SYS_SEM_H
 #include <sys/sem.h>
 #endif

-#include "miscadmin.h"
+#include "storage/lwlock.h"
 #include "storage/proc.h"
-#include "storage/s_lock.h"
-
-
-/* Probably should move these to an appropriate header file */
-extern SPINLOCK BufMgrLock;
-extern SPINLOCK OidGenLockId;
-extern SPINLOCK XidGenLockId;
-extern SPINLOCK ControlFileLockId;
-extern SPINLOCK ShmemLock;
-extern SPINLOCK ShmemIndexLock;
-extern SPINLOCK LockMgrLock;
-extern SPINLOCK SInvalLock;
-extern SPINLOCK ProcStructLock;
-extern SPINLOCK FreeSpaceLock;
-#ifdef STABLE_MEMORY_STORAGE
-extern SPINLOCK MMCacheLock;
-#endif
-
-
-/*
- * Initialize identifiers for permanent spinlocks during startup
- *
- * The same identifiers are used for both TAS and semaphore implementations,
- * although in one case they are indexes into a shmem array and in the other
- * they are semaphore numbers.
- */
-static void
-InitSpinLockIDs(void)
-{
-	BufMgrLock = (SPINLOCK) BUFMGRLOCKID;
-	OidGenLockId = (SPINLOCK) OIDGENLOCKID;
-	XidGenLockId = (SPINLOCK) XIDGENLOCKID;
-	ControlFileLockId = (SPINLOCK) CNTLFILELOCKID;
-	ShmemLock = (SPINLOCK) SHMEMLOCKID;
-	ShmemIndexLock = (SPINLOCK) SHMEMINDEXLOCKID;
-	LockMgrLock = (SPINLOCK) LOCKMGRLOCKID;
-	SInvalLock = (SPINLOCK) SINVALLOCKID;
-	ProcStructLock = (SPINLOCK) PROCSTRUCTLOCKID;
-	FreeSpaceLock = (SPINLOCK) FREESPACELOCKID;
-#ifdef STABLE_MEMORY_STORAGE
-	MMCacheLock = (SPINLOCK) MMCACHELOCKID;
-#endif
-}
+#include "storage/spin.h"


 #ifdef HAS_TEST_AND_SET

-/* real spin lock implementation */
-
-typedef struct slock
-{
-	slock_t		shlock;
-} SLock;
-
-#ifdef LOCK_DEBUG
-bool		Trace_spinlocks = false;
-
-inline static void
-PRINT_SLDEBUG(const char *where, SPINLOCK lockid, const SLock *lock)
-{
-	if (Trace_spinlocks)
-		elog(DEBUG, "%s: id=%d", where, lockid);
-}
-
-#else							/* not LOCK_DEBUG */
-#define PRINT_SLDEBUG(a,b,c)
-#endif	 /* not LOCK_DEBUG */
-
-
-static SLock *SLockArray = NULL;
-
-#define SLOCKMEMORYSIZE		((int) MAX_SPINS * sizeof(SLock))
-
-/*
- * SLockShmemSize --- return shared-memory space needed
- */
-int
-SLockShmemSize(void)
-{
-	return MAXALIGN(SLOCKMEMORYSIZE);
-}
-
 /*
 * CreateSpinlocks --- create and initialize spinlocks during startup
 */
 void
-CreateSpinlocks(PGShmemHeader *seghdr)
+CreateSpinlocks(void)
 {
-	int			id;
-
-	/*
-	 * We must allocate the space "by hand" because shmem.c isn't up yet
-	 */
-	SLockArray = (SLock *) (((char *) seghdr) + seghdr->freeoffset);
-	seghdr->freeoffset += MAXALIGN(SLOCKMEMORYSIZE);
-	Assert(seghdr->freeoffset <= seghdr->totalsize);
-
-	/*
-	 * Initialize all spinlocks to "unlocked" state
-	 */
-	for (id = 0; id < (int) MAX_SPINS; id++)
-	{
-		SLock	   *slckP = &(SLockArray[id]);
-
-		S_INIT_LOCK(&(slckP->shlock));
-	}
-
-	/*
-	 * Assign indexes for fixed spinlocks
-	 */
-	InitSpinLockIDs();
-}
-
-void
-SpinAcquire(SPINLOCK lockid)
-{
-	SLock	   *slckP = &(SLockArray[lockid]);
-
-	PRINT_SLDEBUG("SpinAcquire", lockid, slckP);
-
-	/*
-	 * Acquire the lock, then record that we have done so (for recovery in
-	 * case of elog(ERROR) while holding the lock).  Note we assume here
-	 * that S_LOCK will not accept cancel/die interrupts once it has
-	 * acquired the lock.  However, interrupts should be accepted while
-	 * waiting, if InterruptHoldoffCount is zero.
-	 */
-	S_LOCK(&(slckP->shlock));
-	PROC_INCR_SLOCK(lockid);
-
-	/*
-	 * Lock out cancel/die interrupts until we exit the code section
-	 * protected by the spinlock.  This ensures that interrupts will not
-	 * interfere with manipulations of data structures in shared memory.
-	 */
-	HOLD_INTERRUPTS();
-
-	PRINT_SLDEBUG("SpinAcquire/done", lockid, slckP);
-}
-
-void
-SpinRelease(SPINLOCK lockid)
-{
-	SLock	   *slckP = &(SLockArray[lockid]);
-
-	PRINT_SLDEBUG("SpinRelease", lockid, slckP);
-
-	/*
-	 * Check that we are actually holding the lock we are releasing. This
-	 * can be done only after MyProc has been initialized.
-	 */
-	Assert(!MyProc || MyProc->sLocks[lockid] > 0);
-
-	/*
-	 * Record that we no longer hold the spinlock, and release it.
-	 */
-	PROC_DECR_SLOCK(lockid);
-	S_UNLOCK(&(slckP->shlock));
-
-	/*
-	 * Exit the interrupt holdoff entered in SpinAcquire().
-	 */
-	RESUME_INTERRUPTS();
-
-	PRINT_SLDEBUG("SpinRelease/done", lockid, slckP);
+	/* no-op when we have TAS spinlocks */
 }

 #else							/* !HAS_TEST_AND_SET */
@@ -199,11 +47,7 @@ SpinRelease(SPINLOCK lockid)
 /*
 * No TAS, so spinlocks are implemented using SysV semaphores.
 *
- * We support two slightly different APIs here: SpinAcquire/SpinRelease
- * work with SPINLOCK integer indexes for the permanent spinlocks, which
- * are all assumed to live in the first spinlock semaphore set.  There
- * is also an emulation of the s_lock.h TAS-spinlock macros; for that case,
- * typedef slock_t stores the semId and sem number of the sema to use.
+ * Typedef slock_t stores the semId and sem number of the sema to use.
 * The semas needed are created by CreateSpinlocks and doled out by
 * s_init_lock_sema.
 *
@@ -228,35 +72,26 @@ static int	nextSpinLock = 0;	/* next free spinlock index */

 static void SpinFreeAllSemaphores(void);

-/*
- * SLockShmemSize --- return shared-memory space needed
- */
-int
-SLockShmemSize(void)
-{
-	return 0;
-}

 /*
 * CreateSpinlocks --- create and initialize spinlocks during startup
 */
 void
-CreateSpinlocks(PGShmemHeader *seghdr)
+CreateSpinlocks(void)
 {
 	int			i;

 	if (SpinLockIds == NULL)
 	{
-
 		/*
-		 * Compute number of spinlocks needed.	If this logic gets any
-		 * more complicated, it should be distributed into the affected
-		 * modules, similar to the way shmem space estimation is handled.
+		 * Compute number of spinlocks needed.	It would be cleaner to
+		 * distribute this logic into the affected modules,
+		 * similar to the way shmem space estimation is handled.
 		 *
-		 * For now, though, we just need the fixed spinlocks (MAX_SPINS), two
-		 * spinlocks per shared disk buffer, and four spinlocks for XLOG.
+		 * For now, though, we just need a few spinlocks (10 should be
+		 * plenty) plus one for each LWLock.
 		 */
-		numSpinLocks = (int) MAX_SPINS + 2 * NBuffers + 4;
+		numSpinLocks = NumLWLocks() + 10;

 		/* might as well round up to a multiple of SPINLOCKS_PER_SET */
 		numSpinSets = (numSpinLocks - 1) / SPINLOCKS_PER_SET + 1;
@@ -288,14 +123,8 @@ CreateSpinlocks(PGShmemHeader *seghdr)
 											false);
 	}

-	/*
-	 * Assign indexes for fixed spinlocks
-	 */
-	Assert(MAX_SPINS <= SPINLOCKS_PER_SET);
-	InitSpinLockIDs();
-
 	/* Init counter for allocating dynamic spinlocks */
-	nextSpinLock = MAX_SPINS;
+	nextSpinLock = 0;
 }

 /*
@@ -318,49 +147,6 @@ SpinFreeAllSemaphores(void)
 	SpinLockIds = NULL;
 }

-/*
- * SpinAcquire -- grab a fixed spinlock
- *
- * FAILS if the semaphore is corrupted.
- */
-void
-SpinAcquire(SPINLOCK lock)
-{
-
-	/*
-	 * See the TAS() version of this routine for primary commentary.
-	 *
-	 * NOTE we must pass interruptOK = false to IpcSemaphoreLock, to ensure
-	 * that a cancel/die interrupt cannot prevent us from recording
-	 * ownership of a lock we have just acquired.
-	 */
-	IpcSemaphoreLock(SpinLockIds[0], lock, false);
-	PROC_INCR_SLOCK(lock);
-	HOLD_INTERRUPTS();
-}
-
-/*
- * SpinRelease -- release a fixed spin lock
- *
- * FAILS if the semaphore is corrupted
- */
-void
-SpinRelease(SPINLOCK lock)
-{
-	/* See the TAS() version of this routine for commentary */
-#ifdef USE_ASSERT_CHECKING
-	/* Check it's locked */
-	int			semval;
-
-	semval = IpcSemaphoreGetValue(SpinLockIds[0], lock);
-	Assert(semval < 1);
-#endif
-	Assert(!MyProc || MyProc->sLocks[lockid] > 0);
-	PROC_DECR_SLOCK(lock);
-	IpcSemaphoreUnlock(SpinLockIds[0], lock);
-	RESUME_INTERRUPTS();
-}
-
 /*
 * s_lock.h hardware-spinlock emulation
 */
--- a/src/backend/storage/smgr/mm.c
+++ b/src/backend/storage/smgr/mm.c
@@ -11,17 +11,19 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.24 2001/06/27 23:31:39 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/Attic/mm.c,v 1.25 2001/09/29 04:02:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
-#include "miscadmin.h"
-
-#ifdef STABLE_MEMORY_STORAGE

 #include <math.h>

+#include "storage/smgr.h"
+#include "miscadmin.h"
+
+
+#ifdef STABLE_MEMORY_STORAGE

 /*
 *	MMCacheTag -- Unique triplet for blocks stored by the main memory
@@ -71,8 +73,6 @@ typedef struct MMRelHashEntry
 #define MMNBUFFERS		10
 #define MMNRELATIONS	2

-SPINLOCK	MMCacheLock;
-
 static int *MMCurTop;
 static int *MMCurRelno;
 static MMCacheTag *MMBlockTags;
@@ -88,7 +88,7 @@ mminit()
 	bool		found;
 	HASHCTL		info;

-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);

 	mmsize += MAXALIGN(BLCKSZ * MMNBUFFERS);
 	mmsize += MAXALIGN(sizeof(*MMCurTop));
@@ -98,7 +98,7 @@ mminit()

 	if (mmcacheblk == (char *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		return SM_FAIL;
 	}

@@ -112,7 +112,7 @@ mminit()

 	if (MMCacheHT == (HTAB *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		return SM_FAIL;
 	}

@@ -126,18 +126,18 @@ mminit()

 	if (MMRelCacheHT == (HTAB *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		return SM_FAIL;
 	}

 	if (IsUnderPostmaster)		/* was IsPostmaster bjm */
 	{
 		MemSet(mmcacheblk, 0, mmsize);
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		return SM_SUCCESS;
 	}

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);

 	MMCurTop = (int *) mmcacheblk;
 	mmcacheblk += sizeof(int);
@@ -163,11 +163,11 @@ mmcreate(Relation reln)
 	bool		found;
 	MMRelTag	tag;

-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);

 	if (*MMCurRelno == MMNRELATIONS)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		return SM_FAIL;
 	}

@@ -184,20 +184,20 @@ mmcreate(Relation reln)

 	if (entry == (MMRelHashEntry *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "main memory storage mgr rel cache hash table corrupt");
 	}

 	if (found)
 	{
 		/* already exists */
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		return SM_FAIL;
 	}

 	entry->mmrhe_nblocks = 0;

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);

 	return SM_SUCCESS;
 }
@@ -211,30 +211,24 @@ int
 mmunlink(RelFileNode rnode)
 {
 	int			i;
-	Oid			reldbid;
 	MMHashEntry *entry;
 	MMRelHashEntry *rentry;
 	bool		found;
 	MMRelTag	rtag;

-	if (reln->rd_rel->relisshared)
-		reldbid = (Oid) 0;
-	else
-		reldbid = MyDatabaseId;
-
-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);

 	for (i = 0; i < MMNBUFFERS; i++)
 	{
-		if (MMBlockTags[i].mmct_dbid == reldbid
-			&& MMBlockTags[i].mmct_relid == RelationGetRelid(reln))
+		if (MMBlockTags[i].mmct_dbid == rnode.tblNode
+			&& MMBlockTags[i].mmct_relid == rnode.relNode)
 		{
 			entry = (MMHashEntry *) hash_search(MMCacheHT,
 												(char *) &MMBlockTags[i],
 												HASH_REMOVE, &found);
 			if (entry == (MMHashEntry *) NULL || !found)
 			{
-				SpinRelease(MMCacheLock);
+				LWLockRelease(MMCacheLock);
 				elog(FATAL, "mmunlink: cache hash table corrupted");
 			}
 			MMBlockTags[i].mmct_dbid = (Oid) 0;
@@ -242,21 +236,21 @@ mmunlink(RelFileNode rnode)
 			MMBlockTags[i].mmct_blkno = (BlockNumber) 0;
 		}
 	}
-	rtag.mmrt_dbid = reldbid;
-	rtag.mmrt_relid = RelationGetRelid(reln);
+	rtag.mmrt_dbid = rnode.tblNode;
+	rtag.mmrt_relid = rnode.relNode;

 	rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
 											HASH_REMOVE, &found);

 	if (rentry == (MMRelHashEntry *) NULL || !found)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmunlink: rel cache hash table corrupted");
 	}

 	(*MMCurRelno)--;

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);
 	return 1;
 }

@@ -286,7 +280,7 @@ mmextend(Relation reln, BlockNumber blocknum, char *buffer)
 	tag.mmct_dbid = rtag.mmrt_dbid = reldbid;
 	tag.mmct_relid = rtag.mmrt_relid = RelationGetRelid(reln);

-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);

 	if (*MMCurTop == MMNBUFFERS)
 	{
@@ -298,7 +292,7 @@ mmextend(Relation reln, BlockNumber blocknum, char *buffer)
 		}
 		if (i == MMNBUFFERS)
 		{
-			SpinRelease(MMCacheLock);
+			LWLockRelease(MMCacheLock);
 			return SM_FAIL;
 		}
 	}
@@ -312,7 +306,7 @@ mmextend(Relation reln, BlockNumber blocknum, char *buffer)
 											HASH_FIND, &found);
 	if (rentry == (MMRelHashEntry *) NULL || !found)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmextend: rel cache hash table corrupt");
 	}

@@ -322,7 +316,7 @@ mmextend(Relation reln, BlockNumber blocknum, char *buffer)
 										HASH_ENTER, &found);
 	if (entry == (MMHashEntry *) NULL || found)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmextend: cache hash table corrupt");
 	}

@@ -338,7 +332,7 @@ mmextend(Relation reln, BlockNumber blocknum, char *buffer)
 	offset = (i * BLCKSZ);
 	memmove(&(MMBlockCache[offset]), buffer, BLCKSZ);

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);

 	return SM_SUCCESS;
 }
@@ -386,20 +380,20 @@ mmread(Relation reln, BlockNumber blocknum, char *buffer)
 	tag.mmct_relid = RelationGetRelid(reln);
 	tag.mmct_blkno = blocknum;

-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);
 	entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
 										HASH_FIND, &found);

 	if (entry == (MMHashEntry *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmread: hash table corrupt");
 	}

 	if (!found)
 	{
 		/* reading nonexistent pages is defined to fill them with zeroes */
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		MemSet(buffer, 0, BLCKSZ);
 		return SM_SUCCESS;
 	}
@@ -407,7 +401,7 @@ mmread(Relation reln, BlockNumber blocknum, char *buffer)
 	offset = (entry->mmhe_bufno * BLCKSZ);
 	memmove(buffer, &MMBlockCache[offset], BLCKSZ);

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);

 	return SM_SUCCESS;
 }
@@ -433,26 +427,26 @@ mmwrite(Relation reln, BlockNumber blocknum, char *buffer)
 	tag.mmct_relid = RelationGetRelid(reln);
 	tag.mmct_blkno = blocknum;

-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);
 	entry = (MMHashEntry *) hash_search(MMCacheHT, (char *) &tag,
 										HASH_FIND, &found);

 	if (entry == (MMHashEntry *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmread: hash table corrupt");
 	}

 	if (!found)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmwrite: hash table missing requested page");
 	}

 	offset = (entry->mmhe_bufno * BLCKSZ);
 	memmove(&MMBlockCache[offset], buffer, BLCKSZ);

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);

 	return SM_SUCCESS;
 }
@@ -506,14 +500,14 @@ mmnblocks(Relation reln)

 	rtag.mmrt_relid = RelationGetRelid(reln);

-	SpinAcquire(MMCacheLock);
+	LWLockAcquire(MMCacheLock, LW_EXCLUSIVE);

 	rentry = (MMRelHashEntry *) hash_search(MMRelCacheHT, (char *) &rtag,
 											HASH_FIND, &found);

 	if (rentry == (MMRelHashEntry *) NULL)
 	{
-		SpinRelease(MMCacheLock);
+		LWLockRelease(MMCacheLock);
 		elog(FATAL, "mmnblocks: rel cache hash table corrupt");
 	}

@@ -522,7 +516,7 @@ mmnblocks(Relation reln)
 	else
 		nblocks = InvalidBlockNumber;

-	SpinRelease(MMCacheLock);
+	LWLockRelease(MMCacheLock);

 	return nblocks;
 }
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.52 2001/07/02 20:50:46 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/smgr.c,v 1.53 2001/09/29 04:02:25 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -19,6 +19,7 @@

 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
+#include "storage/ipc.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"