Revise lock manager to support "session level" locks as well as "transaction

level" locks. A session lock is not released at transaction commit (but it is released on transaction abort, to ensure recovery after an elog(ERROR)). In VACUUM, use a session lock to protect the master table while vacuuming a TOAST table, so that the TOAST table can be done in an independent transaction. I also took this opportunity to do some cleanup and renaming in the lock code. The previously noted bug in ProcLockWakeup, that it couldn't wake up any waiters beyond the first non-wakeable waiter, is now fixed. Also found a previously unknown bug of the same kind (failure to scan all members of a lock queue in some cases) in DeadLockCheck. This might have led to failure to detect a deadlock condition, resulting in indefinite waits, but it's difficult to characterize the conditions required to trigger a failure.
2025-11-10 17:42:29 +03:00 · 2000-12-22 00:51:54 +00:00
parent b2145e9365
commit 6cc842abd3
11 changed files with 1030 additions and 972 deletions
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.89 2000/12/18 00:44:45 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.90 2000/12/22 00:51:53 tgl Exp $
 *
 * NOTES
 *		Transaction aborts can now occur two ways:
@@ -741,7 +741,7 @@ AtCommit_Locks(void)
 	 *	Then you're up a creek! -mer 5/24/92
 	 * ----------------
 	 */
-	ProcReleaseLocks();
+	ProcReleaseLocks(true);
 }

 /* --------------------------------
@@ -828,7 +828,7 @@ AtAbort_Locks(void)
 	 *	Then you're up a creek without a paddle! -mer
 	 * ----------------
 	 */
-	ProcReleaseLocks();
+	ProcReleaseLocks(false);
 }


--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.177 2000/12/08 06:43:44 inoue Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.178 2000/12/22 00:51:53 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -61,7 +61,7 @@ static void vacuum_init(void);
 static void vacuum_shutdown(void);
 static void vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2);
 static VRelList getrels(NameData *VacRelP);
-static void vacuum_rel(Oid relid, bool is_toastrel);
+static void vacuum_rel(Oid relid);
 static void scan_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages);
 static void repair_frag(VRelStats *vacrelstats, Relation onerel, VacPageList vacuum_pages, VacPageList fraged_pages, int nindices, Relation *Irel);
 static void vacuum_heap(VRelStats *vacrelstats, Relation onerel, VacPageList vacpagelist);
@@ -239,7 +239,7 @@ vac_vacuum(NameData *VacRelP, bool analyze, List *anal_cols2)
 	/* vacuum each heap relation */
 	for (cur = vrl; cur != (VRelList) NULL; cur = cur->vrl_next)
 	{
-		vacuum_rel(cur->vrl_relid, false);
+		vacuum_rel(cur->vrl_relid);
 		/* analyze separately so locking is minimized */
 		if (analyze)
 			analyze_rel(cur->vrl_relid, anal_cols2, MESSAGE_LEVEL);
@@ -308,7 +308,7 @@ getrels(NameData *VacRelP)

 		if (rkind != RELKIND_RELATION)
 		{
-			elog(NOTICE, "Vacuum: can not process indecies, views and certain system tables");
+			elog(NOTICE, "Vacuum: can not process indices, views and certain system tables");
 			continue;
 		}

@@ -342,23 +342,25 @@ getrels(NameData *VacRelP)
 *	vacuum_rel() -- vacuum one heap relation
 *
 *		This routine vacuums a single heap, cleans out its indices, and
- *		updates its statistics num_pages and num_tuples statistics.
+ *		updates its num_pages and num_tuples statistics.
 *
 *		Doing one heap at a time incurs extra overhead, since we need to
 *		check that the heap exists again just before we vacuum it.	The
 *		reason that we do this is so that vacuuming can be spread across
 *		many small transactions.  Otherwise, two-phase locking would require
 *		us to lock the entire database during one pass of the vacuum cleaner.
+ *
+ *		At entry and exit, we are not inside a transaction.
 */
 static void
-vacuum_rel(Oid relid, bool is_toastrel)
+vacuum_rel(Oid relid)
 {
 	Relation	onerel;
+	LockRelId	onerelid;
 	VacPageListData vacuum_pages; /* List of pages to vacuum and/or clean
-								 * indices */
+								   * indices */
 	VacPageListData fraged_pages; /* List of pages with space enough for
-								 * re-using */
-	VacPage    *vacpage;
+								   * re-using */
 	Relation   *Irel;
 	int32		nindices,
 				i;
@@ -366,8 +368,8 @@ vacuum_rel(Oid relid, bool is_toastrel)
 	bool		reindex = false;
 	Oid			toast_relid;

-	if (!is_toastrel)
-		StartTransactionCommand();
+	/* Begin a transaction for vacuuming this relation */
+	StartTransactionCommand();

 	/*
 	 * Check for user-requested abort.	Note we want this to be inside a
@@ -384,8 +386,7 @@ vacuum_rel(Oid relid, bool is_toastrel)
 							  ObjectIdGetDatum(relid),
 							  0, 0, 0))
 	{
-		if (!is_toastrel)
-			CommitTransactionCommand();
+		CommitTransactionCommand();
 		return;
 	}

@@ -403,13 +404,25 @@ vacuum_rel(Oid relid, bool is_toastrel)
 		elog(NOTICE, "Skipping \"%s\" --- only table owner can VACUUM it",
 			 RelationGetRelationName(onerel));
 		heap_close(onerel, AccessExclusiveLock);
-		if (!is_toastrel)
-			CommitTransactionCommand();
+		CommitTransactionCommand();
 		return;
 	}

 	/*
-	 * Remember the relation'ss TOAST relation for later
+	 * Get a session-level exclusive lock too.  This will protect our
+	 * exclusive access to the relation across multiple transactions,
+	 * so that we can vacuum the relation's TOAST table (if any) secure
+	 * in the knowledge that no one is diddling the parent relation.
+	 *
+	 * NOTE: this cannot block, even if someone else is waiting for access,
+	 * because the lock manager knows that both lock requests are from the
+	 * same process.
+	 */
+	onerelid = onerel->rd_lockInfo.lockRelId;
+	LockRelationForSession(&onerelid, AccessExclusiveLock);
+
+	/*
+	 * Remember the relation's TOAST relation for later
 	 */
 	toast_relid = onerel->rd_rel->reltoastrelid;

@@ -500,21 +513,6 @@ vacuum_rel(Oid relid, bool is_toastrel)
 	if (reindex)
 		activate_indexes_of_a_table(relid, true);

-	/*
-	 * ok - free vacuum_pages list of reaped pages
-	 *
-	 * Isn't this a waste of code?  Upcoming commit should free memory, no?
-	 */
-	if (vacuum_pages.num_pages > 0)
-	{
-		vacpage = vacuum_pages.pagedesc;
-		for (i = 0; i < vacuum_pages.num_pages; i++, vacpage++)
-			pfree(*vacpage);
-		pfree(vacuum_pages.pagedesc);
-		if (fraged_pages.num_pages > 0)
-			pfree(fraged_pages.pagedesc);
-	}
-
 	/* all done with this class, but hold lock until commit */
 	heap_close(onerel, NoLock);

@@ -523,19 +521,25 @@ vacuum_rel(Oid relid, bool is_toastrel)
 					vacrelstats->num_tuples, vacrelstats->hasindex,
 					vacrelstats);

+	/*
+	 * Complete the transaction and free all temporary memory used.
+	 */
+	CommitTransactionCommand();
+
 	/*
 	 * If the relation has a secondary toast one, vacuum that too
-	 * while we still hold the lock on the master table. We don't
-	 * need to propagate "analyze" to it, because the toaster
+	 * while we still hold the session lock on the master table.
+	 * We don't need to propagate "analyze" to it, because the toaster
 	 * always uses hardcoded index access and statistics are
 	 * totally unimportant for toast relations
 	 */
 	if (toast_relid != InvalidOid)
-		vacuum_rel(toast_relid, true);
+		vacuum_rel(toast_relid);

-	/* next command frees attribute stats */
-	if (!is_toastrel)
-		CommitTransactionCommand();
+	/*
+	 * Now release the session-level lock on the master table.
+	 */
+	UnlockRelationForSession(&onerelid, AccessExclusiveLock);
 }

 /*
@@ -1786,9 +1790,13 @@ failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)"
 	if (num_moved > 0)
 	{
 		/*
-		 * We have to commit our tuple' movings before we'll truncate
-		 * relation, but we shouldn't lose our locks. And so - quick hack:
-		 * record status of current transaction as committed, and continue.
+		 * We have to commit our tuple movings before we truncate the
+		 * relation.  Ideally we should do Commit/StartTransactionCommand
+		 * here, relying on the session-level table lock to protect our
+		 * exclusive access to the relation.  However, that would require
+		 * a lot of extra code to close and re-open the relation, indices,
+		 * etc.  For now, a quick hack: record status of current transaction
+		 * as committed, and continue.
 		 */
 		RecordTransactionCommit();
 	}
@@ -1852,7 +1860,7 @@ failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)"
 	/* 
 	 * Reflect the motion of system tuples to catalog cache here.
 	 */
-        CommandCounterIncrement();
+	CommandCounterIncrement();

 	if (Nvacpagelist.num_pages > 0)
 	{
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.37 2000/12/03 17:18:10 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/ipci.c,v 1.38 2000/12/22 00:51:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -84,7 +84,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int maxBackends)
 	 * Set up lock manager
 	 */
 	InitLocks();
-	if (InitLockTable() == INVALID_TABLEID)
+	if (InitLockTable(maxBackends) == INVALID_TABLEID)
 		elog(FATAL, "Couldn't create the lock table");

 	/*
--- a/src/backend/storage/lmgr/README
+++ b/src/backend/storage/lmgr/README
@@ -1,8 +1,15 @@
-$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.3 1998/07/06 18:16:07 momjian Exp $
+$Header: /cvsroot/pgsql/src/backend/storage/lmgr/README,v 1.4 2000/12/22 00:51:54 tgl Exp $

-There are two fundemental lock structures.  Lock methods describe the
-locking behavior.  We currently only support multi-level locking.  Lock
-modes describe the mode of the lock(read/write or shared/exclusive). 
+There are two fundamental lock structures: the per-lockable-object LOCK
+struct, and the per-lock-holder HOLDER struct.  A LOCK object exists
+for each lockable object that currently has locks held or requested on it.
+A HOLDER struct exists for each transaction that is holding or requesting
+lock(s) on each LOCK object.
+
+Lock methods describe the overall locking behavior.  Currently there are
+two lock methods: DEFAULT and USER.  (USER locks are non-blocking.)
+
+Lock modes describe the type of the lock (read/write or shared/exclusive). 
 See src/tools/backend/index.html and src/include/storage/lock.h for more
 details.

@@ -12,10 +19,10 @@ The lock manager's LOCK:

 tag -
    The key fields that are used for hashing locks in the shared memory
-    lock hash table.  This is kept as a separate struct to ensure that we
-    always zero out the correct number of bytes.  This is a problem as
-    part of the tag is an itempointer which is 6 bytes and causes 2
-    additional bytes to be added as padding.
+    lock hash table.  This is declared as a separate struct to ensure that
+    we always zero out the correct number of bytes.  It is critical that
+    any alignment-padding bytes the compiler might insert in the struct
+    be zeroed out, else the hash computation will be random.

    tag.relId -
 	Uniquely identifies the relation that the lock corresponds to.
@@ -30,7 +37,7 @@ tag -
 	tuple within the block.  If we are setting a table level lock
 	both the blockId and tupleId (in an item pointer this is called
 	the position) are set to invalid, if it is a page level lock the
-	blockId is valid, while the tuleId is still invalid.  Finally if
+	blockId is valid, while the tupleId is still invalid.  Finally if
 	this is a tuple level lock (we currently never do this) then both
 	the blockId and tupleId are set to valid specifications.  This is
 	how we get the appearance of a multi-level lock table while using
@@ -38,9 +45,9 @@ tag -
 	you are puzzled about how multi-level lock tables work).

 mask -
-    This field indicates what types of locks are currently held in the
-    given lock.  It is used (against the lock table's conflict table)
-    to determine if the new lock request will conflict with existing
+    This field indicates what types of locks are currently held on the
+    given lockable object.  It is used (against the lock table's conflict
+    table) to determine if the new lock request will conflict with existing
    lock types held.  Conficts are determined by bitwise AND operations
    between the mask and the conflict table entry for the given lock type
    to be set.  The current representation is that each bit (1 through 5)
@@ -73,7 +80,7 @@ holders -

 nActive -
    Keeps a count of how many times this lock has been succesfully acquired.
-    This count does not include attempts that were rejected due to conflicts,
+    This count does not include attempts that are waiting due to conflicts,
    but can count the same backend twice (e.g. a read then a write -- since
    its the same transaction this won't cause a conflict)

@@ -85,3 +92,39 @@ activeHolders -

 ---------------------------------------------------------------------------

+The lock manager's HOLDER:
+
+tag -
+    The key fields that are used for hashing entries in the shared memory
+    holder hash table.  This is declared as a separate struct to ensure that
+    we always zero out the correct number of bytes.
+
+    tag.lock
+        SHMEM offset of the LOCK object this holder is for.
+
+    tag.pid
+        PID of backend process that owns this holder.
+
+    tag.xid
+        XID of transaction this holder is for, or InvalidTransactionId
+        if the holder is for session-level locking.
+
+    Note that this structure will support multiple transactions running
+    concurrently in one backend, which may be handy if we someday decide
+    to support nested transactions.  Currently, the XID field is only needed
+    to distinguish per-transaction locks from session locks.  User locks
+    are always session locks, and we also use session locks for multi-
+    transaction operations like VACUUM.
+
+holders -
+    The number of successfully acquired locks of each type for this holder.
+    (CAUTION: the semantics are not the same as the LOCK's holder[], which
+    counts both acquired and pending requests.  Probably a different name
+    should be used...)
+
+nHolding -
+    Sum of the holders[] array.
+
+queue -
+    List link for shared memory queue of all the HOLDER objects for the
+    same backend.
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.42 2000/11/30 01:39:08 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/lmgr.c,v 1.43 2000/12/22 00:51:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -16,6 +16,7 @@
 #include "postgres.h"

 #include "access/transam.h"
+#include "access/xact.h"
 #include "catalog/catalog.h"
 #include "miscadmin.h"
 #include "storage/lmgr.h"
@@ -72,16 +73,17 @@ LOCKMETHOD	LongTermTableId = (LOCKMETHOD) NULL;
 * Create the lock table described by LockConflicts and LockPrios.
 */
 LOCKMETHOD
-InitLockTable()
+InitLockTable(int maxBackends)
 {
 	int			lockmethod;

 	lockmethod = LockMethodTableInit("LockTable",
-							LockConflicts, LockPrios, MAX_LOCKMODES - 1);
+									 LockConflicts, LockPrios,
+									 MAX_LOCKMODES - 1, maxBackends);
 	LockTableId = lockmethod;

 	if (!(LockTableId))
-		elog(ERROR, "InitLockTable: couldnt initialize lock table");
+		elog(ERROR, "InitLockTable: couldn't initialize lock table");

 #ifdef USER_LOCKS

@@ -90,10 +92,7 @@ InitLockTable()
 	 */
 	LongTermTableId = LockMethodTableRename(LockTableId);
 	if (!(LongTermTableId))
-	{
-		elog(ERROR,
-			 "InitLockTable: couldn't rename long-term lock table");
-	}
+		elog(ERROR, "InitLockTable: couldn't rename long-term lock table");
 #endif

 	return LockTableId;
@@ -139,7 +138,7 @@ LockRelation(Relation relation, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = InvalidBlockNumber;

-	if (!LockAcquire(LockTableId, &tag, lockmode))
+	if (!LockAcquire(LockTableId, &tag, GetCurrentTransactionId(), lockmode))
 		elog(ERROR, "LockRelation: LockAcquire failed");

 	/*
@@ -169,7 +168,55 @@ UnlockRelation(Relation relation, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = InvalidBlockNumber;

-	LockRelease(LockTableId, &tag, lockmode);
+	LockRelease(LockTableId, &tag, GetCurrentTransactionId(), lockmode);
+}
+
+/*
+ *		LockRelationForSession
+ *
+ * This routine grabs a session-level lock on the target relation.  The
+ * session lock persists across transaction boundaries.  It will be removed
+ * when UnlockRelationForSession() is called, or if an elog(ERROR) occurs,
+ * or if the backend exits.
+ *
+ * Note that one should also grab a transaction-level lock on the rel
+ * in any transaction that actually uses the rel, to ensure that the
+ * relcache entry is up to date.
+ */
+void
+LockRelationForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	if (LockingDisabled())
+		return;
+
+	MemSet(&tag, 0, sizeof(tag));
+	tag.relId = relid->relId;
+	tag.dbId = relid->dbId;
+	tag.objId.blkno = InvalidBlockNumber;
+
+	if (!LockAcquire(LockTableId, &tag, InvalidTransactionId, lockmode))
+		elog(ERROR, "LockRelationForSession: LockAcquire failed");
+}
+
+/*
+ *		UnlockRelationForSession
+ */
+void
+UnlockRelationForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	if (LockingDisabled())
+		return;
+
+	MemSet(&tag, 0, sizeof(tag));
+	tag.relId = relid->relId;
+	tag.dbId = relid->dbId;
+	tag.objId.blkno = InvalidBlockNumber;
+
+	LockRelease(LockTableId, &tag, InvalidTransactionId, lockmode);
 }

 /*
@@ -188,7 +235,7 @@ LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = blkno;

-	if (!LockAcquire(LockTableId, &tag, lockmode))
+	if (!LockAcquire(LockTableId, &tag, GetCurrentTransactionId(), lockmode))
 		elog(ERROR, "LockPage: LockAcquire failed");
 }

@@ -208,7 +255,7 @@ UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = blkno;

-	LockRelease(LockTableId, &tag, lockmode);
+	LockRelease(LockTableId, &tag, GetCurrentTransactionId(), lockmode);
 }

 void
@@ -221,10 +268,10 @@ XactLockTableInsert(TransactionId xid)

 	MemSet(&tag, 0, sizeof(tag));
 	tag.relId = XactLockTableId;
-	tag.dbId = InvalidOid;
+	tag.dbId = InvalidOid;		/* xids are globally unique */
 	tag.objId.xid = xid;

-	if (!LockAcquire(LockTableId, &tag, ExclusiveLock))
+	if (!LockAcquire(LockTableId, &tag, xid, ExclusiveLock))
 		elog(ERROR, "XactLockTableInsert: LockAcquire failed");
 }

@@ -242,7 +289,7 @@ XactLockTableDelete(TransactionId xid)
 	tag.dbId = InvalidOid;
 	tag.objId.xid = xid;

-	LockRelease(LockTableId, &tag, ExclusiveLock);
+	LockRelease(LockTableId, &tag, xid, ExclusiveLock);
 }
 #endif

@@ -259,10 +306,10 @@ XactLockTableWait(TransactionId xid)
 	tag.dbId = InvalidOid;
 	tag.objId.xid = xid;

-	if (!LockAcquire(LockTableId, &tag, ShareLock))
+	if (!LockAcquire(LockTableId, &tag, GetCurrentTransactionId(), ShareLock))
 		elog(ERROR, "XactLockTableWait: LockAcquire failed");

-	LockRelease(LockTableId, &tag, ShareLock);
+	LockRelease(LockTableId, &tag, GetCurrentTransactionId(), ShareLock);

 	/*
 	 * Transaction was committed/aborted/crashed - we have to update
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.88 2000/12/18 17:33:41 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.89 2000/12/22 00:51:54 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -48,7 +48,7 @@
 *		This is so that we can support more backends. (system-wide semaphore
 *		sets run out pretty fast.)				  -ay 4/95
 *
- * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.88 2000/12/18 17:33:41 tgl Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.89 2000/12/22 00:51:54 tgl Exp $
 */
 #include "postgres.h"

@@ -74,13 +74,14 @@
 #include <sys/sem.h>
 #endif

+#include "access/xact.h"
 #include "storage/proc.h"



 void		HandleDeadLock(SIGNAL_ARGS);
 static void ProcFreeAllSemaphores(void);
-static bool GetOffWaitqueue(PROC *);
+static bool GetOffWaitQueue(PROC *);

 int DeadlockTimeout = 1000;

@@ -300,50 +301,76 @@ InitProcess(void)

 /* -----------------------
 * get process off any wait queue it might be on
+ *
+ * NB: this does not remove the process' holder object, nor the lock object,
+ * even though their holder counts might now have gone to zero.  That will
+ * happen during a subsequent LockReleaseAll call, which we expect will happen
+ * during transaction cleanup.  (Removal of a proc from its wait queue by
+ * this routine can only happen if we are aborting the transaction.)
 * -----------------------
 */
 static bool
-GetOffWaitqueue(PROC *proc)
+GetOffWaitQueue(PROC *proc)
 {
-	bool		getoffed = false;
+	bool		gotoff = false;

 	LockLockTable();
 	if (proc->links.next != INVALID_OFFSET)
 	{
-		int			lockmode = proc->token;
-		LOCK	*waitLock = proc->waitLock;
+		LOCK   *waitLock = proc->waitLock;
+		LOCKMODE lockmode = proc->waitLockMode;

+		/* Remove proc from lock's wait queue */
 		Assert(waitLock);
 		Assert(waitLock->waitProcs.size > 0);
 		SHMQueueDelete(&(proc->links));
 		--waitLock->waitProcs.size;
+
+		/* Undo increments of holder counts by waiting process */
 		Assert(waitLock->nHolding > 0);
 		Assert(waitLock->nHolding > proc->waitLock->nActive);
 		--waitLock->nHolding;
 		Assert(waitLock->holders[lockmode] > 0);
 		--waitLock->holders[lockmode];
+		/* don't forget to clear waitMask bit if appropriate */
 		if (waitLock->activeHolders[lockmode] == waitLock->holders[lockmode])
 			waitLock->waitMask &= ~(1 << lockmode);
-		ProcLockWakeup(&(waitLock->waitProcs), LOCK_LOCKMETHOD(*waitLock), waitLock);
-		getoffed = true;
+
+		/* Clean up the proc's own state */
+		SHMQueueElemInit(&(proc->links));
+		proc->waitLock = NULL;
+		proc->waitHolder = NULL;
+
+		/* See if any other waiters can be woken up now */
+		ProcLockWakeup(LOCK_LOCKMETHOD(*waitLock), waitLock);
+
+		gotoff = true;
 	}
-	SHMQueueElemInit(&(proc->links));
 	UnlockLockTable();

-	return getoffed;
+	return gotoff;
 }

 /*
- * ProcReleaseLocks() -- release all locks associated with current transaction
+ * ProcReleaseLocks() -- release locks associated with current transaction
+ *			at transaction commit or abort
 *
+ * At commit, we release only locks tagged with the current transaction's XID,
+ * leaving those marked with XID 0 (ie, session locks) undisturbed.  At abort,
+ * we release all locks including XID 0, because we need to clean up after
+ * a failure.  This logic will need extension if we ever support nested
+ * transactions.
+ *
+ * Note that user locks are not released in either case.
 */
 void
-ProcReleaseLocks()
+ProcReleaseLocks(bool isCommit)
 {
 	if (!MyProc)
 		return;
-	LockReleaseAll(DEFAULT_LOCKMETHOD, &MyProc->lockQueue);
-	GetOffWaitqueue(MyProc);
+	GetOffWaitQueue(MyProc);
+	LockReleaseAll(DEFAULT_LOCKMETHOD, MyProc,
+				   !isCommit, GetCurrentTransactionId());
 }

 /*
@@ -384,47 +411,47 @@ static void
 ProcKill(int exitStatus, Datum pid)
 {
 	PROC	   *proc;
-	SHMEM_OFFSET location;

 	/* --------------------
 	 * If this is a FATAL exit the postmaster will have to kill all the
-	 * existing backends and reinitialize shared memory.  So all we don't
+	 * existing backends and reinitialize shared memory.  So we don't
 	 * need to do anything here.
 	 * --------------------
 	 */
 	if (exitStatus != 0)
 		return;

-	ShmemPIDLookup(MyProcPid, &location);
-	if (location == INVALID_OFFSET)
-		return;
+	if ((int) pid == MyProcPid)
+	{
+		proc = MyProc;
+		MyProc = NULL;
+	}
+	else
+	{
+		/* This path is dead code at the moment ... */
+		SHMEM_OFFSET location = INVALID_OFFSET;

-	proc = (PROC *) MAKE_PTR(location);
+		ShmemPIDLookup((int) pid, &location);
+		if (location == INVALID_OFFSET)
+			return;
+		proc = (PROC *) MAKE_PTR(location);
+	}

-	Assert(proc == MyProc || (int)pid != MyProcPid);
+	Assert(proc);

-	MyProc = NULL;
-
-	/* ---------------
-	 * Assume one lock table.
-	 * ---------------
-	 */
+	/* Release any spinlocks the proc is holding */
 	ProcReleaseSpins(proc);
-	LockReleaseAll(DEFAULT_LOCKMETHOD, &proc->lockQueue);
+
+	/* Get the proc off any wait queue it might be on */
+	GetOffWaitQueue(proc);
+
+	/* Remove from the standard lock table */
+	LockReleaseAll(DEFAULT_LOCKMETHOD, proc, true, InvalidTransactionId);

 #ifdef USER_LOCKS
-
-	/*
-	 * Assume we have a second lock table.
-	 */
-	LockReleaseAll(USER_LOCKMETHOD, &proc->lockQueue);
+	/* Remove from the user lock table */
+	LockReleaseAll(USER_LOCKMETHOD, proc, true, InvalidTransactionId);
 #endif
-
-	/* ----------------
-	 * get off the wait queue
-	 * ----------------
-	 */
-	GetOffWaitqueue(proc);
 }

 /*
@@ -488,10 +515,10 @@ SetWaitingForLock(bool waiting)
 		}
 		if (QueryCancel)		/* cancel request pending */
 		{
-			if (GetOffWaitqueue(MyProc))
+			if (GetOffWaitQueue(MyProc))
 			{
 				lockWaiting = false;
-				elog(ERROR, "Query cancel requested while waiting lock");
+				elog(ERROR, "Query cancel requested while waiting for lock");
 			}
 		}
 	}
@@ -519,8 +546,8 @@ LockWaitCancel(void)
    set_alarm(B_INFINITE_TIMEOUT, B_PERIODIC_ALARM);
 #endif /* __BEOS__ */
        
-	if (GetOffWaitqueue(MyProc))
-		elog(ERROR, "Query cancel requested while waiting lock");
+	if (GetOffWaitQueue(MyProc))
+		elog(ERROR, "Query cancel requested while waiting for lock");
 }

 /*
@@ -538,18 +565,19 @@ LockWaitCancel(void)
 * NOTES: The process queue is now a priority queue for locking.
 */
 int
-ProcSleep(PROC_QUEUE *waitQueue,/* lock->waitProcs */
-		  LOCKMETHODCTL *lockctl,
-		  int token,			/* lockmode */
-		  LOCK *lock)
+ProcSleep(LOCKMETHODCTL *lockctl,
+		  LOCKMODE lockmode,
+		  LOCK *lock,
+		  HOLDER *holder)
 {
-	int			i;
+	PROC_QUEUE *waitQueue = &(lock->waitProcs);
 	SPINLOCK	spinlock = lockctl->masterLock;
-	PROC	   *proc;
-	int			myMask = (1 << token);
+	int			myMask = (1 << lockmode);
 	int			waitMask = lock->waitMask;
+	PROC	   *proc;
+	int			i;
 	int			aheadHolders[MAX_LOCKMODES];
-	bool		selfConflict = (lockctl->conflictTab[token] & myMask),
+	bool		selfConflict = (lockctl->conflictTab[lockmode] & myMask),
 				prevSame = false;
 #ifndef __BEOS__
 	struct itimerval timeval,
@@ -558,26 +586,28 @@ ProcSleep(PROC_QUEUE *waitQueue,/* lock->waitProcs */
    bigtime_t time_interval;
 #endif

-	MyProc->token = token;
 	MyProc->waitLock = lock;
+	MyProc->waitHolder = holder;
+	MyProc->waitLockMode = lockmode;
+	/* We assume the caller set up MyProc->holdLock */

 	proc = (PROC *) MAKE_PTR(waitQueue->links.prev);

 	/* if we don't conflict with any waiter - be first in queue */
-	if (!(lockctl->conflictTab[token] & waitMask))
+	if (!(lockctl->conflictTab[lockmode] & waitMask))
 		goto ins;

 	for (i = 1; i < MAX_LOCKMODES; i++)
 		aheadHolders[i] = lock->activeHolders[i];
-	(aheadHolders[token])++;
+	(aheadHolders[lockmode])++;

 	for (i = 0; i < waitQueue->size; i++)
 	{
 		/* am I waiting for him ? */
-		if (lockctl->conflictTab[token] & proc->holdLock)
+		if (lockctl->conflictTab[lockmode] & proc->holdLock)
 		{
 			/* is he waiting for me ? */
-			if (lockctl->conflictTab[proc->token] & MyProc->holdLock)
+			if (lockctl->conflictTab[proc->waitLockMode] & MyProc->holdLock)
 			{
 				/* Yes, report deadlock failure */
 				MyProc->errType = STATUS_ERROR;
@@ -586,10 +616,10 @@ ProcSleep(PROC_QUEUE *waitQueue,/* lock->waitProcs */
 			/* being waiting for him - go past */
 		}
 		/* if he waits for me */
-		else if (lockctl->conflictTab[proc->token] & MyProc->holdLock)
+		else if (lockctl->conflictTab[proc->waitLockMode] & MyProc->holdLock)
 			break;
 		/* if conflicting locks requested */
-		else if (lockctl->conflictTab[proc->token] & myMask)
+		else if (lockctl->conflictTab[proc->waitLockMode] & myMask)
 		{

 			/*
@@ -604,13 +634,13 @@ ProcSleep(PROC_QUEUE *waitQueue,/* lock->waitProcs */
 		 * Last attempt to don't move any more: if we don't conflict with
 		 * rest waiters in queue.
 		 */
-		else if (!(lockctl->conflictTab[token] & waitMask))
+		else if (!(lockctl->conflictTab[lockmode] & waitMask))
 			break;

-		prevSame = (proc->token == token);
-		(aheadHolders[proc->token])++;
-		if (aheadHolders[proc->token] == lock->holders[proc->token])
-			waitMask &= ~(1 << proc->token);
+		prevSame = (proc->waitLockMode == lockmode);
+		(aheadHolders[proc->waitLockMode])++;
+		if (aheadHolders[proc->waitLockMode] == lock->holders[proc->waitLockMode])
+			waitMask &= ~(1 << proc->waitLockMode);
 		proc = (PROC *) MAKE_PTR(proc->links.prev);
 	}

@@ -692,10 +722,8 @@ ins:;

 rt:;

-#ifdef LOCK_DEBUG
-	/* Just to get meaningful debug messages from DumpLocks() */
-	MyProc->waitLock = (LOCK *) NULL;
-#endif
+	MyProc->waitLock = NULL;
+	MyProc->waitHolder = NULL;

 	return MyProc->errType;
 }
@@ -704,7 +732,7 @@ rt:;
 /*
 * ProcWakeup -- wake up a process by releasing its private semaphore.
 *
- *	 remove the process from the wait queue and set its links invalid.
+ *	 Also remove the process from the wait queue and set its links invalid.
 *	 RETURN: the next process in the wait queue.
 */
 PROC *
@@ -720,9 +748,9 @@ ProcWakeup(PROC *proc, int errType)

 	retProc = (PROC *) MAKE_PTR(proc->links.prev);

-	/* you have to update waitLock->waitProcs.size yourself */
 	SHMQueueDelete(&(proc->links));
 	SHMQueueElemInit(&(proc->links));
+	(proc->waitLock->waitProcs.size)--;

 	proc->errType = errType;

@@ -736,65 +764,70 @@ ProcWakeup(PROC *proc, int errType)
 *		released.
 */
 int
-ProcLockWakeup(PROC_QUEUE *queue, LOCKMETHOD lockmethod, LOCK *lock)
+ProcLockWakeup(LOCKMETHOD lockmethod, LOCK *lock)
 {
+	PROC_QUEUE *queue = &(lock->waitProcs);
 	PROC	   *proc;
-	int			count = 0;
-	int			last_locktype = 0;
+	int			awoken = 0;
+	LOCKMODE	last_lockmode = 0;
 	int			queue_size = queue->size;

-	Assert(queue->size >= 0);
+	Assert(queue_size >= 0);

-	if (!queue->size)
+	if (!queue_size)
 		return STATUS_NOT_FOUND;

 	proc = (PROC *) MAKE_PTR(queue->links.prev);
-	while ((queue_size--) && (proc))
-	{

-		/*
-		 * This proc will conflict as the previous one did, don't even
-		 * try.
-		 */
-		if (proc->token == last_locktype)
-			continue;
+	while (queue_size-- > 0)
+	{
+		if (proc->waitLockMode == last_lockmode)
+		{
+			/*
+			 * This proc will conflict as the previous one did, don't even
+			 * try.
+			 */
+			goto nextProc;
+		}

 		/*
 		 * Does this proc conflict with locks held by others ?
 		 */
 		if (LockResolveConflicts(lockmethod,
+								 proc->waitLockMode,
 								 lock,
-								 proc->token,
-								 proc->xid,
-								 (XIDLookupEnt *) NULL) != STATUS_OK)
+								 proc->waitHolder,
+								 proc,
+								 NULL) != STATUS_OK)
 		{
-			if (count != 0)
+			/* Yes.  Quit if we already awoke at least one process. */
+			if (awoken != 0)
 				break;
-			last_locktype = proc->token;
-			continue;
+			/* Otherwise, see if any later waiters can be awoken. */
+			last_lockmode = proc->waitLockMode;
+			goto nextProc;
 		}

 		/*
-		 * there was a waiting process, grant it the lock before waking it
-		 * up.	This will prevent another process from seizing the lock
-		 * between the time we release the lock master (spinlock) and the
-		 * time that the awoken process begins executing again.
+		 * OK to wake up this sleeping process.
 		 */
-		GrantLock(lock, proc->token);
+		GrantLock(lock, proc->waitHolder, proc->waitLockMode);
+		proc = ProcWakeup(proc, NO_ERROR);
+		awoken++;

 		/*
-		 * ProcWakeup removes proc from the lock waiting process queue and
-		 * returns the next proc in chain.
+		 * ProcWakeup removes proc from the lock's waiting process queue
+		 * and returns the next proc in chain; don't use prev link.
 		 */
+		continue;

-		count++;
-		queue->size--;
-		proc = ProcWakeup(proc, NO_ERROR);
+nextProc:
+		proc = (PROC *) MAKE_PTR(proc->links.prev);
 	}

 	Assert(queue->size >= 0);

-	if (count)
+	if (awoken)
 		return STATUS_OK;
 	else
 	{
@@ -802,9 +835,10 @@ ProcLockWakeup(PROC_QUEUE *queue, LOCKMETHOD lockmethod, LOCK *lock)
 #ifdef LOCK_DEBUG
 		if (lock->tag.lockmethod == USER_LOCKMETHOD ? Trace_userlocks : Trace_locks)
 		{
-			elog(DEBUG, "ProcLockWakeup: lock(%lx) can't wake up any process", MAKE_OFFSET(lock));
+			elog(DEBUG, "ProcLockWakeup: lock(%lx) can't wake up any process",
+				 MAKE_OFFSET(lock));
 			if (Debug_deadlocks)
-			DumpAllLocks();
+				DumpAllLocks();
 		}
 #endif
 		return STATUS_NOT_FOUND;
@@ -872,10 +906,12 @@ HandleDeadLock(SIGNAL_ARGS)
 	 */
 	mywaitlock = MyProc->waitLock;
 	Assert(mywaitlock->waitProcs.size > 0);
-	lockWaiting = false;
 	--mywaitlock->waitProcs.size;
 	SHMQueueDelete(&(MyProc->links));
 	SHMQueueElemInit(&(MyProc->links));
+	MyProc->waitLock = NULL;
+	MyProc->waitHolder = NULL;
+	lockWaiting = false;

 	/* ------------------
 	 * Unlock my semaphore so that the interrupted ProcSleep() call can finish.