Two-phase commit. Original patch by Heikki Linnakangas, with additional

hacking by Alvaro Herrera and Tom Lane.
2025-11-10 17:42:29 +03:00 · 2005-06-17 22:32:51 +00:00
parent 5495575903
commit d0a89683a3
61 changed files with 4454 additions and 439 deletions
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.76 2005/05/19 21:35:46 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.77 2005/06/17 22:32:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -17,6 +17,7 @@
 #include "access/clog.h"
 #include "access/multixact.h"
 #include "access/subtrans.h"
+#include "access/twophase.h"
 #include "access/xlog.h"
 #include "miscadmin.h"
 #include "postmaster/bgwriter.h"
@@ -54,9 +55,7 @@
 * memory.	This is true for a standalone backend, false for a postmaster.
 */
 void
-CreateSharedMemoryAndSemaphores(bool makePrivate,
-								int maxBackends,
-								int port)
+CreateSharedMemoryAndSemaphores(bool makePrivate, int port)
 {
 	PGShmemHeader *seghdr = NULL;

@@ -72,15 +71,16 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 		 */
 		size = hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt));
 		size += BufferShmemSize();
-		size += LockShmemSize(maxBackends);
-		size += ProcGlobalShmemSize(maxBackends);
+		size += LockShmemSize();
+		size += ProcGlobalShmemSize();
 		size += XLOGShmemSize();
 		size += CLOGShmemSize();
 		size += SUBTRANSShmemSize();
+		size += TwoPhaseShmemSize();
 		size += MultiXactShmemSize();
 		size += LWLockShmemSize();
-		size += ProcArrayShmemSize(maxBackends);
-		size += SInvalShmemSize(maxBackends);
+		size += ProcArrayShmemSize();
+		size += SInvalShmemSize(MaxBackends);
 		size += FreeSpaceShmemSize();
 		size += BgWriterShmemSize();
 #ifdef EXEC_BACKEND
@@ -100,7 +100,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 		/*
 		 * Create semaphores
 		 */
-		numSemas = ProcGlobalSemas(maxBackends);
+		numSemas = ProcGlobalSemas();
 		numSemas += SpinlockSemas();
 		PGReserveSemaphores(numSemas, port);
 	}
@@ -144,6 +144,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 	XLOGShmemInit();
 	CLOGShmemInit();
 	SUBTRANSShmemInit();
+	TwoPhaseShmemInit();
 	MultiXactShmemInit();
 	InitBufferPool();

@@ -151,18 +152,18 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 	 * Set up lock manager
 	 */
 	InitLocks();
-	InitLockTable(maxBackends);
+	InitLockTable();

 	/*
 	 * Set up process table
 	 */
-	InitProcGlobal(maxBackends);
-	CreateSharedProcArray(maxBackends);
+	InitProcGlobal();
+	CreateSharedProcArray();

 	/*
 	 * Set up shared-inval messaging
 	 */
-	CreateSharedInvalidationState(maxBackends);
+	CreateSharedInvalidationState(MaxBackends);

 	/*
 	 * Set up free-space map
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -11,6 +11,11 @@
 * Because of various subtle race conditions it is critical that a backend
 * hold the correct locks while setting or clearing its MyProc->xid field.
 * See notes in GetSnapshotData.
+ *
+ * The process array now also includes PGPROC structures representing
+ * prepared transactions.  The xid and subxids fields of these are valid,
+ * as is the procLocks list.  They can be distinguished from regular backend
+ * PGPROCs at need by checking for pid == 0.
 * 
 *
 * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
@@ -18,13 +23,14 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.2 2005/05/19 23:57:11 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.3 2005/06/17 22:32:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"

 #include "access/subtrans.h"
+#include "access/twophase.h"
 #include "miscadmin.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
@@ -76,25 +82,23 @@ static void DisplayXidCache(void);
 * Report shared-memory space needed by CreateSharedProcArray.
 */
 int
-ProcArrayShmemSize(int maxBackends)
+ProcArrayShmemSize(void)
 {
-	/* sizeof(ProcArrayStruct) includes the first array element */
-	return MAXALIGN(sizeof(ProcArrayStruct) +
-					(maxBackends - 1) * sizeof(PGPROC *));
+	return MAXALIGN(offsetof(ProcArrayStruct, procs) +
+					(MaxBackends + max_prepared_xacts) * sizeof(PGPROC *));
 }

 /*
 * Initialize the shared PGPROC array during postmaster startup.
 */
 void
-CreateSharedProcArray(int maxBackends)
+CreateSharedProcArray(void)
 {
 	bool		found;

 	/* Create or attach to the ProcArray shared structure */
 	procArray = (ProcArrayStruct *)
-		ShmemInitStruct("Proc Array", ProcArrayShmemSize(maxBackends),
-						&found);
+		ShmemInitStruct("Proc Array", ProcArrayShmemSize(), &found);

 	if (!found)
 	{
@@ -102,18 +106,15 @@ CreateSharedProcArray(int maxBackends)
 		 * We're the first - initialize.
 		 */
 		procArray->numProcs = 0;
-		procArray->maxProcs = maxBackends;
+		procArray->maxProcs = MaxBackends + max_prepared_xacts;
 	}
 }

 /*
- * Add my own PGPROC (found in the global MyProc) to the shared array.
- *
- * This must be called during backend startup, after fully initializing
- * the contents of MyProc.
+ * Add the specified PGPROC to the shared array.
 */
 void
-ProcArrayAddMyself(void)
+ProcArrayAdd(PGPROC *proc)
 {
 	ProcArrayStruct *arrayP = procArray;

@@ -132,32 +133,32 @@ ProcArrayAddMyself(void)
 				 errmsg("sorry, too many clients already")));
 	}

-	arrayP->procs[arrayP->numProcs] = MyProc;
+	arrayP->procs[arrayP->numProcs] = proc;
 	arrayP->numProcs++;

 	LWLockRelease(ProcArrayLock);
 }

 /*
- * Remove my own PGPROC (found in the global MyProc) from the shared array.
- *
- * This must be called during backend shutdown.
+ * Remove the specified PGPROC from the shared array.
 */
 void
-ProcArrayRemoveMyself(void)
+ProcArrayRemove(PGPROC *proc)
 {
 	ProcArrayStruct *arrayP = procArray;
 	int			index;

 #ifdef XIDCACHE_DEBUG
-	DisplayXidCache();
+	/* dump stats at backend shutdown, but not prepared-xact end */
+	if (proc->pid != 0)
+		DisplayXidCache();
 #endif

 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);

 	for (index = 0; index < arrayP->numProcs; index++)
 	{
-		if (arrayP->procs[index] == MyProc)
+		if (arrayP->procs[index] == proc)
 		{
 			arrayP->procs[index] = arrayP->procs[arrayP->numProcs - 1];
 			arrayP->numProcs--;
@@ -169,7 +170,7 @@ ProcArrayRemoveMyself(void)
 	/* Ooops */
 	LWLockRelease(ProcArrayLock);

-	elog(LOG, "failed to find my own proc %p in ProcArray", MyProc);
+	elog(LOG, "failed to find proc %p in ProcArray", proc);
 }


@@ -329,6 +330,55 @@ result_known:
 	return result;
 }

+/*
+ * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ *
+ * This differs from TransactionIdIsInProgress in that it ignores prepared
+ * transactions.  Also, we ignore subtransactions since that's not needed
+ * for current uses.
+ */
+bool
+TransactionIdIsActive(TransactionId xid)
+{
+	bool		result = false;
+	ProcArrayStruct *arrayP = procArray;
+	int			i;
+
+	/*
+	 * Don't bother checking a transaction older than RecentXmin; it
+	 * could not possibly still be running.
+	 */
+	if (TransactionIdPrecedes(xid, RecentXmin))
+		return false;
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (i = 0; i < arrayP->numProcs; i++)
+	{
+		PGPROC	   *proc = arrayP->procs[i];
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		TransactionId pxid = proc->xid;
+
+		if (!TransactionIdIsValid(pxid))
+			continue;
+
+		if (proc->pid == 0)
+			continue;			/* ignore prepared transactions */
+
+		if (TransactionIdEquals(pxid, xid))
+		{
+			result = true;
+			break;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	return result;
+}
+
+
 /*
 * GetOldestXmin -- returns oldest transaction that was running
 *					when any current transaction was started.
@@ -441,12 +491,12 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 		   TransactionIdIsValid(MyProc->xmin));

 	/*
-	 * Allocating space for MaxBackends xids is usually overkill;
+	 * Allocating space for maxProcs xids is usually overkill;
 	 * numProcs would be sufficient.  But it seems better to do the
 	 * malloc while not holding the lock, so we can't look at numProcs.
 	 *
 	 * This does open a possibility for avoiding repeated malloc/free: since
-	 * MaxBackends does not change at runtime, we can simply reuse the
+	 * maxProcs does not change at runtime, we can simply reuse the
 	 * previous xip array if any.  (This relies on the fact that all
 	 * callers pass static SnapshotData structs.)
 	 */
@@ -456,7 +506,7 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 		 * First call for this snapshot
 		 */
 		snapshot->xip = (TransactionId *)
-			malloc(MaxBackends * sizeof(TransactionId));
+			malloc(arrayP->maxProcs * sizeof(TransactionId));
 		if (snapshot->xip == NULL)
 			ereport(ERROR,
 					(errcode(ERRCODE_OUT_OF_MEMORY),
@@ -602,14 +652,21 @@ DatabaseHasActiveBackends(Oid databaseId, bool ignoreMyself)

 /*
 * BackendPidGetProc -- get a backend's PGPROC given its PID
+ *
+ * Returns NULL if not found.  Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
 */
-struct PGPROC *
+PGPROC *
 BackendPidGetProc(int pid)
 {
 	PGPROC	   *result = NULL;
 	ProcArrayStruct *arrayP = procArray;
 	int			index;

+	if (pid == 0)				/* never match dummy PGPROCs */
+		return NULL;
+
 	LWLockAcquire(ProcArrayLock, LW_SHARED);

 	for (index = 0; index < arrayP->numProcs; index++)
@@ -642,10 +699,8 @@ IsBackendPid(int pid)
 *		active transactions.  This is used as a heuristic to decide if
 *		a pre-XLOG-flush delay is worthwhile during commit.
 *
- * An active transaction is something that has written at least one XLOG
- * record; read-only transactions don't count.  Also, do not count backends
- * that are blocked waiting for locks, since they are not going to get to
- * run until someone else commits.
+ * Do not count backends that are blocked waiting for locks, since they are
+ * not going to get to run until someone else commits.
 */
 int
 CountActiveBackends(void)
@@ -656,7 +711,7 @@ CountActiveBackends(void)

 	/*
 	 * Note: for speed, we don't acquire ProcArrayLock.  This is a little bit
-	 * bogus, but since we are only testing xrecoff for zero or nonzero,
+	 * bogus, but since we are only testing fields for zero or nonzero,
 	 * it should be OK.  The result is only used for heuristic purposes
 	 * anyway...
 	 */
@@ -666,7 +721,9 @@ CountActiveBackends(void)

 		if (proc == MyProc)
 			continue;			/* do not count myself */
-		if (proc->logRec.xrecoff == 0)
+		if (proc->pid == 0)
+			continue;			/* do not count prepared xacts */
+		if (proc->xid == InvalidTransactionId)
 			continue;			/* do not count if not in a transaction */
 		if (proc->waitLock != NULL)
 			continue;			/* do not count if blocked on a lock */
@@ -676,25 +733,6 @@ CountActiveBackends(void)
 	return count;
 }

-/*
- * CountEmptyBackendSlots - count empty slots in backend process table
- *
- * Acquiring the lock here is almost certainly overkill, but just in
- * case fetching an int is not atomic on your machine ...
- */
-int
-CountEmptyBackendSlots(void)
-{
-	int			count;
-
-	LWLockAcquire(ProcArrayLock, LW_SHARED);
-
-	count = procArray->maxProcs - procArray->numProcs;
-
-	LWLockRelease(ProcArrayLock);
-
-	return count;
-}

 #define XidCacheRemove(i) \
 	do { \
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.76 2005/06/14 22:15:32 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.77 2005/06/17 22:32:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -77,7 +77,7 @@ static LOCKMETHODID LockTableId = INVALID_LOCKMETHOD;
 * Create the lock table described by LockConflicts
 */
 void
-InitLockTable(int maxBackends)
+InitLockTable(void)
 {
 	LOCKMETHODID LongTermTableId;

@@ -91,8 +91,7 @@ InitLockTable(int maxBackends)
 	/* number of lock modes is lengthof()-1 because of dummy zero */
 	LockTableId = LockMethodTableInit("LockTable",
 									  LockConflicts,
-									  lengthof(LockConflicts) - 1,
-									  maxBackends);
+									  lengthof(LockConflicts) - 1);
 	if (!LockMethodIsValid(LockTableId))
 		elog(ERROR, "could not initialize lock table");
 	Assert(LockTableId == DEFAULT_LOCKMETHOD);
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.155 2005/06/14 22:15:32 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.156 2005/06/17 22:32:45 tgl Exp $
 *
 * NOTES
 *	  Outside modules can create a lock table and acquire/release
@@ -33,6 +33,8 @@
 #include <signal.h>
 #include <unistd.h>

+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
 #include "access/xact.h"
 #include "miscadmin.h"
 #include "storage/proc.h"
@@ -44,7 +46,15 @@
 /* This configuration variable is used to set the lock table size */
 int			max_locks_per_xact; /* set by guc.c */

-#define NLOCKENTS(maxBackends)	(max_locks_per_xact * (maxBackends))
+#define NLOCKENTS()	(max_locks_per_xact * (MaxBackends + max_prepared_xacts))
+
+
+/* Record that's written to 2PC state file when a lock is persisted */
+typedef struct TwoPhaseLockRecord
+{
+	LOCKTAG		locktag;
+	LOCKMODE	lockmode;
+} TwoPhaseLockRecord;


 /*
@@ -168,8 +178,7 @@ static void CleanUpLock(LOCKMETHODID lockmethodid, LOCK *lock,


 /*
- * InitLocks -- Init the lock module.  Create a private data
- *		structure for constructing conflict masks.
+ * InitLocks -- Init the lock module.  Nothing to do here at present.
 */
 void
 InitLocks(void)
@@ -222,8 +231,7 @@ LockMethodInit(LockMethod lockMethodTable,
 LOCKMETHODID
 LockMethodTableInit(const char *tabName,
 					const LOCKMASK *conflictsP,
-					int numModes,
-					int maxBackends)
+					int numModes)
 {
 	LockMethod	newLockMethod;
 	LOCKMETHODID lockmethodid;
@@ -239,7 +247,7 @@ LockMethodTableInit(const char *tabName,
 			 numModes, MAX_LOCKMODES - 1);

 	/* Compute init/max size to request for lock hashtables */
-	max_table_size = NLOCKENTS(maxBackends);
+	max_table_size = NLOCKENTS();
 	init_table_size = max_table_size / 2;

 	/* Allocate a string for the shmem index table lookups. */
@@ -1418,10 +1426,10 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
 	while (proclock)
 	{
 		bool		wakeupNeeded = false;
-		PROCLOCK   *nextHolder;
+		PROCLOCK   *nextplock;

 		/* Get link first, since we may unlink/delete this proclock */
-		nextHolder = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
+		nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
 										   offsetof(PROCLOCK, procLink));

 		Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
@@ -1474,7 +1482,7 @@ LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
 		CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);

 next_item:
-		proclock = nextHolder;
+		proclock = nextplock;
 	}

 	LWLockRelease(masterLock);
@@ -1605,14 +1613,262 @@ LockReassignCurrentOwner(void)
 }


+/*
+ * AtPrepare_Locks
+ *		Do the preparatory work for a PREPARE: make 2PC state file records
+ *		for all locks currently held.
+ *
+ * User locks are non-transactional and are therefore ignored.
+ *
+ * There are some special cases that we error out on: we can't be holding
+ * any session locks (should be OK since only VACUUM uses those) and we
+ * can't be holding any locks on temporary objects (since that would mess
+ * up the current backend if it tries to exit before the prepared xact is
+ * committed).
+ */
+void
+AtPrepare_Locks(void)
+{
+	LOCKMETHODID lockmethodid = DEFAULT_LOCKMETHOD;
+	HASH_SEQ_STATUS status;
+	LOCALLOCK  *locallock;
+
+	/*
+	 * We don't need to touch shared memory for this --- all the necessary
+	 * state information is in the locallock table.
+	 */
+	hash_seq_init(&status, LockMethodLocalHash[lockmethodid]);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		TwoPhaseLockRecord record;
+		LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+		int		i;
+
+		/* Ignore items that are not of the lockmethod to be processed */
+		if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+			continue;
+
+		/* Ignore it if we don't actually hold the lock */
+		if (locallock->nLocks <= 0)
+			continue;
+
+		/* Scan to verify there are no session locks */
+		for (i = locallock->numLockOwners - 1; i >= 0; i--)
+		{
+			/* elog not ereport since this should not happen */
+			if (lockOwners[i].owner == NULL)
+				elog(ERROR, "cannot PREPARE when session locks exist");
+		}
+
+		/* Can't handle it if the lock is on a temporary object */
+		if (locallock->isTempObject)
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot PREPARE a transaction that has operated on temporary tables")));
+
+		/*
+		 * Create a 2PC record.
+		 */
+		memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG));
+		record.lockmode = locallock->tag.mode;
+
+		RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0,
+							   &record, sizeof(TwoPhaseLockRecord));
+	}
+}
+
+/*
+ * PostPrepare_Locks
+ *		Clean up after successful PREPARE
+ *
+ * Here, we want to transfer ownership of our locks to a dummy PGPROC
+ * that's now associated with the prepared transaction, and we want to
+ * clean out the corresponding entries in the LOCALLOCK table.
+ *
+ * Note: by removing the LOCALLOCK entries, we are leaving dangling
+ * pointers in the transaction's resource owner.  This is OK at the
+ * moment since resowner.c doesn't try to free locks retail at a toplevel
+ * transaction commit or abort.  We could alternatively zero out nLocks
+ * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll,
+ * but that probably costs more cycles.
+ */
+void
+PostPrepare_Locks(TransactionId xid)
+{
+	PGPROC	   *newproc = TwoPhaseGetDummyProc(xid);
+	LOCKMETHODID lockmethodid = DEFAULT_LOCKMETHOD;
+	HASH_SEQ_STATUS status;
+	SHM_QUEUE  *procLocks = &(MyProc->procLocks);
+	LWLockId	masterLock;
+	LockMethod	lockMethodTable;
+	int			numLockModes;
+	LOCALLOCK  *locallock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	bool		found;
+	LOCK	   *lock;
+
+	/* This is a critical section: any error means big trouble */
+	START_CRIT_SECTION();
+
+	lockMethodTable = LockMethods[lockmethodid];
+	if (!lockMethodTable)
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	numLockModes = lockMethodTable->numLockModes;
+	masterLock = lockMethodTable->masterLock;
+
+	/*
+	 * First we run through the locallock table and get rid of unwanted
+	 * entries, then we scan the process's proclocks and transfer them
+	 * to the target proc.
+	 *
+	 * We do this separately because we may have multiple locallock
+	 * entries pointing to the same proclock, and we daren't end up with
+	 * any dangling pointers.
+	 */
+	hash_seq_init(&status, LockMethodLocalHash[lockmethodid]);
+
+	while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+	{
+		if (locallock->proclock == NULL || locallock->lock == NULL)
+		{
+			/*
+			 * We must've run out of shared memory while trying to set up
+			 * this lock.  Just forget the local entry.
+			 */
+			Assert(locallock->nLocks == 0);
+			RemoveLocalLock(locallock);
+			continue;
+		}
+
+		/* Ignore items that are not of the lockmethod to be removed */
+		if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+			continue;
+
+		/* We already checked there are no session locks */
+
+		/* Mark the proclock to show we need to release this lockmode */
+		if (locallock->nLocks > 0)
+			locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+		/* And remove the locallock hashtable entry */
+		RemoveLocalLock(locallock);
+	}
+
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);
+
+	proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+										 offsetof(PROCLOCK, procLink));
+
+	while (proclock)
+	{
+		PROCLOCK   *nextplock;
+		LOCKMASK	holdMask;
+		PROCLOCK   *newproclock;
+
+		/* Get link first, since we may unlink/delete this proclock */
+		nextplock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->procLink,
+										   offsetof(PROCLOCK, procLink));
+
+		Assert(proclock->tag.proc == MAKE_OFFSET(MyProc));
+
+		lock = (LOCK *) MAKE_PTR(proclock->tag.lock);
+
+		/* Ignore items that are not of the lockmethod to be removed */
+		if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+			goto next_item;
+
+		PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+		LOCK_PRINT("PostPrepare_Locks", lock, 0);
+		Assert(lock->nRequested >= 0);
+		Assert(lock->nGranted >= 0);
+		Assert(lock->nGranted <= lock->nRequested);
+		Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+		/*
+		 * Since there were no session locks, we should be releasing all locks
+		 */
+		if (proclock->releaseMask != proclock->holdMask)
+			elog(PANIC, "we seem to have dropped a bit somewhere");
+
+		holdMask = proclock->holdMask;
+
+		/*
+		 * We cannot simply modify proclock->tag.proc to reassign ownership
+		 * of the lock, because that's part of the hash key and the proclock
+		 * would then be in the wrong hash chain.  So, unlink and delete the
+		 * old proclock; create a new one with the right contents; and link
+		 * it into place.  We do it in this order to be certain we won't
+		 * run out of shared memory (the way dynahash.c works, the deleted
+		 * object is certain to be available for reallocation).
+		 */
+		SHMQueueDelete(&proclock->lockLink);
+		SHMQueueDelete(&proclock->procLink);
+		if (!hash_search(LockMethodProcLockHash[lockmethodid],
+						 (void *) &(proclock->tag),
+						 HASH_REMOVE, NULL))
+			elog(PANIC, "proclock table corrupted");
+
+		/*
+		 * Create the hash key for the new proclock table.
+		 */
+		MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));
+		proclocktag.lock = MAKE_OFFSET(lock);
+		proclocktag.proc = MAKE_OFFSET(newproc);
+
+		newproclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[lockmethodid],
+											   (void *) &proclocktag,
+											   HASH_ENTER_NULL, &found);
+		if (!newproclock)
+		    ereport(PANIC,		/* should not happen */
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of shared memory"),
+					 errdetail("Not enough memory for reassigning the prepared transaction's locks.")));
+
+		/*
+		 * If new, initialize the new entry
+		 */
+		if (!found)
+		{
+			newproclock->holdMask = 0;
+			newproclock->releaseMask = 0;
+			/* Add new proclock to appropriate lists */
+			SHMQueueInsertBefore(&lock->procLocks, &newproclock->lockLink);
+			SHMQueueInsertBefore(&newproc->procLocks, &newproclock->procLink);
+			PROCLOCK_PRINT("PostPrepare_Locks: new", newproclock);
+		}
+		else
+		{
+			PROCLOCK_PRINT("PostPrepare_Locks: found", newproclock);
+			Assert((newproclock->holdMask & ~lock->grantMask) == 0);
+		}
+
+		/*
+		 * Pass over the identified lock ownership.
+		 */
+		Assert((newproclock->holdMask & holdMask) == 0);
+		newproclock->holdMask |= holdMask;
+
+next_item:
+		proclock = nextplock;
+	}
+
+	LWLockRelease(masterLock);
+
+	END_CRIT_SECTION();
+}
+
+
 /*
 * Estimate shared-memory space used for lock tables
 */
 int
-LockShmemSize(int maxBackends)
+LockShmemSize(void)
 {
 	int			size = 0;
-	long		max_table_size = NLOCKENTS(maxBackends);
+	long		max_table_size = NLOCKENTS();

 	/* lock method headers */
 	size += MAX_LOCK_METHODS * MAXALIGN(sizeof(LockMethodData));
@@ -1704,21 +1960,19 @@ GetLockmodeName(LOCKMODE mode)

 #ifdef LOCK_DEBUG
 /*
- * Dump all locks in the MyProc->procLocks list.
+ * Dump all locks in the given proc's procLocks list.
 *
 * Must have already acquired the masterLock.
 */
 void
-DumpLocks(void)
+DumpLocks(PGPROC *proc)
 {
-	PGPROC	   *proc;
 	SHM_QUEUE  *procLocks;
 	PROCLOCK   *proclock;
 	LOCK	   *lock;
 	int			lockmethodid = DEFAULT_LOCKMETHOD;
 	LockMethod	lockMethodTable;

-	proc = MyProc;
 	if (proc == NULL)
 		return;

@@ -1793,3 +2047,254 @@ DumpAllLocks(void)
 }

 #endif   /* LOCK_DEBUG */
+
+/*
+ * LOCK 2PC resource manager's routines
+ */
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared.
+ *
+ * Because this function is run at db startup, re-acquiring the locks should
+ * never conflict with running transactions because there are none.  We
+ * assume that the lock state represented by the stored 2PC files is legal.
+ */
+void
+lock_twophase_recover(TransactionId xid, uint16 info,
+					  void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid);
+	LOCKTAG	   *locktag;
+	LOCKMODE	lockmode;
+	LOCKMETHODID lockmethodid;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	PROCLOCKTAG proclocktag;
+	bool		found;
+	LWLockId	masterLock;
+	LockMethod	lockMethodTable;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmode = rec->lockmode;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	Assert(lockmethodid < NumLockMethods);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (!lockMethodTable)
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	masterLock = lockMethodTable->masterLock;
+
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);
+
+	/*
+	 * Find or create a lock with this tag.
+	 */
+	lock = (LOCK *) hash_search(LockMethodLockHash[lockmethodid],
+								(void *) locktag,
+								HASH_ENTER_NULL, &found);
+	if (!lock)
+	{
+		LWLockRelease(masterLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+		errhint("You may need to increase max_locks_per_transaction.")));
+	}
+
+	/*
+	 * if it's a new lock object, initialize it
+	 */
+	if (!found)
+	{
+		lock->grantMask = 0;
+		lock->waitMask = 0;
+		SHMQueueInit(&(lock->procLocks));
+		ProcQueueInit(&(lock->waitProcs));
+		lock->nRequested = 0;
+		lock->nGranted = 0;
+		MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+		MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+		LOCK_PRINT("lock_twophase_recover: new", lock, lockmode);
+	}
+	else
+	{
+		LOCK_PRINT("lock_twophase_recover: found", lock, lockmode);
+		Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+		Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+		Assert(lock->nGranted <= lock->nRequested);
+	}
+
+	/*
+	 * Create the hash key for the proclock table.
+	 */
+	MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));	/* must clear padding */
+	proclocktag.lock = MAKE_OFFSET(lock);
+	proclocktag.proc = MAKE_OFFSET(proc);
+
+	/*
+	 * Find or create a proclock entry with this tag
+	 */
+	proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[lockmethodid],
+										(void *) &proclocktag,
+										HASH_ENTER_NULL, &found);
+	if (!proclock)
+	{
+		/* Ooops, not enough shmem for the proclock */
+		if (lock->nRequested == 0)
+		{
+			/*
+			 * There are no other requestors of this lock, so garbage-collect
+			 * the lock object.  We *must* do this to avoid a permanent leak
+			 * of shared memory, because there won't be anything to cause
+			 * anyone to release the lock object later.
+			 */
+			Assert(SHMQueueEmpty(&(lock->procLocks)));
+			if (!hash_search(LockMethodLockHash[lockmethodid],
+							 (void *) &(lock->tag),
+							 HASH_REMOVE, NULL))
+				elog(PANIC, "lock table corrupted");
+		}
+		LWLockRelease(masterLock);
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of shared memory"),
+		errhint("You may need to increase max_locks_per_transaction.")));
+	}
+
+	/*
+	 * If new, initialize the new entry
+	 */
+	if (!found)
+	{
+		proclock->holdMask = 0;
+		proclock->releaseMask = 0;
+		/* Add proclock to appropriate lists */
+		SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
+		SHMQueueInsertBefore(&proc->procLocks, &proclock->procLink);
+		PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
+	}
+	else
+	{
+		PROCLOCK_PRINT("lock_twophase_recover: found", proclock);
+		Assert((proclock->holdMask & ~lock->grantMask) == 0);
+	}
+
+	/*
+	 * lock->nRequested and lock->requested[] count the total number of
+	 * requests, whether granted or waiting, so increment those
+	 * immediately.
+	 */
+	lock->nRequested++;
+	lock->requested[lockmode]++;
+	Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+	/*
+	 * We shouldn't already hold the desired lock.
+	 */
+	if (proclock->holdMask & LOCKBIT_ON(lockmode))
+		elog(ERROR, "lock %s on object %u/%u/%u is already held",
+			 lock_mode_names[lockmode],
+			 lock->tag.locktag_field1, lock->tag.locktag_field2,
+			 lock->tag.locktag_field3);
+
+	/*
+	 * We ignore any possible conflicts and just grant ourselves the lock.
+	 */
+	GrantLock(lock, proclock, lockmode);
+
+	LWLockRelease(masterLock);
+}
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Find and release the lock indicated by the 2PC record.
+ */
+void
+lock_twophase_postcommit(TransactionId xid, uint16 info,
+						 void *recdata, uint32 len)
+{
+	TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+	PGPROC	   *proc = TwoPhaseGetDummyProc(xid);
+	LOCKTAG	   *locktag;
+	LOCKMODE	lockmode;
+	LOCKMETHODID lockmethodid;
+	PROCLOCKTAG proclocktag;
+	LOCK	   *lock;
+	PROCLOCK   *proclock;
+	LWLockId	masterLock;
+	LockMethod	lockMethodTable;
+	bool		wakeupNeeded;
+
+	Assert(len == sizeof(TwoPhaseLockRecord));
+	locktag = &rec->locktag;
+	lockmode = rec->lockmode;
+	lockmethodid = locktag->locktag_lockmethodid;
+
+	Assert(lockmethodid < NumLockMethods);
+	lockMethodTable = LockMethods[lockmethodid];
+	if (!lockMethodTable)
+		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+	masterLock = lockMethodTable->masterLock;
+
+	LWLockAcquire(masterLock, LW_EXCLUSIVE);
+
+	/*
+	 * Re-find the lock object (it had better be there).
+	 */
+	lock = (LOCK *) hash_search(LockMethodLockHash[lockmethodid],
+								(void *) locktag,
+								HASH_FIND, NULL);
+	if (!lock)
+		elog(PANIC, "failed to re-find shared lock object");
+
+	/*
+	 * Re-find the proclock object (ditto).
+	 */
+	MemSet(&proclocktag, 0, sizeof(PROCLOCKTAG));	/* must clear padding */
+	proclocktag.lock = MAKE_OFFSET(lock);
+	proclocktag.proc = MAKE_OFFSET(proc);
+	proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash[lockmethodid],
+										(void *) &proclocktag,
+										HASH_FIND, NULL);
+	if (!proclock)
+		elog(PANIC, "failed to re-find shared proclock object");
+
+	/*
+	 * Double-check that we are actually holding a lock of the type we
+	 * want to release.
+	 */
+	if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+	{
+		PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
+		LWLockRelease(masterLock);
+		elog(WARNING, "you don't own a lock of type %s",
+			 lock_mode_names[lockmode]);
+		return;
+	}
+
+	/*
+	 * Do the releasing.  CleanUpLock will waken any now-wakable waiters.
+	 */
+	wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+	CleanUpLock(lockmethodid, lock, proclock, wakeupNeeded);
+
+	LWLockRelease(masterLock);
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * This is actually just the same as the COMMIT case.
+ */
+void
+lock_twophase_postabort(TransactionId xid, uint16 info,
+						void *recdata, uint32 len)
+{
+	lock_twophase_postcommit(xid, info, recdata, len);
+}
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.159 2005/06/14 22:15:32 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.160 2005/06/17 22:32:45 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -92,13 +92,13 @@ static bool CheckStatementTimeout(void);
 * Report shared-memory space needed by InitProcGlobal.
 */
 int
-ProcGlobalShmemSize(int maxBackends)
+ProcGlobalShmemSize(void)
 {
 	int			size = 0;

 	size += MAXALIGN(sizeof(PROC_HDR)); /* ProcGlobal */
 	size += MAXALIGN(NUM_DUMMY_PROCS * sizeof(PGPROC));	/* DummyProcs */
-	size += MAXALIGN(maxBackends * sizeof(PGPROC));		/* MyProcs */
+	size += MAXALIGN(MaxBackends * sizeof(PGPROC));		/* MyProcs */
 	size += MAXALIGN(sizeof(slock_t)); /* ProcStructLock */

 	return size;
@@ -108,10 +108,10 @@ ProcGlobalShmemSize(int maxBackends)
 * Report number of semaphores needed by InitProcGlobal.
 */
 int
-ProcGlobalSemas(int maxBackends)
+ProcGlobalSemas(void)
 {
 	/* We need a sema per backend, plus one for each dummy process. */
-	return maxBackends + NUM_DUMMY_PROCS;
+	return MaxBackends + NUM_DUMMY_PROCS;
 }

 /*
@@ -134,7 +134,7 @@ ProcGlobalSemas(int maxBackends)
 *	  postmaster, not in backends.
 */
 void
-InitProcGlobal(int maxBackends)
+InitProcGlobal(void)
 {
 	bool		foundProcGlobal,
 				foundDummy;
@@ -170,13 +170,13 @@ InitProcGlobal(int maxBackends)
 		 * Pre-create the PGPROC structures and create a semaphore for
 		 * each.
 		 */
-		procs = (PGPROC *) ShmemAlloc(maxBackends * sizeof(PGPROC));
+		procs = (PGPROC *) ShmemAlloc(MaxBackends * sizeof(PGPROC));
 		if (!procs)
 			ereport(FATAL,
 					(errcode(ERRCODE_OUT_OF_MEMORY),
 					 errmsg("out of shared memory")));
-		MemSet(procs, 0, maxBackends * sizeof(PGPROC));
-		for (i = 0; i < maxBackends; i++)
+		MemSet(procs, 0, MaxBackends * sizeof(PGPROC));
+		for (i = 0; i < MaxBackends; i++)
 		{
 			PGSemaphoreCreate(&(procs[i].sem));
 			procs[i].links.next = ProcGlobal->freeProcs;
@@ -254,7 +254,6 @@ InitProcess(void)
 	MyProc->xmin = InvalidTransactionId;
 	MyProc->pid = MyProcPid;
 	MyProc->databaseId = MyDatabaseId;
-	MyProc->logRec.xrecoff = 0;
 	MyProc->lwWaiting = false;
 	MyProc->lwExclusive = false;
 	MyProc->lwWaitLink = NULL;
@@ -265,7 +264,7 @@ InitProcess(void)
 	/*
 	 * Add our PGPROC to the PGPROC array in shared memory.
 	 */
-	ProcArrayAddMyself();
+	ProcArrayAdd(MyProc);

 	/*
 	 * Arrange to clean up at backend exit.
@@ -332,7 +331,6 @@ InitDummyProcess(int proctype)
 	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
 	MyProc->databaseId = MyDatabaseId;
-	MyProc->logRec.xrecoff = 0;
 	MyProc->lwWaiting = false;
 	MyProc->lwExclusive = false;
 	MyProc->lwWaitLink = NULL;
@@ -352,6 +350,35 @@ InitDummyProcess(int proctype)
 	PGSemaphoreReset(&MyProc->sem);
 }

+/*
+ * Check whether there are at least N free PGPROC objects.
+ *
+ * Note: this is designed on the assumption that N will generally be small.
+ */
+bool
+HaveNFreeProcs(int n)
+{
+	SHMEM_OFFSET offset;
+	PGPROC	   *proc;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile PROC_HDR *procglobal = ProcGlobal;
+
+	SpinLockAcquire(ProcStructLock);
+
+	offset = procglobal->freeProcs;
+
+	while (n > 0 && offset != INVALID_OFFSET)
+	{
+		proc = (PGPROC *) MAKE_PTR(offset);
+		offset = proc->links.next;
+		n--;
+	}
+
+	SpinLockRelease(ProcStructLock);
+
+	return (n <= 0);
+}
+
 /*
 * Cancel any pending wait for lock, when aborting a transaction.
 *
@@ -478,7 +505,7 @@ ProcKill(int code, Datum arg)
 #endif

 	/* Remove our PGPROC from the PGPROC array in shared memory */
-	ProcArrayRemoveMyself();
+	ProcArrayRemove(MyProc);

 	SpinLockAcquire(ProcStructLock);

--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.89 2005/06/06 20:22:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.90 2005/06/17 22:32:46 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -434,7 +434,7 @@ smgrscheduleunlink(SMgrRelation reln, bool isTemp)
 *		during transactional operations, since it can't be undone.
 *
 *		If isRedo is true, it is okay for the underlying file to be gone
- *		already.  (In practice isRedo will always be true.)
+ *		already.
 *
 * This also implies smgrclose() on the SMgrRelation object.
 */
@@ -677,6 +677,30 @@ smgrimmedsync(SMgrRelation reln)
 						reln->smgr_rnode.relNode)));
 }

+
+/*
+ *	PostPrepare_smgr -- Clean up after a successful PREPARE
+ *
+ * What we have to do here is throw away the in-memory state about pending
+ * relation deletes.  It's all been recorded in the 2PC state file and
+ * it's no longer smgr's job to worry about it.
+ */
+void
+PostPrepare_smgr(void)
+{
+	PendingRelDelete *pending;
+	PendingRelDelete *next;
+
+	for (pending = pendingDeletes; pending != NULL; pending = next)
+	{
+		next = pending->next;
+		pendingDeletes = next;
+		/* must explicitly free the list entry */
+		pfree(pending);
+	}
+}
+
+
 /*
 *	smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
 *