diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 3ff70cde3ce..a894236885f 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.4 2003/11/29 19:51:40 pgsql Exp $
+$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.4.8.1 2007/04/19 20:24:10 tgl Exp $
 
 This directory contains an implementation of hash indexing for Postgres.
 
@@ -72,6 +72,18 @@ index, and preparing to allocate additional overflow pages after those
 bucket pages.  hashm_spares[] entries before S cannot change anymore,
 since that would require moving already-created bucket pages.
 
+The last page nominally used by the index is always determinable from
+hashm_spares[S].  To avoid complaints from smgr, the logical EOF as seen by
+the filesystem and smgr must always be greater than or equal to this page.
+We have to allow the case "greater than" because it's possible that during
+an index extension we crash after allocating filesystem space and before
+updating the metapage.  Note that on filesystems that allow "holes" in
+files, it's entirely likely that pages before the logical EOF are not yet
+allocated: when we allocate a new splitpoint's worth of bucket pages, we
+physically zero the last such page to force the EOF up, and the first such
+page will be used immediately, but the intervening pages are not written
+until needed.
+
 Since overflow pages may be recycled if enough tuples are deleted from
 their bucket, we need a way to keep track of currently-free overflow
 pages.  The state of each overflow page (0 = available, 1 = not available)
@@ -305,6 +317,10 @@ we can just error out without any great harm being done.
 Free space management
 ---------------------
 
+(Question: why is this so complicated?  Why not just have a linked list
+of free pages with the list head in the metapage?  It's not like we
+avoid needing to modify the metapage with all this.)
+
 Free space management consists of two sub-algorithms, one for reserving
 an overflow page to add to a bucket chain, and one for returning an empty
 overflow page to the free pool.
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
index 4fe0301c75d..f95e1c6d7f8 100644
--- a/src/backend/access/hash/hashovfl.c
+++ b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53 2006/11/19 21:33:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.53.2.1 2007/04/19 20:24:10 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -272,19 +272,12 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
 	blkno = bitno_to_blkno(metap, bit);
 
 	/*
-	 * We have to fetch the page with P_NEW to ensure smgr's idea of the
+	 * Fetch the page with _hash_getnewbuf to ensure smgr's idea of the
 	 * relation length stays in sync with ours.  XXX It's annoying to do this
 	 * with metapage write lock held; would be better to use a lock that
-	 * doesn't block incoming searches.  Best way to fix it would be to stop
-	 * maintaining hashm_spares[hashm_ovflpoint] and rely entirely on the
-	 * smgr relation length to track where new overflow pages come from;
-	 * then we could release the metapage before we do the smgrextend.
-	 * FIXME later (not in beta...)
+	 * doesn't block incoming searches.
 	 */
-	newbuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
-	if (BufferGetBlockNumber(newbuf) != blkno)
-		elog(ERROR, "unexpected hash relation size: %u, should be %u",
-			 BufferGetBlockNumber(newbuf), blkno);
+	newbuf = _hash_getnewbuf(rel, blkno, HASH_WRITE);
 
 	metap->hashm_spares[splitnum]++;
 
@@ -507,19 +500,14 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
 	/*
 	 * It is okay to write-lock the new bitmap page while holding metapage
 	 * write lock, because no one else could be contending for the new page.
-	 * Also, the metapage lock makes it safe to extend the index using P_NEW,
-	 * which we want to do to ensure the smgr's idea of the relation size
-	 * stays in step with ours.
+	 * Also, the metapage lock makes it safe to extend the index using
+	 * _hash_getnewbuf.
 	 *
 	 * There is some loss of concurrency in possibly doing I/O for the new
 	 * page while holding the metapage lock, but this path is taken so seldom
 	 * that it's not worth worrying about.
 	 */
-	buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
-	if (BufferGetBlockNumber(buf) != blkno)
-		elog(ERROR, "unexpected hash relation size: %u, should be %u",
-			 BufferGetBlockNumber(buf), blkno);
-
+	buf = _hash_getnewbuf(rel, blkno, HASH_WRITE);
 	pg = BufferGetPage(buf);
 
 	/* initialize the page */
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 0f643836a1c..f08404d0536 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61 2006/11/19 21:33:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.61.2.1 2007/04/19 20:24:10 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -36,7 +36,8 @@
 #include "utils/lsyscache.h"
 
 
-static BlockNumber _hash_alloc_buckets(Relation rel, uint32 nblocks);
+static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock,
+								uint32 nblocks);
 static void _hash_splitbucket(Relation rel, Buffer metabuf,
 				  Bucket obucket, Bucket nbucket,
 				  BlockNumber start_oblkno,
@@ -104,8 +105,9 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
  *		requested buffer and its reference count has been incremented
  *		(ie, the buffer is "locked and pinned").
  *
- *		blkno == P_NEW is allowed, but it is caller's responsibility to
- *		ensure that only one process can extend the index at a time.
+ *		P_NEW is disallowed because this routine should only be used
+ *		to access pages that are known to be before the filesystem EOF.
+ *		Extending the index should be done with _hash_getnewbuf.
  *
  *		All call sites should call either _hash_checkpage or _hash_pageinit
  *		on the returned page, depending on whether the block is expected
@@ -116,6 +118,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 {
 	Buffer		buf;
 
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+
 	buf = ReadBuffer(rel, blkno);
 
 	if (access != HASH_NOLOCK)
@@ -125,6 +130,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
 	return buf;
 }
 
+/*
+ *	_hash_getnewbuf() -- Get a new page at the end of the index.
+ *
+ *		This has the same API as _hash_getbuf, except that we are adding
+ *		a page to the index, and hence expect the page to be past the
+ *		logical EOF.  (However, we have to support the case where it isn't,
+ *		since a prior try might have crashed after extending the filesystem
+ *		EOF but before updating the metapage to reflect the added page.)
+ *
+ *		It is caller's responsibility to ensure that only one process can
+ *		extend the index at a time.
+ *
+ *		All call sites should call _hash_pageinit on the returned page.
+ *		Also, it's difficult to imagine why access would not be HASH_WRITE.
+ */
+Buffer
+_hash_getnewbuf(Relation rel, BlockNumber blkno, int access)
+{
+	BlockNumber	nblocks = RelationGetNumberOfBlocks(rel);
+	Buffer		buf;
+
+	if (blkno == P_NEW)
+		elog(ERROR, "hash AM does not use P_NEW");
+	if (blkno > nblocks)
+		elog(ERROR, "access to noncontiguous page in hash index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	/* smgr insists we use P_NEW to extend the relation */
+	if (blkno == nblocks)
+	{
+		buf = ReadBuffer(rel, P_NEW);
+		if (BufferGetBlockNumber(buf) != blkno)
+			elog(ERROR, "unexpected hash relation size: %u, should be %u",
+				 BufferGetBlockNumber(buf), blkno);
+	}
+	else
+		buf = ReadBuffer(rel, blkno);
+
+	if (access != HASH_NOLOCK)
+		LockBuffer(buf, access);
+
+	/* ref count and lock type are correct */
+	return buf;
+}
+
 /*
  *	_hash_relbuf() -- release a locked buffer.
  *
@@ -238,12 +288,11 @@ _hash_metapinit(Relation rel)
 
 	/*
 	 * We initialize the metapage, the first two bucket pages, and the
-	 * first bitmap page in sequence, using P_NEW to cause smgrextend()
-	 * calls to occur.  This ensures that the smgr level has the right
-	 * idea of the physical index length.
+	 * first bitmap page in sequence, using _hash_getnewbuf to cause
+	 * smgrextend() calls to occur.  This ensures that the smgr level
+	 * has the right idea of the physical index length.
 	 */
-	metabuf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
-	Assert(BufferGetBlockNumber(metabuf) == HASH_METAPAGE);
+	metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE);
 	pg = BufferGetPage(metabuf);
 	_hash_pageinit(pg, BufferGetPageSize(metabuf));
 
@@ -296,8 +345,7 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, P_NEW, HASH_WRITE);
-		Assert(BufferGetBlockNumber(buf) == BUCKET_TO_BLKNO(metap, i));
+		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -345,7 +393,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	Bucket		old_bucket;
 	Bucket		new_bucket;
 	uint32		spare_ndx;
-	BlockNumber firstblock = InvalidBlockNumber;
 	BlockNumber start_oblkno;
 	BlockNumber start_nblkno;
 	uint32		maxbucket;
@@ -397,11 +444,45 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 	if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE)
 		goto fail;
 
+	/*
+	 * Determine which bucket is to be split, and attempt to lock the old
+	 * bucket.	If we can't get the lock, give up.
+	 *
+	 * The lock protects us against other backends, but not against our own
+	 * backend.  Must check for active scans separately.
+	 */
+	new_bucket = metap->hashm_maxbucket + 1;
+
+	old_bucket = (new_bucket & metap->hashm_lowmask);
+
+	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
+
+	if (_hash_has_active_scan(rel, old_bucket))
+		goto fail;
+
+	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
+		goto fail;
+
+	/*
+	 * Likewise lock the new bucket (should never fail).
+	 *
+	 * Note: it is safe to compute the new bucket's blkno here, even though
+	 * we may still need to update the BUCKET_TO_BLKNO mapping.  This is
+	 * because the current value of hashm_spares[hashm_ovflpoint] correctly
+	 * shows where we are going to put a new splitpoint's worth of buckets.
+	 */
+	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
+
+	if (_hash_has_active_scan(rel, new_bucket))
+		elog(ERROR, "scan in progress on supposedly new bucket");
+
+	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
+		elog(ERROR, "could not get lock on supposedly new bucket");
+
 	/*
 	 * If the split point is increasing (hashm_maxbucket's log base 2
 	 * increases), we need to allocate a new batch of bucket pages.
 	 */
-	new_bucket = metap->hashm_maxbucket + 1;
 	spare_ndx = _hash_log2(new_bucket + 1);
 	if (spare_ndx > metap->hashm_ovflpoint)
 	{
@@ -412,34 +493,15 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		 * this maps one-to-one to blocks required, but someday we may need
 		 * a more complicated calculation here.
 		 */
-		firstblock = _hash_alloc_buckets(rel, new_bucket);
-		if (firstblock == InvalidBlockNumber)
-			goto fail;			/* can't split due to BlockNumber overflow */
+		if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket))
+		{
+			/* can't split due to BlockNumber overflow */
+			_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
+			_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
+			goto fail;
+		}
 	}
 
-	/*
-	 * Determine which bucket is to be split, and attempt to lock the old
-	 * bucket.	If we can't get the lock, give up.
-	 *
-	 * The lock protects us against other backends, but not against our own
-	 * backend.  Must check for active scans separately.
-	 *
-	 * Ideally we would lock the new bucket too before proceeding, but if we
-	 * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping isn't
-	 * correct yet.  For simplicity we update the metapage first and then
-	 * lock.  This should be okay because no one else should be trying to lock
-	 * the new bucket yet...
-	 */
-	old_bucket = (new_bucket & metap->hashm_lowmask);
-
-	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
-
-	if (_hash_has_active_scan(rel, old_bucket))
-		goto fail;
-
-	if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
-		goto fail;
-
 	/*
 	 * Okay to proceed with split.	Update the metapage bucket mapping info.
 	 *
@@ -472,20 +534,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
 		metap->hashm_ovflpoint = spare_ndx;
 	}
 
-	/* now we can compute the new bucket's primary block number */
-	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
-
-	/* if we added a splitpoint, should match result of _hash_alloc_buckets */
-	if (firstblock != InvalidBlockNumber &&
-		firstblock != start_nblkno)
-		elog(PANIC, "unexpected hash relation size: %u, should be %u",
-			 firstblock, start_nblkno);
-
-	Assert(!_hash_has_active_scan(rel, new_bucket));
-
-	if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
-		elog(PANIC, "could not get lock on supposedly new bucket");
-
 	/* Done mucking with metapage */
 	END_CRIT_SECTION();
 
@@ -551,23 +599,16 @@ fail:
  * for the purpose.  OTOH, adding a splitpoint is a very infrequent operation,
  * so it may not be worth worrying about.
  *
- * Returns the first block number in the new splitpoint's range, or
- * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
+ * Returns TRUE if successful, or FALSE if allocation failed due to
+ * BlockNumber overflow.
  */
-static BlockNumber
-_hash_alloc_buckets(Relation rel, uint32 nblocks)
+static bool
+_hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 {
-	BlockNumber	firstblock;
 	BlockNumber	lastblock;
 	BlockNumber	endblock;
 	char		zerobuf[BLCKSZ];
 
-	/*
-	 * Since we hold metapage lock, no one else is either splitting or
-	 * allocating a new page in _hash_getovflpage(); hence it's safe to
-	 * assume that the relation length isn't changing under us.
-	 */
-	firstblock = RelationGetNumberOfBlocks(rel);
 	lastblock = firstblock + nblocks - 1;
 
 	/*
@@ -575,12 +616,12 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks)
 	 * extend the index anymore.
 	 */
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
-		return InvalidBlockNumber;
-
-	/* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
+		return false;
 
 	MemSet(zerobuf, 0, sizeof(zerobuf));
 
+	RelationOpenSmgr(rel);
+
 	/*
 	 * XXX If the extension results in creation of new segment files,
 	 * we have to make sure that each non-last file is correctly filled out to
@@ -597,7 +638,7 @@ _hash_alloc_buckets(Relation rel, uint32 nblocks)
 
 	smgrextend(rel->rd_smgr, lastblock, zerobuf, rel->rd_istemp);
 
-	return firstblock;
+	return true;
 }
 
 
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 77ed99baa68..016e51d09f9 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.73 2006/07/13 16:49:19 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.73.2.1 2007/04/19 20:24:10 tgl Exp $
  *
  * NOTES
  *		modeled after Margo Seltzer's hash implementation for unix.
@@ -281,6 +281,7 @@ extern void _hash_getlock(Relation rel, BlockNumber whichlock, int access);
 extern bool _hash_try_getlock(Relation rel, BlockNumber whichlock, int access);
 extern void _hash_droplock(Relation rel, BlockNumber whichlock, int access);
 extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access);
+extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, int access);
 extern void _hash_relbuf(Relation rel, Buffer buf);
 extern void _hash_dropbuf(Relation rel, Buffer buf);
 extern void _hash_wrtbuf(Relation rel, Buffer buf);