Back-port changes of Jan 16 and 17 to "revoke" pending fsync requests during

DROP TABLE and DROP DATABASE. Should prevent unexpected "permission denied" failures on Windows, and is cleaner on other platforms too since we no longer have to take it on faith that ENOENT is okay during an fsync attempt. Patched as far back as 8.1; per recent discussion I think we are not going to worry about Windows-specific issues in 8.0 anymore.
2025-12-02 23:42:46 +03:00 · 2007-01-27 20:15:47 +00:00
parent 043fcd6616
commit cb476c1ec3
4 changed files with 283 additions and 101 deletions
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -13,7 +13,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.187 2006/11/05 22:42:08 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.187.2.1 2007/01/27 20:15:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -40,6 +40,7 @@
 #include "postmaster/bgwriter.h"
 #include "storage/freespace.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/flatfiles.h"
@@ -643,6 +644,12 @@ dropdb(const char *dbname, bool missing_ok)
 	 */
 	FreeSpaceMapForgetDatabase(db_id);
 	/*
 	 * Tell bgwriter to forget any pending fsync requests for files in the
 	 * database; else it'll fail at next checkpoint.
 	 */
 	ForgetDatabaseFsyncRequests(db_id);
 	/*
 	 * On Windows, force a checkpoint so that the bgwriter doesn't hold any
 	 * open files, which would cause rmdir() to fail.
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -2,7 +2,7 @@
 *
 * bgwriter.c
 *
- * The background writer (bgwriter) is new in Postgres 8.0.  It attempts
+ * The background writer (bgwriter) is new as of Postgres 8.0.  It attempts
 * to keep regular backends from having to write out dirty shared buffers
 * (which they would only do when needing to free a shared buffer to read in
 * another page).  In the best scenario all writes from shared buffers will
@@ -37,7 +37,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.33 2006/12/01 19:55:28 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.33.2.1 2007/01/27 20:15:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -103,8 +103,8 @@
 typedef struct
 {
 	RelFileNode rnode;
-	BlockNumber segno;
+	BlockNumber segno;			/* see md.c for special values */
-	/* might add a request-type field later */
+	/* might add a real request-type field later; not needed yet */
 } BgWriterRequest;
 typedef struct
@@ -700,6 +700,11 @@ RequestCheckpoint(bool waitforit, bool warnontime)
 * the backend calls this routine to pass over knowledge that the relation
 * is dirty and must be fsync'd before next checkpoint.
 *
 * segno specifies which segment (not block!) of the relation needs to be
 * fsync'd.  (Since the valid range is much less than BlockNumber, we can
 * use high values for special flags; that's all internal to md.c, which
 * see for details.)
 *
 * If we are unable to pass over the request (at present, this can happen
 * if the shared memory queue is full), we return false.  That forces
 * the backend to do its own fsync.  We hope that will be even more seldom.
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123 2006/11/20 01:07:56 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123.2.1 2007/01/27 20:15:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -30,6 +30,23 @@
 /* interval for calling AbsorbFsyncRequests in mdsync */
 #define FSYNCS_PER_ABSORB		10
 /* special values for the segno arg to RememberFsyncRequest */
 #define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
 #define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
 /*
 * On Windows, we have to interpret EACCES as possibly meaning the same as
 * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
 * that's what you get.  Ugh.  This code is designed so that we don't
 * actually believe these cases are okay without further evidence (namely,
 * a pending fsync request getting revoked ... see mdsync).
 */
 #ifndef WIN32
 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
 #else
 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
 #endif
 /*
 *	The magnetic disk storage manager keeps track of open file
 *	descriptors in its own descriptor pool.  This is done to make it
@@ -92,9 +109,8 @@ static MemoryContext MdCxt;		/* context for all md.c allocations */
 * we keep track of pending fsync operations: we need to remember all relation
 * segments that have been written since the last checkpoint, so that we can
 * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.	We use a hash table not because
+ * table remembers the pending operations.	We use a hash table mostly as
- * we want to look up individual operations, but simply as a convenient way
+ * a convenient way of eliminating duplicate requests.
 * of eliminating duplicate requests.
 *
 * (Regular backends do not track pending operations locally, but forward
 * them to the bgwriter.)
@@ -103,6 +119,12 @@ typedef struct
 {
 	RelFileNode rnode;			/* the targeted relation */
 	BlockNumber segno;			/* which segment */
 } PendingOperationTag;
 typedef struct
 {
 	PendingOperationTag tag;	/* hash table key (must be first!) */
 	int			failures;		/* number of failed attempts to fsync */
 } PendingOperationEntry;
 static HTAB *pendingOpsTable = NULL;
@@ -145,7 +167,7 @@ mdinit(void)
 		HASHCTL		hash_ctl;
 		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(PendingOperationEntry);
+		hash_ctl.keysize = sizeof(PendingOperationTag);
 		hash_ctl.entrysize = sizeof(PendingOperationEntry);
 		hash_ctl.hash = tag_hash;
 		hash_ctl.hcxt = MdCxt;
@@ -228,6 +250,12 @@ mdunlink(RelFileNode rnode, bool isRedo)
 	int			save_errno = 0;
 	char	   *path;
 	/*
 	 * We have to clean out any pending fsync requests for the doomed relation,
 	 * else the next mdsync() will fail.
 	 */
 	ForgetRelationFsyncRequests(rnode);
 	path = relpath(rnode);
 	/* Delete the first segment, or only segment if not doing segmenting */
@@ -374,7 +402,7 @@ mdopen(SMgrRelation reln, bool allowNotFound)
 		if (fd < 0)
 		{
 			pfree(path);
-			if (allowNotFound && errno == ENOENT)
+			if (allowNotFound && FILE_POSSIBLY_DELETED(errno))
 				return NULL;
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -727,18 +755,33 @@ mdimmedsync(SMgrRelation reln)
 bool
 mdsync(void)
 {
-	HASH_SEQ_STATUS hstat;
+	bool		need_retry;
 	PendingOperationEntry *entry;
 	int			absorb_counter;
 	if (!pendingOpsTable)
 		return false;
 	/*
 	 * The fsync table could contain requests to fsync relations that have
 	 * been deleted (unlinked) by the time we get to them.  Rather than
 	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
 	 * what we will do is retry the whole process after absorbing fsync
 	 * request messages again.  Since mdunlink() queues a "revoke" message
 	 * before actually unlinking, the fsync request is guaranteed to be gone
 	 * the second time if it really was this case.  DROP DATABASE likewise
 	 * has to tell us to forget fsync requests before it starts deletions.
 	 */
 	do {
 		HASH_SEQ_STATUS hstat;
 		PendingOperationEntry *entry;
 		int			absorb_counter;
 		need_retry = false;
 		/*
 		 * If we are in the bgwriter, the sync had better include all fsync
-	 * requests that were queued by backends before the checkpoint REDO point
+		 * requests that were queued by backends before the checkpoint REDO
-	 * was determined.	We go that a little better by accepting all requests
+		 * point was determined. We go that a little better by accepting all
-	 * queued up to the point where we start fsync'ing.
+		 * requests queued up to the point where we start fsync'ing.
 		 */
 		AbsorbFsyncRequests();
@@ -747,9 +790,9 @@ mdsync(void)
 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 		{
 			/*
-		 * If fsync is off then we don't have to bother opening the file at
+			 * If fsync is off then we don't have to bother opening the file
-		 * all.  (We delay checking until this point so that changing fsync on
+			 * at all.  (We delay checking until this point so that changing
-		 * the fly behaves sensibly.)
+			 * fsync on the fly behaves sensibly.)
 			 */
 			if (enableFsync)
 			{
@@ -757,67 +800,85 @@ mdsync(void)
 				MdfdVec    *seg;
 				/*
-			 * If in bgwriter, absorb pending requests every so often to
+				 * If in bgwriter, we want to absorb pending requests every so
-			 * prevent overflow of the fsync request queue.  The hashtable
+				 * often to prevent overflow of the fsync request queue.  This
-			 * code does not specify whether entries added by this will be
+				 * could result in deleting the current entry out from under
-			 * visited by our search, but we don't really care: it's OK if we
+				 * our hashtable scan, so the procedure is to fall out of the
-			 * do, and OK if we don't.
+				 * scan and start over from the top of the function.
 				 */
 				if (--absorb_counter <= 0)
 				{
-				AbsorbFsyncRequests();
+					need_retry = true;
-				absorb_counter = FSYNCS_PER_ABSORB;
+					break;
 				}
 				/*
-			 * Find or create an smgr hash entry for this relation. This may
+				 * Find or create an smgr hash entry for this relation. This
-			 * seem a bit unclean -- md calling smgr?  But it's really the
+				 * may seem a bit unclean -- md calling smgr?  But it's really
-			 * best solution.  It ensures that the open file reference isn't
+				 * the best solution.  It ensures that the open file reference
-			 * permanently leaked if we get an error here. (You may say "but
+				 * isn't permanently leaked if we get an error here. (You may
-			 * an unreferenced SMgrRelation is still a leak!" Not really,
+				 * say "but an unreferenced SMgrRelation is still a leak!" Not
-			 * because the only case in which a checkpoint is done by a
+				 * really, because the only case in which a checkpoint is done
-			 * process that isn't about to shut down is in the bgwriter, and
+				 * by a process that isn't about to shut down is in the
-			 * it will periodically do smgrcloseall().	This fact justifies
+				 * bgwriter, and it will periodically do smgrcloseall(). This
-			 * our not closing the reln in the success path either, which is a
+				 * fact justifies our not closing the reln in the success path
-			 * good thing since in non-bgwriter cases we couldn't safely do
+				 * either, which is a good thing since in non-bgwriter cases
-			 * that.)  Furthermore, in many cases the relation will have been
+				 * we couldn't safely do that.)  Furthermore, in many cases
-			 * dirtied through this same smgr relation, and so we can save a
+				 * the relation will have been dirtied through this same smgr
-			 * file open/close cycle.
+				 * relation, and so we can save a file open/close cycle.
 				 */
-			reln = smgropen(entry->rnode);
+				reln = smgropen(entry->tag.rnode);
 				/*
-			 * It is possible that the relation has been dropped or truncated
+				 * It is possible that the relation has been dropped or
-			 * since the fsync request was entered.  Therefore, we have to
+				 * truncated since the fsync request was entered.  Therefore,
-			 * allow file-not-found errors.  This applies both during
+				 * allow ENOENT, but only if we didn't fail once already on
-			 * _mdfd_getseg() and during FileSync, since fd.c might have
+				 * this file.  This applies both during _mdfd_getseg() and
-			 * closed the file behind our back.
+				 * during FileSync, since fd.c might have closed the file
 				 * behind our back.
 				 */
 				seg = _mdfd_getseg(reln,
-							   entry->segno * ((BlockNumber) RELSEG_SIZE),
+								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
 								   true);
-			if (seg)
+				if (seg == NULL ||
 					FileSync(seg->mdfd_vfd) < 0)
 				{
-				if (FileSync(seg->mdfd_vfd) < 0 &&
+					/*
-					errno != ENOENT)
+					 * XXX is there any point in allowing more than one try?
 					 * Don't see one at the moment, but easy to change the
 					 * test here if so.
 					 */
 					if (!FILE_POSSIBLY_DELETED(errno) ||
 						++(entry->failures) > 1)
 					{
 						ereport(LOG,
 								(errcode_for_file_access(),
 								 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
-									entry->segno,
+										entry->tag.segno,
-									entry->rnode.spcNode,
+										entry->tag.rnode.spcNode,
-									entry->rnode.dbNode,
+										entry->tag.rnode.dbNode,
-									entry->rnode.relNode)));
+										entry->tag.rnode.relNode)));
 						return false;
 					}
 					else
 						ereport(DEBUG1,
 								(errcode_for_file_access(),
 								 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
 										entry->tag.segno,
 										entry->tag.rnode.spcNode,
 										entry->tag.rnode.dbNode,
 										entry->tag.rnode.relNode)));
 					need_retry = true;
 					continue;	/* don't delete the hashtable entry */
 				}
 			}
 			/* Okay, delete this entry */
-		if (hash_search(pendingOpsTable, entry,
+			if (hash_search(pendingOpsTable, &entry->tag,
 							HASH_REMOVE, NULL) == NULL)
 				elog(ERROR, "pendingOpsTable corrupted");
 		}
 	} while (need_retry);
 	return true;
 }
@@ -839,14 +900,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
 	if (pendingOpsTable)
 	{
-		PendingOperationEntry entry;
+		/* push it into local pending-ops table */
-
+		RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
 		/* ensure any pad bytes in the struct are zeroed */
 		MemSet(&entry, 0, sizeof(entry));
 		entry.rnode = reln->smgr_rnode;
 		entry.segno = seg->mdfd_segno;
 		(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
 		return true;
 	}
 	else
@@ -865,22 +920,135 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 *
 * We stuff the fsync request into the local hash table for execution
 * during the bgwriter's next checkpoint.
 *
 * The range of possible segment numbers is way less than the range of
 * BlockNumber, so we can reserve high values of segno for special purposes.
 * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
 * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
 * a whole database.  (These are a tad slow because the hash table has to be
 * searched linearly, but it doesn't seem worth rethinking the table structure
 * for them.)
 */
 void
 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
 	PendingOperationEntry entry;
 	Assert(pendingOpsTable);
-	/* ensure any pad bytes in the struct are zeroed */
+	if (segno == FORGET_RELATION_FSYNC)
-	MemSet(&entry, 0, sizeof(entry));
+	{
-	entry.rnode = rnode;
+		/* Remove any pending requests for the entire relation */
-	entry.segno = segno;
+		HASH_SEQ_STATUS hstat;
 		PendingOperationEntry *entry;
-	(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
+		hash_seq_init(&hstat, pendingOpsTable);
 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 		{
 			if (RelFileNodeEquals(entry->tag.rnode, rnode))
 			{
 				/* Okay, delete this entry */
 				if (hash_search(pendingOpsTable, &entry->tag,
 								HASH_REMOVE, NULL) == NULL)
 					elog(ERROR, "pendingOpsTable corrupted");
 			}
 		}
 	}
 	else if (segno == FORGET_DATABASE_FSYNC)
 	{
 		/* Remove any pending requests for the entire database */
 		HASH_SEQ_STATUS hstat;
 		PendingOperationEntry *entry;
 		hash_seq_init(&hstat, pendingOpsTable);
 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 		{
 			if (entry->tag.rnode.dbNode == rnode.dbNode)
 			{
 				/* Okay, delete this entry */
 				if (hash_search(pendingOpsTable, &entry->tag,
 								HASH_REMOVE, NULL) == NULL)
 					elog(ERROR, "pendingOpsTable corrupted");
 			}
 		}
 	}
 	else
 	{
 		/* Normal case: enter a request to fsync this segment */
 		PendingOperationTag key;
 		PendingOperationEntry *entry;
 		bool		found;
 		/* ensure any pad bytes in the hash key are zeroed */
 		MemSet(&key, 0, sizeof(key));
 		key.rnode = rnode;
 		key.segno = segno;
 		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
 													  &key,
 													  HASH_ENTER,
 													  &found);
 		if (!found)				/* new entry, so initialize it */
 			entry->failures = 0;
 	}
 }
 /*
 * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
 */
 void
 ForgetRelationFsyncRequests(RelFileNode rnode)
 {
 	if (pendingOpsTable)
 	{
 		/* standalone backend or startup process: fsync state is local */
 		RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
 	}
 	else if (IsUnderPostmaster)
 	{
 		/*
 		 * Notify the bgwriter about it.  If we fail to queue the revoke
 		 * message, we have to sleep and try again ... ugly, but hopefully
 		 * won't happen often.
 		 *
 		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
 		 * an error would leave the no-longer-used file still present on
 		 * disk, which would be bad, so I'm inclined to assume that the
 		 * bgwriter will always empty the queue soon.
 		 */
 		while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
 			pg_usleep(10000L);	/* 10 msec seems a good number */
 		/*
 		 * Note we don't wait for the bgwriter to actually absorb the
 		 * revoke message; see mdsync() for the implications.
 		 */
 	}
 }
 /*
 * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
 */
 void
 ForgetDatabaseFsyncRequests(Oid dbid)
 {
 	RelFileNode rnode;
 	rnode.dbNode = dbid;
 	rnode.spcNode = 0;
 	rnode.relNode = 0;
 	if (pendingOpsTable)
 	{
 		/* standalone backend or startup process: fsync state is local */
 		RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
 	}
 	else if (IsUnderPostmaster)
 	{
 		/* see notes in ForgetRelationFsyncRequests */
 		while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
 			pg_usleep(10000L);	/* 10 msec seems a good number */
 	}
 }
 /*
 *	_fdvec_alloc() -- Make a MdfdVec object.
 */
@@ -984,7 +1152,7 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
 								(segstogo == 1 || InRecovery) ? O_CREAT : 0);
 			if (v->mdfd_chain == NULL)
 			{
-				if (allowNotFound && errno == ENOENT)
+				if (allowNotFound && FILE_POSSIBLY_DELETED(errno))
 					return NULL;
 				ereport(ERROR,
 						(errcode_for_file_access(),
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.55 2006/03/24 04:32:13 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.55.2.1 2007/01/27 20:15:47 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@@ -107,6 +107,8 @@ extern bool mdimmedsync(SMgrRelation reln);
 extern bool mdsync(void);
 extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
 extern void ForgetRelationFsyncRequests(RelFileNode rnode);
 extern void ForgetDatabaseFsyncRequests(Oid dbid);
 /* smgrtype.c */
 extern Datum smgrout(PG_FUNCTION_ARGS);