mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Rearrange mdsync() looping logic to avoid the problem that a sufficiently
fast flow of new fsync requests can prevent mdsync() from ever completing. This was an unforeseen consequence of a patch added in Mar 2006 to prevent the fsync request queue from overflowing. Problem identified by Heikki Linnakangas and independently by ITAGAKI Takahiro; fix based on ideas from Takahiro-san, Heikki, and Tom. Back-patch as far as 8.1 because a previous back-patch introduced the problem into 8.1 ...
This commit is contained in:
		@@ -8,7 +8,7 @@
 | 
			
		||||
 *
 | 
			
		||||
 *
 | 
			
		||||
 * IDENTIFICATION
 | 
			
		||||
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118.2.2 2007/01/27 20:15:55 tgl Exp $
 | 
			
		||||
 *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.118.2.3 2007/04/12 17:11:07 tgl Exp $
 | 
			
		||||
 *
 | 
			
		||||
 *-------------------------------------------------------------------------
 | 
			
		||||
 */
 | 
			
		||||
@@ -122,14 +122,19 @@ typedef struct
 | 
			
		||||
	BlockNumber segno;			/* which segment */
 | 
			
		||||
} PendingOperationTag;
 | 
			
		||||
 | 
			
		||||
typedef uint16 CycleCtr;		/* can be any convenient integer size */
 | 
			
		||||
 | 
			
		||||
typedef struct
 | 
			
		||||
{
 | 
			
		||||
	PendingOperationTag tag;	/* hash table key (must be first!) */
 | 
			
		||||
	int			failures;		/* number of failed attempts to fsync */
 | 
			
		||||
	bool		canceled;		/* T => request canceled, not yet removed */
 | 
			
		||||
	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr when request was made */
 | 
			
		||||
} PendingOperationEntry;
 | 
			
		||||
 | 
			
		||||
static HTAB *pendingOpsTable = NULL;
 | 
			
		||||
 | 
			
		||||
static CycleCtr mdsync_cycle_ctr = 0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/* local routines */
 | 
			
		||||
static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
 | 
			
		||||
@@ -749,34 +754,22 @@ mdimmedsync(SMgrRelation reln)
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 *	mdsync() -- Sync previous writes to stable storage.
 | 
			
		||||
 *
 | 
			
		||||
 * This is only called during checkpoints, and checkpoints should only
 | 
			
		||||
 * occur in processes that have created a pendingOpsTable.
 | 
			
		||||
 */
 | 
			
		||||
bool
 | 
			
		||||
mdsync(void)
 | 
			
		||||
{
 | 
			
		||||
	bool		need_retry;
 | 
			
		||||
	static bool mdsync_in_progress = false;
 | 
			
		||||
 | 
			
		||||
	if (!pendingOpsTable)
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * The fsync table could contain requests to fsync relations that have
 | 
			
		||||
	 * been deleted (unlinked) by the time we get to them.  Rather than
 | 
			
		||||
	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
 | 
			
		||||
	 * what we will do is retry the whole process after absorbing fsync
 | 
			
		||||
	 * request messages again.  Since mdunlink() queues a "revoke" message
 | 
			
		||||
	 * before actually unlinking, the fsync request is guaranteed to be gone
 | 
			
		||||
	 * the second time if it really was this case.  DROP DATABASE likewise
 | 
			
		||||
	 * has to tell us to forget fsync requests before it starts deletions.
 | 
			
		||||
	 */
 | 
			
		||||
	do {
 | 
			
		||||
	HASH_SEQ_STATUS hstat;
 | 
			
		||||
	PendingOperationEntry *entry;
 | 
			
		||||
	int			absorb_counter;
 | 
			
		||||
 | 
			
		||||
		need_retry = false;
 | 
			
		||||
	/*
 | 
			
		||||
	 * This is only called during checkpoints, and checkpoints should only
 | 
			
		||||
	 * occur in processes that have created a pendingOpsTable.
 | 
			
		||||
	 */
 | 
			
		||||
	if (!pendingOpsTable)
 | 
			
		||||
		return false;
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * If we are in the bgwriter, the sync had better include all fsync
 | 
			
		||||
@@ -786,33 +779,100 @@ mdsync(void)
 | 
			
		||||
	 */
 | 
			
		||||
	AbsorbFsyncRequests();
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 | 
			
		||||
	 * checkpoint), we want to ignore fsync requests that are entered into the
 | 
			
		||||
	 * hashtable after this point --- they should be processed next time,
 | 
			
		||||
	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 | 
			
		||||
	 * ones: new ones will have cycle_ctr equal to the incremented value of
 | 
			
		||||
	 * mdsync_cycle_ctr.
 | 
			
		||||
	 *
 | 
			
		||||
	 * In normal circumstances, all entries present in the table at this
 | 
			
		||||
	 * point will have cycle_ctr exactly equal to the current (about to be old)
 | 
			
		||||
	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
 | 
			
		||||
	 * fsync'ing loop, then older values of cycle_ctr might remain when we
 | 
			
		||||
	 * come back here to try again.  Repeated checkpoint failures would
 | 
			
		||||
	 * eventually wrap the counter around to the point where an old entry
 | 
			
		||||
	 * might appear new, causing us to skip it, possibly allowing a checkpoint
 | 
			
		||||
	 * to succeed that should not have.  To forestall wraparound, any time
 | 
			
		||||
	 * the previous mdsync() failed to complete, run through the table and
 | 
			
		||||
	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
 | 
			
		||||
	 *
 | 
			
		||||
	 * Think not to merge this loop with the main loop, as the problem is
 | 
			
		||||
	 * exactly that that loop may fail before having visited all the entries.
 | 
			
		||||
	 * From a performance point of view it doesn't matter anyway, as this
 | 
			
		||||
	 * path will never be taken in a system that's functioning normally.
 | 
			
		||||
	 */
 | 
			
		||||
	if (mdsync_in_progress)
 | 
			
		||||
	{
 | 
			
		||||
		/* prior try failed, so update any stale cycle_ctr values */
 | 
			
		||||
		hash_seq_init(&hstat, pendingOpsTable);
 | 
			
		||||
		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 | 
			
		||||
		{
 | 
			
		||||
			entry->cycle_ctr = mdsync_cycle_ctr;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	/* Advance counter so that new hashtable entries are distinguishable */
 | 
			
		||||
	mdsync_cycle_ctr++;
 | 
			
		||||
 | 
			
		||||
	/* Set flag to detect failure if we don't reach the end of the loop */
 | 
			
		||||
	mdsync_in_progress = true;
 | 
			
		||||
 | 
			
		||||
	/* Now scan the hashtable for fsync requests to process */
 | 
			
		||||
	absorb_counter = FSYNCS_PER_ABSORB;
 | 
			
		||||
	hash_seq_init(&hstat, pendingOpsTable);
 | 
			
		||||
	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 | 
			
		||||
	{
 | 
			
		||||
		/*
 | 
			
		||||
		 * If the entry is new then don't process it this time.  Note that
 | 
			
		||||
		 * "continue" bypasses the hash-remove call at the bottom of the loop.
 | 
			
		||||
		 */
 | 
			
		||||
		if (entry->cycle_ctr == mdsync_cycle_ctr)
 | 
			
		||||
			continue;
 | 
			
		||||
 | 
			
		||||
		/* Else assert we haven't missed it */
 | 
			
		||||
		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
 | 
			
		||||
 | 
			
		||||
		/*
 | 
			
		||||
		 * If fsync is off then we don't have to bother opening the file
 | 
			
		||||
		 * at all.  (We delay checking until this point so that changing
 | 
			
		||||
			 * fsync on the fly behaves sensibly.)
 | 
			
		||||
		 * fsync on the fly behaves sensibly.)  Also, if the entry is
 | 
			
		||||
		 * marked canceled, fall through to delete it.
 | 
			
		||||
		 */
 | 
			
		||||
			if (enableFsync)
 | 
			
		||||
		if (enableFsync && !entry->canceled)
 | 
			
		||||
		{
 | 
			
		||||
				SMgrRelation reln;
 | 
			
		||||
				MdfdVec    *seg;
 | 
			
		||||
			int			failures;
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 * If in bgwriter, we want to absorb pending requests every so
 | 
			
		||||
				 * often to prevent overflow of the fsync request queue.  This
 | 
			
		||||
				 * could result in deleting the current entry out from under
 | 
			
		||||
				 * our hashtable scan, so the procedure is to fall out of the
 | 
			
		||||
				 * scan and start over from the top of the function.
 | 
			
		||||
			 * often to prevent overflow of the fsync request queue.  It is
 | 
			
		||||
			 * unspecified whether newly-added entries will be visited by
 | 
			
		||||
			 * hash_seq_search, but we don't care since we don't need to
 | 
			
		||||
			 * process them anyway.
 | 
			
		||||
			 */
 | 
			
		||||
			if (--absorb_counter <= 0)
 | 
			
		||||
			{
 | 
			
		||||
					need_retry = true;
 | 
			
		||||
					break;
 | 
			
		||||
				AbsorbFsyncRequests();
 | 
			
		||||
				absorb_counter = FSYNCS_PER_ABSORB;
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			/*
 | 
			
		||||
			 * The fsync table could contain requests to fsync segments that
 | 
			
		||||
			 * have been deleted (unlinked) by the time we get to them.
 | 
			
		||||
			 * Rather than just hoping an ENOENT (or EACCES on Windows) error
 | 
			
		||||
			 * can be ignored, what we do on error is absorb pending requests
 | 
			
		||||
			 * and then retry.  Since mdunlink() queues a "revoke" message
 | 
			
		||||
			 * before actually unlinking, the fsync request is guaranteed to
 | 
			
		||||
			 * be marked canceled after the absorb if it really was this case.
 | 
			
		||||
			 * DROP DATABASE likewise has to tell us to forget fsync requests
 | 
			
		||||
			 * before it starts deletions.
 | 
			
		||||
			 */
 | 
			
		||||
			for (failures = 0; ; failures++)	/* loop exits at "break" */
 | 
			
		||||
			{
 | 
			
		||||
				SMgrRelation reln;
 | 
			
		||||
				MdfdVec    *seg;
 | 
			
		||||
 | 
			
		||||
				/*
 | 
			
		||||
				 * Find or create an smgr hash entry for this relation. This
 | 
			
		||||
				 * may seem a bit unclean -- md calling smgr?  But it's really
 | 
			
		||||
@@ -833,7 +893,7 @@ mdsync(void)
 | 
			
		||||
				/*
 | 
			
		||||
				 * It is possible that the relation has been dropped or
 | 
			
		||||
				 * truncated since the fsync request was entered.  Therefore,
 | 
			
		||||
				 * allow ENOENT, but only if we didn't fail once already on
 | 
			
		||||
				 * allow ENOENT, but only if we didn't fail already on
 | 
			
		||||
				 * this file.  This applies both during _mdfd_getseg() and
 | 
			
		||||
				 * during FileSync, since fd.c might have closed the file
 | 
			
		||||
				 * behind our back.
 | 
			
		||||
@@ -841,16 +901,17 @@ mdsync(void)
 | 
			
		||||
				seg = _mdfd_getseg(reln,
 | 
			
		||||
								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
 | 
			
		||||
								   true);
 | 
			
		||||
				if (seg == NULL ||
 | 
			
		||||
					FileSync(seg->mdfd_vfd) < 0)
 | 
			
		||||
				{
 | 
			
		||||
				if (seg != NULL &&
 | 
			
		||||
					FileSync(seg->mdfd_vfd) >= 0)
 | 
			
		||||
					break;		/* success; break out of retry loop */
 | 
			
		||||
 | 
			
		||||
				/*
 | 
			
		||||
					 * XXX is there any point in allowing more than one try?
 | 
			
		||||
				 * XXX is there any point in allowing more than one retry?
 | 
			
		||||
				 * Don't see one at the moment, but easy to change the
 | 
			
		||||
				 * test here if so.
 | 
			
		||||
				 */
 | 
			
		||||
				if (!FILE_POSSIBLY_DELETED(errno) ||
 | 
			
		||||
						++(entry->failures) > 1)
 | 
			
		||||
					failures > 0)
 | 
			
		||||
				{
 | 
			
		||||
					ereport(LOG,
 | 
			
		||||
							(errcode_for_file_access(),
 | 
			
		||||
@@ -869,17 +930,30 @@ mdsync(void)
 | 
			
		||||
									entry->tag.rnode.spcNode,
 | 
			
		||||
									entry->tag.rnode.dbNode,
 | 
			
		||||
									entry->tag.rnode.relNode)));
 | 
			
		||||
					need_retry = true;
 | 
			
		||||
					continue;	/* don't delete the hashtable entry */
 | 
			
		||||
				}
 | 
			
		||||
 | 
			
		||||
				/*
 | 
			
		||||
				 * Absorb incoming requests and check to see if canceled.
 | 
			
		||||
				 */
 | 
			
		||||
				AbsorbFsyncRequests();
 | 
			
		||||
				absorb_counter = FSYNCS_PER_ABSORB;	/* might as well... */
 | 
			
		||||
 | 
			
		||||
				if (entry->canceled)
 | 
			
		||||
					break;
 | 
			
		||||
			}	/* end retry loop */
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
			/* Okay, delete this entry */
 | 
			
		||||
		/*
 | 
			
		||||
		 * If we get here, either we fsync'd successfully, or we don't have
 | 
			
		||||
		 * to because enableFsync is off, or the entry is (now) marked
 | 
			
		||||
		 * canceled.  Okay to delete it.
 | 
			
		||||
		 */
 | 
			
		||||
		if (hash_search(pendingOpsTable, &entry->tag,
 | 
			
		||||
						HASH_REMOVE, NULL) == NULL)
 | 
			
		||||
			elog(ERROR, "pendingOpsTable corrupted");
 | 
			
		||||
		}
 | 
			
		||||
	} while (need_retry);
 | 
			
		||||
	}	/* end loop over hashtable entries */
 | 
			
		||||
 | 
			
		||||
	/* Flag successful completion of mdsync */
 | 
			
		||||
	mdsync_in_progress = false;
 | 
			
		||||
 | 
			
		||||
	return true;
 | 
			
		||||
}
 | 
			
		||||
@@ -924,8 +998,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 | 
			
		||||
 *
 | 
			
		||||
 * The range of possible segment numbers is way less than the range of
 | 
			
		||||
 * BlockNumber, so we can reserve high values of segno for special purposes.
 | 
			
		||||
 * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
 | 
			
		||||
 * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
 | 
			
		||||
 * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
 | 
			
		||||
 * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
 | 
			
		||||
 * a whole database.  (These are a tad slow because the hash table has to be
 | 
			
		||||
 * searched linearly, but it doesn't seem worth rethinking the table structure
 | 
			
		||||
 * for them.)
 | 
			
		||||
@@ -946,10 +1020,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 | 
			
		||||
		{
 | 
			
		||||
			if (RelFileNodeEquals(entry->tag.rnode, rnode))
 | 
			
		||||
			{
 | 
			
		||||
				/* Okay, delete this entry */
 | 
			
		||||
				if (hash_search(pendingOpsTable, &entry->tag,
 | 
			
		||||
								HASH_REMOVE, NULL) == NULL)
 | 
			
		||||
					elog(ERROR, "pendingOpsTable corrupted");
 | 
			
		||||
				/* Okay, cancel this entry */
 | 
			
		||||
				entry->canceled = true;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
@@ -964,10 +1036,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 | 
			
		||||
		{
 | 
			
		||||
			if (entry->tag.rnode.dbNode == rnode.dbNode)
 | 
			
		||||
			{
 | 
			
		||||
				/* Okay, delete this entry */
 | 
			
		||||
				if (hash_search(pendingOpsTable, &entry->tag,
 | 
			
		||||
								HASH_REMOVE, NULL) == NULL)
 | 
			
		||||
					elog(ERROR, "pendingOpsTable corrupted");
 | 
			
		||||
				/* Okay, cancel this entry */
 | 
			
		||||
				entry->canceled = true;
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
@@ -987,8 +1057,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 | 
			
		||||
													  &key,
 | 
			
		||||
													  HASH_ENTER,
 | 
			
		||||
													  &found);
 | 
			
		||||
		if (!found)				/* new entry, so initialize it */
 | 
			
		||||
			entry->failures = 0;
 | 
			
		||||
		/* if new or previously canceled entry, initialize it */
 | 
			
		||||
		if (!found || entry->canceled)
 | 
			
		||||
		{
 | 
			
		||||
			entry->canceled = false;
 | 
			
		||||
			entry->cycle_ctr = mdsync_cycle_ctr;
 | 
			
		||||
		}
 | 
			
		||||
		/*
 | 
			
		||||
		 * NB: it's intentional that we don't change cycle_ctr if the entry
 | 
			
		||||
		 * already exists.  The fsync request must be treated as old, even
 | 
			
		||||
		 * though the new request will be satisfied too by any subsequent
 | 
			
		||||
		 * fsync.
 | 
			
		||||
		 *
 | 
			
		||||
		 * However, if the entry is present but is marked canceled, we should
 | 
			
		||||
		 * act just as though it wasn't there.  The only case where this could
 | 
			
		||||
		 * happen would be if a file had been deleted, we received but did not
 | 
			
		||||
		 * yet act on the cancel request, and the same relfilenode was then
 | 
			
		||||
		 * assigned to a new file.  We mustn't lose the new request, but
 | 
			
		||||
		 * it should be considered new not old.
 | 
			
		||||
		 */
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user