mirror of
https://github.com/postgres/postgres.git
synced 2025-04-29 13:56:47 +03:00
Rethink checkpointer's fsync-request table representation.
Instead of having one hash table entry per relation/fork/segment, just have one per relation, and use bitmapsets to represent which specific segments need to be fsync'd. This eliminates the need to scan the whole hash table to implement FORGET_RELATION_FSYNC, which fixes the O(N^2) behavior recently demonstrated by Jeff Janes for cases involving lots of TRUNCATE or DROP TABLE operations during a single checkpoint cycle. Per an idea from Robert Haas. (FORGET_DATABASE_FSYNC still sucks, but since dropping a database is a pretty expensive operation anyway, we'll live with that.) In passing, improve the delayed-unlink code: remove the pass over the list in mdpreckpt, since it wasn't doing anything for us except supporting a useless Assert in mdpostckpt, and fix mdpostckpt so that it will absorb fsync requests every so often when clearing a large backlog of deletion requests.
This commit is contained in:
parent
3072b7bade
commit
be86e3dd5b
@ -32,8 +32,9 @@
|
|||||||
#include "pg_trace.h"
|
#include "pg_trace.h"
|
||||||
|
|
||||||
|
|
||||||
/* interval for calling AbsorbFsyncRequests in mdsync */
|
/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
|
||||||
#define FSYNCS_PER_ABSORB 10
|
#define FSYNCS_PER_ABSORB 10
|
||||||
|
#define UNLINKS_PER_ABSORB 10
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Special values for the segno arg to RememberFsyncRequest.
|
* Special values for the segno arg to RememberFsyncRequest.
|
||||||
@ -51,7 +52,7 @@
|
|||||||
* ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
|
* ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
|
||||||
* that's what you get. Ugh. This code is designed so that we don't
|
* that's what you get. Ugh. This code is designed so that we don't
|
||||||
* actually believe these cases are okay without further evidence (namely,
|
* actually believe these cases are okay without further evidence (namely,
|
||||||
* a pending fsync request getting revoked ... see mdsync).
|
* a pending fsync request getting canceled ... see mdsync).
|
||||||
*/
|
*/
|
||||||
#ifndef WIN32
|
#ifndef WIN32
|
||||||
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
|
#define FILE_POSSIBLY_DELETED(err) ((err) == ENOENT)
|
||||||
@ -111,12 +112,12 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
|
|||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* In some contexts (currently, standalone backends and the checkpointer process)
|
* In some contexts (currently, standalone backends and the checkpointer)
|
||||||
* we keep track of pending fsync operations: we need to remember all relation
|
* we keep track of pending fsync operations: we need to remember all relation
|
||||||
* segments that have been written since the last checkpoint, so that we can
|
* segments that have been written since the last checkpoint, so that we can
|
||||||
* fsync them down to disk before completing the next checkpoint. This hash
|
* fsync them down to disk before completing the next checkpoint. This hash
|
||||||
* table remembers the pending operations. We use a hash table mostly as
|
* table remembers the pending operations. We use a hash table mostly as
|
||||||
* a convenient way of eliminating duplicate requests.
|
* a convenient way of merging duplicate requests.
|
||||||
*
|
*
|
||||||
* We use a similar mechanism to remember no-longer-needed files that can
|
* We use a similar mechanism to remember no-longer-needed files that can
|
||||||
* be deleted after the next checkpoint, but we use a linked list instead of
|
* be deleted after the next checkpoint, but we use a linked list instead of
|
||||||
@ -129,20 +130,16 @@ static MemoryContext MdCxt; /* context for all md.c allocations */
|
|||||||
* (Regular backends do not track pending operations locally, but forward
|
* (Regular backends do not track pending operations locally, but forward
|
||||||
* them to the checkpointer.)
|
* them to the checkpointer.)
|
||||||
*/
|
*/
|
||||||
typedef struct
|
|
||||||
{
|
|
||||||
RelFileNode rnode; /* the targeted relation */
|
|
||||||
ForkNumber forknum; /* which fork */
|
|
||||||
BlockNumber segno; /* which segment */
|
|
||||||
} PendingOperationTag;
|
|
||||||
|
|
||||||
typedef uint16 CycleCtr; /* can be any convenient integer size */
|
typedef uint16 CycleCtr; /* can be any convenient integer size */
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
PendingOperationTag tag; /* hash table key (must be first!) */
|
RelFileNode rnode; /* hash table key (must be first!) */
|
||||||
bool canceled; /* T => request canceled, not yet removed */
|
CycleCtr cycle_ctr; /* mdsync_cycle_ctr of oldest request */
|
||||||
CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */
|
/* requests[f] has bit n set if we need to fsync segment n of fork f */
|
||||||
|
Bitmapset *requests[MAX_FORKNUM + 1];
|
||||||
|
/* canceled[f] is true if we canceled fsyncs for fork "recently" */
|
||||||
|
bool canceled[MAX_FORKNUM + 1];
|
||||||
} PendingOperationEntry;
|
} PendingOperationEntry;
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
@ -206,7 +203,7 @@ mdinit(void)
|
|||||||
HASHCTL hash_ctl;
|
HASHCTL hash_ctl;
|
||||||
|
|
||||||
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
|
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
|
||||||
hash_ctl.keysize = sizeof(PendingOperationTag);
|
hash_ctl.keysize = sizeof(RelFileNode);
|
||||||
hash_ctl.entrysize = sizeof(PendingOperationEntry);
|
hash_ctl.entrysize = sizeof(PendingOperationEntry);
|
||||||
hash_ctl.hash = tag_hash;
|
hash_ctl.hash = tag_hash;
|
||||||
hash_ctl.hcxt = MdCxt;
|
hash_ctl.hcxt = MdCxt;
|
||||||
@ -227,10 +224,19 @@ mdinit(void)
|
|||||||
void
|
void
|
||||||
SetForwardFsyncRequests(void)
|
SetForwardFsyncRequests(void)
|
||||||
{
|
{
|
||||||
/* Perform any pending ops we may have queued up */
|
/* Perform any pending fsyncs we may have queued up, then drop table */
|
||||||
if (pendingOpsTable)
|
if (pendingOpsTable)
|
||||||
|
{
|
||||||
mdsync();
|
mdsync();
|
||||||
|
hash_destroy(pendingOpsTable);
|
||||||
|
}
|
||||||
pendingOpsTable = NULL;
|
pendingOpsTable = NULL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We should not have any pending unlink requests, since mdunlink doesn't
|
||||||
|
* queue unlink requests when isRedo.
|
||||||
|
*/
|
||||||
|
Assert(pendingUnlinks == NIL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1046,8 +1052,11 @@ mdsync(void)
|
|||||||
hash_seq_init(&hstat, pendingOpsTable);
|
hash_seq_init(&hstat, pendingOpsTable);
|
||||||
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
||||||
{
|
{
|
||||||
|
ForkNumber forknum;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If the entry is new then don't process it this time. Note that
|
* If the entry is new then don't process it this time; it might
|
||||||
|
* contain multiple fsync-request bits, but they are all new. Note
|
||||||
* "continue" bypasses the hash-remove call at the bottom of the loop.
|
* "continue" bypasses the hash-remove call at the bottom of the loop.
|
||||||
*/
|
*/
|
||||||
if (entry->cycle_ctr == mdsync_cycle_ctr)
|
if (entry->cycle_ctr == mdsync_cycle_ctr)
|
||||||
@ -1057,21 +1066,42 @@ mdsync(void)
|
|||||||
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
|
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If fsync is off then we don't have to bother opening the file at
|
* Scan over the forks and segments represented by the entry.
|
||||||
* all. (We delay checking until this point so that changing fsync on
|
*
|
||||||
* the fly behaves sensibly.) Also, if the entry is marked canceled,
|
* The bitmap manipulations are slightly tricky, because we can call
|
||||||
* fall through to delete it.
|
* AbsorbFsyncRequests() inside the loop and that could result in
|
||||||
|
* bms_add_member() modifying and even re-palloc'ing the bitmapsets.
|
||||||
|
* This is okay because we unlink each bitmapset from the hashtable
|
||||||
|
* entry before scanning it. That means that any incoming fsync
|
||||||
|
* requests will be processed now if they reach the table before we
|
||||||
|
* begin to scan their fork.
|
||||||
*/
|
*/
|
||||||
if (enableFsync && !entry->canceled)
|
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||||
|
{
|
||||||
|
Bitmapset *requests = entry->requests[forknum];
|
||||||
|
int segno;
|
||||||
|
|
||||||
|
entry->requests[forknum] = NULL;
|
||||||
|
entry->canceled[forknum] = false;
|
||||||
|
|
||||||
|
while ((segno = bms_first_member(requests)) >= 0)
|
||||||
{
|
{
|
||||||
int failures;
|
int failures;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If in checkpointer, we want to absorb pending requests every so
|
* If fsync is off then we don't have to bother opening the
|
||||||
* often to prevent overflow of the fsync request queue. It is
|
* file at all. (We delay checking until this point so that
|
||||||
* unspecified whether newly-added entries will be visited by
|
* changing fsync on the fly behaves sensibly.)
|
||||||
* hash_seq_search, but we don't care since we don't need to
|
*/
|
||||||
* process them anyway.
|
if (!enableFsync)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If in checkpointer, we want to absorb pending requests
|
||||||
|
* every so often to prevent overflow of the fsync request
|
||||||
|
* queue. It is unspecified whether newly-added entries will
|
||||||
|
* be visited by hash_seq_search, but we don't care since we
|
||||||
|
* don't need to process them anyway.
|
||||||
*/
|
*/
|
||||||
if (--absorb_counter <= 0)
|
if (--absorb_counter <= 0)
|
||||||
{
|
{
|
||||||
@ -1080,47 +1110,43 @@ mdsync(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The fsync table could contain requests to fsync segments that
|
* The fsync table could contain requests to fsync segments
|
||||||
* have been deleted (unlinked) by the time we get to them. Rather
|
* that have been deleted (unlinked) by the time we get to
|
||||||
* than just hoping an ENOENT (or EACCES on Windows) error can be
|
* them. Rather than just hoping an ENOENT (or EACCES on
|
||||||
* ignored, what we do on error is absorb pending requests and
|
* Windows) error can be ignored, what we do on error is
|
||||||
* then retry. Since mdunlink() queues a "revoke" message before
|
* absorb pending requests and then retry. Since mdunlink()
|
||||||
* actually unlinking, the fsync request is guaranteed to be
|
* queues a "cancel" message before actually unlinking, the
|
||||||
* marked canceled after the absorb if it really was this case.
|
* fsync request is guaranteed to be marked canceled after the
|
||||||
* DROP DATABASE likewise has to tell us to forget fsync requests
|
* absorb if it really was this case. DROP DATABASE likewise
|
||||||
* before it starts deletions.
|
* has to tell us to forget fsync requests before it starts
|
||||||
|
* deletions.
|
||||||
*/
|
*/
|
||||||
for (failures = 0;; failures++) /* loop exits at "break" */
|
for (failures = 0;; failures++) /* loop exits at "break" */
|
||||||
{
|
{
|
||||||
SMgrRelation reln;
|
SMgrRelation reln;
|
||||||
MdfdVec *seg;
|
MdfdVec *seg;
|
||||||
char *path;
|
char *path;
|
||||||
|
int save_errno;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Find or create an smgr hash entry for this relation. This
|
* Find or create an smgr hash entry for this relation.
|
||||||
* may seem a bit unclean -- md calling smgr? But it's really
|
* This may seem a bit unclean -- md calling smgr? But
|
||||||
* the best solution. It ensures that the open file reference
|
* it's really the best solution. It ensures that the
|
||||||
* isn't permanently leaked if we get an error here. (You may
|
* open file reference isn't permanently leaked if we get
|
||||||
* say "but an unreferenced SMgrRelation is still a leak!" Not
|
* an error here. (You may say "but an unreferenced
|
||||||
* really, because the only case in which a checkpoint is done
|
* SMgrRelation is still a leak!" Not really, because the
|
||||||
* by a process that isn't about to shut down is in the
|
* only case in which a checkpoint is done by a process
|
||||||
* checkpointer, and it will periodically do smgrcloseall().
|
* that isn't about to shut down is in the checkpointer,
|
||||||
* This fact justifies our not closing the reln in the success
|
* and it will periodically do smgrcloseall(). This fact
|
||||||
* path either, which is a good thing since in
|
* justifies our not closing the reln in the success path
|
||||||
* non-checkpointer cases we couldn't safely do that.)
|
* either, which is a good thing since in non-checkpointer
|
||||||
|
* cases we couldn't safely do that.)
|
||||||
*/
|
*/
|
||||||
reln = smgropen(entry->tag.rnode, InvalidBackendId);
|
reln = smgropen(entry->rnode, InvalidBackendId);
|
||||||
|
|
||||||
/*
|
/* Attempt to open and fsync the target segment */
|
||||||
* It is possible that the relation has been dropped or
|
seg = _mdfd_getseg(reln, forknum,
|
||||||
* truncated since the fsync request was entered. Therefore,
|
(BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
|
||||||
* allow ENOENT, but only if we didn't fail already on this
|
|
||||||
* file. This applies both during _mdfd_getseg() and during
|
|
||||||
* FileSync, since fd.c might have closed the file behind our
|
|
||||||
* back.
|
|
||||||
*/
|
|
||||||
seg = _mdfd_getseg(reln, entry->tag.forknum,
|
|
||||||
entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
|
|
||||||
false, EXTENSION_RETURN_NULL);
|
false, EXTENSION_RETURN_NULL);
|
||||||
|
|
||||||
INSTR_TIME_SET_CURRENT(sync_start);
|
INSTR_TIME_SET_CURRENT(sync_start);
|
||||||
@ -1128,6 +1154,7 @@ mdsync(void)
|
|||||||
if (seg != NULL &&
|
if (seg != NULL &&
|
||||||
FileSync(seg->mdfd_vfd) >= 0)
|
FileSync(seg->mdfd_vfd) >= 0)
|
||||||
{
|
{
|
||||||
|
/* Success; update statistics about sync timing */
|
||||||
INSTR_TIME_SET_CURRENT(sync_end);
|
INSTR_TIME_SET_CURRENT(sync_end);
|
||||||
sync_diff = sync_end;
|
sync_diff = sync_end;
|
||||||
INSTR_TIME_SUBTRACT(sync_diff, sync_start);
|
INSTR_TIME_SUBTRACT(sync_diff, sync_start);
|
||||||
@ -1138,23 +1165,36 @@ mdsync(void)
|
|||||||
processed++;
|
processed++;
|
||||||
if (log_checkpoints)
|
if (log_checkpoints)
|
||||||
elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
|
elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
|
||||||
processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
|
processed,
|
||||||
|
FilePathName(seg->mdfd_vfd),
|
||||||
|
(double) elapsed / 1000);
|
||||||
|
|
||||||
break; /* success; break out of retry loop */
|
break; /* out of retry loop */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Compute file name for use in message */
|
||||||
|
save_errno = errno;
|
||||||
|
path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
|
||||||
|
errno = save_errno;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
* It is possible that the relation has been dropped or
|
||||||
|
* truncated since the fsync request was entered.
|
||||||
|
* Therefore, allow ENOENT, but only if we didn't fail
|
||||||
|
* already on this file. This applies both for
|
||||||
|
* _mdfd_getseg() and for FileSync, since fd.c might have
|
||||||
|
* closed the file behind our back.
|
||||||
|
*
|
||||||
* XXX is there any point in allowing more than one retry?
|
* XXX is there any point in allowing more than one retry?
|
||||||
* Don't see one at the moment, but easy to change the test
|
* Don't see one at the moment, but easy to change the
|
||||||
* here if so.
|
* test here if so.
|
||||||
*/
|
*/
|
||||||
path = _mdfd_segpath(reln, entry->tag.forknum,
|
|
||||||
entry->tag.segno);
|
|
||||||
if (!FILE_POSSIBLY_DELETED(errno) ||
|
if (!FILE_POSSIBLY_DELETED(errno) ||
|
||||||
failures > 0)
|
failures > 0)
|
||||||
ereport(ERROR,
|
ereport(ERROR,
|
||||||
(errcode_for_file_access(),
|
(errcode_for_file_access(),
|
||||||
errmsg("could not fsync file \"%s\": %m", path)));
|
errmsg("could not fsync file \"%s\": %m",
|
||||||
|
path)));
|
||||||
else
|
else
|
||||||
ereport(DEBUG1,
|
ereport(DEBUG1,
|
||||||
(errcode_for_file_access(),
|
(errcode_for_file_access(),
|
||||||
@ -1163,24 +1203,39 @@ mdsync(void)
|
|||||||
pfree(path);
|
pfree(path);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Absorb incoming requests and check to see if canceled.
|
* Absorb incoming requests and check to see if a cancel
|
||||||
|
* arrived for this relation fork.
|
||||||
*/
|
*/
|
||||||
AbsorbFsyncRequests();
|
AbsorbFsyncRequests();
|
||||||
absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
|
absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
|
||||||
|
|
||||||
if (entry->canceled)
|
if (entry->canceled[forknum])
|
||||||
break;
|
break;
|
||||||
} /* end retry loop */
|
} /* end retry loop */
|
||||||
}
|
}
|
||||||
|
bms_free(requests);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If we get here, either we fsync'd successfully, or we don't have to
|
* We've finished everything that was requested before we started to
|
||||||
* because enableFsync is off, or the entry is (now) marked canceled.
|
* scan the entry. If no new requests have been inserted meanwhile,
|
||||||
* Okay to delete it.
|
* remove the entry. Otherwise, update its cycle counter, as all the
|
||||||
|
* requests now in it must have arrived during this cycle.
|
||||||
*/
|
*/
|
||||||
if (hash_search(pendingOpsTable, &entry->tag,
|
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||||
|
{
|
||||||
|
if (entry->requests[forknum] != NULL)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (forknum <= MAX_FORKNUM)
|
||||||
|
entry->cycle_ctr = mdsync_cycle_ctr;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* Okay to remove it */
|
||||||
|
if (hash_search(pendingOpsTable, &entry->rnode,
|
||||||
HASH_REMOVE, NULL) == NULL)
|
HASH_REMOVE, NULL) == NULL)
|
||||||
elog(ERROR, "pendingOpsTable corrupted");
|
elog(ERROR, "pendingOpsTable corrupted");
|
||||||
|
}
|
||||||
} /* end loop over hashtable entries */
|
} /* end loop over hashtable entries */
|
||||||
|
|
||||||
/* Return sync performance metrics for report at checkpoint end */
|
/* Return sync performance metrics for report at checkpoint end */
|
||||||
@ -1209,21 +1264,6 @@ mdsync(void)
|
|||||||
void
|
void
|
||||||
mdpreckpt(void)
|
mdpreckpt(void)
|
||||||
{
|
{
|
||||||
ListCell *cell;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* In case the prior checkpoint wasn't completed, stamp all entries in the
|
|
||||||
* list with the current cycle counter. Anything that's in the list at
|
|
||||||
* the start of checkpoint can surely be deleted after the checkpoint is
|
|
||||||
* finished, regardless of when the request was made.
|
|
||||||
*/
|
|
||||||
foreach(cell, pendingUnlinks)
|
|
||||||
{
|
|
||||||
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
|
|
||||||
|
|
||||||
entry->cycle_ctr = mdckpt_cycle_ctr;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Any unlink requests arriving after this point will be assigned the next
|
* Any unlink requests arriving after this point will be assigned the next
|
||||||
* cycle counter, and won't be unlinked until next checkpoint.
|
* cycle counter, and won't be unlinked until next checkpoint.
|
||||||
@ -1239,6 +1279,9 @@ mdpreckpt(void)
|
|||||||
void
|
void
|
||||||
mdpostckpt(void)
|
mdpostckpt(void)
|
||||||
{
|
{
|
||||||
|
int absorb_counter;
|
||||||
|
|
||||||
|
absorb_counter = UNLINKS_PER_ABSORB;
|
||||||
while (pendingUnlinks != NIL)
|
while (pendingUnlinks != NIL)
|
||||||
{
|
{
|
||||||
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
|
PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
|
||||||
@ -1247,13 +1290,15 @@ mdpostckpt(void)
|
|||||||
/*
|
/*
|
||||||
* New entries are appended to the end, so if the entry is new we've
|
* New entries are appended to the end, so if the entry is new we've
|
||||||
* reached the end of old entries.
|
* reached the end of old entries.
|
||||||
|
*
|
||||||
|
* Note: if just the right number of consecutive checkpoints fail, we
|
||||||
|
* could be fooled here by cycle_ctr wraparound. However, the only
|
||||||
|
* consequence is that we'd delay unlinking for one more checkpoint,
|
||||||
|
* which is perfectly tolerable.
|
||||||
*/
|
*/
|
||||||
if (entry->cycle_ctr == mdckpt_cycle_ctr)
|
if (entry->cycle_ctr == mdckpt_cycle_ctr)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Else assert we haven't missed it */
|
|
||||||
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
|
|
||||||
|
|
||||||
/* Unlink the file */
|
/* Unlink the file */
|
||||||
path = relpathperm(entry->rnode, MAIN_FORKNUM);
|
path = relpathperm(entry->rnode, MAIN_FORKNUM);
|
||||||
if (unlink(path) < 0)
|
if (unlink(path) < 0)
|
||||||
@ -1272,8 +1317,21 @@ mdpostckpt(void)
|
|||||||
}
|
}
|
||||||
pfree(path);
|
pfree(path);
|
||||||
|
|
||||||
|
/* And remove the list entry */
|
||||||
pendingUnlinks = list_delete_first(pendingUnlinks);
|
pendingUnlinks = list_delete_first(pendingUnlinks);
|
||||||
pfree(entry);
|
pfree(entry);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* As in mdsync, we don't want to stop absorbing fsync requests for a
|
||||||
|
* long time when there are many deletions to be done. We can safely
|
||||||
|
* call AbsorbFsyncRequests() at this point in the loop (note it might
|
||||||
|
* try to delete list entries).
|
||||||
|
*/
|
||||||
|
if (--absorb_counter <= 0)
|
||||||
|
{
|
||||||
|
AbsorbFsyncRequests();
|
||||||
|
absorb_counter = UNLINKS_PER_ABSORB;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1353,7 +1411,7 @@ register_unlink(RelFileNodeBackend rnode)
|
|||||||
/*
|
/*
|
||||||
* RememberFsyncRequest() -- callback from checkpointer side of fsync request
|
* RememberFsyncRequest() -- callback from checkpointer side of fsync request
|
||||||
*
|
*
|
||||||
* We stuff most fsync requests into the local hash table for execution
|
* We stuff fsync requests into the local hash table for execution
|
||||||
* during the checkpointer's next checkpoint. UNLINK requests go into a
|
* during the checkpointer's next checkpoint. UNLINK requests go into a
|
||||||
* separate linked list, however, because they get processed separately.
|
* separate linked list, however, because they get processed separately.
|
||||||
*
|
*
|
||||||
@ -1365,10 +1423,11 @@ register_unlink(RelFileNodeBackend rnode)
|
|||||||
* - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
|
* - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
|
||||||
* - UNLINK_RELATION_REQUEST is a request to delete the file after the next
|
* - UNLINK_RELATION_REQUEST is a request to delete the file after the next
|
||||||
* checkpoint.
|
* checkpoint.
|
||||||
|
* Note also that we're assuming real segment numbers don't exceed INT_MAX.
|
||||||
*
|
*
|
||||||
* (Handling the FORGET_* requests is a tad slow because the hash table has
|
* (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
|
||||||
* to be searched linearly, but it doesn't seem worth rethinking the table
|
* table has to be searched linearly, but dropping a database is a pretty
|
||||||
* structure for them.)
|
* heavyweight operation anyhow, so we'll live with it.)
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
||||||
@ -1378,18 +1437,37 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
|||||||
if (segno == FORGET_RELATION_FSYNC)
|
if (segno == FORGET_RELATION_FSYNC)
|
||||||
{
|
{
|
||||||
/* Remove any pending requests for the relation (one or all forks) */
|
/* Remove any pending requests for the relation (one or all forks) */
|
||||||
HASH_SEQ_STATUS hstat;
|
|
||||||
PendingOperationEntry *entry;
|
PendingOperationEntry *entry;
|
||||||
|
|
||||||
hash_seq_init(&hstat, pendingOpsTable);
|
entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
|
||||||
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
&rnode,
|
||||||
|
HASH_FIND,
|
||||||
|
NULL);
|
||||||
|
if (entry)
|
||||||
{
|
{
|
||||||
if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
|
/*
|
||||||
(entry->tag.forknum == forknum ||
|
* We can't just delete the entry since mdsync could have an
|
||||||
forknum == InvalidForkNumber))
|
* active hashtable scan. Instead we delete the bitmapsets; this
|
||||||
|
* is safe because of the way mdsync is coded. We also set the
|
||||||
|
* "canceled" flags so that mdsync can tell that a cancel arrived
|
||||||
|
* for the fork(s).
|
||||||
|
*/
|
||||||
|
if (forknum == InvalidForkNumber)
|
||||||
{
|
{
|
||||||
/* Okay, cancel this entry */
|
/* remove requests for all forks */
|
||||||
entry->canceled = true;
|
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||||
|
{
|
||||||
|
bms_free(entry->requests[forknum]);
|
||||||
|
entry->requests[forknum] = NULL;
|
||||||
|
entry->canceled[forknum] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* remove requests for single fork */
|
||||||
|
bms_free(entry->requests[forknum]);
|
||||||
|
entry->requests[forknum] = NULL;
|
||||||
|
entry->canceled[forknum] = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1406,10 +1484,15 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
|||||||
hash_seq_init(&hstat, pendingOpsTable);
|
hash_seq_init(&hstat, pendingOpsTable);
|
||||||
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
||||||
{
|
{
|
||||||
if (entry->tag.rnode.dbNode == rnode.dbNode)
|
if (entry->rnode.dbNode == rnode.dbNode)
|
||||||
{
|
{
|
||||||
/* Okay, cancel this entry */
|
/* remove requests for all forks */
|
||||||
entry->canceled = true;
|
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
|
||||||
|
{
|
||||||
|
bms_free(entry->requests[forknum]);
|
||||||
|
entry->requests[forknum] = NULL;
|
||||||
|
entry->canceled[forknum] = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1449,40 +1532,32 @@ RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
|
|||||||
else
|
else
|
||||||
{
|
{
|
||||||
/* Normal case: enter a request to fsync this segment */
|
/* Normal case: enter a request to fsync this segment */
|
||||||
PendingOperationTag key;
|
MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
|
||||||
PendingOperationEntry *entry;
|
PendingOperationEntry *entry;
|
||||||
bool found;
|
bool found;
|
||||||
|
|
||||||
/* ensure any pad bytes in the hash key are zeroed */
|
|
||||||
MemSet(&key, 0, sizeof(key));
|
|
||||||
key.rnode = rnode;
|
|
||||||
key.forknum = forknum;
|
|
||||||
key.segno = segno;
|
|
||||||
|
|
||||||
entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
|
entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
|
||||||
&key,
|
&rnode,
|
||||||
HASH_ENTER,
|
HASH_ENTER,
|
||||||
&found);
|
&found);
|
||||||
/* if new or previously canceled entry, initialize it */
|
/* if new entry, initialize it */
|
||||||
if (!found || entry->canceled)
|
if (!found)
|
||||||
{
|
{
|
||||||
entry->canceled = false;
|
|
||||||
entry->cycle_ctr = mdsync_cycle_ctr;
|
entry->cycle_ctr = mdsync_cycle_ctr;
|
||||||
|
MemSet(entry->requests, 0, sizeof(entry->requests));
|
||||||
|
MemSet(entry->canceled, 0, sizeof(entry->canceled));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NB: it's intentional that we don't change cycle_ctr if the entry
|
* NB: it's intentional that we don't change cycle_ctr if the entry
|
||||||
* already exists. The fsync request must be treated as old, even
|
* already exists. The cycle_ctr must represent the oldest fsync
|
||||||
* though the new request will be satisfied too by any subsequent
|
* request that could be in the entry.
|
||||||
* fsync.
|
|
||||||
*
|
|
||||||
* However, if the entry is present but is marked canceled, we should
|
|
||||||
* act just as though it wasn't there. The only case where this could
|
|
||||||
* happen would be if a file had been deleted, we received but did not
|
|
||||||
* yet act on the cancel request, and the same relfilenode was then
|
|
||||||
* assigned to a new file. We mustn't lose the new request, but it
|
|
||||||
* should be considered new not old.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
entry->requests[forknum] = bms_add_member(entry->requests[forknum],
|
||||||
|
(int) segno);
|
||||||
|
|
||||||
|
MemoryContextSwitchTo(oldcxt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1503,7 +1578,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
|
|||||||
else if (IsUnderPostmaster)
|
else if (IsUnderPostmaster)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
* Notify the checkpointer about it. If we fail to queue the revoke
|
* Notify the checkpointer about it. If we fail to queue the cancel
|
||||||
* message, we have to sleep and try again ... ugly, but hopefully
|
* message, we have to sleep and try again ... ugly, but hopefully
|
||||||
* won't happen often.
|
* won't happen often.
|
||||||
*
|
*
|
||||||
@ -1517,7 +1592,7 @@ ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Note we don't wait for the checkpointer to actually absorb the
|
* Note we don't wait for the checkpointer to actually absorb the
|
||||||
* revoke message; see mdsync() for the implications.
|
* cancel message; see mdsync() for the implications.
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user