diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index 417e3bb0d1b..92fd4276cd1 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -105,7 +105,7 @@ */ typedef struct { - RelFileNodeBackend rnode; + RelFileNode rnode; ForkNumber forknum; BlockNumber segno; /* see md.c for special values */ /* might add a real request-type field later; not needed yet */ @@ -924,17 +924,22 @@ CheckpointerShmemSize(void) void CheckpointerShmemInit(void) { + Size size = CheckpointerShmemSize(); bool found; CheckpointerShmem = (CheckpointerShmemStruct *) ShmemInitStruct("Checkpointer Data", - CheckpointerShmemSize(), + size, &found); if (!found) { - /* First time through, so initialize */ - MemSet(CheckpointerShmem, 0, sizeof(CheckpointerShmemStruct)); + /* + * First time through, so initialize. Note that we zero the whole + * requests array; this is so that CompactCheckpointerRequestQueue + * can assume that any pad bytes in the request structs are zeroes. + */ + MemSet(CheckpointerShmem, 0, size); SpinLockInit(&CheckpointerShmem->ckpt_lck); CheckpointerShmem->max_requests = NBuffers; } @@ -1091,11 +1096,15 @@ RequestCheckpoint(int flags) * Forward a file-fsync request from a backend to the checkpointer * * Whenever a backend is compelled to write directly to a relation - * (which should be seldom, if the checkpointer is getting its job done), + * (which should be seldom, if the background writer is getting its job done), * the backend calls this routine to pass over knowledge that the relation * is dirty and must be fsync'd before next checkpoint. We also use this * opportunity to count such writes for statistical purposes. * + * This functionality is only supported for regular (not backend-local) + * relations, so the rnode argument is intentionally RelFileNode not + * RelFileNodeBackend. + * * segno specifies which segment (not block!) of the relation needs to be * fsync'd. (Since the valid range is much less than BlockNumber, we can * use high values for special flags; that's all internal to md.c, which @@ -1112,8 +1121,7 @@ RequestCheckpoint(int flags) * let the backend know by returning false. */ bool -ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) +ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) { CheckpointerRequest *request; bool too_full; @@ -1169,6 +1177,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, /* * CompactCheckpointerRequestQueue * Remove duplicates from the request queue to avoid backend fsyncs. + * Returns "true" if any entries were removed. * * Although a full fsync request queue is not common, it can lead to severe * performance problems when it does happen. So far, this situation has @@ -1178,7 +1187,7 @@ ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, * gets very expensive and can slow down the whole system. * * Trying to do this every time the queue is full could lose if there - * aren't any removable entries. But should be vanishingly rare in + * aren't any removable entries. But that should be vanishingly rare in * practice: there's one queue entry per shared buffer. */ static bool @@ -1200,18 +1209,20 @@ CompactCheckpointerRequestQueue(void) /* must hold CheckpointerCommLock in exclusive mode */ Assert(LWLockHeldByMe(CheckpointerCommLock)); + /* Initialize skip_slot array */ + skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + /* Initialize temporary hash table */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = sizeof(CheckpointerRequest); ctl.entrysize = sizeof(struct CheckpointerSlotMapping); ctl.hash = tag_hash; + ctl.hcxt = CurrentMemoryContext; + htab = hash_create("CompactCheckpointerRequestQueue", CheckpointerShmem->num_requests, &ctl, - HASH_ELEM | HASH_FUNCTION); - - /* Initialize skip_slot array */ - skip_slot = palloc0(sizeof(bool) * CheckpointerShmem->num_requests); + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); /* * The basic idea here is that a request can be skipped if it's followed @@ -1226,19 +1237,28 @@ CompactCheckpointerRequestQueue(void) * anyhow), but it's not clear that the extra complexity would buy us * anything. */ - for (n = 0; n < CheckpointerShmem->num_requests; ++n) + for (n = 0; n < CheckpointerShmem->num_requests; n++) { CheckpointerRequest *request; struct CheckpointerSlotMapping *slotmap; bool found; + /* + * We use the request struct directly as a hashtable key. This + * assumes that any padding bytes in the structs are consistently the + * same, which should be okay because we zeroed them in + * CheckpointerShmemInit. Note also that RelFileNode had better + * contain no pad bytes. + */ request = &CheckpointerShmem->requests[n]; slotmap = hash_search(htab, request, HASH_ENTER, &found); if (found) { + /* Duplicate, so mark the previous occurrence as skippable */ skip_slot[slotmap->slot] = true; - ++num_skipped; + num_skipped++; } + /* Remember slot containing latest occurrence of this request value */ slotmap->slot = n; } @@ -1253,7 +1273,8 @@ CompactCheckpointerRequestQueue(void) } /* We found some duplicates; remove them. */ - for (n = 0, preserve_count = 0; n < CheckpointerShmem->num_requests; ++n) + preserve_count = 0; + for (n = 0; n < CheckpointerShmem->num_requests; n++) { if (skip_slot[n]) continue; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 78145472e16..a3bf9a4d44e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -2049,7 +2049,7 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, int i; /* If it's a local relation, it's localbuf.c's problem. */ - if (rnode.backend != InvalidBackendId) + if (RelFileNodeBackendIsTemp(rnode)) { if (rnode.backend == MyBackendId) DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock); @@ -2103,7 +2103,7 @@ DropRelFileNodeAllBuffers(RelFileNodeBackend rnode) int i; /* If it's a local relation, it's localbuf.c's problem. */ - if (rnode.backend != InvalidBackendId) + if (RelFileNodeBackendIsTemp(rnode)) { if (rnode.backend == MyBackendId) DropRelFileNodeAllLocalBuffers(rnode.node); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index e5dec9d2a32..c05315d8849 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -38,7 +38,7 @@ /* * Special values for the segno arg to RememberFsyncRequest. * - * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an + * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an * fsync request from the queue if an identical, subsequent request is found. * See comments there before making changes here. */ @@ -122,13 +122,17 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ * be deleted after the next checkpoint, but we use a linked list instead of * a hash table, because we don't expect there to be any duplicate requests. * + * These mechanisms are only used for non-temp relations; we never fsync + * temp rels, nor do we need to postpone their deletion (see comments in + * mdunlink). + * * (Regular backends do not track pending operations locally, but forward * them to the checkpointer.) */ typedef struct { - RelFileNodeBackend rnode; /* the targeted relation */ - ForkNumber forknum; + RelFileNode rnode; /* the targeted relation */ + ForkNumber forknum; /* which fork */ BlockNumber segno; /* which segment */ } PendingOperationTag; @@ -143,7 +147,7 @@ typedef struct typedef struct { - RelFileNodeBackend rnode; /* the dead relation to delete */ + RelFileNode rnode; /* the dead relation to delete */ CycleCtr cycle_ctr; /* mdckpt_cycle_ctr when request was made */ } PendingUnlinkEntry; @@ -302,11 +306,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) /* * mdunlink() -- Unlink a relation. * - * Note that we're passed a RelFileNode --- by the time this is called, + * Note that we're passed a RelFileNodeBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. * - * Actually, we don't unlink the first segment file of the relation, but - * just truncate it to zero length, and record a request to unlink it after + * For regular relations, we don't unlink the first segment file of the rel, + * but just truncate it to zero length, and record a request to unlink it after * the next checkpoint. Additional segments can be unlinked immediately, * however. Leaving the empty file in place prevents that relfilenode * number from being reused. The scenario this protects us from is: @@ -323,6 +327,12 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * number until it's safe, because relfilenode assignment skips over any * existing file. * + * We do not need to go through this dance for temp relations, though, because + * we never make WAL entries for temp rels, and so a temp rel poses no threat + * to the health of a regular rel that has taken over its relfilenode number. + * The fact that temp rels and regular rels have different file naming + * patterns provides additional safety. + * * All the above applies only to the relation's main fork; other forks can * just be removed immediately, since they are not needed to prevent the * relfilenode number from being recycled. Also, we do not carefully @@ -345,16 +355,18 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo) /* * We have to clean out any pending fsync requests for the doomed - * relation, else the next mdsync() will fail. + * relation, else the next mdsync() will fail. There can't be any such + * requests for a temp relation, though. */ - ForgetRelationFsyncRequests(rnode, forkNum); + if (!RelFileNodeBackendIsTemp(rnode)) + ForgetRelationFsyncRequests(rnode.node, forkNum); path = relpath(rnode, forkNum); /* * Delete or truncate the first segment. */ - if (isRedo || forkNum != MAIN_FORKNUM) + if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode)) { ret = unlink(path); if (ret < 0 && errno != ENOENT) @@ -1077,12 +1089,8 @@ mdsync(void) * This fact justifies our not closing the reln in the success * path either, which is a good thing since in * non-checkpointer cases we couldn't safely do that.) - * Furthermore, in many cases the relation will have been - * dirtied through this same smgr relation, and so we can save - * a file open/close cycle. */ - reln = smgropen(entry->tag.rnode.node, - entry->tag.rnode.backend); + reln = smgropen(entry->tag.rnode, InvalidBackendId); /* * It is possible that the relation has been dropped or @@ -1228,7 +1236,7 @@ mdpostckpt(void) Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr); /* Unlink the file */ - path = relpath(entry->rnode, MAIN_FORKNUM); + path = relpathperm(entry->rnode, MAIN_FORKNUM); if (unlink(path) < 0) { /* @@ -1255,21 +1263,24 @@ mdpostckpt(void) * * If there is a local pending-ops table, just make an entry in it for * mdsync to process later. Otherwise, try to pass off the fsync request - * to the background writer process. If that fails, just do the fsync - * locally before returning (we expect this will not happen often enough + * to the checkpointer process. If that fails, just do the fsync + * locally before returning (we hope this will not happen often enough * to be a performance problem). */ static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { + /* Temp relations should never be fsync'd */ + Assert(!SmgrIsTemp(reln)); + if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno); + RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno); } else { - if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno)) + if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno)) return; /* passed it off successfully */ ereport(DEBUG1, @@ -1286,16 +1297,23 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) /* * register_unlink() -- Schedule a file to be deleted after next checkpoint * + * We don't bother passing in the fork number, because this is only used + * with main forks. + * * As with register_dirty_segment, this could involve either a local or * a remote pending-ops table. */ static void register_unlink(RelFileNodeBackend rnode) { + /* Should never be used with temp relations */ + Assert(!RelFileNodeBackendIsTemp(rnode)); + if (pendingOpsTable) { /* push it into local pending-ops table */ - RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST); + RememberFsyncRequest(rnode.node, MAIN_FORKNUM, + UNLINK_RELATION_REQUEST); } else { @@ -1307,7 +1325,7 @@ register_unlink(RelFileNodeBackend rnode) * XXX should we just leave the file orphaned instead? */ Assert(IsUnderPostmaster); - while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM, + while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM, UNLINK_RELATION_REQUEST)) pg_usleep(10000L); /* 10 msec seems a good number */ } @@ -1333,8 +1351,7 @@ register_unlink(RelFileNodeBackend rnode) * structure for them.) */ void -RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, - BlockNumber segno) +RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno) { Assert(pendingOpsTable); @@ -1347,7 +1364,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { - if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) && + if (RelFileNodeEquals(entry->tag.rnode, rnode) && entry->tag.forknum == forknum) { /* Okay, cancel this entry */ @@ -1368,7 +1385,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, hash_seq_init(&hstat, pendingOpsTable); while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) { - if (entry->tag.rnode.node.dbNode == rnode.node.dbNode) + if (entry->tag.rnode.dbNode == rnode.dbNode) { /* Okay, cancel this entry */ entry->canceled = true; @@ -1382,7 +1399,7 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell); next = lnext(cell); - if (entry->rnode.node.dbNode == rnode.node.dbNode) + if (entry->rnode.dbNode == rnode.dbNode) { pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev); pfree(entry); @@ -1397,6 +1414,9 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt); PendingUnlinkEntry *entry; + /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */ + Assert(forknum == MAIN_FORKNUM); + entry = palloc(sizeof(PendingUnlinkEntry)); entry->rnode = rnode; entry->cycle_ctr = mdckpt_cycle_ctr; @@ -1446,10 +1466,10 @@ RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, } /* - * ForgetRelationFsyncRequests -- forget any fsyncs for a rel + * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork */ void -ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) +ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum) { if (pendingOpsTable) { @@ -1484,12 +1504,11 @@ ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum) void ForgetDatabaseFsyncRequests(Oid dbid) { - RelFileNodeBackend rnode; + RelFileNode rnode; - rnode.node.dbNode = dbid; - rnode.node.spcNode = 0; - rnode.node.relNode = 0; - rnode.backend = InvalidBackendId; + rnode.dbNode = dbid; + rnode.spcNode = 0; + rnode.relNode = 0; if (pendingOpsTable) { diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index 996065c2edf..2e97e6aea55 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -31,7 +31,7 @@ extern void CheckpointerMain(void) __attribute__((noreturn)); extern void RequestCheckpoint(int flags); extern void CheckpointWriteDelay(int flags, double progress); -extern bool ForwardFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, +extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno); extern void AbsorbFsyncRequests(void); diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index 60c38295375..5ec1d8f7177 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -69,6 +69,10 @@ typedef enum ForkNumber * Note: in pg_class, relfilenode can be zero to denote that the relation * is a "mapped" relation, whose current true filenode number is available * from relmapper.c. Again, this case is NOT allowed in RelFileNodes. + * + * Note: various places use RelFileNode in hashtable keys. Therefore, + * there *must not* be any unused padding bytes in this struct. That + * should be safe as long as all the fields are of type Oid. */ typedef struct RelFileNode { @@ -79,7 +83,11 @@ typedef struct RelFileNode /* * Augmenting a relfilenode with the backend ID provides all the information - * we need to locate the physical storage. + * we need to locate the physical storage. The backend ID is InvalidBackendId + * for regular relations (those accessible to more than one backend), or the + * owning backend's ID for backend-local relations. Backend-local relations + * are always transient and removed in case of a database crash; they are + * never WAL-logged or fsync'd. */ typedef struct RelFileNodeBackend { @@ -87,11 +95,15 @@ typedef struct RelFileNodeBackend BackendId backend; } RelFileNodeBackend; +#define RelFileNodeBackendIsTemp(rnode) \ + ((rnode).backend != InvalidBackendId) + /* * Note: RelFileNodeEquals and RelFileNodeBackendEquals compare relNode first * since that is most likely to be different in two unequal RelFileNodes. It * is probably redundant to compare spcNode if the other fields are found equal, - * but do it anyway to be sure. + * but do it anyway to be sure. Likewise for checking the backend ID in + * RelFileNodeBackendEquals. */ #define RelFileNodeEquals(node1, node2) \ ((node1).relNode == (node2).relNode && \ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index f8fc2b2d6e8..3560d539076 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -69,7 +69,7 @@ typedef struct SMgrRelationData typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ - ((smgr)->smgr_rnode.backend != InvalidBackendId) + RelFileNodeBackendIsTemp((smgr)->smgr_rnode) extern void smgrinit(void); extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); @@ -124,10 +124,9 @@ extern void mdsync(void); extern void mdpostckpt(void); extern void SetForwardFsyncRequests(void); -extern void RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum, +extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno); -extern void ForgetRelationFsyncRequests(RelFileNodeBackend rnode, - ForkNumber forknum); +extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum); extern void ForgetDatabaseFsyncRequests(Oid dbid); /* smgrtype.c */