diff --git a/src/backend/access/rmgrdesc/mxactdesc.c b/src/backend/access/rmgrdesc/mxactdesc.c index 572951ec2f1..5b8134f2582 100644 --- a/src/backend/access/rmgrdesc/mxactdesc.c +++ b/src/backend/access/rmgrdesc/mxactdesc.c @@ -70,6 +70,14 @@ multixact_desc(StringInfo buf, XLogReaderState *record) for (i = 0; i < xlrec->nmembers; i++) out_member(buf, &xlrec->members[i]); } + else if (info == XLOG_MULTIXACT_TRUNCATE_ID) + { + xl_multixact_truncate *xlrec = (xl_multixact_truncate *) rec; + + appendStringInfo(buf, "offsets [%u, %u), members [%u, %u)", + xlrec->startTruncOff, xlrec->endTruncOff, + xlrec->startTruncMemb, xlrec->endTruncMemb); + } } const char * @@ -88,6 +96,9 @@ multixact_identify(uint8 info) case XLOG_MULTIXACT_CREATE_ID: id = "CREATE_ID"; break; + case XLOG_MULTIXACT_TRUNCATE_ID: + id = "TRUNCATE_ID"; + break; } return id; diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 1933a87d656..5e3357f87f7 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -49,9 +49,7 @@ * value is removed; the cutoff value is stored in pg_class. The minimum value * across all tables in each database is stored in pg_database, and the global * minimum across all databases is part of pg_control and is kept in shared - * memory. At checkpoint time, after the value is known flushed in WAL, any - * files that correspond to multixacts older than that value are removed. - * (These files are also removed when a restartpoint is executed.) + * memory. Whenever that minimum is advanced, the SLRUs are truncated. * * When new multixactid values are to be created, care is taken that the * counter does not fall within the wraparound horizon considering the global @@ -83,6 +81,7 @@ #include "postmaster/autovacuum.h" #include "storage/lmgr.h" #include "storage/pmsignal.h" +#include "storage/proc.h" #include "storage/procarray.h" #include "utils/builtins.h" #include "utils/memutils.h" @@ -109,6 +108,7 @@ ((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) #define MultiXactIdToOffsetEntry(xid) \ ((xid) % (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE) +#define MultiXactIdToOffsetSegment(xid) (MultiXactIdToOffsetPage(xid) / SLRU_PAGES_PER_SEGMENT) /* * The situation for members is a bit more complex: we store one byte of @@ -153,6 +153,7 @@ /* page in which a member is to be found */ #define MXOffsetToMemberPage(xid) ((xid) / (TransactionId) MULTIXACT_MEMBERS_PER_PAGE) +#define MXOffsetToMemberSegment(xid) (MXOffsetToMemberPage(xid) / SLRU_PAGES_PER_SEGMENT) /* Location (byte offset within page) of flag word for a given member */ #define MXOffsetToFlagsOffset(xid) \ @@ -212,19 +213,20 @@ typedef struct MultiXactStateData Oid oldestMultiXactDB; /* - * Oldest multixact offset that is potentially referenced by a - * multixact referenced by a relation. We don't always know this value, - * so there's a flag here to indicate whether or not we currently do. + * Oldest multixact offset that is potentially referenced by a multixact + * referenced by a relation. We don't always know this value, so there's + * a flag here to indicate whether or not we currently do. */ MultiXactOffset oldestOffset; bool oldestOffsetKnown; /* - * This is what the previous checkpoint stored as the truncate position. - * This value is the oldestMultiXactId that was valid when a checkpoint - * was last executed. + * True if a multixact truncation WAL record was replayed since the last + * checkpoint. This is used to trigger 'legacy truncations', i.e. truncate + * by looking at the data directory during WAL replay, when the primary is + * too old to generate truncation records. */ - MultiXactId lastCheckpointedOldest; + bool sawTruncationInCkptCycle; /* support for anti-wraparound measures */ MultiXactId multiVacLimit; @@ -233,8 +235,7 @@ typedef struct MultiXactStateData MultiXactId multiWrapLimit; /* support for members anti-wraparound measures */ - MultiXactOffset offsetStopLimit; - bool offsetStopLimitKnown; + MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ /* * Per-backend data starts here. We have two arrays stored in the area @@ -364,12 +365,14 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2); static void ExtendMultiXactOffset(MultiXactId multi); static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers); -static void DetermineSafeOldestOffset(MultiXactId oldestMXact); static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, uint32 distance); -static bool SetOffsetVacuumLimit(bool finish_setup); +static bool SetOffsetVacuumLimit(void); static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result); static void WriteMZeroPageXlogRec(int pageno, uint8 info); +static void WriteMTruncateXlogRec(Oid oldestMultiDB, + MultiXactId startOff, MultiXactId endOff, + MultiXactOffset startMemb, MultiXactOffset endMemb); /* @@ -1099,7 +1102,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) *---------- */ #define OFFSET_WARN_SEGMENTS 20 - if (MultiXactState->offsetStopLimitKnown && + if (MultiXactState->oldestOffsetKnown && MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, nmembers)) { @@ -1139,7 +1142,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); } - if (MultiXactState->offsetStopLimitKnown && + if (MultiXactState->oldestOffsetKnown && MultiXactOffsetWouldWrap(MultiXactState->offsetStopLimit, nextOffset, nmembers + MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT * OFFSET_WARN_SEGMENTS)) @@ -2010,20 +2013,24 @@ StartupMultiXact(void) /* * This must be called ONCE at the end of startup/recovery. - * - * We don't need any locks here, really; the SLRU locks are taken only because - * slru.c expects to be called with locks held. */ void TrimMultiXact(void) { - MultiXactId multi = MultiXactState->nextMXact; - MultiXactOffset offset = MultiXactState->nextOffset; - MultiXactId oldestMXact; + MultiXactId nextMXact; + MultiXactOffset offset; + MultiXactId oldestMXact; + Oid oldestMXactDB; int pageno; int entryno; int flagsoff; + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextMXact = MultiXactState->nextMXact; + offset = MultiXactState->nextOffset; + oldestMXact = MultiXactState->oldestMultiXactId; + oldestMXactDB = MultiXactState->oldestMultiXactDB; + LWLockRelease(MultiXactGenLock); /* Clean up offsets state */ LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); @@ -2031,20 +2038,20 @@ TrimMultiXact(void) /* * (Re-)Initialize our idea of the latest page number for offsets. */ - pageno = MultiXactIdToOffsetPage(multi); + pageno = MultiXactIdToOffsetPage(nextMXact); MultiXactOffsetCtl->shared->latest_page_number = pageno; /* * Zero out the remainder of the current offsets page. See notes in * TrimCLOG() for motivation. */ - entryno = MultiXactIdToOffsetEntry(multi); + entryno = MultiXactIdToOffsetEntry(nextMXact); if (entryno != 0) { int slotno; MultiXactOffset *offptr; - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, multi); + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; @@ -2093,12 +2100,13 @@ TrimMultiXact(void) LWLockRelease(MultiXactMemberControlLock); - if (SetOffsetVacuumLimit(true) && IsUnderPostmaster) - SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); - LWLockAcquire(MultiXactGenLock, LW_SHARED); - oldestMXact = MultiXactState->lastCheckpointedOldest; + /* signal that we're officially up */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->finishedStartup = true; LWLockRelease(MultiXactGenLock); - DetermineSafeOldestOffset(oldestMXact); + + /* Now compute how far away the next members wraparound is. */ + SetMultiXactIdLimit(oldestMXact, oldestMXactDB); } /* @@ -2267,8 +2275,20 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) (errmsg("MultiXactId wrap limit is %u, limited by database with OID %u", multiWrapLimit, oldest_datoid))); + /* + * Computing the actual limits is only possible once the data directory is + * in a consistent state. There's no need to compute the limits while + * still replaying WAL - no decisions about new multis are made even + * though multixact creations might be replayed. So we'll only do further + * checks after TrimMultiXact() has been called. + */ + if (!MultiXactState->finishedStartup) + return; + + Assert(!InRecovery); + /* Set limits for offset vacuum. */ - needs_offset_vacuum = SetOffsetVacuumLimit(false); + needs_offset_vacuum = SetOffsetVacuumLimit(); /* * If past the autovacuum force point, immediately signal an autovac @@ -2278,11 +2298,11 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid) * another iteration immediately if there are still any old databases. */ if ((MultiXactIdPrecedes(multiVacLimit, curMulti) || - needs_offset_vacuum) && IsUnderPostmaster && !InRecovery) + needs_offset_vacuum) && IsUnderPostmaster) SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER); /* Give an immediate warning if past the wrap warn point */ - if (MultiXactIdPrecedes(multiWarnLimit, curMulti) && !InRecovery) + if (MultiXactIdPrecedes(multiWarnLimit, curMulti)) { char *oldest_datname; @@ -2350,27 +2370,39 @@ MultiXactAdvanceNextMXact(MultiXactId minMulti, } /* - * Update our oldestMultiXactId value, but only if it's more recent than - * what we had. However, even if not, always update the oldest multixact - * offset limit. + * Update our oldestMultiXactId value, but only if it's more recent than what + * we had. + * + * This may only be called during WAL replay. */ void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) { - if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) - SetMultiXactIdLimit(oldestMulti, oldestMultiDB); -} + Assert(InRecovery); -/* - * Update the "safe truncation point". This is the newest value of oldestMulti - * that is known to be flushed as part of a checkpoint record. - */ -void -MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti) -{ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->lastCheckpointedOldest = safeTruncateMulti; - LWLockRelease(MultiXactGenLock); + if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti)) + { + /* + * If there has been a truncation on the master, detected by seeing a + * moving oldestMulti, without a corresponding truncation record, we + * know that the primary is still running an older version of postgres + * that doesn't yet log multixact truncations. So perform the + * truncation ourselves. + */ + if (!MultiXactState->sawTruncationInCkptCycle) + { + ereport(LOG, + (errmsg("performing legacy multixact truncation"), + errdetail("Legacy truncations are sometimes performed when replaying WAL from an older primary."), + errhint("Upgrade the primary, it is susceptible to data corruption."))); + TruncateMultiXact(oldestMulti, oldestMultiDB, true); + } + + SetMultiXactIdLimit(oldestMulti, oldestMultiDB); + } + + /* only looked at in the startup process, no lock necessary */ + MultiXactState->sawTruncationInCkptCycle = false; } /* @@ -2526,132 +2558,56 @@ GetOldestMultiXactId(void) } /* - * Based on the given oldest MultiXactId, determine what's the oldest member - * offset and install the limit info in MultiXactState, where it can be used to - * prevent overrun of old data in the members SLRU area. + * Determine how aggressively we need to vacuum in order to prevent member + * wraparound. + * + * To do so determine what's the oldest member offset and install the limit + * info in MultiXactState, where it can be used to prevent overrun of old data + * in the members SLRU area. + * + * The return value is true if emergency autovacuum is required and false + * otherwise. */ -static void -DetermineSafeOldestOffset(MultiXactId oldestMXact) +static bool +SetOffsetVacuumLimit(void) { - MultiXactOffset oldestOffset; + MultiXactId oldestMultiXactId; + MultiXactId nextMXact; + MultiXactOffset oldestOffset = 0; /* placate compiler */ + MultiXactOffset prevOldestOffset; MultiXactOffset nextOffset; - MultiXactOffset offsetStopLimit; - MultiXactOffset prevOffsetStopLimit; - MultiXactId nextMXact; - bool finishedStartup; - bool prevOffsetStopLimitKnown; + bool oldestOffsetKnown = false; + bool prevOldestOffsetKnown; + MultiXactOffset offsetStopLimit = 0; - /* Fetch values from shared memory. */ + /* + * NB: Have to prevent concurrent truncation, we might otherwise try to + * lookup a oldestMulti that's concurrently getting truncated away. + */ + LWLockAcquire(MultiXactTruncationLock, LW_SHARED); + + /* Read relevant fields from shared memory. */ LWLockAcquire(MultiXactGenLock, LW_SHARED); - finishedStartup = MultiXactState->finishedStartup; + oldestMultiXactId = MultiXactState->oldestMultiXactId; nextMXact = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; - prevOffsetStopLimit = MultiXactState->offsetStopLimit; - prevOffsetStopLimitKnown = MultiXactState->offsetStopLimitKnown; + prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; + prevOldestOffset = MultiXactState->oldestOffset; + Assert(MultiXactState->finishedStartup); LWLockRelease(MultiXactGenLock); - /* Don't worry about this until after we've started up. */ - if (!finishedStartup) - return; - /* * Determine the offset of the oldest multixact. Normally, we can read * the offset from the multixact itself, but there's an important special * case: if there are no multixacts in existence at all, oldestMXact * obviously can't point to one. It will instead point to the multixact * ID that will be assigned the next time one is needed. - * - * NB: oldestMXact should be the oldest multixact that still exists in the - * SLRU, unlike in SetOffsetVacuumLimit, where we do this same computation - * based on the oldest value that might be referenced in a table. - */ - if (nextMXact == oldestMXact) - oldestOffset = nextOffset; - else - { - bool oldestOffsetKnown; - - oldestOffsetKnown = find_multixact_start(oldestMXact, &oldestOffset); - if (!oldestOffsetKnown) - { - ereport(LOG, - (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", - oldestMXact))); - return; - } - } - - /* move back to start of the corresponding segment */ - offsetStopLimit = oldestOffset - (oldestOffset % - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); - /* always leave one segment before the wraparound point */ - offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); - - /* if nothing has changed, we're done */ - if (prevOffsetStopLimitKnown && offsetStopLimit == prevOffsetStopLimit) - return; - - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->offsetStopLimit = offsetStopLimit; - MultiXactState->offsetStopLimitKnown = true; - LWLockRelease(MultiXactGenLock); - - if (!prevOffsetStopLimitKnown && IsUnderPostmaster) - ereport(LOG, - (errmsg("MultiXact member wraparound protections are now enabled"))); - ereport(DEBUG1, - (errmsg("MultiXact member stop limit is now %u based on MultiXact %u", - offsetStopLimit, oldestMXact))); -} - -/* - * Determine how aggressively we need to vacuum in order to prevent member - * wraparound. - * - * To determine the oldest multixact ID, we look at oldestMultiXactId, not - * lastCheckpointedOldest. That's because vacuuming can't help with anything - * older than oldestMultiXactId; anything older than that isn't referenced - * by any table. Offsets older than oldestMultiXactId but not as old as - * lastCheckpointedOldest will go away after the next checkpoint. - * - * The return value is true if emergency autovacuum is required and false - * otherwise. - */ -static bool -SetOffsetVacuumLimit(bool finish_setup) -{ - MultiXactId oldestMultiXactId; - MultiXactId nextMXact; - bool finishedStartup; - MultiXactOffset oldestOffset = 0; /* placate compiler */ - MultiXactOffset nextOffset; - bool oldestOffsetKnown = false; - MultiXactOffset prevOldestOffset; - bool prevOldestOffsetKnown; - - /* Read relevant fields from shared memory. */ - LWLockAcquire(MultiXactGenLock, LW_SHARED); - oldestMultiXactId = MultiXactState->oldestMultiXactId; - nextMXact = MultiXactState->nextMXact; - nextOffset = MultiXactState->nextOffset; - finishedStartup = MultiXactState->finishedStartup; - prevOldestOffset = MultiXactState->oldestOffset; - prevOldestOffsetKnown = MultiXactState->oldestOffsetKnown; - LWLockRelease(MultiXactGenLock); - - /* Don't do this until after any recovery is complete. */ - if (!finishedStartup && !finish_setup) - return false; - - /* - * If no multixacts exist, then oldestMultiXactId will be the next - * multixact that will be created, rather than an existing multixact. */ if (oldestMultiXactId == nextMXact) { /* - * When the next multixact gets created, it will be stored at the - * next offset. + * When the next multixact gets created, it will be stored at the next + * offset. */ oldestOffset = nextOffset; oldestOffsetKnown = true; @@ -2659,55 +2615,67 @@ SetOffsetVacuumLimit(bool finish_setup) else { /* - * Figure out where the oldest existing multixact's offsets are stored. - * Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, the - * supposedly-earliest multixact might not really exist. We are + * Figure out where the oldest existing multixact's offsets are + * stored. Due to bugs in early release of PostgreSQL 9.3.X and 9.4.X, + * the supposedly-earliest multixact might not really exist. We are * careful not to fail in that case. */ oldestOffsetKnown = find_multixact_start(oldestMultiXactId, &oldestOffset); - } - /* - * Except when initializing the system for the first time, there's no - * need to update anything if we don't know the oldest offset or if it - * hasn't changed. - */ - if (finish_setup || - (oldestOffsetKnown && !prevOldestOffsetKnown) || - (oldestOffsetKnown && prevOldestOffset != oldestOffset)) - { - /* Install the new limits. */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); - MultiXactState->oldestOffset = oldestOffset; - MultiXactState->oldestOffsetKnown = oldestOffsetKnown; - MultiXactState->finishedStartup = true; - LWLockRelease(MultiXactGenLock); - - /* Log the info */ if (oldestOffsetKnown) ereport(DEBUG1, (errmsg("oldest MultiXactId member is at offset %u", - oldestOffset))); + oldestOffset))); else - ereport(DEBUG1, - (errmsg("oldest MultiXactId member offset unknown"))); + ereport(LOG, + (errmsg("MultiXact member wraparound protections are disabled because oldest checkpointed MultiXact %u does not exist on disk", + oldestMultiXactId))); } + LWLockRelease(MultiXactTruncationLock); + /* - * If we failed to get the oldest offset this time, but we have a value - * from a previous pass through this function, assess the need for - * autovacuum based on that old value rather than automatically forcing - * it. + * If we can, compute limits (and install them MultiXactState) to prevent + * overrun of old data in the members SLRU area. We can only do so if the + * oldest offset is known though. */ - if (prevOldestOffsetKnown && !oldestOffsetKnown) + if (oldestOffsetKnown) { + /* move back to start of the corresponding segment */ + offsetStopLimit = oldestOffset - (oldestOffset % + (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); + + /* always leave one segment before the wraparound point */ + offsetStopLimit -= (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); + + if (!prevOldestOffsetKnown && IsUnderPostmaster) + ereport(LOG, + (errmsg("MultiXact member wraparound protections are now enabled"))); + ereport(DEBUG1, + (errmsg("MultiXact member stop limit is now %u based on MultiXact %u", + offsetStopLimit, oldestMultiXactId))); + } + else if (prevOldestOffsetKnown) + { + /* + * If we failed to get the oldest offset this time, but we have a + * value from a previous pass through this function, use the old value + * rather than automatically forcing it. + */ oldestOffset = prevOldestOffset; oldestOffsetKnown = true; } + /* Install the computed values */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestOffset = oldestOffset; + MultiXactState->oldestOffsetKnown = oldestOffsetKnown; + MultiXactState->offsetStopLimit = offsetStopLimit; + LWLockRelease(MultiXactGenLock); + /* - * Do we need an emergency autovacuum? If we're not sure, assume yes. + * Do we need an emergency autovacuum? If we're not sure, assume yes. */ return !oldestOffsetKnown || (nextOffset - oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD); @@ -2720,7 +2688,7 @@ SetOffsetVacuumLimit(bool finish_setup) * boundary point, hence the name. The reason we don't want to use the regular * 2^31-modulo arithmetic here is that we want to be able to use the whole of * the 2^32-1 space here, allowing for more multixacts that would fit - * otherwise. See also SlruScanDirCbRemoveMembers. + * otherwise. */ static bool MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, @@ -2766,6 +2734,9 @@ MultiXactOffsetWouldWrap(MultiXactOffset boundary, MultiXactOffset start, * * Returns false if the file containing the multi does not exist on disk. * Otherwise, returns true and sets *result to the starting member offset. + * + * This function does not prevent concurrent truncation, so if that's + * required, the caller has to protect against that. */ static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result) @@ -2776,9 +2747,22 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result) int slotno; MultiXactOffset *offptr; + /* XXX: Remove || AmStartupProcess() after WAL page magic bump */ + Assert(MultiXactState->finishedStartup || AmStartupProcess()); + pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* + * Flush out dirty data, so PhysicalPageExists can work correctly. + * SimpleLruFlush() is a pretty big hammer for that. Alternatively we + * could add a in-memory version of page exists, but find_multixact_start + * is called infrequently, and it doesn't seem bad to flush buffers to + * disk before truncation. + */ + SimpleLruFlush(MultiXactOffsetCtl, true); + SimpleLruFlush(MultiXactMemberCtl, true); + if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno)) return false; @@ -2884,65 +2868,6 @@ MultiXactMemberFreezeThreshold(void) return multixacts - victim_multixacts; } -/* - * SlruScanDirectory callback. - * This callback deletes segments that are outside the range determined by - * the given page numbers. - * - * Both range endpoints are exclusive (that is, segments containing any of - * those pages are kept.) - */ -typedef struct MembersLiveRange -{ - int rangeStart; - int rangeEnd; -} MembersLiveRange; - -static bool -SlruScanDirCbRemoveMembers(SlruCtl ctl, char *filename, int segpage, - void *data) -{ - MembersLiveRange *range = (MembersLiveRange *) data; - MultiXactOffset nextOffset; - - if ((segpage == range->rangeStart) || - (segpage == range->rangeEnd)) - return false; /* easy case out */ - - /* - * To ensure that no segment is spuriously removed, we must keep track of - * new segments added since the start of the directory scan; to do this, - * we update our end-of-range point as we run. - * - * As an optimization, we can skip looking at shared memory if we know for - * certain that the current segment must be kept. This is so because - * nextOffset never decreases, and we never increase rangeStart during any - * one run. - */ - if (!((range->rangeStart > range->rangeEnd && - segpage > range->rangeEnd && segpage < range->rangeStart) || - (range->rangeStart < range->rangeEnd && - (segpage < range->rangeStart || segpage > range->rangeEnd)))) - return false; - - /* - * Update our idea of the end of the live range. - */ - LWLockAcquire(MultiXactGenLock, LW_SHARED); - nextOffset = MultiXactState->nextOffset; - LWLockRelease(MultiXactGenLock); - range->rangeEnd = MXOffsetToMemberPage(nextOffset); - - /* Recheck the deletion condition. If it still holds, perform deletion */ - if ((range->rangeStart > range->rangeEnd && - segpage > range->rangeEnd && segpage < range->rangeStart) || - (range->rangeStart < range->rangeEnd && - (segpage < range->rangeStart || segpage > range->rangeEnd))) - SlruDeleteSegment(ctl, filename); - - return false; /* keep going */ -} - typedef struct mxtruncinfo { int earliestExistingPage; @@ -2966,37 +2891,115 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) return false; /* keep going */ } + +/* + * Delete members segments [oldest, newOldest) + * + * The members SLRU can, in contrast to the offsets one, be filled to almost + * the full range at once. This means SimpleLruTruncate() can't trivially be + * used - instead the to-be-deleted range is computed using the offsets + * SLRU. C.f. TruncateMultiXact(). + */ +static void +PerformMembersTruncation(MultiXactOffset oldestOffset, MultiXactOffset newOldestOffset) +{ + const int maxsegment = MXOffsetToMemberSegment(MaxMultiXactOffset); + int startsegment = MXOffsetToMemberSegment(oldestOffset); + int endsegment = MXOffsetToMemberSegment(newOldestOffset); + int segment = startsegment; + + /* + * Delete all the segments but the last one. The last segment can still + * contain, possibly partially, valid data. + */ + while (segment != endsegment) + { + elog(DEBUG2, "truncating multixact members segment %x", segment); + SlruDeleteSegment(MultiXactMemberCtl, segment); + + /* move to next segment, handling wraparound correctly */ + if (segment == maxsegment) + segment = 0; + else + segment += 1; + } +} + +/* + * Delete offsets segments [oldest, newOldest) + */ +static void +PerformOffsetsTruncation(MultiXactId oldestMulti, MultiXactId newOldestMulti) +{ + /* + * We step back one multixact to avoid passing a cutoff page that hasn't + * been created yet in the rare case that oldestMulti would be the first + * item on a page and oldestMulti == nextMulti. In that case, if we + * didn't subtract one, we'd trigger SimpleLruTruncate's wraparound + * detection. + */ + SimpleLruTruncate(MultiXactOffsetCtl, + MultiXactIdToOffsetPage(PreviousMultiXactId(newOldestMulti))); +} + /* * Remove all MultiXactOffset and MultiXactMember segments before the oldest * ones still of interest. * - * On a primary, this is called by the checkpointer process after a checkpoint - * has been flushed; during crash recovery, it's called from - * CreateRestartPoint(). In the latter case, we rely on the fact that - * xlog_redo() will already have called MultiXactAdvanceOldest(). Our - * latest_page_number will already have been initialized by StartupMultiXact() - * and kept up to date as new pages are zeroed. + * On a primary this is called as part of vacuum (via + * vac_truncate_clog()). During recovery truncation is normally done by + * replaying truncation WAL records instead of this routine; the exception is + * when replaying records from an older primary that doesn't yet generate + * truncation WAL records. In that case truncation is triggered by + * MultiXactAdvanceOldest(). + * + * newOldestMulti is the oldest currently required multixact, newOldestMultiDB + * is one of the databases preventing newOldestMulti from increasing. */ void -TruncateMultiXact(void) +TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB, bool in_recovery) { - MultiXactId oldestMXact; + MultiXactId oldestMulti; + MultiXactId nextMulti; + MultiXactOffset newOldestOffset; MultiXactOffset oldestOffset; - MultiXactId nextMXact; - MultiXactOffset nextOffset; + MultiXactOffset nextOffset; mxtruncinfo trunc; MultiXactId earliest; - MembersLiveRange range; - Assert(AmCheckpointerProcess() || AmStartupProcess() || - !IsPostmasterEnvironment); + /* + * Need to allow being called in recovery for backwards compatibility, + * when an updated standby replays WAL generated by a non-updated primary. + */ + Assert(in_recovery || !RecoveryInProgress()); + Assert(!in_recovery || AmStartupProcess()); + Assert(in_recovery || MultiXactState->finishedStartup); + + /* + * We can only allow one truncation to happen at once. Otherwise parts of + * members might vanish while we're doing lookups or similar. There's no + * need to have an interlock with creating new multis or such, since those + * are constrained by the limits (which only grow, never shrink). + */ + LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); LWLockAcquire(MultiXactGenLock, LW_SHARED); - oldestMXact = MultiXactState->lastCheckpointedOldest; - nextMXact = MultiXactState->nextMXact; + nextMulti = MultiXactState->nextMXact; nextOffset = MultiXactState->nextOffset; + oldestMulti = MultiXactState->oldestMultiXactId; LWLockRelease(MultiXactGenLock); - Assert(MultiXactIdIsValid(oldestMXact)); + Assert(MultiXactIdIsValid(oldestMulti)); + + /* + * Make sure to only attempt truncation if there's values to truncate + * away. In normal processing values shouldn't go backwards, but there's + * some corner cases (due to bugs) where that's possible. + */ + if (MultiXactIdPrecedesOrEquals(newOldestMulti, oldestMulti)) + { + LWLockRelease(MultiXactTruncationLock); + return; + } /* * Note we can't just plow ahead with the truncation; it's possible that @@ -3004,6 +3007,9 @@ TruncateMultiXact(void) * going to attempt to read the offsets page to determine where to * truncate the members SLRU. So we first scan the directory to determine * the earliest offsets page number that we can read without error. + * + * NB: It's also possible that the page that oldestMulti is on has already + * been truncated away, and we crashed before updating oldestMulti. */ trunc.earliestExistingPage = -1; SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); @@ -3011,19 +3017,10 @@ TruncateMultiXact(void) if (earliest < FirstMultiXactId) earliest = FirstMultiXactId; - /* - * If there's nothing to remove, we can bail out early. - * - * Due to bugs in early releases of PostgreSQL 9.3.X and 9.4.X, - * oldestMXact might point to a multixact that does not exist. - * Autovacuum will eventually advance it to a value that does exist, - * and we want to set a proper offsetStopLimit when that happens, - * so call DetermineSafeOldestOffset here even if we're not actually - * truncating. - */ - if (MultiXactIdPrecedes(oldestMXact, earliest)) + /* If there's nothing to remove, we can bail out early. */ + if (MultiXactIdPrecedes(oldestMulti, earliest)) { - DetermineSafeOldestOffset(oldestMXact); + LWLockRelease(MultiXactTruncationLock); return; } @@ -3032,49 +3029,104 @@ TruncateMultiXact(void) * the starting offset of the oldest multixact. * * Hopefully, find_multixact_start will always work here, because we've - * already checked that it doesn't precede the earliest MultiXact on - * disk. But if it fails, don't truncate anything, and log a message. + * already checked that it doesn't precede the earliest MultiXact on disk. + * But if it fails, don't truncate anything, and log a message. */ - if (oldestMXact == nextMXact) - oldestOffset = nextOffset; /* there are NO MultiXacts */ - else if (!find_multixact_start(oldestMXact, &oldestOffset)) + if (oldestMulti == nextMulti) + { + /* there are NO MultiXacts */ + oldestOffset = nextOffset; + } + else if (!find_multixact_start(oldestMulti, &oldestOffset)) { ereport(LOG, (errmsg("oldest MultiXact %u not found, earliest MultiXact %u, skipping truncation", - oldestMXact, earliest))); + oldestMulti, earliest))); + LWLockRelease(MultiXactTruncationLock); return; } /* - * To truncate MultiXactMembers, we need to figure out the active page - * range and delete all files outside that range. The start point is the - * start of the segment containing the oldest offset; an end point of the - * segment containing the next offset to use is enough. The end point is - * updated as MultiXactMember gets extended concurrently, elsewhere. + * Secondly compute up to where to truncate. Lookup the corresponding + * member offset for newOldestMulti for that. */ - range.rangeStart = MXOffsetToMemberPage(oldestOffset); - range.rangeStart -= range.rangeStart % SLRU_PAGES_PER_SEGMENT; + if (newOldestMulti == nextMulti) + { + /* there are NO MultiXacts */ + newOldestOffset = nextOffset; + } + else if (!find_multixact_start(newOldestMulti, &newOldestOffset)) + { + ereport(LOG, + (errmsg("cannot truncate up to MultiXact %u because it does not exist on disk, skipping truncation", + newOldestMulti))); + LWLockRelease(MultiXactTruncationLock); + return; + } - range.rangeEnd = MXOffsetToMemberPage(nextOffset); - - SlruScanDirectory(MultiXactMemberCtl, SlruScanDirCbRemoveMembers, &range); + elog(DEBUG1, "performing multixact truncation: " + "offsets [%u, %u), offsets segments [%x, %x), " + "members [%u, %u), members segments [%x, %x)", + oldestMulti, newOldestMulti, + MultiXactIdToOffsetSegment(oldestMulti), + MultiXactIdToOffsetSegment(newOldestMulti), + oldestOffset, newOldestOffset, + MXOffsetToMemberSegment(oldestOffset), + MXOffsetToMemberSegment(newOldestOffset)); /* - * Now we can truncate MultiXactOffset. We step back one multixact to - * avoid passing a cutoff page that hasn't been created yet in the rare - * case that oldestMXact would be the first item on a page and oldestMXact - * == nextMXact. In that case, if we didn't subtract one, we'd trigger - * SimpleLruTruncate's wraparound detection. + * Do truncation, and the WAL logging of the truncation, in a critical + * section. That way offsets/members cannot get out of sync anymore, i.e. + * once consistent the newOldestMulti will always exist in members, even + * if we crashed in the wrong moment. */ - SimpleLruTruncate(MultiXactOffsetCtl, - MultiXactIdToOffsetPage(PreviousMultiXactId(oldestMXact))); + START_CRIT_SECTION(); /* - * Now, and only now, we can advance the stop point for multixact members. - * If we did it any sooner, the segments we deleted above might already - * have been overwritten with new members. That would be bad. + * Prevent checkpoints from being scheduled concurrently. This is critical + * because otherwise a truncation record might not be replayed after a + * crash/basebackup, even though the state of the data directory would + * require it. It's not possible (startup process doesn't have a PGXACT + * entry), and not needed, to do this during recovery, when performing an + * old-style truncation, though. There the entire scheduling depends on + * the replayed WAL records which be the same after a possible crash. */ - DetermineSafeOldestOffset(oldestMXact); + if (!in_recovery) + { + Assert(!MyPgXact->delayChkpt); + MyPgXact->delayChkpt = true; + } + + /* WAL log truncation */ + if (!in_recovery) + WriteMTruncateXlogRec(newOldestMultiDB, + oldestMulti, newOldestMulti, + oldestOffset, newOldestOffset); + + /* + * Update in-memory limits before performing the truncation, while inside + * the critical section: Have to do it before truncation, to prevent + * concurrent lookups of those values. Has to be inside the critical + * section as otherwise a future call to this function would error out, + * while looking up the oldest member in offsets, if our caller crashes + * before updating the limits. + */ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->oldestMultiXactId = newOldestMulti; + MultiXactState->oldestMultiXactDB = newOldestMultiDB; + LWLockRelease(MultiXactGenLock); + + /* First truncate members */ + PerformMembersTruncation(oldestOffset, newOldestOffset); + + /* Then offsets */ + PerformOffsetsTruncation(oldestMulti, newOldestMulti); + + if (!in_recovery) + MyPgXact->delayChkpt = false; + + END_CRIT_SECTION(); + LWLockRelease(MultiXactTruncationLock); } /* @@ -3170,6 +3222,34 @@ WriteMZeroPageXlogRec(int pageno, uint8 info) (void) XLogInsert(RM_MULTIXACT_ID, info); } +/* + * Write a TRUNCATE xlog record + * + * We must flush the xlog record to disk before returning --- see notes in + * TruncateCLOG(). + */ +static void +WriteMTruncateXlogRec(Oid oldestMultiDB, + MultiXactId startTruncOff, MultiXactId endTruncOff, + MultiXactOffset startTruncMemb, MultiXactOffset endTruncMemb) +{ + XLogRecPtr recptr; + xl_multixact_truncate xlrec; + + xlrec.oldestMultiDB = oldestMultiDB; + + xlrec.startTruncOff = startTruncOff; + xlrec.endTruncOff = endTruncOff; + + xlrec.startTruncMemb = startTruncMemb; + xlrec.endTruncMemb = endTruncMemb; + + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), SizeOfMultiXactTruncate); + recptr = XLogInsert(RM_MULTIXACT_ID, XLOG_MULTIXACT_TRUNCATE_ID); + XLogFlush(recptr); +} + /* * MULTIXACT resource manager's routines */ @@ -3252,6 +3332,49 @@ multixact_redo(XLogReaderState *record) LWLockRelease(XidGenLock); } } + else if (info == XLOG_MULTIXACT_TRUNCATE_ID) + { + xl_multixact_truncate xlrec; + int pageno; + + memcpy(&xlrec, XLogRecGetData(record), + SizeOfMultiXactTruncate); + + elog(DEBUG1, "replaying multixact truncation: " + "offsets [%u, %u), offsets segments [%x, %x), " + "members [%u, %u), members segments [%x, %x)", + xlrec.startTruncOff, xlrec.endTruncOff, + MultiXactIdToOffsetSegment(xlrec.startTruncOff), + MultiXactIdToOffsetSegment(xlrec.endTruncOff), + xlrec.startTruncMemb, xlrec.endTruncMemb, + MXOffsetToMemberSegment(xlrec.startTruncMemb), + MXOffsetToMemberSegment(xlrec.endTruncMemb)); + + /* should not be required, but more than cheap enough */ + LWLockAcquire(MultiXactTruncationLock, LW_EXCLUSIVE); + + /* + * Advance the horizon values, so they're current at the end of + * recovery. + */ + SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB); + + PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb); + + /* + * During XLOG replay, latest_page_number isn't necessarily set up + * yet; insert a suitable value to bypass the sanity test in + * SimpleLruTruncate. + */ + pageno = MultiXactIdToOffsetPage(xlrec.endTruncOff); + MultiXactOffsetCtl->shared->latest_page_number = pageno; + PerformOffsetsTruncation(xlrec.startTruncOff, xlrec.endTruncOff); + + LWLockRelease(MultiXactTruncationLock); + + /* only looked at in the startup process, no lock necessary */ + MultiXactState->sawTruncationInCkptCycle = true; + } else elog(PANIC, "multixact_redo: unknown op code %u", info); } diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 5fcea113eaf..90c7cf56e63 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -134,6 +134,7 @@ static int SlruSelectLRUPage(SlruCtl ctl, int pageno); static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data); +static void SlruInternalDeleteSegment(SlruCtl ctl, char *filename); /* * Initialization of shared memory @@ -1075,7 +1076,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) * Flush dirty pages to disk during checkpoint or database shutdown */ void -SimpleLruFlush(SlruCtl ctl, bool checkpoint) +SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) { SlruShared shared = ctl->shared; SlruFlushData fdata; @@ -1096,11 +1097,11 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint) SlruInternalWritePage(ctl, slotno, &fdata); /* - * When called during a checkpoint, we cannot assert that the slot is - * clean now, since another process might have re-dirtied it already. - * That's okay. + * In some places (e.g. checkpoints), we cannot assert that the slot + * is clean now, since another process might have re-dirtied it + * already. That's okay. */ - Assert(checkpoint || + Assert(allow_redirtied || shared->page_status[slotno] == SLRU_PAGE_EMPTY || (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno])); @@ -1210,8 +1211,14 @@ restart:; (void) SlruScanDirectory(ctl, SlruScanDirCbDeleteCutoff, &cutoffPage); } -void -SlruDeleteSegment(SlruCtl ctl, char *filename) +/* + * Delete an individual SLRU segment, identified by the filename. + * + * NB: This does not touch the SLRU buffers themselves, callers have to ensure + * they either can't yet contain anything, or have already been cleaned out. + */ +static void +SlruInternalDeleteSegment(SlruCtl ctl, char *filename) { char path[MAXPGPATH]; @@ -1221,6 +1228,64 @@ SlruDeleteSegment(SlruCtl ctl, char *filename) unlink(path); } +/* + * Delete an individual SLRU segment, identified by the segment number. + */ +void +SlruDeleteSegment(SlruCtl ctl, int segno) +{ + SlruShared shared = ctl->shared; + int slotno; + char path[MAXPGPATH]; + bool did_write; + + /* Clean out any possibly existing references to the segment. */ + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); +restart: + did_write = false; + for (slotno = 0; slotno < shared->num_slots; slotno++) + { + int pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT; + + if (shared->page_status[slotno] == SLRU_PAGE_EMPTY) + continue; + + /* not the segment we're looking for */ + if (pagesegno != segno) + continue; + + /* If page is clean, just change state to EMPTY (expected case). */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID && + !shared->page_dirty[slotno]) + { + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + continue; + } + + /* Same logic as SimpleLruTruncate() */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + SlruInternalWritePage(ctl, slotno, NULL); + else + SimpleLruWaitIO(ctl, slotno); + + did_write = true; + } + + /* + * Be extra careful and re-check. The IO functions release the control + * lock, so new pages could have been read in. + */ + if (did_write) + goto restart; + + snprintf(path, MAXPGPATH, "%s/%04X", ctl->Dir, segno); + ereport(DEBUG2, + (errmsg("removing file \"%s\"", path))); + unlink(path); + + LWLockRelease(shared->ControlLock); +} + /* * SlruScanDirectory callback * This callback reports true if there's any segment prior to the one @@ -1249,7 +1314,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) int cutoffPage = *(int *) data; if (ctl->PagePrecedes(segpage, cutoffPage)) - SlruDeleteSegment(ctl, filename); + SlruInternalDeleteSegment(ctl, filename); return false; /* keep going */ } @@ -1261,7 +1326,7 @@ SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename, int segpage, void *data) bool SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) { - SlruDeleteSegment(ctl, filename); + SlruInternalDeleteSegment(ctl, filename); return false; /* keep going */ } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a87f09ee47f..1ac1c0550dd 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6330,7 +6330,6 @@ StartupXLOG(void) SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); SetCommitTsLimit(checkPoint.oldestCommitTs, checkPoint.newestCommitTs); - MultiXactSetSafeTruncate(checkPoint.oldestMulti); XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch; XLogCtl->ckptXid = checkPoint.nextXid; @@ -6347,10 +6346,8 @@ StartupXLOG(void) StartupReorderBuffer(); /* - * Startup MultiXact. We need to do this early for two reasons: one is - * that we might try to access multixacts when we do tuple freezing, and - * the other is we need its state initialized because we attempt - * truncation during restartpoints. + * Startup MultiXact. We need to do this early to be able to replay + * truncations. */ StartupMultiXact(); @@ -8507,12 +8504,6 @@ CreateCheckPoint(int flags) */ END_CRIT_SECTION(); - /* - * Now that the checkpoint is safely on disk, we can update the point to - * which multixact can be truncated. - */ - MultiXactSetSafeTruncate(checkPoint.oldestMulti); - /* * Let smgr do post-checkpoint cleanup (eg, deleting old files). */ @@ -8552,11 +8543,6 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestXmin(NULL, false)); - /* - * Truncate pg_multixact too. - */ - TruncateMultiXact(); - /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -8886,21 +8872,6 @@ CreateRestartPoint(int flags) ThisTimeLineID = 0; } - /* - * Due to a historical accident multixact truncations are not WAL-logged, - * but just performed everytime the mxact horizon is increased. So, unless - * we explicitly execute truncations on a standby it will never clean out - * /pg_multixact which obviously is bad, both because it uses space and - * because we can wrap around into pre-existing data... - * - * We can only do the truncation here, after the UpdateControlFile() - * above, because we've now safely established a restart point. That - * guarantees we will not need to access those multis. - * - * It's probably worth improving this. - */ - TruncateMultiXact(); - /* * Truncate pg_subtrans if possible. We can throw away all data before * the oldest XMIN of any running transaction. No future transaction will @@ -9261,9 +9232,14 @@ xlog_redo(XLogReaderState *record) LWLockRelease(OidGenLock); MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + + /* + * NB: This may perform multixact truncation when replaying WAL + * generated by an older primary. + */ + MultiXactAdvanceOldest(checkPoint.oldestMulti, + checkPoint.oldestMultiDB); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); - MultiXactSetSafeTruncate(checkPoint.oldestMulti); /* * If we see a shutdown checkpoint while waiting for an end-of-backup @@ -9353,14 +9329,17 @@ xlog_redo(XLogReaderState *record) LWLockRelease(OidGenLock); MultiXactAdvanceNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); + + /* + * NB: This may perform multixact truncation when replaying WAL + * generated by an older primary. + */ + MultiXactAdvanceOldest(checkPoint.oldestMulti, + checkPoint.oldestMultiDB); if (TransactionIdPrecedes(ShmemVariableCache->oldestXid, checkPoint.oldestXid)) SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); - MultiXactAdvanceOldest(checkPoint.oldestMulti, - checkPoint.oldestMultiDB); - MultiXactSetSafeTruncate(checkPoint.oldestMulti); - /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; ControlFile->checkPointCopy.nextXid = checkPoint.nextXid; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 85b04832479..698bb3531cc 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1137,11 +1137,11 @@ vac_truncate_clog(TransactionId frozenXID, return; /* - * Truncate CLOG and CommitTs to the oldest computed value. Note we don't - * truncate multixacts; that will be done by the next checkpoint. + * Truncate CLOG, multixact and CommitTs to the oldest computed value. */ TruncateCLOG(frozenXID); TruncateCommitTs(frozenXID, true); + TruncateMultiXact(minMulti, minmulti_datoid, false); /* * Update the wrap limit for GetNewTransactionId and creation of new diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 96bbfe8fab6..c557cb68d0b 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -45,3 +45,4 @@ ReplicationSlotControlLock 37 CommitTsControlLock 38 CommitTsLock 39 ReplicationOriginLock 40 +MultiXactTruncationLock 41 diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index f1448fe0635..9f9df9cda04 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -71,6 +71,7 @@ typedef struct MultiXactMember #define XLOG_MULTIXACT_ZERO_OFF_PAGE 0x00 #define XLOG_MULTIXACT_ZERO_MEM_PAGE 0x10 #define XLOG_MULTIXACT_CREATE_ID 0x20 +#define XLOG_MULTIXACT_TRUNCATE_ID 0x30 typedef struct xl_multixact_create { @@ -82,6 +83,21 @@ typedef struct xl_multixact_create #define SizeOfMultiXactCreate (offsetof(xl_multixact_create, members)) +typedef struct xl_multixact_truncate +{ + Oid oldestMultiDB; + + /* to-be-truncated range of multixact offsets */ + MultiXactId startTruncOff; /* just for completeness' sake */ + MultiXactId endTruncOff; + + /* to-be-truncated range of multixact members */ + MultiXactOffset startTruncMemb; + MultiXactOffset endTruncMemb; +} xl_multixact_truncate; + +#define SizeOfMultiXactTruncate (sizeof(xl_multixact_truncate)) + extern MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, @@ -119,13 +135,12 @@ extern void MultiXactGetCheckptMulti(bool is_shutdown, Oid *oldestMultiDB); extern void CheckPointMultiXact(void); extern MultiXactId GetOldestMultiXactId(void); -extern void TruncateMultiXact(void); +extern void TruncateMultiXact(MultiXactId oldestMulti, Oid oldestMultiDB, bool in_recovery); extern void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset); extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); -extern void MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti); extern int MultiXactMemberFreezeThreshold(void); extern void multixact_twophase_recover(TransactionId xid, uint16 info, diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 9c7f01933f7..f60e75b569a 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -143,14 +143,14 @@ extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid); extern void SimpleLruWritePage(SlruCtl ctl, int slotno); -extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint); +extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno); typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage, void *data); extern bool SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data); -extern void SlruDeleteSegment(SlruCtl ctl, char *filename); +extern void SlruDeleteSegment(SlruCtl ctl, int segno); /* SlruScanDirectory public callbacks */ extern bool SlruScanDirCbReportPresence(SlruCtl ctl, char *filename, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index a037f818acc..0e149ea2f4d 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2750,6 +2750,7 @@ xl_invalid_page xl_invalid_page_key xl_multi_insert_tuple xl_multixact_create +xl_multixact_truncate xl_parameter_change xl_relmap_update xl_replorigin_drop