diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b7b47ef076a..eb807df5ba8 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -274,12 +274,6 @@ typedef struct MultiXactStateData /* support for members anti-wraparound measures */ MultiXactOffset offsetStopLimit; /* known if oldestOffsetKnown */ - /* - * This is used to sleep until a multixact offset is written when we want - * to create the next one. - */ - ConditionVariable nextoff_cv; - /* * Per-backend data starts here. We have two arrays stored in the area * immediately following the MultiXactStateData struct. Each is indexed by @@ -384,6 +378,9 @@ static MemoryContext MXactContext = NULL; #define debug_elog6(a,b,c,d,e,f) #endif +/* hack to deal with WAL generated with older minor versions */ +static int64 pre_initialized_offsets_page = -1; + /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, @@ -915,13 +912,65 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int entryno; int slotno; MultiXactOffset *offptr; - int i; + MultiXactId next; + int64 next_pageno; + int next_entryno; + MultiXactOffset *next_offptr; LWLock *lock; LWLock *prevlock = NULL; + /* position of this multixid in the offsets SLRU area */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* position of the next multixid */ + next = multi + 1; + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + next_entryno = MultiXactIdToOffsetEntry(next); + + /* + * Older minor versions didn't set the next multixid's offset in this + * function, and therefore didn't initialize the next page until the next + * multixid was assigned. If we're replaying WAL that was generated by + * such a version, the next page might not be initialized yet. Initialize + * it now. + */ + if (InRecovery && + next_pageno != pageno && + pg_atomic_read_u64(&MultiXactOffsetCtl->shared->latest_page_number) == pageno) + { + elog(DEBUG1, "next offsets page is not initialized, initializing it now"); + + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + LWLockRelease(lock); + + /* + * Remember that we initialized the page, so that we don't zero it + * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record. + */ + pre_initialized_offsets_page = next_pageno; + } + + /* + * Set the starting offset of this multixid's members. + * + * In the common case, it was already be set by the previous + * RecordNewMultiXact call, as this was the next multixid of the previous + * multixid. But if multiple backends are generating multixids + * concurrently, we might race ahead and get called before the previous + * multixid. + */ lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); @@ -936,22 +985,50 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + if (*offptr != offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*offptr == 0); + *offptr = offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + /* + * Set the next multixid's offset to the end of this multixid's members. + */ + if (next_pageno == pageno) + { + next_offptr = offptr + 1; + } + else + { + /* must be the first entry on the page */ + Assert(next_entryno == 0 || next == FirstMultiXactId); + + /* Swap the lock for a lock on the next page */ + LWLockRelease(lock); + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, next_pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); + + slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + next_offptr += next_entryno; + } + + if (*next_offptr != offset + nmembers) + { + /* should already be set to the correct value, or not at all */ + Assert(*next_offptr == 0); + *next_offptr = offset + nmembers; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } /* Release MultiXactOffset SLRU lock. */ LWLockRelease(lock); - /* - * If anybody was waiting to know the offset of this multixact ID we just - * wrote, they can read it now, so wake them up. - */ - ConditionVariableBroadcast(&MultiXactState->nextoff_cv); - prev_pageno = -1; - for (i = 0; i < nmembers; i++, offset++) + for (int i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; uint32 *flagsptr; @@ -1141,8 +1218,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* + * Make sure there is room for the next MXID in the file. Assigning this + * MXID sets the next MXID's offset already. + */ + ExtendMultiXactOffset(result + 1); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1307,7 +1387,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, MultiXactOffset nextOffset; MultiXactMember *ptr; LWLock *lock; - bool slept = false; debug_elog3(DEBUG2, "GetMembers: asked for %u", multi); @@ -1384,23 +1463,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * one's. However, there are some corner cases to worry about: * * 1. This multixact may be the latest one created, in which case there is - * no next one to look at. In this case the nextOffset value we just - * saved is the correct endpoint. + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. * - * 2. The next multixact may still be in process of being filled in: that - * is, another process may have done GetNewMultiXactId but not yet written - * the offset entry for that ID. In that scenario, it is guaranteed that - * the offset entry for that multixact exists (because GetNewMultiXactId - * won't release MultiXactGenLock until it does) but contains zero - * (because we are careful to pre-zero offset pages). Because - * GetNewMultiXactId will never return zero as the starting offset for a - * multixact, when we read zero as the next multixact's offset, we know we - * have this case. We handle this by sleeping on the condition variable - * we have just for this; the process in charge will signal the CV as soon - * as it has finished writing the multixact offset. - * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset * wraparound. If we see next multixact's offset is one, is that our * multixact's actual endpoint, or did it end at zero with a subsequent * increment? We handle this using the knowledge that if the zero'th @@ -1412,7 +1482,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); @@ -1475,16 +1544,10 @@ retry: nextMXOffset = *offptr; if (nextMXOffset == 0) - { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(lock); - CHECK_FOR_INTERRUPTS(); - - ConditionVariableSleep(&MultiXactState->nextoff_cv, - WAIT_EVENT_MULTIXACT_CREATION); - slept = true; - goto retry; - } + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid next offset", + multi))); length = nextMXOffset - offset; } @@ -1492,12 +1555,6 @@ retry: LWLockRelease(lock); lock = NULL; - /* - * If we slept above, clean up state; it's no longer needed. - */ - if (slept) - ConditionVariableCancelSleep(); - ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember)); truelength = 0; @@ -1540,7 +1597,7 @@ retry: if (!TransactionIdIsValid(*xactptr)) { - /* Corner case 3: we must be looking at unused slot zero */ + /* Corner case 2: we must be looking at unused slot zero */ Assert(offset == 0); continue; } @@ -1987,7 +2044,6 @@ MultiXactShmemInit(void) /* Make sure we zero out the per-backend state */ MemSet(MultiXactState, 0, SHARED_MULTIXACT_STATE_SIZE); - ConditionVariableInit(&MultiXactState->nextoff_cv); } else Assert(found); @@ -2194,26 +2250,34 @@ TrimMultiXact(void) pageno); /* - * Zero out the remainder of the current offsets page. See notes in - * TrimCLOG() for background. Unlike CLOG, some WAL record covers every - * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL - * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * Set the offset of nextMXact on the offsets page. This is normally done + * in RecordNewMultiXact() of the previous multixact, but we used to not + * do that in older minor versions. To ensure that the next offset is set + * if the binary was just upgraded from an older minor version, do it now. + * + * Zero out the remainder of the page. See notes in TrimCLOG() for + * background. Unlike CLOG, some WAL record covers every pg_multixact + * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write + * xlog before data," nextMXact successors may carry obsolete, nonzero + * offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); - if (entryno != 0) { int slotno; MultiXactOffset *offptr; LWLock *lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + if (entryno == 0) + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + else + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + *offptr = offset; + if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ) + MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset)); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; LWLockRelease(lock); @@ -3398,14 +3462,24 @@ multixact_redo(XLogReaderState *record) memcpy(&pageno, XLogRecGetData(record), sizeof(pageno)); - lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); - LWLockAcquire(lock, LW_EXCLUSIVE); + /* + * Skip the record if we already initialized the page at the previous + * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact(). + */ + if (pre_initialized_offsets_page != pageno) + { + lock = SimpleLruGetBankLock(MultiXactOffsetCtl, pageno); + LWLockAcquire(lock, LW_EXCLUSIVE); - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - LWLockRelease(lock); + LWLockRelease(lock); + } + else + elog(DEBUG1, "skipping initialization of offsets page " INT64_FORMAT " because it was already initialized on multixid creation", pageno); + pre_initialized_offsets_page = -1; } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { @@ -3431,6 +3505,22 @@ multixact_redo(XLogReaderState *record) TransactionId max_xid; int i; + if (pre_initialized_offsets_page != -1) + { + /* + * If we implicitly initialized the next offsets page while + * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated + * with an older minor version, we still expect to see an + * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other + * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should + * not happen. If it does, we'll continue with the replay, but + * log a message to note that something's funny. + */ + elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page " INT64_FORMAT " that was implicitly initialized earlier", + pre_initialized_offsets_page); + pre_initialized_offsets_page = -1; + } + /* Store the data back into the SLRU files */ RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, xlrec->members);