diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index d56269d941c..920ec582b6d 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -337,6 +337,9 @@ static MemoryContext MXactContext = NULL; #define debug_elog6(a,b,c,d,e,f) #endif +/* hack to deal with WAL generated with older minor versions */ +static int pre_initialized_offsets_page = -1; + /* internal MultiXactId management */ static void MultiXactIdSetOldestVisible(void); static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, @@ -868,13 +871,61 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, int entryno; int slotno; MultiXactOffset *offptr; - int i; + MultiXactId next; + int next_pageno; + int next_entryno; + MultiXactOffset *next_offptr; LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + /* position of this multixid in the offsets SLRU area */ pageno = MultiXactIdToOffsetPage(multi); entryno = MultiXactIdToOffsetEntry(multi); + /* position of the next multixid */ + next = multi + 1; + if (next < FirstMultiXactId) + next = FirstMultiXactId; + next_pageno = MultiXactIdToOffsetPage(next); + next_entryno = MultiXactIdToOffsetEntry(next); + + /* + * Older minor versions didn't set the next multixid's offset in this + * function, and therefore didn't initialize the next page until the next + * multixid was assigned. If we're replaying WAL that was generated by + * such a version, the next page might not be initialized yet. Initialize + * it now. + */ + if (InRecovery && + next_pageno != pageno && + MultiXactOffsetCtl->shared->latest_page_number == pageno) + { + elog(DEBUG1, "next offsets page is not initialized, initializing it now"); + + /* Create and zero the page */ + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, next_pageno); + + /* Make sure it's written out */ + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + + /* + * Remember that we initialized the page, so that we don't zero it + * again at the XLOG_MULTIXACT_ZERO_OFF_PAGE record. + */ + pre_initialized_offsets_page = next_pageno; + } + + /* + * Set the starting offset of this multixid's members. + * + * In the common case, it was already be set by the previous + * RecordNewMultiXact call, as this was the next multixid of the previous + * multixid. But if multiple backends are generating multixids + * concurrently, we might race ahead and get called before the previous + * multixid. + */ + /* * Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction" * to complain about if there's any I/O error. This is kinda bogus, but @@ -886,9 +937,37 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - *offptr = offset; + if (*offptr != offset) + { + /* should already be set to the correct value, or not at all */ + Assert(*offptr == 0); + *offptr = offset; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } - MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + /* + * Set the next multixid's offset to the end of this multixid's members. + */ + if (next_pageno == pageno) + { + next_offptr = offptr + 1; + } + else + { + /* must be the first entry on the page */ + Assert(next_entryno == 0 || next == FirstMultiXactId); + slotno = SimpleLruReadPage(MultiXactOffsetCtl, next_pageno, true, next); + next_offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; + next_offptr += next_entryno; + } + + if (*next_offptr != offset + nmembers) + { + /* should already be set to the correct value, or not at all */ + Assert(*next_offptr == 0); + *next_offptr = offset + nmembers; + MultiXactOffsetCtl->shared->page_dirty[slotno] = true; + } /* Exchange our lock */ LWLockRelease(MultiXactOffsetSLRULock); @@ -897,7 +976,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset, prev_pageno = -1; - for (i = 0; i < nmembers; i++, offset++) + for (int i = 0; i < nmembers; i++, offset++) { TransactionId *memberptr; uint32 *flagsptr; @@ -1072,8 +1151,11 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) result = FirstMultiXactId; } - /* Make sure there is room for the MXID in the file. */ - ExtendMultiXactOffset(result); + /* + * Make sure there is room for the next MXID in the file. Assigning this + * MXID sets the next MXID's offset already. + */ + ExtendMultiXactOffset(result + 1); /* * Reserve the members space, similarly to above. Also, be careful not to @@ -1314,21 +1396,14 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * one's. However, there are some corner cases to worry about: * * 1. This multixact may be the latest one created, in which case there is - * no next one to look at. In this case the nextOffset value we just - * saved is the correct endpoint. + * no next one to look at. The next multixact's offset should be set + * already, as we set it in RecordNewMultiXact(), but we used to not do + * that in older minor versions. To cope with that case, if this + * multixact is the latest one created, use the nextOffset value we read + * above as the endpoint. * - * 2. The next multixact may still be in process of being filled in: that - * is, another process may have done GetNewMultiXactId but not yet written - * the offset entry for that ID. In that scenario, it is guaranteed that - * the offset entry for that multixact exists (because GetNewMultiXactId - * won't release MultiXactGenLock until it does) but contains zero - * (because we are careful to pre-zero offset pages). Because - * GetNewMultiXactId will never return zero as the starting offset for a - * multixact, when we read zero as the next multixact's offset, we know we - * have this case. We sleep for a bit and try again. - * - * 3. Because GetNewMultiXactId increments offset zero to offset one to - * handle case #2, there is an ambiguity near the point of offset + * 2. Because GetNewMultiXactId skips over offset zero, to reserve zero + * for to mean "unset", there is an ambiguity near the point of offset * wraparound. If we see next multixact's offset is one, is that our * multixact's actual endpoint, or did it end at zero with a subsequent * increment? We handle this using the knowledge that if the zero'th @@ -1340,7 +1415,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * cases, so it seems better than holding the MultiXactGenLock for a long * time on every multixact creation. */ -retry: LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); pageno = MultiXactIdToOffsetPage(multi); @@ -1385,13 +1459,10 @@ retry: nextMXOffset = *offptr; if (nextMXOffset == 0) - { - /* Corner case 2: next multixact is still being filled in */ - LWLockRelease(MultiXactOffsetSLRULock); - CHECK_FOR_INTERRUPTS(); - pg_usleep(1000L); - goto retry; - } + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("MultiXact %u has invalid next offset", + multi))); length = nextMXOffset - offset; } @@ -1427,7 +1498,7 @@ retry: if (!TransactionIdIsValid(*xactptr)) { - /* Corner case 3: we must be looking at unused slot zero */ + /* Corner case 2: we must be looking at unused slot zero */ Assert(offset == 0); continue; } @@ -2056,24 +2127,32 @@ TrimMultiXact(void) MultiXactOffsetCtl->shared->latest_page_number = pageno; /* - * Zero out the remainder of the current offsets page. See notes in - * TrimCLOG() for background. Unlike CLOG, some WAL record covers every - * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL - * rule "write xlog before data," nextMXact successors may carry obsolete, - * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() - * operates normally. + * Set the offset of nextMXact on the offsets page. This is normally done + * in RecordNewMultiXact() of the previous multixact, but we used to not + * do that in older minor versions. To ensure that the next offset is set + * if the binary was just upgraded from an older minor version, do it now. + * + * Zero out the remainder of the page. See notes in TrimCLOG() for + * background. Unlike CLOG, some WAL record covers every pg_multixact + * SLRU mutation. Since, also unlike CLOG, we ignore the WAL rule "write + * xlog before data," nextMXact successors may carry obsolete, nonzero + * offset values. */ entryno = MultiXactIdToOffsetEntry(nextMXact); - if (entryno != 0) { int slotno; MultiXactOffset *offptr; - slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); + if (entryno == 0) + slotno = SimpleLruZeroPage(MultiXactOffsetCtl, pageno); + else + slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, true, nextMXact); offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno]; offptr += entryno; - MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset))); + *offptr = offset; + if (entryno != 0 && (entryno + 1) * sizeof(MultiXactOffset) != BLCKSZ) + MemSet(offptr + 1, 0, BLCKSZ - (entryno + 1) * sizeof(MultiXactOffset)); MultiXactOffsetCtl->shared->page_dirty[slotno] = true; } @@ -3255,13 +3334,21 @@ multixact_redo(XLogReaderState *record) memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); - - slotno = ZeroMultiXactOffsetPage(pageno, false); - SimpleLruWritePage(MultiXactOffsetCtl, slotno); - Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); - - LWLockRelease(MultiXactOffsetSLRULock); + /* + * Skip the record if we already initialized the page at the previous + * XLOG_MULTIXACT_CREATE_ID record. See RecordNewMultiXact(). + */ + if (pre_initialized_offsets_page != pageno) + { + LWLockAcquire(MultiXactOffsetSLRULock, LW_EXCLUSIVE); + slotno = ZeroMultiXactOffsetPage(pageno, false); + SimpleLruWritePage(MultiXactOffsetCtl, slotno); + Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]); + LWLockRelease(MultiXactOffsetSLRULock); + } + else + elog(DEBUG1, "skipping initialization of offsets page %d because it was already initialized on multixid creation", pageno); + pre_initialized_offsets_page = -1; } else if (info == XLOG_MULTIXACT_ZERO_MEM_PAGE) { @@ -3285,6 +3372,22 @@ multixact_redo(XLogReaderState *record) TransactionId max_xid; int i; + if (pre_initialized_offsets_page != -1) + { + /* + * If we implicitly initialized the next offsets page while + * replaying an XLOG_MULTIXACT_CREATE_ID record that was generated + * with an older minor version, we still expect to see an + * XLOG_MULTIXACT_ZERO_OFF_PAGE record for it before any other + * XLOG_MULTIXACT_CREATE_ID records. Therefore this case should + * not happen. If it does, we'll continue with the replay, but + * log a message to note that something's funny. + */ + elog(LOG, "expected to see an XLOG_MULTIXACT_ZERO_OFF_PAGE record for page %d that was implicitly initialized earlier", + pre_initialized_offsets_page); + pre_initialized_offsets_page = -1; + } + /* Store the data back into the SLRU files */ RecordNewMultiXact(xlrec->mid, xlrec->moff, xlrec->nmembers, xlrec->members);