mirror of
https://github.com/postgres/postgres.git
synced 2025-11-24 00:23:06 +03:00
Fix data loss at inplace update after heap_update().
As previously-added tests demonstrated, heap_inplace_update() could instead update an unrelated tuple of the same catalog. It could lose the update. Losing relhasindex=t was a source of index corruption. Inplace-updating commands like VACUUM will now wait for heap_update() commands like GRANT TABLE and GRANT DATABASE. That isn't ideal, but a long-running GRANT already hurts VACUUM progress more just by keeping an XID running. The VACUUM will behave like a DELETE or UPDATE waiting for the uncommitted change. For implementation details, start at the systable_inplace_update_begin() header comment and README.tuplock. Back-patch to v12 (all supported versions). In back branches, retain a deprecated heap_inplace_update(), for extensions. Reported by Smolkin Grigory. Reviewed by Nitin Motiani, (in earlier versions) Heikki Linnakangas, and (in earlier versions) Alexander Lakhin. Discussion: https://postgr.es/m/CAMp+ueZQz3yDk7qg42hk6-9gxniYbp-=bG2mgqecErqR5gGGOA@mail.gmail.com
This commit is contained in:
@@ -20,6 +20,7 @@
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/genam.h"
|
||||
#include "access/heapam.h"
|
||||
#include "access/relscan.h"
|
||||
#include "access/tableam.h"
|
||||
#include "access/transam.h"
|
||||
@@ -29,6 +30,7 @@
|
||||
#include "storage/bufmgr.h"
|
||||
#include "storage/procarray.h"
|
||||
#include "utils/acl.h"
|
||||
#include "utils/injection_point.h"
|
||||
#include "utils/lsyscache.h"
|
||||
#include "utils/rel.h"
|
||||
#include "utils/rls.h"
|
||||
@@ -757,3 +759,138 @@ systable_endscan_ordered(SysScanDesc sysscan)
|
||||
UnregisterSnapshot(sysscan->snapshot);
|
||||
pfree(sysscan);
|
||||
}
|
||||
|
||||
/*
|
||||
* systable_inplace_update_begin --- update a row "in place" (overwrite it)
|
||||
*
|
||||
* Overwriting violates both MVCC and transactional safety, so the uses of
|
||||
* this function in Postgres are extremely limited. Nonetheless we find some
|
||||
* places to use it. Standard flow:
|
||||
*
|
||||
* ... [any slow preparation not requiring oldtup] ...
|
||||
* systable_inplace_update_begin([...], &tup, &inplace_state);
|
||||
* if (!HeapTupleIsValid(tup))
|
||||
* elog(ERROR, [...]);
|
||||
* ... [buffer is exclusive-locked; mutate "tup"] ...
|
||||
* if (dirty)
|
||||
* systable_inplace_update_finish(inplace_state, tup);
|
||||
* else
|
||||
* systable_inplace_update_cancel(inplace_state);
|
||||
*
|
||||
* The first several params duplicate the systable_beginscan() param list.
|
||||
* "oldtupcopy" is an output parameter, assigned NULL if the key ceases to
|
||||
* find a live tuple. (In PROC_IN_VACUUM, that is a low-probability transient
|
||||
* condition.) If "oldtupcopy" gets non-NULL, you must pass output parameter
|
||||
* "state" to systable_inplace_update_finish() or
|
||||
* systable_inplace_update_cancel().
|
||||
*/
|
||||
void
|
||||
systable_inplace_update_begin(Relation relation,
|
||||
Oid indexId,
|
||||
bool indexOK,
|
||||
Snapshot snapshot,
|
||||
int nkeys, const ScanKeyData *key,
|
||||
HeapTuple *oldtupcopy,
|
||||
void **state)
|
||||
{
|
||||
int retries = 0;
|
||||
SysScanDesc scan;
|
||||
HeapTuple oldtup;
|
||||
|
||||
/*
|
||||
* For now, we don't allow parallel updates. Unlike a regular update,
|
||||
* this should never create a combo CID, so it might be possible to relax
|
||||
* this restriction, but not without more thought and testing. It's not
|
||||
* clear that it would be useful, anyway.
|
||||
*/
|
||||
if (IsInParallelMode())
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_TRANSACTION_STATE),
|
||||
errmsg("cannot update tuples during a parallel operation")));
|
||||
|
||||
/*
|
||||
* Accept a snapshot argument, for symmetry, but this function advances
|
||||
* its snapshot as needed to reach the tail of the updated tuple chain.
|
||||
*/
|
||||
Assert(snapshot == NULL);
|
||||
|
||||
Assert(IsInplaceUpdateRelation(relation) || !IsSystemRelation(relation));
|
||||
|
||||
/* Loop for an exclusive-locked buffer of a non-updated tuple. */
|
||||
for (;;)
|
||||
{
|
||||
TupleTableSlot *slot;
|
||||
BufferHeapTupleTableSlot *bslot;
|
||||
|
||||
CHECK_FOR_INTERRUPTS();
|
||||
|
||||
/*
|
||||
* Processes issuing heap_update (e.g. GRANT) at maximum speed could
|
||||
* drive us to this error. A hostile table owner has stronger ways to
|
||||
* damage their own table, so that's minor.
|
||||
*/
|
||||
if (retries++ > 10000)
|
||||
elog(ERROR, "giving up after too many tries to overwrite row");
|
||||
|
||||
INJECTION_POINT("inplace-before-pin");
|
||||
scan = systable_beginscan(relation, indexId, indexOK, snapshot,
|
||||
nkeys, unconstify(ScanKeyData *, key));
|
||||
oldtup = systable_getnext(scan);
|
||||
if (!HeapTupleIsValid(oldtup))
|
||||
{
|
||||
systable_endscan(scan);
|
||||
*oldtupcopy = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
slot = scan->slot;
|
||||
Assert(TTS_IS_BUFFERTUPLE(slot));
|
||||
bslot = (BufferHeapTupleTableSlot *) slot;
|
||||
if (heap_inplace_lock(scan->heap_rel,
|
||||
bslot->base.tuple, bslot->buffer))
|
||||
break;
|
||||
systable_endscan(scan);
|
||||
};
|
||||
|
||||
*oldtupcopy = heap_copytuple(oldtup);
|
||||
*state = scan;
|
||||
}
|
||||
|
||||
/*
|
||||
* systable_inplace_update_finish --- second phase of inplace update
|
||||
*
|
||||
* The tuple cannot change size, and therefore its header fields and null
|
||||
* bitmap (if any) don't change either.
|
||||
*/
|
||||
void
|
||||
systable_inplace_update_finish(void *state, HeapTuple tuple)
|
||||
{
|
||||
SysScanDesc scan = (SysScanDesc) state;
|
||||
Relation relation = scan->heap_rel;
|
||||
TupleTableSlot *slot = scan->slot;
|
||||
BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
|
||||
HeapTuple oldtup = bslot->base.tuple;
|
||||
Buffer buffer = bslot->buffer;
|
||||
|
||||
heap_inplace_update_and_unlock(relation, oldtup, tuple, buffer);
|
||||
systable_endscan(scan);
|
||||
}
|
||||
|
||||
/*
|
||||
* systable_inplace_update_cancel --- abandon inplace update
|
||||
*
|
||||
* This is an alternative to making a no-op update.
|
||||
*/
|
||||
void
|
||||
systable_inplace_update_cancel(void *state)
|
||||
{
|
||||
SysScanDesc scan = (SysScanDesc) state;
|
||||
Relation relation = scan->heap_rel;
|
||||
TupleTableSlot *slot = scan->slot;
|
||||
BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
|
||||
HeapTuple oldtup = bslot->base.tuple;
|
||||
Buffer buffer = bslot->buffer;
|
||||
|
||||
heap_inplace_unlock(relation, oldtup, buffer);
|
||||
systable_endscan(scan);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user