1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-13 07:41:39 +03:00

Implement genuine serializable isolation level.

Until now, our Serializable mode has in fact been what's called Snapshot
Isolation, which allows some anomalies that could not occur in any
serialized ordering of the transactions. This patch fixes that using a
method called Serializable Snapshot Isolation, based on research papers by
Michael J. Cahill (see README-SSI for full references). In Serializable
Snapshot Isolation, transactions run like they do in Snapshot Isolation,
but a predicate lock manager observes the reads and writes performed and
aborts transactions if it detects that an anomaly might occur. This method
produces some false positives, ie. it sometimes aborts transactions even
though there is no anomaly.

To track reads we implement predicate locking, see storage/lmgr/predicate.c.
Whenever a tuple is read, a predicate lock is acquired on the tuple. Shared
memory is finite, so when a transaction takes many tuple-level locks on a
page, the locks are promoted to a single page-level lock, and further to a
single relation level lock if necessary. To lock key values with no matching
tuple, a sequential scan always takes a relation-level lock, and an index
scan acquires a page-level lock that covers the search key, whether or not
there are any matching keys at the moment.

A predicate lock doesn't conflict with any regular locks or with another
predicate locks in the normal sense. They're only used by the predicate lock
manager to detect the danger of anomalies. Only serializable transactions
participate in predicate locking, so there should be no extra overhead for
for other transactions.

Predicate locks can't be released at commit, but must be remembered until
all the transactions that overlapped with it have completed. That means that
we need to remember an unbounded amount of predicate locks, so we apply a
lossy but conservative method of tracking locks for committed transactions.
If we run short of shared memory, we overflow to a new "pg_serial" SLRU
pool.

We don't currently allow Serializable transactions in Hot Standby mode.
That would be hard, because even read-only transactions can cause anomalies
that wouldn't otherwise occur.

Serializable isolation mode now means the new fully serializable level.
Repeatable Read gives you the old Snapshot Isolation level that we have
always had.

Kevin Grittner and Dan Ports, reviewed by Jeff Davis, Heikki Linnakangas and
Anssi Kääriäinen
This commit is contained in:
Heikki Linnakangas
2011-02-07 23:46:51 +02:00
parent c18f51da17
commit dafaa3efb7
90 changed files with 14995 additions and 271 deletions

View File

@ -57,6 +57,7 @@
#include "storage/bufmgr.h"
#include "storage/freespace.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/procarray.h"
#include "storage/smgr.h"
#include "storage/standby.h"
@ -261,20 +262,20 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
{
if (ItemIdIsNormal(lpp))
{
HeapTupleData loctup;
bool valid;
loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
loctup.t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(loctup.t_self), page, lineoff);
if (all_visible)
valid = true;
else
{
HeapTupleData loctup;
loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
loctup.t_len = ItemIdGetLength(lpp);
ItemPointerSet(&(loctup.t_self), page, lineoff);
valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
}
CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup, buffer);
if (valid)
scan->rs_vistuples[ntup++] = lineoff;
}
@ -468,12 +469,16 @@ heapgettup(HeapScanDesc scan,
snapshot,
scan->rs_cbuf);
CheckForSerializableConflictOut(valid, scan->rs_rd, tuple, scan->rs_cbuf);
if (valid && key != NULL)
HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
nkeys, key, valid);
if (valid)
{
if (!scan->rs_relpredicatelocked)
PredicateLockTuple(scan->rs_rd, tuple);
LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
return;
}
@ -741,12 +746,16 @@ heapgettup_pagemode(HeapScanDesc scan,
nkeys, key, valid);
if (valid)
{
if (!scan->rs_relpredicatelocked)
PredicateLockTuple(scan->rs_rd, tuple);
scan->rs_cindex = lineindex;
return;
}
}
else
{
if (!scan->rs_relpredicatelocked)
PredicateLockTuple(scan->rs_rd, tuple);
scan->rs_cindex = lineindex;
return;
}
@ -1213,6 +1222,7 @@ heap_beginscan_internal(Relation relation, Snapshot snapshot,
scan->rs_strategy = NULL; /* set in initscan */
scan->rs_allow_strat = allow_strat;
scan->rs_allow_sync = allow_sync;
scan->rs_relpredicatelocked = false;
/*
* we can use page-at-a-time mode if it's an MVCC-safe snapshot
@ -1459,8 +1469,13 @@ heap_fetch(Relation relation,
*/
valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
if (valid)
PredicateLockTuple(relation, tuple);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
CheckForSerializableConflictOut(valid, relation, tuple, buffer);
if (valid)
{
/*
@ -1506,13 +1521,15 @@ heap_fetch(Relation relation,
* heap_fetch, we do not report any pgstats count; caller may do so if wanted.
*/
bool
heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
bool *all_dead)
heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
Snapshot snapshot, bool *all_dead)
{
Page dp = (Page) BufferGetPage(buffer);
TransactionId prev_xmax = InvalidTransactionId;
OffsetNumber offnum;
bool at_chain_start;
bool valid;
bool match_found;
if (all_dead)
*all_dead = true;
@ -1522,6 +1539,7 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer));
offnum = ItemPointerGetOffsetNumber(tid);
at_chain_start = true;
match_found = false;
/* Scan through possible multiple members of HOT-chain */
for (;;)
@ -1552,6 +1570,8 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
heapTuple.t_data = (HeapTupleHeader) PageGetItem(dp, lp);
heapTuple.t_len = ItemIdGetLength(lp);
heapTuple.t_tableOid = relation->rd_id;
heapTuple.t_self = *tid;
/*
* Shouldn't see a HEAP_ONLY tuple at chain start.
@ -1569,12 +1589,18 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
break;
/* If it's visible per the snapshot, we must return it */
if (HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer))
valid = HeapTupleSatisfiesVisibility(&heapTuple, snapshot, buffer);
CheckForSerializableConflictOut(valid, relation, &heapTuple, buffer);
if (valid)
{
ItemPointerSetOffsetNumber(tid, offnum);
PredicateLockTuple(relation, &heapTuple);
if (all_dead)
*all_dead = false;
return true;
if (IsolationIsSerializable())
match_found = true;
else
return true;
}
/*
@ -1603,7 +1629,7 @@ heap_hot_search_buffer(ItemPointer tid, Buffer buffer, Snapshot snapshot,
break; /* end of chain */
}
return false;
return match_found;
}
/*
@ -1622,7 +1648,7 @@ heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
LockBuffer(buffer, BUFFER_LOCK_SHARE);
result = heap_hot_search_buffer(tid, buffer, snapshot, all_dead);
result = heap_hot_search_buffer(tid, relation, buffer, snapshot, all_dead);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(buffer);
return result;
@ -1729,6 +1755,7 @@ heap_get_latest_tid(Relation relation,
* result candidate.
*/
valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
CheckForSerializableConflictOut(valid, relation, &tp, buffer);
if (valid)
*tid = ctid;
@ -1893,6 +1920,13 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
InvalidBuffer, options, bistate);
/*
* We're about to do the actual insert -- check for conflict at the
* relation or buffer level first, to avoid possibly having to roll
* back work we've just done.
*/
CheckForSerializableConflictIn(relation, NULL, buffer);
/* NO EREPORT(ERROR) from here till changes are logged */
START_CRIT_SECTION();
@ -2193,6 +2227,12 @@ l1:
return result;
}
/*
* We're about to do the actual delete -- check for conflict first,
* to avoid possibly having to roll back work we've just done.
*/
CheckForSerializableConflictIn(relation, &tp, buffer);
/* replace cid with a combo cid if necessary */
HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
@ -2546,6 +2586,12 @@ l2:
return result;
}
/*
* We're about to do the actual update -- check for conflict first,
* to avoid possibly having to roll back work we've just done.
*/
CheckForSerializableConflictIn(relation, &oldtup, buffer);
/* Fill in OID and transaction status data for newtup */
if (relation->rd_rel->relhasoids)
{
@ -2690,6 +2736,16 @@ l2:
heaptup = newtup;
}
/*
* We're about to create the new tuple -- check for conflict first,
* to avoid possibly having to roll back work we've just done.
*
* NOTE: For a tuple insert, we only need to check for table locks, since
* predicate locking at the index level will cover ranges for anything
* except a table scan. Therefore, only provide the relation.
*/
CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
/*
* At this point newbuf and buffer are both pinned and locked, and newbuf
* has enough space for the new tuple. If they are the same buffer, only
@ -2799,6 +2855,12 @@ l2:
END_CRIT_SECTION();
/*
* Any existing SIREAD locks on the old tuple must be linked to the new
* tuple for conflict detection purposes.
*/
PredicateLockTupleRowVersionLink(relation, &oldtup, newtup);
if (newbuf != buffer)
LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);