mirror of
https://github.com/postgres/postgres.git
synced 2025-10-25 13:17:41 +03:00
to eliminate unnecessary deadlocks. This commit adds SELECT ... FOR SHARE paralleling SELECT ... FOR UPDATE. The implementation uses a new SLRU data structure (managed much like pg_subtrans) to represent multiple- transaction-ID sets. When more than one transaction is holding a shared lock on a particular row, we create a MultiXactId representing that set of transactions and store its ID in the row's XMAX. This scheme allows an effectively unlimited number of row locks, just as we did before, while not costing any extra overhead except when a shared lock actually has to be shared. Still TODO: use the regular lock manager to control the grant order when multiple backends are waiting for a row lock. Alvaro Herrera and Tom Lane.
514 lines
16 KiB
C
514 lines
16 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* htup.h
|
|
* POSTGRES heap tuple definitions.
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1994, Regents of the University of California
|
|
*
|
|
* $PostgreSQL: pgsql/src/include/access/htup.h,v 1.74 2005/04/28 21:47:17 tgl Exp $
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#ifndef HTUP_H
|
|
#define HTUP_H
|
|
|
|
#include "storage/bufpage.h"
|
|
#include "storage/relfilenode.h"
|
|
#include "access/transam.h"
|
|
|
|
|
|
/*
|
|
* MaxTupleAttributeNumber limits the number of (user) columns in a tuple.
|
|
* The key limit on this value is that the size of the fixed overhead for
|
|
* a tuple, plus the size of the null-values bitmap (at 1 bit per column),
|
|
* plus MAXALIGN alignment, must fit into t_hoff which is uint8. On most
|
|
* machines the upper limit without making t_hoff wider would be a little
|
|
* over 1700. We use round numbers here and for MaxHeapAttributeNumber
|
|
* so that alterations in HeapTupleHeaderData layout won't change the
|
|
* supported max number of columns.
|
|
*/
|
|
#define MaxTupleAttributeNumber 1664 /* 8 * 208 */
|
|
|
|
/*----------
|
|
* MaxHeapAttributeNumber limits the number of (user) columns in a table.
|
|
* This should be somewhat less than MaxTupleAttributeNumber. It must be
|
|
* at least one less, else we will fail to do UPDATEs on a maximal-width
|
|
* table (because UPDATE has to form working tuples that include CTID).
|
|
* In practice we want some additional daylight so that we can gracefully
|
|
* support operations that add hidden "resjunk" columns, for example
|
|
* SELECT * FROM wide_table ORDER BY foo, bar, baz.
|
|
* In any case, depending on column data types you will likely be running
|
|
* into the disk-block-based limit on overall tuple size if you have more
|
|
* than a thousand or so columns. TOAST won't help.
|
|
*----------
|
|
*/
|
|
#define MaxHeapAttributeNumber 1600 /* 8 * 200 */
|
|
|
|
/*----------
|
|
* Heap tuple header. To avoid wasting space, the fields should be
|
|
* layed out in such a way to avoid structure padding.
|
|
*
|
|
* Datums of composite types (row types) share the same general structure
|
|
* as on-disk tuples, so that the same routines can be used to build and
|
|
* examine them. However the requirements are slightly different: a Datum
|
|
* does not need any transaction visibility information, and it does need
|
|
* a length word and some embedded type information. We can achieve this
|
|
* by overlaying the xmin/cmin/xmax/cmax/xvac fields of a heap tuple
|
|
* with the fields needed in the Datum case. Typically, all tuples built
|
|
* in-memory will be initialized with the Datum fields; but when a tuple is
|
|
* about to be inserted in a table, the transaction fields will be filled,
|
|
* overwriting the datum fields.
|
|
*
|
|
* The overall structure of a heap tuple looks like:
|
|
* fixed fields (HeapTupleHeaderData struct)
|
|
* nulls bitmap (if HEAP_HASNULL is set in t_infomask)
|
|
* alignment padding (as needed to make user data MAXALIGN'd)
|
|
* object ID (if HEAP_HASOID is set in t_infomask)
|
|
* user data fields
|
|
*
|
|
* We store five "virtual" fields Xmin, Cmin, Xmax, Cmax, and Xvac in four
|
|
* physical fields. Xmin, Cmin and Xmax are always really stored, but
|
|
* Cmax and Xvac share a field. This works because we know that there are
|
|
* only a limited number of states that a tuple can be in, and that Cmax
|
|
* is only interesting for the lifetime of the deleting transaction.
|
|
* This assumes that VACUUM FULL never tries to move a tuple whose Cmax
|
|
* is still interesting (ie, delete-in-progress).
|
|
*
|
|
* Note that in 7.3 and 7.4 a similar idea was applied to Xmax and Cmin.
|
|
* However, with the advent of subtransactions, a tuple may need both Xmax
|
|
* and Cmin simultaneously, so this is no longer possible.
|
|
*
|
|
* Following the fixed header fields, the nulls bitmap is stored (beginning
|
|
* at t_bits). The bitmap is *not* stored if t_infomask shows that there
|
|
* are no nulls in the tuple. If an OID field is present (as indicated by
|
|
* t_infomask), then it is stored just before the user data, which begins at
|
|
* the offset shown by t_hoff. Note that t_hoff must be a multiple of
|
|
* MAXALIGN.
|
|
*----------
|
|
*/
|
|
|
|
typedef struct HeapTupleFields
|
|
{
|
|
TransactionId t_xmin; /* inserting xact ID */
|
|
CommandId t_cmin; /* inserting command ID */
|
|
TransactionId t_xmax; /* deleting or locking xact ID */
|
|
|
|
union
|
|
{
|
|
CommandId t_cmax; /* deleting or locking command ID */
|
|
TransactionId t_xvac; /* VACUUM FULL xact ID */
|
|
} t_field4;
|
|
} HeapTupleFields;
|
|
|
|
typedef struct DatumTupleFields
|
|
{
|
|
int32 datum_len; /* required to be a varlena type */
|
|
|
|
int32 datum_typmod; /* -1, or identifier of a record type */
|
|
|
|
Oid datum_typeid; /* composite type OID, or RECORDOID */
|
|
|
|
/*
|
|
* Note: field ordering is chosen with thought that Oid might someday
|
|
* widen to 64 bits.
|
|
*/
|
|
} DatumTupleFields;
|
|
|
|
typedef struct HeapTupleHeaderData
|
|
{
|
|
union
|
|
{
|
|
HeapTupleFields t_heap;
|
|
DatumTupleFields t_datum;
|
|
} t_choice;
|
|
|
|
ItemPointerData t_ctid; /* current TID of this or newer tuple */
|
|
|
|
int16 t_natts; /* number of attributes */
|
|
|
|
uint16 t_infomask; /* various flag bits, see below */
|
|
|
|
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
|
|
|
|
/* ^ - 27 bytes - ^ */
|
|
|
|
bits8 t_bits[1]; /* bitmap of NULLs -- VARIABLE LENGTH */
|
|
|
|
/* MORE DATA FOLLOWS AT END OF STRUCT */
|
|
} HeapTupleHeaderData;
|
|
|
|
typedef HeapTupleHeaderData *HeapTupleHeader;
|
|
|
|
/*
|
|
* information stored in t_infomask:
|
|
*/
|
|
#define HEAP_HASNULL 0x0001 /* has null attribute(s) */
|
|
#define HEAP_HASVARWIDTH 0x0002 /* has variable-width attribute(s) */
|
|
#define HEAP_HASEXTERNAL 0x0004 /* has external stored
|
|
* attribute(s) */
|
|
#define HEAP_HASCOMPRESSED 0x0008 /* has compressed stored
|
|
* attribute(s) */
|
|
#define HEAP_HASEXTENDED 0x000C /* the two above combined */
|
|
#define HEAP_HASOID 0x0010 /* has an object-id field */
|
|
/* 0x0020 is presently unused */
|
|
#define HEAP_XMAX_EXCL_LOCK 0x0040 /* xmax is exclusive locker */
|
|
#define HEAP_XMAX_SHARED_LOCK 0x0080 /* xmax is shared locker */
|
|
/* if either LOCK bit is set, xmax hasn't deleted the tuple, only locked it */
|
|
#define HEAP_IS_LOCKED (HEAP_XMAX_EXCL_LOCK | HEAP_XMAX_SHARED_LOCK)
|
|
#define HEAP_XMIN_COMMITTED 0x0100 /* t_xmin committed */
|
|
#define HEAP_XMIN_INVALID 0x0200 /* t_xmin invalid/aborted */
|
|
#define HEAP_XMAX_COMMITTED 0x0400 /* t_xmax committed */
|
|
#define HEAP_XMAX_INVALID 0x0800 /* t_xmax invalid/aborted */
|
|
#define HEAP_XMAX_IS_MULTI 0x1000 /* t_xmax is a MultiXactId */
|
|
#define HEAP_UPDATED 0x2000 /* this is UPDATEd version of row */
|
|
#define HEAP_MOVED_OFF 0x4000 /* moved to another place by
|
|
* VACUUM FULL */
|
|
#define HEAP_MOVED_IN 0x8000 /* moved from another place by
|
|
* VACUUM FULL */
|
|
#define HEAP_MOVED (HEAP_MOVED_OFF | HEAP_MOVED_IN)
|
|
|
|
#define HEAP_XACT_MASK 0xFFC0 /* visibility-related bits */
|
|
|
|
|
|
/*
|
|
* HeapTupleHeader accessor macros
|
|
*
|
|
* Note: beware of multiple evaluations of "tup" argument. But the Set
|
|
* macros evaluate their other argument only once.
|
|
*/
|
|
|
|
#define HeapTupleHeaderGetXmin(tup) \
|
|
( \
|
|
(tup)->t_choice.t_heap.t_xmin \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetXmin(tup, xid) \
|
|
( \
|
|
TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmin) \
|
|
)
|
|
|
|
#define HeapTupleHeaderGetXmax(tup) \
|
|
( \
|
|
(tup)->t_choice.t_heap.t_xmax \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetXmax(tup, xid) \
|
|
( \
|
|
TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmax) \
|
|
)
|
|
|
|
#define HeapTupleHeaderGetCmin(tup) \
|
|
( \
|
|
(tup)->t_choice.t_heap.t_cmin \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetCmin(tup, cid) \
|
|
( \
|
|
(tup)->t_choice.t_heap.t_cmin = (cid) \
|
|
)
|
|
|
|
/*
|
|
* Note: GetCmax will produce wrong answers after SetXvac has been executed
|
|
* by a transaction other than the inserting one. We could check
|
|
* HEAP_XMAX_INVALID and return FirstCommandId if it's clear, but since that
|
|
* bit will be set again if the deleting transaction aborts, there'd be no
|
|
* real gain in safety from the extra test. So, just rely on the caller not
|
|
* to trust the value unless it's meaningful.
|
|
*/
|
|
#define HeapTupleHeaderGetCmax(tup) \
|
|
( \
|
|
(tup)->t_choice.t_heap.t_field4.t_cmax \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetCmax(tup, cid) \
|
|
do { \
|
|
Assert(!((tup)->t_infomask & HEAP_MOVED)); \
|
|
(tup)->t_choice.t_heap.t_field4.t_cmax = (cid); \
|
|
} while (0)
|
|
|
|
#define HeapTupleHeaderGetXvac(tup) \
|
|
( \
|
|
((tup)->t_infomask & HEAP_MOVED) ? \
|
|
(tup)->t_choice.t_heap.t_field4.t_xvac \
|
|
: \
|
|
InvalidTransactionId \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetXvac(tup, xid) \
|
|
do { \
|
|
Assert((tup)->t_infomask & HEAP_MOVED); \
|
|
TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field4.t_xvac); \
|
|
} while (0)
|
|
|
|
#define HeapTupleHeaderGetDatumLength(tup) \
|
|
( \
|
|
(tup)->t_choice.t_datum.datum_len \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetDatumLength(tup, len) \
|
|
( \
|
|
(tup)->t_choice.t_datum.datum_len = (len) \
|
|
)
|
|
|
|
#define HeapTupleHeaderGetTypeId(tup) \
|
|
( \
|
|
(tup)->t_choice.t_datum.datum_typeid \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetTypeId(tup, typeid) \
|
|
( \
|
|
(tup)->t_choice.t_datum.datum_typeid = (typeid) \
|
|
)
|
|
|
|
#define HeapTupleHeaderGetTypMod(tup) \
|
|
( \
|
|
(tup)->t_choice.t_datum.datum_typmod \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetTypMod(tup, typmod) \
|
|
( \
|
|
(tup)->t_choice.t_datum.datum_typmod = (typmod) \
|
|
)
|
|
|
|
#define HeapTupleHeaderGetOid(tup) \
|
|
( \
|
|
((tup)->t_infomask & HEAP_HASOID) ? \
|
|
*((Oid *) ((char *)(tup) + (tup)->t_hoff - sizeof(Oid))) \
|
|
: \
|
|
InvalidOid \
|
|
)
|
|
|
|
#define HeapTupleHeaderSetOid(tup, oid) \
|
|
do { \
|
|
Assert((tup)->t_infomask & HEAP_HASOID); \
|
|
*((Oid *) ((char *)(tup) + (tup)->t_hoff - sizeof(Oid))) = (oid); \
|
|
} while (0)
|
|
|
|
|
|
/*
|
|
* BITMAPLEN(NATTS) -
|
|
* Computes size of null bitmap given number of data columns.
|
|
*/
|
|
#define BITMAPLEN(NATTS) (((int)(NATTS) + 7) / 8)
|
|
|
|
/*
|
|
* MaxTupleSize is the maximum allowed size of a tuple, including header and
|
|
* MAXALIGN alignment padding. Basically it's BLCKSZ minus the other stuff
|
|
* that has to be on a disk page. The "other stuff" includes access-method-
|
|
* dependent "special space", which we assume will be no more than
|
|
* MaxSpecialSpace bytes (currently, on heap pages it's actually zero).
|
|
*
|
|
* NOTE: we do not need to count an ItemId for the tuple because
|
|
* sizeof(PageHeaderData) includes the first ItemId on the page.
|
|
*/
|
|
#define MaxSpecialSpace 32
|
|
|
|
#define MaxTupleSize \
|
|
(BLCKSZ - MAXALIGN(sizeof(PageHeaderData) + MaxSpecialSpace))
|
|
|
|
/*
|
|
* MaxAttrSize is a somewhat arbitrary upper limit on the declared size of
|
|
* data fields of char(n) and similar types. It need not have anything
|
|
* directly to do with the *actual* upper limit of varlena values, which
|
|
* is currently 1Gb (see struct varattrib in postgres.h). I've set it
|
|
* at 10Mb which seems like a reasonable number --- tgl 8/6/00.
|
|
*/
|
|
#define MaxAttrSize (10 * 1024 * 1024)
|
|
|
|
|
|
/*
|
|
* Attribute numbers for the system-defined attributes
|
|
*/
|
|
#define SelfItemPointerAttributeNumber (-1)
|
|
#define ObjectIdAttributeNumber (-2)
|
|
#define MinTransactionIdAttributeNumber (-3)
|
|
#define MinCommandIdAttributeNumber (-4)
|
|
#define MaxTransactionIdAttributeNumber (-5)
|
|
#define MaxCommandIdAttributeNumber (-6)
|
|
#define TableOidAttributeNumber (-7)
|
|
#define FirstLowInvalidHeapAttributeNumber (-8)
|
|
|
|
|
|
/*
|
|
* HeapTupleData is an in-memory data structure that points to a tuple.
|
|
*
|
|
* This new HeapTuple for version >= 6.5 and this is why it was changed:
|
|
*
|
|
* 1. t_len moved off on-disk tuple data - ItemIdData is used to get len;
|
|
* 2. t_ctid above is not self tuple TID now - it may point to
|
|
* updated version of tuple (required by MVCC);
|
|
* 3. someday someone let tuple to cross block boundaries -
|
|
* he have to add something below...
|
|
*
|
|
* Change for 7.0:
|
|
* Up to now t_data could be NULL, the memory location directly following
|
|
* HeapTupleData, or pointing into a buffer. Now, it could also point to
|
|
* a separate allocation that was done in the t_datamcxt memory context.
|
|
*/
|
|
typedef struct HeapTupleData
|
|
{
|
|
uint32 t_len; /* length of *t_data */
|
|
ItemPointerData t_self; /* SelfItemPointer */
|
|
Oid t_tableOid; /* table the tuple came from */
|
|
MemoryContext t_datamcxt; /* memory context of allocation */
|
|
HeapTupleHeader t_data; /* -> tuple header and data */
|
|
} HeapTupleData;
|
|
|
|
typedef HeapTupleData *HeapTuple;
|
|
|
|
#define HEAPTUPLESIZE MAXALIGN(sizeof(HeapTupleData))
|
|
|
|
/*
|
|
* GETSTRUCT - given a HeapTuple pointer, return address of the user data
|
|
*/
|
|
#define GETSTRUCT(TUP) ((char *) ((TUP)->t_data) + (TUP)->t_data->t_hoff)
|
|
|
|
/*
|
|
* Accessor macros to be used with HeapTuple pointers.
|
|
*/
|
|
#define HeapTupleIsValid(tuple) PointerIsValid(tuple)
|
|
|
|
#define HeapTupleHasNulls(tuple) \
|
|
(((tuple)->t_data->t_infomask & HEAP_HASNULL) != 0)
|
|
|
|
#define HeapTupleNoNulls(tuple) \
|
|
(!((tuple)->t_data->t_infomask & HEAP_HASNULL))
|
|
|
|
#define HeapTupleHasVarWidth(tuple) \
|
|
(((tuple)->t_data->t_infomask & HEAP_HASVARWIDTH) != 0)
|
|
|
|
#define HeapTupleAllFixed(tuple) \
|
|
(!((tuple)->t_data->t_infomask & HEAP_HASVARWIDTH))
|
|
|
|
#define HeapTupleHasExternal(tuple) \
|
|
(((tuple)->t_data->t_infomask & HEAP_HASEXTERNAL) != 0)
|
|
|
|
#define HeapTupleHasCompressed(tuple) \
|
|
(((tuple)->t_data->t_infomask & HEAP_HASCOMPRESSED) != 0)
|
|
|
|
#define HeapTupleHasExtended(tuple) \
|
|
(((tuple)->t_data->t_infomask & HEAP_HASEXTENDED) != 0)
|
|
|
|
#define HeapTupleGetOid(tuple) \
|
|
HeapTupleHeaderGetOid((tuple)->t_data)
|
|
|
|
#define HeapTupleSetOid(tuple, oid) \
|
|
HeapTupleHeaderSetOid((tuple)->t_data, (oid))
|
|
|
|
|
|
/*
|
|
* WAL record definitions for heapam.c's WAL operations
|
|
*
|
|
* XLOG allows to store some information in high 4 bits of log
|
|
* record xl_info field. We use 3 for opcode and one for init bit.
|
|
*/
|
|
#define XLOG_HEAP_INSERT 0x00
|
|
#define XLOG_HEAP_DELETE 0x10
|
|
#define XLOG_HEAP_UPDATE 0x20
|
|
#define XLOG_HEAP_MOVE 0x30
|
|
#define XLOG_HEAP_CLEAN 0x40
|
|
#define XLOG_HEAP_NEWPAGE 0x50
|
|
#define XLOG_HEAP_LOCK 0x60
|
|
/* opcode 0x70 still free */
|
|
#define XLOG_HEAP_OPMASK 0x70
|
|
/*
|
|
* When we insert 1st item on new page in INSERT/UPDATE
|
|
* we can (and we do) restore entire page in redo
|
|
*/
|
|
#define XLOG_HEAP_INIT_PAGE 0x80
|
|
|
|
/*
|
|
* All what we need to find changed tuple
|
|
*
|
|
* NB: on most machines, sizeof(xl_heaptid) will include some trailing pad
|
|
* bytes for alignment. We don't want to store the pad space in the XLOG,
|
|
* so use SizeOfHeapTid for space calculations. Similar comments apply for
|
|
* the other xl_FOO structs.
|
|
*/
|
|
typedef struct xl_heaptid
|
|
{
|
|
RelFileNode node;
|
|
ItemPointerData tid; /* changed tuple id */
|
|
} xl_heaptid;
|
|
|
|
#define SizeOfHeapTid (offsetof(xl_heaptid, tid) + SizeOfIptrData)
|
|
|
|
/* This is what we need to know about delete */
|
|
typedef struct xl_heap_delete
|
|
{
|
|
xl_heaptid target; /* deleted tuple id */
|
|
} xl_heap_delete;
|
|
|
|
#define SizeOfHeapDelete (offsetof(xl_heap_delete, target) + SizeOfHeapTid)
|
|
|
|
/*
|
|
* We don't store the whole fixed part (HeapTupleHeaderData) of an inserted
|
|
* or updated tuple in WAL; we can save a few bytes by reconstructing the
|
|
* fields that are available elsewhere in the WAL record, or perhaps just
|
|
* plain needn't be reconstructed. These are the fields we must store.
|
|
* NOTE: t_hoff could be recomputed, but we may as well store it because
|
|
* it will come for free due to alignment considerations.
|
|
*/
|
|
typedef struct xl_heap_header
|
|
{
|
|
int16 t_natts;
|
|
uint16 t_infomask;
|
|
uint8 t_hoff;
|
|
} xl_heap_header;
|
|
|
|
#define SizeOfHeapHeader (offsetof(xl_heap_header, t_hoff) + sizeof(uint8))
|
|
|
|
/* This is what we need to know about insert */
|
|
typedef struct xl_heap_insert
|
|
{
|
|
xl_heaptid target; /* inserted tuple id */
|
|
/* xl_heap_header & TUPLE DATA FOLLOWS AT END OF STRUCT */
|
|
} xl_heap_insert;
|
|
|
|
#define SizeOfHeapInsert (offsetof(xl_heap_insert, target) + SizeOfHeapTid)
|
|
|
|
/* This is what we need to know about update|move */
|
|
typedef struct xl_heap_update
|
|
{
|
|
xl_heaptid target; /* deleted tuple id */
|
|
ItemPointerData newtid; /* new inserted tuple id */
|
|
/* NEW TUPLE xl_heap_header (PLUS xmax & xmin IF MOVE OP) */
|
|
/* and TUPLE DATA FOLLOWS AT END OF STRUCT */
|
|
} xl_heap_update;
|
|
|
|
#define SizeOfHeapUpdate (offsetof(xl_heap_update, newtid) + SizeOfIptrData)
|
|
|
|
/* This is what we need to know about vacuum page cleanup */
|
|
typedef struct xl_heap_clean
|
|
{
|
|
RelFileNode node;
|
|
BlockNumber block;
|
|
/* UNUSED OFFSET NUMBERS FOLLOW AT THE END */
|
|
} xl_heap_clean;
|
|
|
|
#define SizeOfHeapClean (offsetof(xl_heap_clean, block) + sizeof(BlockNumber))
|
|
|
|
/* This is for replacing a page's contents in toto */
|
|
/* NB: this is used for indexes as well as heaps */
|
|
typedef struct xl_heap_newpage
|
|
{
|
|
RelFileNode node;
|
|
BlockNumber blkno; /* location of new page */
|
|
/* entire page contents follow at end of record */
|
|
} xl_heap_newpage;
|
|
|
|
#define SizeOfHeapNewpage (offsetof(xl_heap_newpage, blkno) + sizeof(BlockNumber))
|
|
|
|
/* This is what we need to know about lock */
|
|
typedef struct xl_heap_lock
|
|
{
|
|
xl_heaptid target; /* locked tuple id */
|
|
bool shared_lock; /* shared or exclusive row lock? */
|
|
} xl_heap_lock;
|
|
|
|
#define SizeOfHeapLock (offsetof(xl_heap_lock, shared_lock) + sizeof(bool))
|
|
|
|
#endif /* HTUP_H */
|