mirror of
https://github.com/postgres/postgres.git
synced 2025-11-13 16:22:44 +03:00
pgindent run for 9.4
This includes removing tabs after periods in C comments, which was applied to back branches, so this change should not effect backpatching.
This commit is contained in:
@@ -21,7 +21,7 @@
|
||||
* tuptoaster.c.
|
||||
*
|
||||
* This change will break any code that assumes it needn't detoast values
|
||||
* that have been put into a tuple but never sent to disk. Hopefully there
|
||||
* that have been put into a tuple but never sent to disk. Hopefully there
|
||||
* are few such places.
|
||||
*
|
||||
* Varlenas still have alignment 'i' (or 'd') in pg_type/pg_attribute, since
|
||||
@@ -387,7 +387,7 @@ nocachegetattr(HeapTuple tuple,
|
||||
|
||||
/*
|
||||
* Otherwise, check for non-fixed-length attrs up to and including
|
||||
* target. If there aren't any, it's safe to cheaply initialize the
|
||||
* target. If there aren't any, it's safe to cheaply initialize the
|
||||
* cached offsets for these attrs.
|
||||
*/
|
||||
if (HeapTupleHasVarWidth(tuple))
|
||||
@@ -454,7 +454,7 @@ nocachegetattr(HeapTuple tuple,
|
||||
*
|
||||
* Note - This loop is a little tricky. For each non-null attribute,
|
||||
* we have to first account for alignment padding before the attr,
|
||||
* then advance over the attr based on its length. Nulls have no
|
||||
* then advance over the attr based on its length. Nulls have no
|
||||
* storage and no alignment padding either. We can use/set
|
||||
* attcacheoff until we reach either a null or a var-width attribute.
|
||||
*/
|
||||
@@ -549,7 +549,7 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
|
||||
|
||||
/*
|
||||
* cmin and cmax are now both aliases for the same field, which
|
||||
* can in fact also be a combo command id. XXX perhaps we should
|
||||
* can in fact also be a combo command id. XXX perhaps we should
|
||||
* return the "real" cmin or cmax if possible, that is if we are
|
||||
* inside the originating transaction?
|
||||
*/
|
||||
@@ -709,7 +709,7 @@ heap_form_tuple(TupleDesc tupleDescriptor,
|
||||
len += data_len;
|
||||
|
||||
/*
|
||||
* Allocate and zero the space needed. Note that the tuple body and
|
||||
* Allocate and zero the space needed. Note that the tuple body and
|
||||
* HeapTupleData management structure are allocated in one chunk.
|
||||
*/
|
||||
tuple = (HeapTuple) palloc0(HEAPTUPLESIZE + len);
|
||||
|
||||
@@ -71,7 +71,7 @@ index_form_tuple(TupleDesc tupleDescriptor,
|
||||
|
||||
/*
|
||||
* If value is stored EXTERNAL, must fetch it so we are not depending
|
||||
* on outside storage. This should be improved someday.
|
||||
* on outside storage. This should be improved someday.
|
||||
*/
|
||||
if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i])))
|
||||
{
|
||||
@@ -280,7 +280,7 @@ nocache_index_getattr(IndexTuple tup,
|
||||
|
||||
/*
|
||||
* Otherwise, check for non-fixed-length attrs up to and including
|
||||
* target. If there aren't any, it's safe to cheaply initialize the
|
||||
* target. If there aren't any, it's safe to cheaply initialize the
|
||||
* cached offsets for these attrs.
|
||||
*/
|
||||
if (IndexTupleHasVarwidths(tup))
|
||||
@@ -347,7 +347,7 @@ nocache_index_getattr(IndexTuple tup,
|
||||
*
|
||||
* Note - This loop is a little tricky. For each non-null attribute,
|
||||
* we have to first account for alignment padding before the attr,
|
||||
* then advance over the attr based on its length. Nulls have no
|
||||
* then advance over the attr based on its length. Nulls have no
|
||||
* storage and no alignment padding either. We can use/set
|
||||
* attcacheoff until we reach either a null or a var-width attribute.
|
||||
*/
|
||||
|
||||
@@ -182,7 +182,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
|
||||
* or some similar function; it does not contain a full set of fields.
|
||||
* The targetlist will be NIL when executing a utility function that does
|
||||
* not have a plan. If the targetlist isn't NIL then it is a Query node's
|
||||
* targetlist; it is up to us to ignore resjunk columns in it. The formats[]
|
||||
* targetlist; it is up to us to ignore resjunk columns in it. The formats[]
|
||||
* array pointer might be NULL (if we are doing Describe on a prepared stmt);
|
||||
* send zeroes for the format codes in that case.
|
||||
*/
|
||||
|
||||
@@ -540,7 +540,7 @@ add_real_reloption(bits32 kinds, char *name, char *desc, double default_val,
|
||||
* Add a new string reloption
|
||||
*
|
||||
* "validator" is an optional function pointer that can be used to test the
|
||||
* validity of the values. It must elog(ERROR) when the argument string is
|
||||
* validity of the values. It must elog(ERROR) when the argument string is
|
||||
* not acceptable for the variable. Note that the default value must pass
|
||||
* the validation.
|
||||
*/
|
||||
@@ -868,7 +868,7 @@ extractRelOptions(HeapTuple tuple, TupleDesc tupdesc, Oid amoptions)
|
||||
* is returned.
|
||||
*
|
||||
* Note: values of type int, bool and real are allocated as part of the
|
||||
* returned array. Values of type string are allocated separately and must
|
||||
* returned array. Values of type string are allocated separately and must
|
||||
* be freed by the caller.
|
||||
*/
|
||||
relopt_value *
|
||||
@@ -1205,7 +1205,7 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
|
||||
{"check_option", RELOPT_TYPE_STRING,
|
||||
offsetof(StdRdOptions, check_option_offset)},
|
||||
{"user_catalog_table", RELOPT_TYPE_BOOL,
|
||||
offsetof(StdRdOptions, user_catalog_table)}
|
||||
offsetof(StdRdOptions, user_catalog_table)}
|
||||
};
|
||||
|
||||
options = parseRelOptions(reloptions, validate, kind, &numoptions);
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
*
|
||||
* These functions provide conversion between rowtypes that are logically
|
||||
* equivalent but might have columns in a different order or different sets
|
||||
* of dropped columns. There is some overlap of functionality with the
|
||||
* of dropped columns. There is some overlap of functionality with the
|
||||
* executor's "junkfilter" routines, but these functions work on bare
|
||||
* HeapTuples rather than TupleTableSlots.
|
||||
*
|
||||
|
||||
@@ -581,7 +581,7 @@ TupleDescInitEntryCollation(TupleDesc desc,
|
||||
* Given a relation schema (list of ColumnDef nodes), build a TupleDesc.
|
||||
*
|
||||
* Note: the default assumption is no OIDs; caller may modify the returned
|
||||
* TupleDesc if it wants OIDs. Also, tdtypeid will need to be filled in
|
||||
* TupleDesc if it wants OIDs. Also, tdtypeid will need to be filled in
|
||||
* later on.
|
||||
*/
|
||||
TupleDesc
|
||||
|
||||
@@ -197,7 +197,7 @@ ginarrayconsistent(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* Must have all elements in check[] true; no discrimination
|
||||
* against nulls here. This is because array_contain_compare and
|
||||
* against nulls here. This is because array_contain_compare and
|
||||
* array_eq handle nulls differently ...
|
||||
*/
|
||||
res = true;
|
||||
@@ -279,9 +279,10 @@ ginarraytriconsistent(PG_FUNCTION_ARGS)
|
||||
res = GIN_MAYBE;
|
||||
break;
|
||||
case GinEqualStrategy:
|
||||
|
||||
/*
|
||||
* Must have all elements in check[] true; no discrimination
|
||||
* against nulls here. This is because array_contain_compare and
|
||||
* against nulls here. This is because array_contain_compare and
|
||||
* array_eq handle nulls differently ...
|
||||
*/
|
||||
res = GIN_MAYBE;
|
||||
|
||||
@@ -251,6 +251,7 @@ ginFindParents(GinBtree btree, GinBtreeStack *stack)
|
||||
Assert(blkno != btree->rootBlkno);
|
||||
ptr->blkno = blkno;
|
||||
ptr->buffer = buffer;
|
||||
|
||||
/*
|
||||
* parent may be wrong, but if so, the ginFinishSplit call will
|
||||
* recurse to call ginFindParents again to fix it.
|
||||
@@ -328,7 +329,8 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
|
||||
GinPlaceToPageRC rc;
|
||||
uint16 xlflags = 0;
|
||||
Page childpage = NULL;
|
||||
Page newlpage = NULL, newrpage = NULL;
|
||||
Page newlpage = NULL,
|
||||
newrpage = NULL;
|
||||
|
||||
if (GinPageIsData(page))
|
||||
xlflags |= GIN_INSERT_ISDATA;
|
||||
@@ -346,8 +348,8 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to put the incoming tuple on the page. placeToPage will decide
|
||||
* if the page needs to be split.
|
||||
* Try to put the incoming tuple on the page. placeToPage will decide if
|
||||
* the page needs to be split.
|
||||
*/
|
||||
rc = btree->placeToPage(btree, stack->buffer, stack,
|
||||
insertdata, updateblkno,
|
||||
@@ -371,7 +373,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
|
||||
XLogRecPtr recptr;
|
||||
XLogRecData rdata[3];
|
||||
ginxlogInsert xlrec;
|
||||
BlockIdData childblknos[2];
|
||||
BlockIdData childblknos[2];
|
||||
|
||||
xlrec.node = btree->index->rd_node;
|
||||
xlrec.blkno = BufferGetBlockNumber(stack->buffer);
|
||||
@@ -449,7 +451,8 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
|
||||
data.flags = xlflags;
|
||||
if (childbuf != InvalidBuffer)
|
||||
{
|
||||
Page childpage = BufferGetPage(childbuf);
|
||||
Page childpage = BufferGetPage(childbuf);
|
||||
|
||||
GinPageGetOpaque(childpage)->flags &= ~GIN_INCOMPLETE_SPLIT;
|
||||
|
||||
data.leftChildBlkno = BufferGetBlockNumber(childbuf);
|
||||
@@ -505,8 +508,8 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
|
||||
|
||||
/*
|
||||
* Construct a new root page containing downlinks to the new left
|
||||
* and right pages. (do this in a temporary copy first rather
|
||||
* than overwriting the original page directly, so that we can still
|
||||
* and right pages. (do this in a temporary copy first rather than
|
||||
* overwriting the original page directly, so that we can still
|
||||
* abort gracefully if this fails.)
|
||||
*/
|
||||
newrootpg = PageGetTempPage(newrpage);
|
||||
@@ -604,7 +607,7 @@ ginPlaceToPage(GinBtree btree, GinBtreeStack *stack,
|
||||
else
|
||||
{
|
||||
elog(ERROR, "unknown return code from GIN placeToPage method: %d", rc);
|
||||
return false; /* keep compiler quiet */
|
||||
return false; /* keep compiler quiet */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -627,8 +630,8 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
|
||||
bool first = true;
|
||||
|
||||
/*
|
||||
* freestack == false when we encounter an incompletely split page during a
|
||||
* scan, while freestack == true is used in the normal scenario that a
|
||||
* freestack == false when we encounter an incompletely split page during
|
||||
* a scan, while freestack == true is used in the normal scenario that a
|
||||
* split is finished right after the initial insert.
|
||||
*/
|
||||
if (!freestack)
|
||||
@@ -650,8 +653,8 @@ ginFinishSplit(GinBtree btree, GinBtreeStack *stack, bool freestack,
|
||||
* then continue with the current one.
|
||||
*
|
||||
* Note: we have to finish *all* incomplete splits we encounter, even
|
||||
* if we have to move right. Otherwise we might choose as the target
|
||||
* a page that has no downlink in the parent, and splitting it further
|
||||
* if we have to move right. Otherwise we might choose as the target a
|
||||
* page that has no downlink in the parent, and splitting it further
|
||||
* would fail.
|
||||
*/
|
||||
if (GinPageIsIncompleteSplit(BufferGetPage(parent->buffer)))
|
||||
|
||||
@@ -187,7 +187,7 @@ ginInsertBAEntry(BuildAccumulator *accum,
|
||||
* Since the entries are being inserted into a balanced binary tree, you
|
||||
* might think that the order of insertion wouldn't be critical, but it turns
|
||||
* out that inserting the entries in sorted order results in a lot of
|
||||
* rebalancing operations and is slow. To prevent this, we attempt to insert
|
||||
* rebalancing operations and is slow. To prevent this, we attempt to insert
|
||||
* the nodes in an order that will produce a nearly-balanced tree if the input
|
||||
* is in fact sorted.
|
||||
*
|
||||
|
||||
@@ -49,8 +49,8 @@ typedef struct
|
||||
dlist_head segments; /* a list of leafSegmentInfos */
|
||||
|
||||
/*
|
||||
* The following fields represent how the segments are split across
|
||||
* pages, if a page split is required. Filled in by leafRepackItems.
|
||||
* The following fields represent how the segments are split across pages,
|
||||
* if a page split is required. Filled in by leafRepackItems.
|
||||
*/
|
||||
dlist_node *lastleft; /* last segment on left page */
|
||||
int lsize; /* total size on left page */
|
||||
@@ -61,7 +61,7 @@ typedef struct
|
||||
|
||||
typedef struct
|
||||
{
|
||||
dlist_node node; /* linked list pointers */
|
||||
dlist_node node; /* linked list pointers */
|
||||
|
||||
/*-------------
|
||||
* 'action' indicates the status of this in-memory segment, compared to
|
||||
@@ -83,9 +83,9 @@ typedef struct
|
||||
int nmodifieditems;
|
||||
|
||||
/*
|
||||
* The following fields represent the items in this segment. If 'items'
|
||||
* is not NULL, it contains a palloc'd array of the itemsin this segment.
|
||||
* If 'seg' is not NULL, it contains the items in an already-compressed
|
||||
* The following fields represent the items in this segment. If 'items' is
|
||||
* not NULL, it contains a palloc'd array of the itemsin this segment. If
|
||||
* 'seg' is not NULL, it contains the items in an already-compressed
|
||||
* format. It can point to an on-disk page (!modified), or a palloc'd
|
||||
* segment in memory. If both are set, they must represent the same items.
|
||||
*/
|
||||
@@ -386,7 +386,7 @@ GinDataPageAddPostingItem(Page page, PostingItem *data, OffsetNumber offset)
|
||||
if (offset != maxoff + 1)
|
||||
memmove(ptr + sizeof(PostingItem),
|
||||
ptr,
|
||||
(maxoff - offset + 1) * sizeof(PostingItem));
|
||||
(maxoff - offset + 1) *sizeof(PostingItem));
|
||||
}
|
||||
memcpy(ptr, data, sizeof(PostingItem));
|
||||
|
||||
@@ -436,8 +436,8 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
|
||||
int maxitems = items->nitem - items->curitem;
|
||||
Page page = BufferGetPage(buf);
|
||||
int i;
|
||||
ItemPointerData rbound;
|
||||
ItemPointerData lbound;
|
||||
ItemPointerData rbound;
|
||||
ItemPointerData lbound;
|
||||
bool needsplit;
|
||||
bool append;
|
||||
int segsize;
|
||||
@@ -451,7 +451,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
|
||||
|
||||
Assert(GinPageIsData(page));
|
||||
|
||||
rbound = *GinDataPageGetRightBound(page);
|
||||
rbound = *GinDataPageGetRightBound(page);
|
||||
|
||||
/*
|
||||
* Count how many of the new items belong to this page.
|
||||
@@ -464,8 +464,8 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
|
||||
{
|
||||
/*
|
||||
* This needs to go to some other location in the tree. (The
|
||||
* caller should've chosen the insert location so that at least
|
||||
* the first item goes here.)
|
||||
* caller should've chosen the insert location so that at
|
||||
* least the first item goes here.)
|
||||
*/
|
||||
Assert(i > 0);
|
||||
break;
|
||||
@@ -553,7 +553,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
|
||||
/* Add the new items to the segments */
|
||||
if (!addItemsToLeaf(leaf, newItems, maxitems))
|
||||
{
|
||||
/* all items were duplicates, we have nothing to do */
|
||||
/* all items were duplicates, we have nothing to do */
|
||||
items->curitem += maxitems;
|
||||
|
||||
MemoryContextSwitchTo(oldCxt);
|
||||
@@ -680,7 +680,7 @@ dataPlaceToPageLeaf(GinBtree btree, Buffer buf, GinBtreeStack *stack,
|
||||
|
||||
Assert(GinPageRightMost(page) ||
|
||||
ginCompareItemPointers(GinDataPageGetRightBound(*newlpage),
|
||||
GinDataPageGetRightBound(*newrpage)) < 0);
|
||||
GinDataPageGetRightBound(*newrpage)) < 0);
|
||||
|
||||
if (append)
|
||||
elog(DEBUG2, "appended %d items to block %u; split %d/%d (%d to go)",
|
||||
@@ -769,16 +769,16 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
|
||||
* We don't try to re-encode the segments here, even though some of them
|
||||
* might be really small now that we've removed some items from them. It
|
||||
* seems like a waste of effort, as there isn't really any benefit from
|
||||
* larger segments per se; larger segments only help to pack more items
|
||||
* in the same space. We might as well delay doing that until the next
|
||||
* larger segments per se; larger segments only help to pack more items in
|
||||
* the same space. We might as well delay doing that until the next
|
||||
* insertion, which will need to re-encode at least part of the page
|
||||
* anyway.
|
||||
*
|
||||
* Also note if the page was in uncompressed, pre-9.4 format before, it
|
||||
* is now represented as one huge segment that contains all the items.
|
||||
* It might make sense to split that, to speed up random access, but we
|
||||
* don't bother. You'll have to REINDEX anyway if you want the full gain
|
||||
* of the new tighter index format.
|
||||
* Also note if the page was in uncompressed, pre-9.4 format before, it is
|
||||
* now represented as one huge segment that contains all the items. It
|
||||
* might make sense to split that, to speed up random access, but we don't
|
||||
* bother. You'll have to REINDEX anyway if you want the full gain of the
|
||||
* new tighter index format.
|
||||
*/
|
||||
if (removedsomething)
|
||||
{
|
||||
@@ -795,6 +795,7 @@ ginVacuumPostingTreeLeaf(Relation indexrel, Buffer buffer, GinVacuumState *gvs)
|
||||
{
|
||||
leafSegmentInfo *seginfo = dlist_container(leafSegmentInfo, node,
|
||||
iter.cur);
|
||||
|
||||
if (seginfo->action != GIN_SEGMENT_UNMODIFIED)
|
||||
modified = true;
|
||||
if (modified && seginfo->action != GIN_SEGMENT_DELETE)
|
||||
@@ -862,10 +863,11 @@ constructLeafRecompressWALData(Buffer buf, disassembledLeaf *leaf)
|
||||
}
|
||||
|
||||
walbufbegin = palloc(
|
||||
sizeof(ginxlogRecompressDataLeaf) +
|
||||
BLCKSZ + /* max size needed to hold the segment data */
|
||||
nmodified * 2 + /* (segno + action) per action */
|
||||
sizeof(XLogRecData));
|
||||
sizeof(ginxlogRecompressDataLeaf) +
|
||||
BLCKSZ + /* max size needed to hold the segment
|
||||
* data */
|
||||
nmodified * 2 + /* (segno + action) per action */
|
||||
sizeof(XLogRecData));
|
||||
walbufend = walbufbegin;
|
||||
|
||||
recompress_xlog = (ginxlogRecompressDataLeaf *) walbufend;
|
||||
@@ -965,9 +967,9 @@ dataPlaceToPageLeafRecompress(Buffer buf, disassembledLeaf *leaf)
|
||||
int segsize;
|
||||
|
||||
/*
|
||||
* If the page was in pre-9.4 format before, convert the header, and
|
||||
* force all segments to be copied to the page whether they were modified
|
||||
* or not.
|
||||
* If the page was in pre-9.4 format before, convert the header, and force
|
||||
* all segments to be copied to the page whether they were modified or
|
||||
* not.
|
||||
*/
|
||||
if (!GinPageIsCompressed(page))
|
||||
{
|
||||
@@ -1022,6 +1024,7 @@ dataPlaceToPageLeafSplit(Buffer buf, disassembledLeaf *leaf,
|
||||
dlist_node *node;
|
||||
dlist_node *firstright;
|
||||
leafSegmentInfo *seginfo;
|
||||
|
||||
/* these must be static so they can be returned to caller */
|
||||
static ginxlogSplitDataLeaf split_xlog;
|
||||
static XLogRecData rdata[3];
|
||||
@@ -1121,6 +1124,7 @@ dataPlaceToPageInternal(GinBtree btree, Buffer buf, GinBtreeStack *stack,
|
||||
Page page = BufferGetPage(buf);
|
||||
OffsetNumber off = stack->off;
|
||||
PostingItem *pitem;
|
||||
|
||||
/* these must be static so they can be returned to caller */
|
||||
static XLogRecData rdata;
|
||||
static ginxlogInsertDataInternal data;
|
||||
@@ -1198,7 +1202,7 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf,
|
||||
int nrightitems;
|
||||
Size pageSize = PageGetPageSize(oldpage);
|
||||
ItemPointerData oldbound = *GinDataPageGetRightBound(oldpage);
|
||||
ItemPointer bound;
|
||||
ItemPointer bound;
|
||||
Page lpage;
|
||||
Page rpage;
|
||||
OffsetNumber separator;
|
||||
@@ -1216,8 +1220,8 @@ dataSplitPageInternal(GinBtree btree, Buffer origbuf,
|
||||
*prdata = rdata;
|
||||
|
||||
/*
|
||||
* First construct a new list of PostingItems, which includes all the
|
||||
* old items, and the new item.
|
||||
* First construct a new list of PostingItems, which includes all the old
|
||||
* items, and the new item.
|
||||
*/
|
||||
memcpy(allitems, GinDataPageGetPostingItem(oldpage, FirstOffsetNumber),
|
||||
(off - 1) * sizeof(PostingItem));
|
||||
@@ -1402,8 +1406,8 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
|
||||
leafSegmentInfo *newseg;
|
||||
|
||||
/*
|
||||
* If the page is completely empty, just construct one new segment to
|
||||
* hold all the new items.
|
||||
* If the page is completely empty, just construct one new segment to hold
|
||||
* all the new items.
|
||||
*/
|
||||
if (dlist_is_empty(&leaf->segments))
|
||||
{
|
||||
@@ -1418,9 +1422,9 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
|
||||
|
||||
dlist_foreach(iter, &leaf->segments)
|
||||
{
|
||||
leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
|
||||
leafSegmentInfo *cur = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node, iter.cur);
|
||||
int nthis;
|
||||
ItemPointer tmpitems;
|
||||
ItemPointer tmpitems;
|
||||
int ntmpitems;
|
||||
|
||||
/*
|
||||
@@ -1434,7 +1438,7 @@ addItemsToLeaf(disassembledLeaf *leaf, ItemPointer newItems, int nNewItems)
|
||||
ItemPointerData next_first;
|
||||
|
||||
next = (leafSegmentInfo *) dlist_container(leafSegmentInfo, node,
|
||||
dlist_next_node(&leaf->segments, iter.cur));
|
||||
dlist_next_node(&leaf->segments, iter.cur));
|
||||
if (next->items)
|
||||
next_first = next->items[0];
|
||||
else
|
||||
@@ -1556,27 +1560,27 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
|
||||
if (seginfo->seg == NULL)
|
||||
{
|
||||
if (seginfo->nitems > GinPostingListSegmentMaxSize)
|
||||
npacked = 0; /* no chance that it would fit. */
|
||||
npacked = 0; /* no chance that it would fit. */
|
||||
else
|
||||
{
|
||||
seginfo->seg = ginCompressPostingList(seginfo->items,
|
||||
seginfo->nitems,
|
||||
GinPostingListSegmentMaxSize,
|
||||
GinPostingListSegmentMaxSize,
|
||||
&npacked);
|
||||
}
|
||||
if (npacked != seginfo->nitems)
|
||||
{
|
||||
/*
|
||||
* Too large. Compress again to the target size, and create
|
||||
* a new segment to represent the remaining items. The new
|
||||
* segment is inserted after this one, so it will be
|
||||
* processed in the next iteration of this loop.
|
||||
* Too large. Compress again to the target size, and
|
||||
* create a new segment to represent the remaining items.
|
||||
* The new segment is inserted after this one, so it will
|
||||
* be processed in the next iteration of this loop.
|
||||
*/
|
||||
if (seginfo->seg)
|
||||
pfree(seginfo->seg);
|
||||
seginfo->seg = ginCompressPostingList(seginfo->items,
|
||||
seginfo->nitems,
|
||||
GinPostingListSegmentTargetSize,
|
||||
GinPostingListSegmentTargetSize,
|
||||
&npacked);
|
||||
if (seginfo->action != GIN_SEGMENT_INSERT)
|
||||
seginfo->action = GIN_SEGMENT_REPLACE;
|
||||
@@ -1596,7 +1600,7 @@ leafRepackItems(disassembledLeaf *leaf, ItemPointer remaining)
|
||||
*/
|
||||
if (SizeOfGinPostingList(seginfo->seg) < GinPostingListSegmentMinSize && next_node)
|
||||
{
|
||||
int nmerged;
|
||||
int nmerged;
|
||||
|
||||
nextseg = dlist_container(leafSegmentInfo, node, next_node);
|
||||
|
||||
@@ -1741,8 +1745,8 @@ createPostingTree(Relation index, ItemPointerData *items, uint32 nitems,
|
||||
GinPageGetOpaque(tmppage)->rightlink = InvalidBlockNumber;
|
||||
|
||||
/*
|
||||
* Write as many of the items to the root page as fit. In segments
|
||||
* of max GinPostingListSegmentMaxSize bytes each.
|
||||
* Write as many of the items to the root page as fit. In segments of max
|
||||
* GinPostingListSegmentMaxSize bytes each.
|
||||
*/
|
||||
nrootitems = 0;
|
||||
rootsize = 0;
|
||||
|
||||
@@ -135,7 +135,8 @@ GinFormTuple(GinState *ginstate,
|
||||
*/
|
||||
if (data)
|
||||
{
|
||||
char *ptr = GinGetPosting(itup);
|
||||
char *ptr = GinGetPosting(itup);
|
||||
|
||||
memcpy(ptr, data, dataSize);
|
||||
}
|
||||
|
||||
@@ -162,7 +163,7 @@ ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
|
||||
{
|
||||
Pointer ptr = GinGetPosting(itup);
|
||||
int nipd = GinGetNPosting(itup);
|
||||
ItemPointer ipd;
|
||||
ItemPointer ipd;
|
||||
int ndecoded;
|
||||
|
||||
if (GinItupIsCompressed(itup))
|
||||
@@ -192,7 +193,7 @@ ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
|
||||
* Form a non-leaf entry tuple by copying the key data from the given tuple,
|
||||
* which can be either a leaf or non-leaf entry tuple.
|
||||
*
|
||||
* Any posting list in the source tuple is not copied. The specified child
|
||||
* Any posting list in the source tuple is not copied. The specified child
|
||||
* block number is inserted into t_tid.
|
||||
*/
|
||||
static IndexTuple
|
||||
|
||||
@@ -440,7 +440,7 @@ ginHeapTupleFastInsert(GinState *ginstate, GinTupleCollector *collector)
|
||||
* Create temporary index tuples for a single indexable item (one index column
|
||||
* for the heap tuple specified by ht_ctid), and append them to the array
|
||||
* in *collector. They will subsequently be written out using
|
||||
* ginHeapTupleFastInsert. Note that to guarantee consistent state, all
|
||||
* ginHeapTupleFastInsert. Note that to guarantee consistent state, all
|
||||
* temp tuples for a given heap tuple must be written in one call to
|
||||
* ginHeapTupleFastInsert.
|
||||
*/
|
||||
@@ -707,7 +707,7 @@ processPendingPage(BuildAccumulator *accum, KeyArray *ka,
|
||||
*
|
||||
* This can be called concurrently by multiple backends, so it must cope.
|
||||
* On first glance it looks completely not concurrent-safe and not crash-safe
|
||||
* either. The reason it's okay is that multiple insertion of the same entry
|
||||
* either. The reason it's okay is that multiple insertion of the same entry
|
||||
* is detected and treated as a no-op by gininsert.c. If we crash after
|
||||
* posting entries to the main index and before removing them from the
|
||||
* pending list, it's okay because when we redo the posting later on, nothing
|
||||
@@ -761,7 +761,7 @@ ginInsertCleanup(GinState *ginstate,
|
||||
LockBuffer(metabuffer, GIN_UNLOCK);
|
||||
|
||||
/*
|
||||
* Initialize. All temporary space will be in opCtx
|
||||
* Initialize. All temporary space will be in opCtx
|
||||
*/
|
||||
opCtx = AllocSetContextCreate(CurrentMemoryContext,
|
||||
"GIN insert cleanup temporary context",
|
||||
@@ -855,7 +855,7 @@ ginInsertCleanup(GinState *ginstate,
|
||||
|
||||
/*
|
||||
* While we left the page unlocked, more stuff might have gotten
|
||||
* added to it. If so, process those entries immediately. There
|
||||
* added to it. If so, process those entries immediately. There
|
||||
* shouldn't be very many, so we don't worry about the fact that
|
||||
* we're doing this with exclusive lock. Insertion algorithm
|
||||
* guarantees that inserted row(s) will not continue on next page.
|
||||
|
||||
@@ -85,7 +85,8 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
|
||||
page = BufferGetPage(buffer);
|
||||
if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
|
||||
{
|
||||
int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
|
||||
int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
|
||||
|
||||
scanEntry->predictNumberResult += n;
|
||||
}
|
||||
|
||||
@@ -100,7 +101,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
|
||||
|
||||
/*
|
||||
* Collects TIDs into scanEntry->matchBitmap for all heap tuples that
|
||||
* match the search entry. This supports three different match modes:
|
||||
* match the search entry. This supports three different match modes:
|
||||
*
|
||||
* 1. Partial-match support: scan from current point until the
|
||||
* comparePartialFn says we're done.
|
||||
@@ -196,7 +197,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
|
||||
/*
|
||||
* In ALL mode, we are not interested in null items, so we can
|
||||
* stop if we get to a null-item placeholder (which will be the
|
||||
* last entry for a given attnum). We do want to include NULL_KEY
|
||||
* last entry for a given attnum). We do want to include NULL_KEY
|
||||
* and EMPTY_ITEM entries, though.
|
||||
*/
|
||||
if (icategory == GIN_CAT_NULL_ITEM)
|
||||
@@ -407,7 +408,7 @@ restartScanEntry:
|
||||
else if (GinGetNPosting(itup) > 0)
|
||||
{
|
||||
entry->list = ginReadTuple(ginstate, entry->attnum, itup,
|
||||
&entry->nlist);
|
||||
&entry->nlist);
|
||||
entry->predictNumberResult = entry->nlist;
|
||||
|
||||
entry->isFinished = FALSE;
|
||||
@@ -463,11 +464,11 @@ startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
|
||||
* considerably, if the frequent term can be put in the additional set.
|
||||
*
|
||||
* There can be many legal ways to divide them entries into these two
|
||||
* sets. A conservative division is to just put everything in the
|
||||
* required set, but the more you can put in the additional set, the more
|
||||
* you can skip during the scan. To maximize skipping, we try to put as
|
||||
* many frequent items as possible into additional, and less frequent
|
||||
* ones into required. To do that, sort the entries by frequency
|
||||
* sets. A conservative division is to just put everything in the required
|
||||
* set, but the more you can put in the additional set, the more you can
|
||||
* skip during the scan. To maximize skipping, we try to put as many
|
||||
* frequent items as possible into additional, and less frequent ones into
|
||||
* required. To do that, sort the entries by frequency
|
||||
* (predictNumberResult), and put entries into the required set in that
|
||||
* order, until the consistent function says that none of the remaining
|
||||
* entries can form a match, without any items from the required set. The
|
||||
@@ -635,8 +636,8 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
|
||||
if (stepright)
|
||||
{
|
||||
/*
|
||||
* We've processed all the entries on this page. If it was the last
|
||||
* page in the tree, we're done.
|
||||
* We've processed all the entries on this page. If it was the
|
||||
* last page in the tree, we're done.
|
||||
*/
|
||||
if (GinPageRightMost(page))
|
||||
{
|
||||
@@ -647,8 +648,8 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
|
||||
}
|
||||
|
||||
/*
|
||||
* Step to next page, following the right link. then find the first
|
||||
* ItemPointer greater than advancePast.
|
||||
* Step to next page, following the right link. then find the
|
||||
* first ItemPointer greater than advancePast.
|
||||
*/
|
||||
entry->buffer = ginStepRight(entry->buffer,
|
||||
ginstate->index,
|
||||
@@ -658,7 +659,7 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advan
|
||||
stepright = true;
|
||||
|
||||
if (GinPageGetOpaque(page)->flags & GIN_DELETED)
|
||||
continue; /* page was deleted by concurrent vacuum */
|
||||
continue; /* page was deleted by concurrent vacuum */
|
||||
|
||||
/*
|
||||
* The first item > advancePast might not be on this page, but
|
||||
@@ -781,6 +782,7 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
|
||||
gotitem = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Not a lossy page. Skip over any offsets <= advancePast, and
|
||||
* return that.
|
||||
@@ -788,8 +790,9 @@ entryGetItem(GinState *ginstate, GinScanEntry entry,
|
||||
if (entry->matchResult->blockno == advancePastBlk)
|
||||
{
|
||||
/*
|
||||
* First, do a quick check against the last offset on the page.
|
||||
* If that's > advancePast, so are all the other offsets.
|
||||
* First, do a quick check against the last offset on the
|
||||
* page. If that's > advancePast, so are all the other
|
||||
* offsets.
|
||||
*/
|
||||
if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
|
||||
{
|
||||
@@ -890,8 +893,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
||||
|
||||
/*
|
||||
* We might have already tested this item; if so, no need to repeat work.
|
||||
* (Note: the ">" case can happen, if advancePast is exact but we previously
|
||||
* had to set curItem to a lossy-page pointer.)
|
||||
* (Note: the ">" case can happen, if advancePast is exact but we
|
||||
* previously had to set curItem to a lossy-page pointer.)
|
||||
*/
|
||||
if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
|
||||
return;
|
||||
@@ -942,8 +945,8 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
||||
/*
|
||||
* Ok, we now know that there are no matches < minItem.
|
||||
*
|
||||
* If minItem is lossy, it means that there were no exact items on
|
||||
* the page among requiredEntries, because lossy pointers sort after exact
|
||||
* If minItem is lossy, it means that there were no exact items on the
|
||||
* page among requiredEntries, because lossy pointers sort after exact
|
||||
* items. However, there might be exact items for the same page among
|
||||
* additionalEntries, so we mustn't advance past them.
|
||||
*/
|
||||
@@ -1085,6 +1088,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
||||
if (entry->isFinished)
|
||||
key->entryRes[i] = GIN_FALSE;
|
||||
#if 0
|
||||
|
||||
/*
|
||||
* This case can't currently happen, because we loaded all the entries
|
||||
* for this item earlier.
|
||||
@@ -1119,6 +1123,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
||||
break;
|
||||
|
||||
default:
|
||||
|
||||
/*
|
||||
* the 'default' case shouldn't happen, but if the consistent
|
||||
* function returns something bogus, this is the safe result
|
||||
@@ -1129,11 +1134,10 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
|
||||
}
|
||||
|
||||
/*
|
||||
* We have a tuple, and we know if it matches or not. If it's a
|
||||
* non-match, we could continue to find the next matching tuple, but
|
||||
* let's break out and give scanGetItem a chance to advance the other
|
||||
* keys. They might be able to skip past to a much higher TID, allowing
|
||||
* us to save work.
|
||||
* We have a tuple, and we know if it matches or not. If it's a non-match,
|
||||
* we could continue to find the next matching tuple, but let's break out
|
||||
* and give scanGetItem a chance to advance the other keys. They might be
|
||||
* able to skip past to a much higher TID, allowing us to save work.
|
||||
*/
|
||||
|
||||
/* clean up after consistentFn calls */
|
||||
@@ -1165,14 +1169,14 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
|
||||
* matching item.
|
||||
*
|
||||
* This logic works only if a keyGetItem stream can never contain both
|
||||
* exact and lossy pointers for the same page. Else we could have a
|
||||
* exact and lossy pointers for the same page. Else we could have a
|
||||
* case like
|
||||
*
|
||||
* stream 1 stream 2
|
||||
* ... ...
|
||||
* ... ...
|
||||
* 42/6 42/7
|
||||
* 50/1 42/0xffff
|
||||
* ... ...
|
||||
* ... ...
|
||||
*
|
||||
* We would conclude that 42/6 is not a match and advance stream 1,
|
||||
* thus never detecting the match to the lossy pointer in stream 2.
|
||||
@@ -1205,12 +1209,11 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
|
||||
}
|
||||
|
||||
/*
|
||||
* It's a match. We can conclude that nothing < matches, so
|
||||
* the other key streams can skip to this item.
|
||||
* It's a match. We can conclude that nothing < matches, so the
|
||||
* other key streams can skip to this item.
|
||||
*
|
||||
* Beware of lossy pointers, though; from a lossy pointer, we
|
||||
* can only conclude that nothing smaller than this *block*
|
||||
* matches.
|
||||
* Beware of lossy pointers, though; from a lossy pointer, we can
|
||||
* only conclude that nothing smaller than this *block* matches.
|
||||
*/
|
||||
if (ItemPointerIsLossyPage(&key->curItem))
|
||||
{
|
||||
@@ -1229,8 +1232,8 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is the first key, remember this location as a
|
||||
* potential match, and proceed to check the rest of the keys.
|
||||
* If this is the first key, remember this location as a potential
|
||||
* match, and proceed to check the rest of the keys.
|
||||
*
|
||||
* Otherwise, check if this is the same item that we checked the
|
||||
* previous keys for (or a lossy pointer for the same page). If
|
||||
@@ -1247,7 +1250,7 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
|
||||
if (ItemPointerIsLossyPage(&key->curItem) ||
|
||||
ItemPointerIsLossyPage(item))
|
||||
{
|
||||
Assert (GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
|
||||
Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
|
||||
match = (GinItemPointerGetBlockNumber(&key->curItem) ==
|
||||
GinItemPointerGetBlockNumber(item));
|
||||
}
|
||||
@@ -1264,8 +1267,8 @@ scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
|
||||
|
||||
/*
|
||||
* Now *item contains the first ItemPointer after previous result that
|
||||
* satisfied all the keys for that exact TID, or a lossy reference
|
||||
* to the same page.
|
||||
* satisfied all the keys for that exact TID, or a lossy reference to the
|
||||
* same page.
|
||||
*
|
||||
* We must return recheck = true if any of the keys are marked recheck.
|
||||
*/
|
||||
@@ -1776,10 +1779,10 @@ gingetbitmap(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* First, scan the pending list and collect any matching entries into the
|
||||
* bitmap. After we scan a pending item, some other backend could post it
|
||||
* bitmap. After we scan a pending item, some other backend could post it
|
||||
* into the main index, and so we might visit it a second time during the
|
||||
* main scan. This is okay because we'll just re-set the same bit in the
|
||||
* bitmap. (The possibility of duplicate visits is a major reason why GIN
|
||||
* bitmap. (The possibility of duplicate visits is a major reason why GIN
|
||||
* can't support the amgettuple API, however.) Note that it would not do
|
||||
* to scan the main index before the pending list, since concurrent
|
||||
* cleanup could then make us miss entries entirely.
|
||||
|
||||
@@ -40,7 +40,7 @@ typedef struct
|
||||
* Adds array of item pointers to tuple's posting list, or
|
||||
* creates posting tree and tuple pointing to tree in case
|
||||
* of not enough space. Max size of tuple is defined in
|
||||
* GinFormTuple(). Returns a new, modified index tuple.
|
||||
* GinFormTuple(). Returns a new, modified index tuple.
|
||||
* items[] must be in sorted order with no duplicates.
|
||||
*/
|
||||
static IndexTuple
|
||||
|
||||
@@ -47,7 +47,7 @@
|
||||
* Maximum number of MAYBE inputs that shimTriConsistentFn will try to
|
||||
* resolve by calling all combinations.
|
||||
*/
|
||||
#define MAX_MAYBE_ENTRIES 4
|
||||
#define MAX_MAYBE_ENTRIES 4
|
||||
|
||||
/*
|
||||
* Dummy consistent functions for an EVERYTHING key. Just claim it matches.
|
||||
@@ -95,14 +95,14 @@ static GinTernaryValue
|
||||
directTriConsistentFn(GinScanKey key)
|
||||
{
|
||||
return DatumGetGinTernaryValue(FunctionCall7Coll(
|
||||
key->triConsistentFmgrInfo,
|
||||
key->collation,
|
||||
PointerGetDatum(key->entryRes),
|
||||
UInt16GetDatum(key->strategy),
|
||||
key->query,
|
||||
UInt32GetDatum(key->nuserentries),
|
||||
PointerGetDatum(key->extra_data),
|
||||
PointerGetDatum(key->queryValues),
|
||||
key->triConsistentFmgrInfo,
|
||||
key->collation,
|
||||
PointerGetDatum(key->entryRes),
|
||||
UInt16GetDatum(key->strategy),
|
||||
key->query,
|
||||
UInt32GetDatum(key->nuserentries),
|
||||
PointerGetDatum(key->extra_data),
|
||||
PointerGetDatum(key->queryValues),
|
||||
PointerGetDatum(key->queryCategories)));
|
||||
}
|
||||
|
||||
@@ -115,15 +115,16 @@ static bool
|
||||
shimBoolConsistentFn(GinScanKey key)
|
||||
{
|
||||
GinTernaryValue result;
|
||||
|
||||
result = DatumGetGinTernaryValue(FunctionCall7Coll(
|
||||
key->triConsistentFmgrInfo,
|
||||
key->collation,
|
||||
PointerGetDatum(key->entryRes),
|
||||
UInt16GetDatum(key->strategy),
|
||||
key->query,
|
||||
UInt32GetDatum(key->nuserentries),
|
||||
PointerGetDatum(key->extra_data),
|
||||
PointerGetDatum(key->queryValues),
|
||||
key->triConsistentFmgrInfo,
|
||||
key->collation,
|
||||
PointerGetDatum(key->entryRes),
|
||||
UInt16GetDatum(key->strategy),
|
||||
key->query,
|
||||
UInt32GetDatum(key->nuserentries),
|
||||
PointerGetDatum(key->extra_data),
|
||||
PointerGetDatum(key->queryValues),
|
||||
PointerGetDatum(key->queryCategories)));
|
||||
if (result == GIN_MAYBE)
|
||||
{
|
||||
@@ -240,8 +241,8 @@ ginInitConsistentFunction(GinState *ginstate, GinScanKey key)
|
||||
key->boolConsistentFn = shimBoolConsistentFn;
|
||||
|
||||
if (OidIsValid(ginstate->triConsistentFn[key->attnum - 1].fn_oid))
|
||||
key->triConsistentFn = directTriConsistentFn;
|
||||
key->triConsistentFn = directTriConsistentFn;
|
||||
else
|
||||
key->triConsistentFn = shimTriConsistentFn;
|
||||
key->triConsistentFn = shimTriConsistentFn;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -126,9 +126,9 @@ encode_varbyte(uint64 val, unsigned char **ptr)
|
||||
static uint64
|
||||
decode_varbyte(unsigned char **ptr)
|
||||
{
|
||||
uint64 val;
|
||||
uint64 val;
|
||||
unsigned char *p = *ptr;
|
||||
uint64 c;
|
||||
uint64 c;
|
||||
|
||||
c = *(p++);
|
||||
val = c & 0x7F;
|
||||
@@ -210,7 +210,7 @@ ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
|
||||
uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
|
||||
uint64 delta = val - prev;
|
||||
|
||||
Assert (val > prev);
|
||||
Assert(val > prev);
|
||||
|
||||
if (endptr - ptr >= 6)
|
||||
encode_varbyte(delta, &ptr);
|
||||
@@ -225,7 +225,7 @@ ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
|
||||
|
||||
encode_varbyte(delta, &p);
|
||||
if (p - buf > (endptr - ptr))
|
||||
break; /* output is full */
|
||||
break; /* output is full */
|
||||
|
||||
memcpy(ptr, buf, p - buf);
|
||||
ptr += (p - buf);
|
||||
@@ -286,7 +286,7 @@ ginPostingListDecode(GinPostingList *plist, int *ndecoded)
|
||||
ItemPointer
|
||||
ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
|
||||
{
|
||||
ItemPointer result;
|
||||
ItemPointer result;
|
||||
int nallocated;
|
||||
uint64 val;
|
||||
char *endseg = ((char *) segment) + len;
|
||||
@@ -349,7 +349,7 @@ ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
|
||||
TIDBitmap *tbm)
|
||||
{
|
||||
int ndecoded;
|
||||
ItemPointer items;
|
||||
ItemPointer items;
|
||||
|
||||
items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
|
||||
tbm_add_tuples(tbm, items, ndecoded, false);
|
||||
@@ -374,8 +374,8 @@ ginMergeItemPointers(ItemPointerData *a, uint32 na,
|
||||
dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
|
||||
|
||||
/*
|
||||
* If the argument arrays don't overlap, we can just append them to
|
||||
* each other.
|
||||
* If the argument arrays don't overlap, we can just append them to each
|
||||
* other.
|
||||
*/
|
||||
if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
|
||||
{
|
||||
|
||||
@@ -389,7 +389,7 @@ ginNewScanKey(IndexScanDesc scan)
|
||||
/*
|
||||
* If the index is version 0, it may be missing null and placeholder
|
||||
* entries, which would render searches for nulls and full-index scans
|
||||
* unreliable. Throw an error if so.
|
||||
* unreliable. Throw an error if so.
|
||||
*/
|
||||
if (hasNullQuery && !so->isVoidRes)
|
||||
{
|
||||
|
||||
@@ -67,6 +67,7 @@ initGinState(GinState *state, Relation index)
|
||||
fmgr_info_copy(&(state->extractQueryFn[i]),
|
||||
index_getprocinfo(index, i + 1, GIN_EXTRACTQUERY_PROC),
|
||||
CurrentMemoryContext);
|
||||
|
||||
/*
|
||||
* Check opclass capability to do tri-state or binary logic consistent
|
||||
* check.
|
||||
@@ -74,14 +75,14 @@ initGinState(GinState *state, Relation index)
|
||||
if (index_getprocid(index, i + 1, GIN_TRICONSISTENT_PROC) != InvalidOid)
|
||||
{
|
||||
fmgr_info_copy(&(state->triConsistentFn[i]),
|
||||
index_getprocinfo(index, i + 1, GIN_TRICONSISTENT_PROC),
|
||||
index_getprocinfo(index, i + 1, GIN_TRICONSISTENT_PROC),
|
||||
CurrentMemoryContext);
|
||||
}
|
||||
|
||||
if (index_getprocid(index, i + 1, GIN_CONSISTENT_PROC) != InvalidOid)
|
||||
{
|
||||
fmgr_info_copy(&(state->consistentFn[i]),
|
||||
index_getprocinfo(index, i + 1, GIN_CONSISTENT_PROC),
|
||||
index_getprocinfo(index, i + 1, GIN_CONSISTENT_PROC),
|
||||
CurrentMemoryContext);
|
||||
}
|
||||
|
||||
@@ -458,7 +459,7 @@ ginExtractEntries(GinState *ginstate, OffsetNumber attnum,
|
||||
* If there's more than one key, sort and unique-ify.
|
||||
*
|
||||
* XXX Using qsort here is notationally painful, and the overhead is
|
||||
* pretty bad too. For small numbers of keys it'd likely be better to use
|
||||
* pretty bad too. For small numbers of keys it'd likely be better to use
|
||||
* a simple insertion sort.
|
||||
*/
|
||||
if (*nentries > 1)
|
||||
|
||||
@@ -47,7 +47,7 @@ ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items,
|
||||
{
|
||||
int i,
|
||||
remaining = 0;
|
||||
ItemPointer tmpitems = NULL;
|
||||
ItemPointer tmpitems = NULL;
|
||||
|
||||
/*
|
||||
* Iterate over TIDs array
|
||||
@@ -208,8 +208,8 @@ ginVacuumPostingTreeLeaves(GinVacuumState *gvs, BlockNumber blkno, bool isRoot,
|
||||
}
|
||||
|
||||
/*
|
||||
* if we have root and there are empty pages in tree, then we don't release
|
||||
* lock to go further processing and guarantee that tree is unused
|
||||
* if we have root and there are empty pages in tree, then we don't
|
||||
* release lock to go further processing and guarantee that tree is unused
|
||||
*/
|
||||
if (!(isRoot && hasVoidPage))
|
||||
{
|
||||
@@ -236,7 +236,7 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
|
||||
Buffer pBuffer;
|
||||
Page page,
|
||||
parentPage;
|
||||
BlockNumber rightlink;
|
||||
BlockNumber rightlink;
|
||||
|
||||
/*
|
||||
* Lock the pages in the same order as an insertion would, to avoid
|
||||
@@ -302,11 +302,11 @@ ginDeletePage(GinVacuumState *gvs, BlockNumber deleteBlkno, BlockNumber leftBlkn
|
||||
data.rightLink = GinPageGetOpaque(page)->rightlink;
|
||||
|
||||
/*
|
||||
* We can't pass buffer_std = TRUE, because we didn't set pd_lower
|
||||
* on pre-9.4 versions. The page might've been binary-upgraded from
|
||||
* an older version, and hence not have pd_lower set correctly.
|
||||
* Ditto for the left page, but removing the item from the parent
|
||||
* updated its pd_lower, so we know that's OK at this point.
|
||||
* We can't pass buffer_std = TRUE, because we didn't set pd_lower on
|
||||
* pre-9.4 versions. The page might've been binary-upgraded from an
|
||||
* older version, and hence not have pd_lower set correctly. Ditto for
|
||||
* the left page, but removing the item from the parent updated its
|
||||
* pd_lower, so we know that's OK at this point.
|
||||
*/
|
||||
rdata[0].buffer = dBuffer;
|
||||
rdata[0].buffer_std = FALSE;
|
||||
@@ -538,7 +538,8 @@ ginVacuumEntryPage(GinVacuumState *gvs, Buffer buffer, BlockNumber *roots, uint3
|
||||
}
|
||||
|
||||
/*
|
||||
* if we already created a temporary page, make changes in place
|
||||
* if we already created a temporary page, make changes in
|
||||
* place
|
||||
*/
|
||||
if (tmppage == origpage)
|
||||
{
|
||||
|
||||
@@ -133,7 +133,7 @@ ginRedoInsertEntry(Buffer buffer, bool isLeaf, BlockNumber rightblkno, void *rda
|
||||
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), offset, false, false) == InvalidOffsetNumber)
|
||||
{
|
||||
RelFileNode node;
|
||||
ForkNumber forknum;
|
||||
ForkNumber forknum;
|
||||
BlockNumber blknum;
|
||||
|
||||
BufferGetTag(buffer, &node, &forknum, &blknum);
|
||||
@@ -341,8 +341,8 @@ ginRedoInsert(XLogRecPtr lsn, XLogRecord *record)
|
||||
payload = XLogRecGetData(record) + sizeof(ginxlogInsert);
|
||||
|
||||
/*
|
||||
* First clear incomplete-split flag on child page if this finishes
|
||||
* a split.
|
||||
* First clear incomplete-split flag on child page if this finishes a
|
||||
* split.
|
||||
*/
|
||||
if (!isLeaf)
|
||||
{
|
||||
@@ -472,8 +472,8 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
|
||||
payload = XLogRecGetData(record) + sizeof(ginxlogSplit);
|
||||
|
||||
/*
|
||||
* First clear incomplete-split flag on child page if this finishes
|
||||
* a split
|
||||
* First clear incomplete-split flag on child page if this finishes a
|
||||
* split
|
||||
*/
|
||||
if (!isLeaf)
|
||||
{
|
||||
@@ -522,7 +522,7 @@ ginRedoSplit(XLogRecPtr lsn, XLogRecord *record)
|
||||
|
||||
if (isRoot)
|
||||
{
|
||||
BlockNumber rootBlkno = data->rrlink;
|
||||
BlockNumber rootBlkno = data->rrlink;
|
||||
Buffer rootBuf = XLogReadBuffer(data->node, rootBlkno, true);
|
||||
Page rootPage = BufferGetPage(rootBuf);
|
||||
|
||||
@@ -711,9 +711,9 @@ ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record)
|
||||
Buffer buffer;
|
||||
|
||||
/*
|
||||
* Restore the metapage. This is essentially the same as a full-page image,
|
||||
* so restore the metapage unconditionally without looking at the LSN, to
|
||||
* avoid torn page hazards.
|
||||
* Restore the metapage. This is essentially the same as a full-page
|
||||
* image, so restore the metapage unconditionally without looking at the
|
||||
* LSN, to avoid torn page hazards.
|
||||
*/
|
||||
metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false);
|
||||
if (!BufferIsValid(metabuffer))
|
||||
@@ -877,7 +877,7 @@ ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record)
|
||||
|
||||
/*
|
||||
* In normal operation, shiftList() takes exclusive lock on all the
|
||||
* pages-to-be-deleted simultaneously. During replay, however, it should
|
||||
* pages-to-be-deleted simultaneously. During replay, however, it should
|
||||
* be all right to lock them one at a time. This is dependent on the fact
|
||||
* that we are deleting pages from the head of the list, and that readers
|
||||
* share-lock the next page before releasing the one they are on. So we
|
||||
|
||||
@@ -1382,7 +1382,7 @@ initGISTstate(Relation index)
|
||||
/*
|
||||
* If the index column has a specified collation, we should honor that
|
||||
* while doing comparisons. However, we may have a collatable storage
|
||||
* type for a noncollatable indexed data type. If there's no index
|
||||
* type for a noncollatable indexed data type. If there's no index
|
||||
* collation then specify default collation in case the support
|
||||
* functions need collation. This is harmless if the support
|
||||
* functions don't care about collation, so we just do it
|
||||
|
||||
@@ -31,7 +31,7 @@
|
||||
*
|
||||
* On success return for a heap tuple, *recheck_p is set to indicate
|
||||
* whether recheck is needed. We recheck if any of the consistent() functions
|
||||
* request it. recheck is not interesting when examining a non-leaf entry,
|
||||
* request it. recheck is not interesting when examining a non-leaf entry,
|
||||
* since we must visit the lower index page if there's any doubt.
|
||||
*
|
||||
* If we are doing an ordered scan, so->distances[] is filled with distance
|
||||
@@ -62,7 +62,7 @@ gistindex_keytest(IndexScanDesc scan,
|
||||
|
||||
/*
|
||||
* If it's a leftover invalid tuple from pre-9.1, treat it as a match with
|
||||
* minimum possible distances. This means we'll always follow it to the
|
||||
* minimum possible distances. This means we'll always follow it to the
|
||||
* referenced page.
|
||||
*/
|
||||
if (GistTupleIsInvalid(tuple))
|
||||
@@ -224,7 +224,7 @@ gistindex_keytest(IndexScanDesc scan,
|
||||
* ntids: if not NULL, gistgetbitmap's output tuple counter
|
||||
*
|
||||
* If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap
|
||||
* tuples should be reported directly into the bitmap. If they are NULL,
|
||||
* tuples should be reported directly into the bitmap. If they are NULL,
|
||||
* we're doing a plain or ordered indexscan. For a plain indexscan, heap
|
||||
* tuple TIDs are returned into so->pageData[]. For an ordered indexscan,
|
||||
* heap tuple TIDs are pushed into individual search queue items.
|
||||
|
||||
@@ -56,7 +56,7 @@ GISTSearchTreeItemCombiner(RBNode *existing, const RBNode *newrb, void *arg)
|
||||
/*
|
||||
* If new item is heap tuple, it goes to front of chain; otherwise insert
|
||||
* it before the first index-page item, so that index pages are visited in
|
||||
* LIFO order, ensuring depth-first search of index pages. See comments
|
||||
* LIFO order, ensuring depth-first search of index pages. See comments
|
||||
* in gist_private.h.
|
||||
*/
|
||||
if (GISTSearchItemIsHeap(*newitem))
|
||||
|
||||
@@ -71,7 +71,7 @@ gistunionsubkeyvec(GISTSTATE *giststate, IndexTuple *itvec,
|
||||
* Recompute unions of left- and right-side subkeys after a page split,
|
||||
* ignoring any tuples that are marked in spl->spl_dontcare[].
|
||||
*
|
||||
* Note: we always recompute union keys for all index columns. In some cases
|
||||
* Note: we always recompute union keys for all index columns. In some cases
|
||||
* this might represent duplicate work for the leftmost column(s), but it's
|
||||
* not safe to assume that "zero penalty to move a tuple" means "the union
|
||||
* key doesn't change at all". Penalty functions aren't 100% accurate.
|
||||
@@ -160,7 +160,7 @@ findDontCares(Relation r, GISTSTATE *giststate, GISTENTRY *valvec,
|
||||
|
||||
/*
|
||||
* Remove tuples that are marked don't-cares from the tuple index array a[]
|
||||
* of length *len. This is applied separately to the spl_left and spl_right
|
||||
* of length *len. This is applied separately to the spl_left and spl_right
|
||||
* arrays.
|
||||
*/
|
||||
static void
|
||||
@@ -193,7 +193,7 @@ removeDontCares(OffsetNumber *a, int *len, const bool *dontcare)
|
||||
/*
|
||||
* Place a single don't-care tuple into either the left or right side of the
|
||||
* split, according to which has least penalty for merging the tuple into
|
||||
* the previously-computed union keys. We need consider only columns starting
|
||||
* the previously-computed union keys. We need consider only columns starting
|
||||
* at attno.
|
||||
*/
|
||||
static void
|
||||
@@ -291,7 +291,7 @@ supportSecondarySplit(Relation r, GISTSTATE *giststate, int attno,
|
||||
|
||||
/*
|
||||
* There is only one previously defined union, so we just choose swap
|
||||
* or not by lowest penalty for that side. We can only get here if a
|
||||
* or not by lowest penalty for that side. We can only get here if a
|
||||
* secondary split happened to have all NULLs in its column in the
|
||||
* tuples that the outer recursion level had assigned to one side.
|
||||
* (Note that the null checks in gistSplitByKey don't prevent the
|
||||
@@ -427,7 +427,7 @@ gistUserPicksplit(Relation r, GistEntryVector *entryvec, int attno, GistSplitVec
|
||||
sv->spl_rdatum = v->spl_rattr[attno];
|
||||
|
||||
/*
|
||||
* Let the opclass-specific PickSplit method do its thing. Note that at
|
||||
* Let the opclass-specific PickSplit method do its thing. Note that at
|
||||
* this point we know there are no null keys in the entryvec.
|
||||
*/
|
||||
FunctionCall2Coll(&giststate->picksplitFn[attno],
|
||||
|
||||
@@ -414,7 +414,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
|
||||
* some inserts to go to other equally-good subtrees.
|
||||
*
|
||||
* keep_current_best is -1 if we haven't yet had to make a random choice
|
||||
* whether to keep the current best tuple. If we have done so, and
|
||||
* whether to keep the current best tuple. If we have done so, and
|
||||
* decided to keep it, keep_current_best is 1; if we've decided to
|
||||
* replace, keep_current_best is 0. (This state will be reset to -1 as
|
||||
* soon as we've made the replacement, but sometimes we make the choice in
|
||||
@@ -456,7 +456,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
|
||||
{
|
||||
/*
|
||||
* New best penalty for column. Tentatively select this tuple
|
||||
* as the target, and record the best penalty. Then reset the
|
||||
* as the target, and record the best penalty. Then reset the
|
||||
* next column's penalty to "unknown" (and indirectly, the
|
||||
* same for all the ones to its right). This will force us to
|
||||
* adopt this tuple's penalty values as the best for all the
|
||||
@@ -475,7 +475,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
|
||||
{
|
||||
/*
|
||||
* The current tuple is exactly as good for this column as the
|
||||
* best tuple seen so far. The next iteration of this loop
|
||||
* best tuple seen so far. The next iteration of this loop
|
||||
* will compare the next column.
|
||||
*/
|
||||
}
|
||||
@@ -681,7 +681,7 @@ gistcheckpage(Relation rel, Buffer buf)
|
||||
/*
|
||||
* ReadBuffer verifies that every newly-read page passes
|
||||
* PageHeaderIsValid, which means it either contains a reasonably sane
|
||||
* page header or is all-zero. We have to defend against the all-zero
|
||||
* page header or is all-zero. We have to defend against the all-zero
|
||||
* case, however.
|
||||
*/
|
||||
if (PageIsNew(page))
|
||||
|
||||
@@ -49,7 +49,7 @@ gistvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
stats->estimated_count = info->estimated_count;
|
||||
|
||||
/*
|
||||
* XXX the above is wrong if index is partial. Would it be OK to just
|
||||
* XXX the above is wrong if index is partial. Would it be OK to just
|
||||
* return NULL, or is there work we must do below?
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@ static MemoryContext opCtx; /* working memory for operations */
|
||||
* follow-right flag, because that change is not included in the full-page
|
||||
* image. To be sure that the intermediate state with the wrong flag value is
|
||||
* not visible to concurrent Hot Standby queries, this function handles
|
||||
* restoring the full-page image as well as updating the flag. (Note that
|
||||
* restoring the full-page image as well as updating the flag. (Note that
|
||||
* we never need to do anything else to the child page in the current WAL
|
||||
* action.)
|
||||
*/
|
||||
@@ -89,7 +89,7 @@ gistRedoPageUpdateRecord(XLogRecPtr lsn, XLogRecord *record)
|
||||
|
||||
/*
|
||||
* We need to acquire and hold lock on target page while updating the left
|
||||
* child page. If we have a full-page image of target page, getting the
|
||||
* child page. If we have a full-page image of target page, getting the
|
||||
* lock is a side-effect of restoring that image. Note that even if the
|
||||
* target page no longer exists, we'll still attempt to replay the change
|
||||
* on the child page.
|
||||
@@ -387,6 +387,7 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
|
||||
|
||||
for (ptr = dist; ptr; ptr = ptr->next)
|
||||
npage++;
|
||||
|
||||
/*
|
||||
* the caller should've checked this already, but doesn't hurt to check
|
||||
* again.
|
||||
|
||||
@@ -78,7 +78,7 @@ hashbuild(PG_FUNCTION_ARGS)
|
||||
* (assuming their hash codes are pretty random) there will be no locality
|
||||
* of access to the index, and if the index is bigger than available RAM
|
||||
* then we'll thrash horribly. To prevent that scenario, we can sort the
|
||||
* tuples by (expected) bucket number. However, such a sort is useless
|
||||
* tuples by (expected) bucket number. However, such a sort is useless
|
||||
* overhead when the index does fit in RAM. We choose to sort if the
|
||||
* initial index size exceeds NBuffers.
|
||||
*
|
||||
@@ -248,7 +248,7 @@ hashgettuple(PG_FUNCTION_ARGS)
|
||||
/*
|
||||
* An insertion into the current index page could have happened while
|
||||
* we didn't have read lock on it. Re-find our position by looking
|
||||
* for the TID we previously returned. (Because we hold share lock on
|
||||
* for the TID we previously returned. (Because we hold share lock on
|
||||
* the bucket, no deletions or splits could have occurred; therefore
|
||||
* we can expect that the TID still exists in the current index page,
|
||||
* at an offset >= where we were.)
|
||||
@@ -524,7 +524,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
|
||||
/*
|
||||
* Read the metapage to fetch original bucket and tuple counts. Also, we
|
||||
* keep a copy of the last-seen metapage so that we can use its
|
||||
* hashm_spares[] values to compute bucket page addresses. This is a bit
|
||||
* hashm_spares[] values to compute bucket page addresses. This is a bit
|
||||
* hokey but perfectly safe, since the interesting entries in the spares
|
||||
* array cannot change under us; and it beats rereading the metapage for
|
||||
* each bucket.
|
||||
@@ -655,7 +655,7 @@ loop_top:
|
||||
{
|
||||
/*
|
||||
* Otherwise, our count is untrustworthy since we may have
|
||||
* double-scanned tuples in split buckets. Proceed by dead-reckoning.
|
||||
* double-scanned tuples in split buckets. Proceed by dead-reckoning.
|
||||
* (Note: we still return estimated_count = false, because using this
|
||||
* count is better than not updating reltuples at all.)
|
||||
*/
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
* src/backend/access/hash/hashfunc.c
|
||||
*
|
||||
* NOTES
|
||||
* These functions are stored in pg_amproc. For each operator class
|
||||
* These functions are stored in pg_amproc. For each operator class
|
||||
* defined for hash indexes, they compute the hash value of the argument.
|
||||
*
|
||||
* Additional hash functions appear in /utils/adt/ files for various
|
||||
@@ -158,7 +158,7 @@ hashtext(PG_FUNCTION_ARGS)
|
||||
/*
|
||||
* Note: this is currently identical in behavior to hashvarlena, but keep
|
||||
* it as a separate function in case we someday want to do something
|
||||
* different in non-C locales. (See also hashbpchar, if so.)
|
||||
* different in non-C locales. (See also hashbpchar, if so.)
|
||||
*/
|
||||
result = hash_any((unsigned char *) VARDATA_ANY(key),
|
||||
VARSIZE_ANY_EXHDR(key));
|
||||
@@ -236,7 +236,7 @@ hashvarlena(PG_FUNCTION_ARGS)
|
||||
*
|
||||
* This allows some parallelism. Read-after-writes are good at doubling
|
||||
* the number of bits affected, so the goal of mixing pulls in the opposite
|
||||
* direction from the goal of parallelism. I did what I could. Rotates
|
||||
* direction from the goal of parallelism. I did what I could. Rotates
|
||||
* seem to cost as much as shifts on every machine I could lay my hands on,
|
||||
* and rotates are much kinder to the top and bottom bits, so I used rotates.
|
||||
*----------
|
||||
@@ -270,7 +270,7 @@ hashvarlena(PG_FUNCTION_ARGS)
|
||||
* substantial performance increase since final() does not need to
|
||||
* do well in reverse, but is does need to affect all output bits.
|
||||
* mix(), on the other hand, does not need to affect all output
|
||||
* bits (affecting 32 bits is enough). The original hash function had
|
||||
* bits (affecting 32 bits is enough). The original hash function had
|
||||
* a single mixing operation that had to satisfy both sets of requirements
|
||||
* and was slower as a result.
|
||||
*----------
|
||||
@@ -291,7 +291,7 @@ hashvarlena(PG_FUNCTION_ARGS)
|
||||
* k : the key (the unaligned variable-length array of bytes)
|
||||
* len : the length of the key, counting by bytes
|
||||
*
|
||||
* Returns a uint32 value. Every bit of the key affects every bit of
|
||||
* Returns a uint32 value. Every bit of the key affects every bit of
|
||||
* the return value. Every 1-bit and 2-bit delta achieves avalanche.
|
||||
* About 6*len+35 instructions. The best hash table sizes are powers
|
||||
* of 2. There is no need to do mod a prime (mod is sooo slow!).
|
||||
|
||||
@@ -89,7 +89,7 @@ _hash_doinsert(Relation rel, IndexTuple itup)
|
||||
|
||||
/*
|
||||
* If the previous iteration of this loop locked what is still the
|
||||
* correct target bucket, we are done. Otherwise, drop any old lock
|
||||
* correct target bucket, we are done. Otherwise, drop any old lock
|
||||
* and lock what now appears to be the correct bucket.
|
||||
*/
|
||||
if (retry)
|
||||
|
||||
@@ -80,7 +80,7 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
|
||||
*
|
||||
* Add an overflow page to the bucket whose last page is pointed to by 'buf'.
|
||||
*
|
||||
* On entry, the caller must hold a pin but no lock on 'buf'. The pin is
|
||||
* On entry, the caller must hold a pin but no lock on 'buf'. The pin is
|
||||
* dropped before exiting (we assume the caller is not interested in 'buf'
|
||||
* anymore). The returned overflow page will be pinned and write-locked;
|
||||
* it is guaranteed to be empty.
|
||||
@@ -89,12 +89,12 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
|
||||
* That buffer is returned in the same state.
|
||||
*
|
||||
* The caller must hold at least share lock on the bucket, to ensure that
|
||||
* no one else tries to compact the bucket meanwhile. This guarantees that
|
||||
* no one else tries to compact the bucket meanwhile. This guarantees that
|
||||
* 'buf' won't stop being part of the bucket while it's unlocked.
|
||||
*
|
||||
* NB: since this could be executed concurrently by multiple processes,
|
||||
* one should not assume that the returned overflow page will be the
|
||||
* immediate successor of the originally passed 'buf'. Additional overflow
|
||||
* immediate successor of the originally passed 'buf'. Additional overflow
|
||||
* pages might have been added to the bucket chain in between.
|
||||
*/
|
||||
Buffer
|
||||
@@ -157,7 +157,7 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf)
|
||||
/*
|
||||
* _hash_getovflpage()
|
||||
*
|
||||
* Find an available overflow page and return it. The returned buffer
|
||||
* Find an available overflow page and return it. The returned buffer
|
||||
* is pinned and write-locked, and has had _hash_pageinit() applied,
|
||||
* but it is caller's responsibility to fill the special space.
|
||||
*
|
||||
@@ -253,7 +253,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
|
||||
* We create the new bitmap page with all pages marked "in use".
|
||||
* Actually two pages in the new bitmap's range will exist
|
||||
* immediately: the bitmap page itself, and the following page which
|
||||
* is the one we return to the caller. Both of these are correctly
|
||||
* is the one we return to the caller. Both of these are correctly
|
||||
* marked "in use". Subsequent pages do not exist yet, but it is
|
||||
* convenient to pre-mark them as "in use" too.
|
||||
*/
|
||||
@@ -284,7 +284,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
|
||||
metap->hashm_spares[splitnum]++;
|
||||
|
||||
/*
|
||||
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
|
||||
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
|
||||
* changing it if someone moved it while we were searching bitmap pages.
|
||||
*/
|
||||
if (metap->hashm_firstfree == orig_firstfree)
|
||||
@@ -313,7 +313,7 @@ found:
|
||||
blkno = bitno_to_blkno(metap, bit);
|
||||
|
||||
/*
|
||||
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
|
||||
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
|
||||
* changing it if someone moved it while we were searching bitmap pages.
|
||||
*/
|
||||
if (metap->hashm_firstfree == orig_firstfree)
|
||||
@@ -494,7 +494,7 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf,
|
||||
/*
|
||||
* _hash_initbitmap()
|
||||
*
|
||||
* Initialize a new bitmap page. The metapage has a write-lock upon
|
||||
* Initialize a new bitmap page. The metapage has a write-lock upon
|
||||
* entering the function, and must be written by caller after return.
|
||||
*
|
||||
* 'blkno' is the block number of the new bitmap page.
|
||||
|
||||
@@ -49,7 +49,7 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf,
|
||||
* of the locking rules). However, we can skip taking lmgr locks when the
|
||||
* index is local to the current backend (ie, either temp or new in the
|
||||
* current transaction). No one else can see it, so there's no reason to
|
||||
* take locks. We still take buffer-level locks, but not lmgr locks.
|
||||
* take locks. We still take buffer-level locks, but not lmgr locks.
|
||||
*/
|
||||
#define USELOCKING(rel) (!RELATION_IS_LOCAL(rel))
|
||||
|
||||
@@ -136,7 +136,7 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
|
||||
*
|
||||
* This must be used only to fetch pages that are known to be before
|
||||
* the index's filesystem EOF, but are to be filled from scratch.
|
||||
* _hash_pageinit() is applied automatically. Otherwise it has
|
||||
* _hash_pageinit() is applied automatically. Otherwise it has
|
||||
* effects similar to _hash_getbuf() with access = HASH_WRITE.
|
||||
*
|
||||
* When this routine returns, a write lock is set on the
|
||||
@@ -344,7 +344,7 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
|
||||
/*
|
||||
* Determine the target fill factor (in tuples per bucket) for this index.
|
||||
* The idea is to make the fill factor correspond to pages about as full
|
||||
* as the user-settable fillfactor parameter says. We can compute it
|
||||
* as the user-settable fillfactor parameter says. We can compute it
|
||||
* exactly since the index datatype (i.e. uint32 hash key) is fixed-width.
|
||||
*/
|
||||
data_width = sizeof(uint32);
|
||||
@@ -377,7 +377,7 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
|
||||
/*
|
||||
* We initialize the metapage, the first N bucket pages, and the first
|
||||
* bitmap page in sequence, using _hash_getnewbuf to cause smgrextend()
|
||||
* calls to occur. This ensures that the smgr level has the right idea of
|
||||
* calls to occur. This ensures that the smgr level has the right idea of
|
||||
* the physical index length.
|
||||
*/
|
||||
metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum);
|
||||
@@ -545,7 +545,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
|
||||
/*
|
||||
* Determine which bucket is to be split, and attempt to lock the old
|
||||
* bucket. If we can't get the lock, give up.
|
||||
* bucket. If we can't get the lock, give up.
|
||||
*
|
||||
* The lock protects us against other backends, but not against our own
|
||||
* backend. Must check for active scans separately.
|
||||
@@ -603,7 +603,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
}
|
||||
|
||||
/*
|
||||
* Okay to proceed with split. Update the metapage bucket mapping info.
|
||||
* Okay to proceed with split. Update the metapage bucket mapping info.
|
||||
*
|
||||
* Since we are scribbling on the metapage data right in the shared
|
||||
* buffer, any failure in this next little bit leaves us with a big
|
||||
@@ -641,7 +641,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
* Copy bucket mapping info now; this saves re-accessing the meta page
|
||||
* inside _hash_splitbucket's inner loop. Note that once we drop the
|
||||
* split lock, other splits could begin, so these values might be out of
|
||||
* date before _hash_splitbucket finishes. That's okay, since all it
|
||||
* date before _hash_splitbucket finishes. That's okay, since all it
|
||||
* needs is to tell which of these two buckets to map hashkeys into.
|
||||
*/
|
||||
maxbucket = metap->hashm_maxbucket;
|
||||
@@ -876,7 +876,7 @@ _hash_splitbucket(Relation rel,
|
||||
|
||||
/*
|
||||
* We're at the end of the old bucket chain, so we're done partitioning
|
||||
* the tuples. Before quitting, call _hash_squeezebucket to ensure the
|
||||
* the tuples. Before quitting, call _hash_squeezebucket to ensure the
|
||||
* tuples remaining in the old bucket (including the overflow pages) are
|
||||
* packed as tightly as possible. The new bucket is already tight.
|
||||
*/
|
||||
|
||||
@@ -210,7 +210,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
|
||||
|
||||
/*
|
||||
* If the previous iteration of this loop locked what is still the
|
||||
* correct target bucket, we are done. Otherwise, drop any old lock
|
||||
* correct target bucket, we are done. Otherwise, drop any old lock
|
||||
* and lock what now appears to be the correct bucket.
|
||||
*/
|
||||
if (retry)
|
||||
@@ -269,7 +269,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
|
||||
* _hash_step() -- step to the next valid item in a scan in the bucket.
|
||||
*
|
||||
* If no valid record exists in the requested direction, return
|
||||
* false. Else, return true and set the hashso_curpos for the
|
||||
* false. Else, return true and set the hashso_curpos for the
|
||||
* scan to the right thing.
|
||||
*
|
||||
* 'bufP' points to the current buffer, which is pinned and read-locked.
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* thrashing. We use tuplesort.c to sort the given index tuples into order.
|
||||
*
|
||||
* Note: if the number of rows in the table has been underestimated,
|
||||
* bucket splits may occur during the index build. In that case we'd
|
||||
* bucket splits may occur during the index build. In that case we'd
|
||||
* be inserting into two or more buckets for each possible masked-off
|
||||
* hash code value. That's no big problem though, since we'll still have
|
||||
* plenty of locality of access.
|
||||
@@ -52,7 +52,7 @@ _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
|
||||
hspool->index = index;
|
||||
|
||||
/*
|
||||
* Determine the bitmask for hash code values. Since there are currently
|
||||
* Determine the bitmask for hash code values. Since there are currently
|
||||
* num_buckets buckets in the index, the appropriate mask can be computed
|
||||
* as follows.
|
||||
*
|
||||
|
||||
@@ -160,7 +160,7 @@ _hash_checkpage(Relation rel, Buffer buf, int flags)
|
||||
/*
|
||||
* ReadBuffer verifies that every newly-read page passes
|
||||
* PageHeaderIsValid, which means it either contains a reasonably sane
|
||||
* page header or is all-zero. We have to defend against the all-zero
|
||||
* page header or is all-zero. We have to defend against the all-zero
|
||||
* case, however.
|
||||
*/
|
||||
if (PageIsNew(page))
|
||||
@@ -280,7 +280,7 @@ _hash_form_tuple(Relation index, Datum *values, bool *isnull)
|
||||
*
|
||||
* Returns the offset of the first index entry having hashkey >= hash_value,
|
||||
* or the page's max offset plus one if hash_value is greater than all
|
||||
* existing hash keys in the page. This is the appropriate place to start
|
||||
* existing hash keys in the page. This is the appropriate place to start
|
||||
* a search, or to insert a new item.
|
||||
*/
|
||||
OffsetNumber
|
||||
|
||||
@@ -88,11 +88,11 @@ static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
|
||||
HeapTuple newtup, HeapTuple old_key_tup,
|
||||
bool all_visible_cleared, bool new_all_visible_cleared);
|
||||
static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
|
||||
Bitmapset *hot_attrs,
|
||||
Bitmapset *key_attrs, Bitmapset *id_attrs,
|
||||
bool *satisfies_hot, bool *satisfies_key,
|
||||
bool *satisfies_id,
|
||||
HeapTuple oldtup, HeapTuple newtup);
|
||||
Bitmapset *hot_attrs,
|
||||
Bitmapset *key_attrs, Bitmapset *id_attrs,
|
||||
bool *satisfies_hot, bool *satisfies_key,
|
||||
bool *satisfies_id,
|
||||
HeapTuple oldtup, HeapTuple newtup);
|
||||
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
|
||||
uint16 old_infomask2, TransactionId add_to_xmax,
|
||||
LockTupleMode mode, bool is_update,
|
||||
@@ -113,7 +113,7 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status
|
||||
XLTW_Oper oper, int *remaining);
|
||||
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
|
||||
static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
|
||||
bool *copy);
|
||||
bool *copy);
|
||||
|
||||
|
||||
/*
|
||||
@@ -213,7 +213,7 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
|
||||
* while the scan is in progress will be invisible to my snapshot anyway.
|
||||
* (That is not true when using a non-MVCC snapshot. However, we couldn't
|
||||
* guarantee to return tuples added after scan start anyway, since they
|
||||
* might go into pages we already scanned. To guarantee consistent
|
||||
* might go into pages we already scanned. To guarantee consistent
|
||||
* results for a non-MVCC snapshot, the caller must hold some higher-level
|
||||
* lock that ensures the interesting tuple(s) won't change.)
|
||||
*/
|
||||
@@ -221,7 +221,7 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
|
||||
|
||||
/*
|
||||
* If the table is large relative to NBuffers, use a bulk-read access
|
||||
* strategy and enable synchronized scanning (see syncscan.c). Although
|
||||
* strategy and enable synchronized scanning (see syncscan.c). Although
|
||||
* the thresholds for these features could be different, we make them the
|
||||
* same so that there are only two behaviors to tune rather than four.
|
||||
* (However, some callers need to be able to disable one or both of these
|
||||
@@ -325,7 +325,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
|
||||
}
|
||||
|
||||
/*
|
||||
* Be sure to check for interrupts at least once per page. Checks at
|
||||
* Be sure to check for interrupts at least once per page. Checks at
|
||||
* higher code levels won't be able to stop a seqscan that encounters many
|
||||
* pages' worth of consecutive dead tuples.
|
||||
*/
|
||||
@@ -349,7 +349,7 @@ heapgetpage(HeapScanDesc scan, BlockNumber page)
|
||||
|
||||
/*
|
||||
* We must hold share lock on the buffer content while examining tuple
|
||||
* visibility. Afterwards, however, the tuples we have found to be
|
||||
* visibility. Afterwards, however, the tuples we have found to be
|
||||
* visible are guaranteed good as long as we hold the buffer pin.
|
||||
*/
|
||||
LockBuffer(buffer, BUFFER_LOCK_SHARE);
|
||||
@@ -1126,7 +1126,7 @@ relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
|
||||
*
|
||||
* Same as relation_openrv, but with an additional missing_ok argument
|
||||
* allowing a NULL return rather than an error if the relation is not
|
||||
* found. (Note that some other causes, such as permissions problems,
|
||||
* found. (Note that some other causes, such as permissions problems,
|
||||
* will still result in an ereport.)
|
||||
* ----------------
|
||||
*/
|
||||
@@ -1740,7 +1740,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
|
||||
|
||||
/*
|
||||
* When first_call is true (and thus, skip is initially false) we'll
|
||||
* return the first tuple we find. But on later passes, heapTuple
|
||||
* return the first tuple we find. But on later passes, heapTuple
|
||||
* will initially be pointing to the tuple we returned last time.
|
||||
* Returning it again would be incorrect (and would loop forever), so
|
||||
* we skip it and return the next match we find.
|
||||
@@ -1834,7 +1834,7 @@ heap_hot_search(ItemPointer tid, Relation relation, Snapshot snapshot,
|
||||
* possibly uncommitted version.
|
||||
*
|
||||
* *tid is both an input and an output parameter: it is updated to
|
||||
* show the latest version of the row. Note that it will not be changed
|
||||
* show the latest version of the row. Note that it will not be changed
|
||||
* if no version of the row passes the snapshot test.
|
||||
*/
|
||||
void
|
||||
@@ -1955,7 +1955,7 @@ heap_get_latest_tid(Relation relation,
|
||||
*
|
||||
* This is called after we have waited for the XMAX transaction to terminate.
|
||||
* If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
|
||||
* be set on exit. If the transaction committed, we set the XMAX_COMMITTED
|
||||
* be set on exit. If the transaction committed, we set the XMAX_COMMITTED
|
||||
* hint bit if possible --- but beware that that may not yet be possible,
|
||||
* if the transaction committed asynchronously.
|
||||
*
|
||||
@@ -2042,7 +2042,7 @@ FreeBulkInsertState(BulkInsertState bistate)
|
||||
* The return value is the OID assigned to the tuple (either here or by the
|
||||
* caller), or InvalidOid if no OID. The header fields of *tup are updated
|
||||
* to match the stored tuple; in particular tup->t_self receives the actual
|
||||
* TID where the tuple was stored. But note that any toasting of fields
|
||||
* TID where the tuple was stored. But note that any toasting of fields
|
||||
* within the tuple data is NOT reflected into *tup.
|
||||
*/
|
||||
Oid
|
||||
@@ -2071,7 +2071,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
* For a heap insert, we only need to check for table-level SSI locks. Our
|
||||
* new tuple can't possibly conflict with existing tuple locks, and heap
|
||||
* page locks are only consolidated versions of tuple locks; they do not
|
||||
* lock "gaps" as index page locks do. So we don't need to identify a
|
||||
* lock "gaps" as index page locks do. So we don't need to identify a
|
||||
* buffer before making the call.
|
||||
*/
|
||||
CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
|
||||
@@ -2123,8 +2123,8 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
bool need_tuple_data;
|
||||
|
||||
/*
|
||||
* For logical decoding, we need the tuple even if we're doing a
|
||||
* full page write, so make sure to log it separately. (XXX We could
|
||||
* For logical decoding, we need the tuple even if we're doing a full
|
||||
* page write, so make sure to log it separately. (XXX We could
|
||||
* alternatively store a pointer into the FPW).
|
||||
*
|
||||
* Also, if this is a catalog, we need to transmit combocids to
|
||||
@@ -2165,9 +2165,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
rdata[2].next = NULL;
|
||||
|
||||
/*
|
||||
* Make a separate rdata entry for the tuple's buffer if we're
|
||||
* doing logical decoding, so that an eventual FPW doesn't
|
||||
* remove the tuple's data.
|
||||
* Make a separate rdata entry for the tuple's buffer if we're doing
|
||||
* logical decoding, so that an eventual FPW doesn't remove the
|
||||
* tuple's data.
|
||||
*/
|
||||
if (need_tuple_data)
|
||||
{
|
||||
@@ -2248,7 +2248,7 @@ heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid,
|
||||
|
||||
/*
|
||||
* If the object id of this tuple has already been assigned, trust the
|
||||
* caller. There are a couple of ways this can happen. At initial db
|
||||
* caller. There are a couple of ways this can happen. At initial db
|
||||
* creation, the backend program sets oids for tuples. When we define
|
||||
* an index, we set the oid. Finally, in the future, we may allow
|
||||
* users to set their own object ids in order to support a persistent
|
||||
@@ -2342,7 +2342,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
|
||||
* For a heap insert, we only need to check for table-level SSI locks. Our
|
||||
* new tuple can't possibly conflict with existing tuple locks, and heap
|
||||
* page locks are only consolidated versions of tuple locks; they do not
|
||||
* lock "gaps" as index page locks do. So we don't need to identify a
|
||||
* lock "gaps" as index page locks do. So we don't need to identify a
|
||||
* buffer before making the call.
|
||||
*/
|
||||
CheckForSerializableConflictIn(relation, NULL, InvalidBuffer);
|
||||
@@ -2356,7 +2356,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
|
||||
int nthispage;
|
||||
|
||||
/*
|
||||
* Find buffer where at least the next tuple will fit. If the page is
|
||||
* Find buffer where at least the next tuple will fit. If the page is
|
||||
* all-visible, this will also pin the requisite visibility map page.
|
||||
*/
|
||||
buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
|
||||
@@ -2487,9 +2487,9 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
|
||||
rdata[1].next = NULL;
|
||||
|
||||
/*
|
||||
* Make a separate rdata entry for the tuple's buffer if
|
||||
* we're doing logical decoding, so that an eventual FPW
|
||||
* doesn't remove the tuple's data.
|
||||
* Make a separate rdata entry for the tuple's buffer if we're
|
||||
* doing logical decoding, so that an eventual FPW doesn't remove
|
||||
* the tuple's data.
|
||||
*/
|
||||
if (need_tuple_data)
|
||||
{
|
||||
@@ -2597,8 +2597,8 @@ compute_infobits(uint16 infomask, uint16 infomask2)
|
||||
static inline bool
|
||||
xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
|
||||
{
|
||||
const uint16 interesting =
|
||||
HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
|
||||
const uint16 interesting =
|
||||
HEAP_XMAX_IS_MULTI | HEAP_XMAX_LOCK_ONLY | HEAP_LOCK_MASK;
|
||||
|
||||
if ((new_infomask & interesting) != (old_infomask & interesting))
|
||||
return true;
|
||||
@@ -2650,7 +2650,7 @@ heap_delete(Relation relation, ItemPointer tid,
|
||||
bool have_tuple_lock = false;
|
||||
bool iscombo;
|
||||
bool all_visible_cleared = false;
|
||||
HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
|
||||
HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
|
||||
bool old_key_copied = false;
|
||||
|
||||
Assert(ItemPointerIsValid(tid));
|
||||
@@ -2751,10 +2751,10 @@ l1:
|
||||
/*
|
||||
* You might think the multixact is necessarily done here, but not
|
||||
* so: it could have surviving members, namely our own xact or
|
||||
* other subxacts of this backend. It is legal for us to delete
|
||||
* other subxacts of this backend. It is legal for us to delete
|
||||
* the tuple in either case, however (the latter case is
|
||||
* essentially a situation of upgrading our former shared lock to
|
||||
* exclusive). We don't bother changing the on-disk hint bits
|
||||
* exclusive). We don't bother changing the on-disk hint bits
|
||||
* since we are about to overwrite the xmax altogether.
|
||||
*/
|
||||
}
|
||||
@@ -2836,7 +2836,7 @@ l1:
|
||||
* If this is the first possibly-multixact-able operation in the current
|
||||
* transaction, set my per-backend OldestMemberMXactId setting. We can be
|
||||
* certain that the transaction will never become a member of any older
|
||||
* MultiXactIds than that. (We have to do this even if we end up just
|
||||
* MultiXactIds than that. (We have to do this even if we end up just
|
||||
* using our own TransactionId below, since some other backend could
|
||||
* incorporate our XID into a MultiXact immediately afterwards.)
|
||||
*/
|
||||
@@ -2852,7 +2852,7 @@ l1:
|
||||
/*
|
||||
* If this transaction commits, the tuple will become DEAD sooner or
|
||||
* later. Set flag that this page is a candidate for pruning once our xid
|
||||
* falls below the OldestXmin horizon. If the transaction finally aborts,
|
||||
* falls below the OldestXmin horizon. If the transaction finally aborts,
|
||||
* the subsequent page pruning will be a no-op and the hint will be
|
||||
* cleared.
|
||||
*/
|
||||
@@ -2919,7 +2919,7 @@ l1:
|
||||
xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
|
||||
|
||||
rdata[1].next = &(rdata[2]);
|
||||
rdata[2].data = (char*)&xlhdr;
|
||||
rdata[2].data = (char *) &xlhdr;
|
||||
rdata[2].len = SizeOfHeapHeader;
|
||||
rdata[2].buffer = InvalidBuffer;
|
||||
rdata[2].next = NULL;
|
||||
@@ -2994,7 +2994,7 @@ l1:
|
||||
*
|
||||
* This routine may be used to delete a tuple when concurrent updates of
|
||||
* the target tuple are not expected (for example, because we have a lock
|
||||
* on the relation associated with the tuple). Any failure is reported
|
||||
* on the relation associated with the tuple). Any failure is reported
|
||||
* via ereport().
|
||||
*/
|
||||
void
|
||||
@@ -3110,7 +3110,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
|
||||
/*
|
||||
* Fetch the list of attributes to be checked for HOT update. This is
|
||||
* wasted effort if we fail to update or have to put the new tuple on a
|
||||
* different page. But we must compute the list before obtaining buffer
|
||||
* different page. But we must compute the list before obtaining buffer
|
||||
* lock --- in the worst case, if we are doing an update on one of the
|
||||
* relevant system catalogs, we could deadlock if we try to fetch the list
|
||||
* later. In any case, the relcache caches the data so this is usually
|
||||
@@ -3122,7 +3122,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
|
||||
hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
|
||||
key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
|
||||
id_attrs = RelationGetIndexAttrBitmap(relation,
|
||||
INDEX_ATTR_BITMAP_IDENTITY_KEY);
|
||||
INDEX_ATTR_BITMAP_IDENTITY_KEY);
|
||||
|
||||
block = ItemPointerGetBlockNumber(otid);
|
||||
buffer = ReadBuffer(relation, block);
|
||||
@@ -3193,7 +3193,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
|
||||
* If this is the first possibly-multixact-able operation in the
|
||||
* current transaction, set my per-backend OldestMemberMXactId
|
||||
* setting. We can be certain that the transaction will never become a
|
||||
* member of any older MultiXactIds than that. (We have to do this
|
||||
* member of any older MultiXactIds than that. (We have to do this
|
||||
* even if we end up just using our own TransactionId below, since
|
||||
* some other backend could incorporate our XID into a MultiXact
|
||||
* immediately afterwards.)
|
||||
@@ -3238,7 +3238,7 @@ l2:
|
||||
/*
|
||||
* XXX note that we don't consider the "no wait" case here. This
|
||||
* isn't a problem currently because no caller uses that case, but it
|
||||
* should be fixed if such a caller is introduced. It wasn't a
|
||||
* should be fixed if such a caller is introduced. It wasn't a
|
||||
* problem previously because this code would always wait, but now
|
||||
* that some tuple locks do not conflict with one of the lock modes we
|
||||
* use, it is possible that this case is interesting to handle
|
||||
@@ -3276,7 +3276,7 @@ l2:
|
||||
* it as locker, unless it is gone completely.
|
||||
*
|
||||
* If it's not a multi, we need to check for sleeping conditions
|
||||
* before actually going to sleep. If the update doesn't conflict
|
||||
* before actually going to sleep. If the update doesn't conflict
|
||||
* with the locks, we just continue without sleeping (but making sure
|
||||
* it is preserved).
|
||||
*/
|
||||
@@ -3302,10 +3302,10 @@ l2:
|
||||
goto l2;
|
||||
|
||||
/*
|
||||
* Note that the multixact may not be done by now. It could have
|
||||
* Note that the multixact may not be done by now. It could have
|
||||
* surviving members; our own xact or other subxacts of this
|
||||
* backend, and also any other concurrent transaction that locked
|
||||
* the tuple with KeyShare if we only got TupleLockUpdate. If
|
||||
* the tuple with KeyShare if we only got TupleLockUpdate. If
|
||||
* this is the case, we have to be careful to mark the updated
|
||||
* tuple with the surviving members in Xmax.
|
||||
*
|
||||
@@ -3512,7 +3512,7 @@ l2:
|
||||
* If the toaster needs to be activated, OR if the new tuple will not fit
|
||||
* on the same page as the old, then we need to release the content lock
|
||||
* (but not the pin!) on the old tuple's buffer while we are off doing
|
||||
* TOAST and/or table-file-extension work. We must mark the old tuple to
|
||||
* TOAST and/or table-file-extension work. We must mark the old tuple to
|
||||
* show that it's already being updated, else other processes may try to
|
||||
* update it themselves.
|
||||
*
|
||||
@@ -3578,7 +3578,7 @@ l2:
|
||||
* there's more free now than before.
|
||||
*
|
||||
* What's more, if we need to get a new page, we will need to acquire
|
||||
* buffer locks on both old and new pages. To avoid deadlock against
|
||||
* buffer locks on both old and new pages. To avoid deadlock against
|
||||
* some other backend trying to get the same two locks in the other
|
||||
* order, we must be consistent about the order we get the locks in.
|
||||
* We use the rule "lock the lower-numbered page of the relation
|
||||
@@ -3638,7 +3638,7 @@ l2:
|
||||
|
||||
/*
|
||||
* At this point newbuf and buffer are both pinned and locked, and newbuf
|
||||
* has enough space for the new tuple. If they are the same buffer, only
|
||||
* has enough space for the new tuple. If they are the same buffer, only
|
||||
* one pin is held.
|
||||
*/
|
||||
|
||||
@@ -3646,7 +3646,7 @@ l2:
|
||||
{
|
||||
/*
|
||||
* Since the new tuple is going into the same page, we might be able
|
||||
* to do a HOT update. Check if any of the index columns have been
|
||||
* to do a HOT update. Check if any of the index columns have been
|
||||
* changed. If not, then HOT update is possible.
|
||||
*/
|
||||
if (satisfies_hot)
|
||||
@@ -3672,13 +3672,13 @@ l2:
|
||||
/*
|
||||
* If this transaction commits, the old tuple will become DEAD sooner or
|
||||
* later. Set flag that this page is a candidate for pruning once our xid
|
||||
* falls below the OldestXmin horizon. If the transaction finally aborts,
|
||||
* falls below the OldestXmin horizon. If the transaction finally aborts,
|
||||
* the subsequent page pruning will be a no-op and the hint will be
|
||||
* cleared.
|
||||
*
|
||||
* XXX Should we set hint on newbuf as well? If the transaction aborts,
|
||||
* there would be a prunable tuple in the newbuf; but for now we choose
|
||||
* not to optimize for aborts. Note that heap_xlog_update must be kept in
|
||||
* not to optimize for aborts. Note that heap_xlog_update must be kept in
|
||||
* sync if this decision changes.
|
||||
*/
|
||||
PageSetPrunable(page, xid);
|
||||
@@ -3775,7 +3775,7 @@ l2:
|
||||
* Mark old tuple for invalidation from system caches at next command
|
||||
* boundary, and mark the new tuple for invalidation in case we abort. We
|
||||
* have to do this before releasing the buffer because oldtup is in the
|
||||
* buffer. (heaptup is all in local memory, but it's necessary to process
|
||||
* buffer. (heaptup is all in local memory, but it's necessary to process
|
||||
* both tuple versions in one call to inval.c so we can avoid redundant
|
||||
* sinval messages.)
|
||||
*/
|
||||
@@ -3853,7 +3853,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
|
||||
|
||||
/*
|
||||
* Extract the corresponding values. XXX this is pretty inefficient if
|
||||
* there are many indexed columns. Should HeapSatisfiesHOTandKeyUpdate do
|
||||
* there are many indexed columns. Should HeapSatisfiesHOTandKeyUpdate do
|
||||
* a single heap_deform_tuple call on each tuple, instead? But that
|
||||
* doesn't work for system columns ...
|
||||
*/
|
||||
@@ -3876,7 +3876,7 @@ heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
|
||||
/*
|
||||
* We do simple binary comparison of the two datums. This may be overly
|
||||
* strict because there can be multiple binary representations for the
|
||||
* same logical value. But we should be OK as long as there are no false
|
||||
* same logical value. But we should be OK as long as there are no false
|
||||
* positives. Using a type-specific equality operator is messy because
|
||||
* there could be multiple notions of equality in different operator
|
||||
* classes; furthermore, we cannot safely invoke user-defined functions
|
||||
@@ -3951,8 +3951,7 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs,
|
||||
/*
|
||||
* Since the HOT attributes are a superset of the key attributes and
|
||||
* the key attributes are a superset of the id attributes, this logic
|
||||
* is guaranteed to identify the next column that needs to be
|
||||
* checked.
|
||||
* is guaranteed to identify the next column that needs to be checked.
|
||||
*/
|
||||
if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber)
|
||||
check_now = next_hot_attnum;
|
||||
@@ -3981,12 +3980,11 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs,
|
||||
}
|
||||
|
||||
/*
|
||||
* Advance the next attribute numbers for the sets that contain
|
||||
* the attribute we just checked. As we work our way through the
|
||||
* columns, the next_attnum values will rise; but when each set
|
||||
* becomes empty, bms_first_member() will return -1 and the attribute
|
||||
* number will end up with a value less than
|
||||
* FirstLowInvalidHeapAttributeNumber.
|
||||
* Advance the next attribute numbers for the sets that contain the
|
||||
* attribute we just checked. As we work our way through the columns,
|
||||
* the next_attnum values will rise; but when each set becomes empty,
|
||||
* bms_first_member() will return -1 and the attribute number will end
|
||||
* up with a value less than FirstLowInvalidHeapAttributeNumber.
|
||||
*/
|
||||
if (hot_result && check_now == next_hot_attnum)
|
||||
{
|
||||
@@ -4015,7 +4013,7 @@ HeapSatisfiesHOTandKeyUpdate(Relation relation, Bitmapset *hot_attrs,
|
||||
*
|
||||
* This routine may be used to update a tuple when concurrent updates of
|
||||
* the target tuple are not expected (for example, because we have a lock
|
||||
* on the relation associated with the tuple). Any failure is reported
|
||||
* on the relation associated with the tuple). Any failure is reported
|
||||
* via ereport().
|
||||
*/
|
||||
void
|
||||
@@ -4057,7 +4055,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
|
||||
static MultiXactStatus
|
||||
get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
|
||||
{
|
||||
int retval;
|
||||
int retval;
|
||||
|
||||
if (is_update)
|
||||
retval = tupleLockExtraInfo[mode].updstatus;
|
||||
@@ -4239,15 +4237,15 @@ l3:
|
||||
* However, if there are updates, we need to walk the update chain
|
||||
* to mark future versions of the row as locked, too. That way,
|
||||
* if somebody deletes that future version, we're protected
|
||||
* against the key going away. This locking of future versions
|
||||
* against the key going away. This locking of future versions
|
||||
* could block momentarily, if a concurrent transaction is
|
||||
* deleting a key; or it could return a value to the effect that
|
||||
* the transaction deleting the key has already committed. So we
|
||||
* the transaction deleting the key has already committed. So we
|
||||
* do this before re-locking the buffer; otherwise this would be
|
||||
* prone to deadlocks.
|
||||
*
|
||||
* Note that the TID we're locking was grabbed before we unlocked
|
||||
* the buffer. For it to change while we're not looking, the
|
||||
* the buffer. For it to change while we're not looking, the
|
||||
* other properties we're testing for below after re-locking the
|
||||
* buffer would also change, in which case we would restart this
|
||||
* loop above.
|
||||
@@ -4472,7 +4470,7 @@ l3:
|
||||
* Of course, the multixact might not be done here: if we're
|
||||
* requesting a light lock mode, other transactions with light
|
||||
* locks could still be alive, as well as locks owned by our
|
||||
* own xact or other subxacts of this backend. We need to
|
||||
* own xact or other subxacts of this backend. We need to
|
||||
* preserve the surviving MultiXact members. Note that it
|
||||
* isn't absolutely necessary in the latter case, but doing so
|
||||
* is simpler.
|
||||
@@ -4516,7 +4514,7 @@ l3:
|
||||
/*
|
||||
* xwait is done, but if xwait had just locked the tuple then
|
||||
* some other xact could update this tuple before we get to
|
||||
* this point. Check for xmax change, and start over if so.
|
||||
* this point. Check for xmax change, and start over if so.
|
||||
*/
|
||||
if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
|
||||
!TransactionIdEquals(
|
||||
@@ -4525,7 +4523,7 @@ l3:
|
||||
goto l3;
|
||||
|
||||
/*
|
||||
* Otherwise check if it committed or aborted. Note we cannot
|
||||
* Otherwise check if it committed or aborted. Note we cannot
|
||||
* be here if the tuple was only locked by somebody who didn't
|
||||
* conflict with us; that should have been handled above. So
|
||||
* that transaction must necessarily be gone by now.
|
||||
@@ -4605,7 +4603,7 @@ failed:
|
||||
* If this is the first possibly-multixact-able operation in the current
|
||||
* transaction, set my per-backend OldestMemberMXactId setting. We can be
|
||||
* certain that the transaction will never become a member of any older
|
||||
* MultiXactIds than that. (We have to do this even if we end up just
|
||||
* MultiXactIds than that. (We have to do this even if we end up just
|
||||
* using our own TransactionId below, since some other backend could
|
||||
* incorporate our XID into a MultiXact immediately afterwards.)
|
||||
*/
|
||||
@@ -4641,7 +4639,7 @@ failed:
|
||||
HeapTupleHeaderSetXmax(tuple->t_data, xid);
|
||||
|
||||
/*
|
||||
* Make sure there is no forward chain link in t_ctid. Note that in the
|
||||
* Make sure there is no forward chain link in t_ctid. Note that in the
|
||||
* cases where the tuple has been updated, we must not overwrite t_ctid,
|
||||
* because it was set by the updater. Moreover, if the tuple has been
|
||||
* updated, we need to follow the update chain to lock the new versions of
|
||||
@@ -4653,8 +4651,8 @@ failed:
|
||||
MarkBufferDirty(*buffer);
|
||||
|
||||
/*
|
||||
* XLOG stuff. You might think that we don't need an XLOG record because
|
||||
* there is no state change worth restoring after a crash. You would be
|
||||
* XLOG stuff. You might think that we don't need an XLOG record because
|
||||
* there is no state change worth restoring after a crash. You would be
|
||||
* wrong however: we have just written either a TransactionId or a
|
||||
* MultiXactId that may never have been seen on disk before, and we need
|
||||
* to make sure that there are XLOG entries covering those ID numbers.
|
||||
@@ -4818,7 +4816,7 @@ l5:
|
||||
* If the XMAX is already a MultiXactId, then we need to expand it to
|
||||
* include add_to_xmax; but if all the members were lockers and are
|
||||
* all gone, we can do away with the IS_MULTI bit and just set
|
||||
* add_to_xmax as the only locker/updater. If all lockers are gone
|
||||
* add_to_xmax as the only locker/updater. If all lockers are gone
|
||||
* and we have an updater that aborted, we can also do without a
|
||||
* multi.
|
||||
*
|
||||
@@ -4881,7 +4879,7 @@ l5:
|
||||
*/
|
||||
MultiXactStatus new_status;
|
||||
MultiXactStatus old_status;
|
||||
LockTupleMode old_mode;
|
||||
LockTupleMode old_mode;
|
||||
|
||||
if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
|
||||
{
|
||||
@@ -4900,8 +4898,8 @@ l5:
|
||||
{
|
||||
/*
|
||||
* LOCK_ONLY can be present alone only when a page has been
|
||||
* upgraded by pg_upgrade. But in that case,
|
||||
* TransactionIdIsInProgress() should have returned false. We
|
||||
* upgraded by pg_upgrade. But in that case,
|
||||
* TransactionIdIsInProgress() should have returned false. We
|
||||
* assume it's no longer locked in this case.
|
||||
*/
|
||||
elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
|
||||
@@ -4929,12 +4927,13 @@ l5:
|
||||
if (xmax == add_to_xmax)
|
||||
{
|
||||
/*
|
||||
* Note that it's not possible for the original tuple to be updated:
|
||||
* we wouldn't be here because the tuple would have been invisible and
|
||||
* we wouldn't try to update it. As a subtlety, this code can also
|
||||
* run when traversing an update chain to lock future versions of a
|
||||
* tuple. But we wouldn't be here either, because the add_to_xmax
|
||||
* would be different from the original updater.
|
||||
* Note that it's not possible for the original tuple to be
|
||||
* updated: we wouldn't be here because the tuple would have been
|
||||
* invisible and we wouldn't try to update it. As a subtlety,
|
||||
* this code can also run when traversing an update chain to lock
|
||||
* future versions of a tuple. But we wouldn't be here either,
|
||||
* because the add_to_xmax would be different from the original
|
||||
* updater.
|
||||
*/
|
||||
Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
|
||||
|
||||
@@ -5013,7 +5012,7 @@ static HTSU_Result
|
||||
test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
|
||||
LockTupleMode mode, bool *needwait)
|
||||
{
|
||||
MultiXactStatus wantedstatus;
|
||||
MultiXactStatus wantedstatus;
|
||||
|
||||
*needwait = false;
|
||||
wantedstatus = get_mxact_status_for_lock(mode, false);
|
||||
@@ -5026,18 +5025,18 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
|
||||
if (TransactionIdIsCurrentTransactionId(xid))
|
||||
{
|
||||
/*
|
||||
* Updated by our own transaction? Just return failure. This shouldn't
|
||||
* normally happen.
|
||||
* Updated by our own transaction? Just return failure. This
|
||||
* shouldn't normally happen.
|
||||
*/
|
||||
return HeapTupleSelfUpdated;
|
||||
}
|
||||
else if (TransactionIdIsInProgress(xid))
|
||||
{
|
||||
/*
|
||||
* If the locking transaction is running, what we do depends on whether
|
||||
* the lock modes conflict: if they do, then we must wait for it to
|
||||
* finish; otherwise we can fall through to lock this tuple version
|
||||
* without waiting.
|
||||
* If the locking transaction is running, what we do depends on
|
||||
* whether the lock modes conflict: if they do, then we must wait for
|
||||
* it to finish; otherwise we can fall through to lock this tuple
|
||||
* version without waiting.
|
||||
*/
|
||||
if (DoLockModesConflict(LOCKMODE_from_mxstatus(status),
|
||||
LOCKMODE_from_mxstatus(wantedstatus)))
|
||||
@@ -5046,8 +5045,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
|
||||
}
|
||||
|
||||
/*
|
||||
* If we set needwait above, then this value doesn't matter; otherwise,
|
||||
* this value signals to caller that it's okay to proceed.
|
||||
* If we set needwait above, then this value doesn't matter;
|
||||
* otherwise, this value signals to caller that it's okay to proceed.
|
||||
*/
|
||||
return HeapTupleMayBeUpdated;
|
||||
}
|
||||
@@ -5059,7 +5058,7 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
|
||||
* The other transaction committed. If it was only a locker, then the
|
||||
* lock is completely gone now and we can return success; but if it
|
||||
* was an update, then what we do depends on whether the two lock
|
||||
* modes conflict. If they conflict, then we must report error to
|
||||
* modes conflict. If they conflict, then we must report error to
|
||||
* caller. But if they don't, we can fall through to allow the current
|
||||
* transaction to lock the tuple.
|
||||
*
|
||||
@@ -5133,8 +5132,8 @@ l4:
|
||||
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Check the tuple XMIN against prior XMAX, if any. If we reached
|
||||
* the end of the chain, we're done, so return success.
|
||||
* Check the tuple XMIN against prior XMAX, if any. If we reached the
|
||||
* end of the chain, we're done, so return success.
|
||||
*/
|
||||
if (TransactionIdIsValid(priorXmax) &&
|
||||
!TransactionIdEquals(HeapTupleHeaderGetXmin(mytup.t_data),
|
||||
@@ -5162,14 +5161,14 @@ l4:
|
||||
rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
|
||||
if (old_infomask & HEAP_XMAX_IS_MULTI)
|
||||
{
|
||||
int nmembers;
|
||||
int i;
|
||||
int nmembers;
|
||||
int i;
|
||||
MultiXactMember *members;
|
||||
|
||||
nmembers = GetMultiXactIdMembers(rawxmax, &members, false);
|
||||
for (i = 0; i < nmembers; i++)
|
||||
{
|
||||
HTSU_Result res;
|
||||
HTSU_Result res;
|
||||
|
||||
res = test_lockmode_for_conflict(members[i].status,
|
||||
members[i].xid,
|
||||
@@ -5196,7 +5195,7 @@ l4:
|
||||
}
|
||||
else
|
||||
{
|
||||
HTSU_Result res;
|
||||
HTSU_Result res;
|
||||
MultiXactStatus status;
|
||||
|
||||
/*
|
||||
@@ -5219,9 +5218,9 @@ l4:
|
||||
else
|
||||
{
|
||||
/*
|
||||
* LOCK_ONLY present alone (a pg_upgraded tuple
|
||||
* marked as share-locked in the old cluster) shouldn't
|
||||
* be seen in the middle of an update chain.
|
||||
* LOCK_ONLY present alone (a pg_upgraded tuple marked
|
||||
* as share-locked in the old cluster) shouldn't be
|
||||
* seen in the middle of an update chain.
|
||||
*/
|
||||
elog(ERROR, "invalid lock status in tuple");
|
||||
}
|
||||
@@ -5323,7 +5322,7 @@ l4:
|
||||
* The initial tuple is assumed to be already locked.
|
||||
*
|
||||
* This function doesn't check visibility, it just inconditionally marks the
|
||||
* tuple(s) as locked. If any tuple in the updated chain is being deleted
|
||||
* tuple(s) as locked. If any tuple in the updated chain is being deleted
|
||||
* concurrently (or updated with the key being modified), sleep until the
|
||||
* transaction doing it is finished.
|
||||
*
|
||||
@@ -5347,7 +5346,7 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
|
||||
* If this is the first possibly-multixact-able operation in the
|
||||
* current transaction, set my per-backend OldestMemberMXactId
|
||||
* setting. We can be certain that the transaction will never become a
|
||||
* member of any older MultiXactIds than that. (We have to do this
|
||||
* member of any older MultiXactIds than that. (We have to do this
|
||||
* even if we end up just using our own TransactionId below, since
|
||||
* some other backend could incorporate our XID into a MultiXact
|
||||
* immediately afterwards.)
|
||||
@@ -5366,7 +5365,7 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, ItemPointer ctid,
|
||||
* heap_inplace_update - update a tuple "in place" (ie, overwrite it)
|
||||
*
|
||||
* Overwriting violates both MVCC and transactional safety, so the uses
|
||||
* of this function in Postgres are extremely limited. Nonetheless we
|
||||
* of this function in Postgres are extremely limited. Nonetheless we
|
||||
* find some places to use it.
|
||||
*
|
||||
* The tuple cannot change size, and therefore it's reasonable to assume
|
||||
@@ -5608,7 +5607,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
|
||||
*/
|
||||
if (ISUPDATE_from_mxstatus(members[i].status))
|
||||
{
|
||||
TransactionId xid = members[i].xid;
|
||||
TransactionId xid = members[i].xid;
|
||||
|
||||
/*
|
||||
* It's an update; should we keep it? If the transaction is known
|
||||
@@ -5728,7 +5727,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
|
||||
* heap_prepare_freeze_tuple
|
||||
*
|
||||
* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
|
||||
* are older than the specified cutoff XID and cutoff MultiXactId. If so,
|
||||
* are older than the specified cutoff XID and cutoff MultiXactId. If so,
|
||||
* setup enough state (in the *frz output argument) to later execute and
|
||||
* WAL-log what we would need to do, and return TRUE. Return FALSE if nothing
|
||||
* is to be changed.
|
||||
@@ -5801,11 +5800,11 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
|
||||
else if (flags & FRM_RETURN_IS_XID)
|
||||
{
|
||||
/*
|
||||
* NB -- some of these transformations are only valid because
|
||||
* we know the return Xid is a tuple updater (i.e. not merely a
|
||||
* NB -- some of these transformations are only valid because we
|
||||
* know the return Xid is a tuple updater (i.e. not merely a
|
||||
* locker.) Also note that the only reason we don't explicitely
|
||||
* worry about HEAP_KEYS_UPDATED is because it lives in t_infomask2
|
||||
* rather than t_infomask.
|
||||
* worry about HEAP_KEYS_UPDATED is because it lives in
|
||||
* t_infomask2 rather than t_infomask.
|
||||
*/
|
||||
frz->t_infomask &= ~HEAP_XMAX_BITS;
|
||||
frz->xmax = newxmax;
|
||||
@@ -5815,8 +5814,8 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
|
||||
}
|
||||
else if (flags & FRM_RETURN_IS_MULTI)
|
||||
{
|
||||
uint16 newbits;
|
||||
uint16 newbits2;
|
||||
uint16 newbits;
|
||||
uint16 newbits2;
|
||||
|
||||
/*
|
||||
* We can't use GetMultiXactIdHintBits directly on the new multi
|
||||
@@ -5851,7 +5850,7 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId cutoff_xid,
|
||||
|
||||
/*
|
||||
* The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
|
||||
* LOCKED. Normalize to INVALID just to be sure no one gets confused.
|
||||
* LOCKED. Normalize to INVALID just to be sure no one gets confused.
|
||||
* Also get rid of the HEAP_KEYS_UPDATED bit.
|
||||
*/
|
||||
frz->t_infomask &= ~HEAP_XMAX_BITS;
|
||||
@@ -6111,7 +6110,7 @@ HeapTupleGetUpdateXid(HeapTupleHeader tuple)
|
||||
* used to optimize multixact access in case it's a lock-only multi); 'nowait'
|
||||
* indicates whether to use conditional lock acquisition, to allow callers to
|
||||
* fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
|
||||
* context information for error messages. 'remaining', if not NULL, receives
|
||||
* context information for error messages. 'remaining', if not NULL, receives
|
||||
* the number of members that are still running, including any (non-aborted)
|
||||
* subtransactions of our own transaction.
|
||||
*
|
||||
@@ -6173,7 +6172,7 @@ Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status,
|
||||
* return failure, if asked to avoid waiting.)
|
||||
*
|
||||
* Note that we don't set up an error context callback ourselves,
|
||||
* but instead we pass the info down to XactLockTableWait. This
|
||||
* but instead we pass the info down to XactLockTableWait. This
|
||||
* might seem a bit wasteful because the context is set up and
|
||||
* tore down for each member of the multixact, but in reality it
|
||||
* should be barely noticeable, and it avoids duplicate code.
|
||||
@@ -6242,7 +6241,7 @@ ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status,
|
||||
* heap_tuple_needs_freeze
|
||||
*
|
||||
* Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
|
||||
* are older than the specified cutoff XID or MultiXactId. If so, return TRUE.
|
||||
* are older than the specified cutoff XID or MultiXactId. If so, return TRUE.
|
||||
*
|
||||
* It doesn't matter whether the tuple is alive or dead, we are checking
|
||||
* to see if a tuple needs to be removed or frozen to avoid wraparound.
|
||||
@@ -6366,7 +6365,7 @@ heap_restrpos(HeapScanDesc scan)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* If we reached end of scan, rs_inited will now be false. We must
|
||||
* If we reached end of scan, rs_inited will now be false. We must
|
||||
* reset it to true to keep heapgettup from doing the wrong thing.
|
||||
*/
|
||||
scan->rs_inited = true;
|
||||
@@ -6548,7 +6547,7 @@ log_heap_clean(Relation reln, Buffer buffer,
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform XLogInsert for a heap-freeze operation. Caller must have already
|
||||
* Perform XLogInsert for a heap-freeze operation. Caller must have already
|
||||
* modified the buffer and marked it dirty.
|
||||
*/
|
||||
XLogRecPtr
|
||||
@@ -6593,7 +6592,7 @@ log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid,
|
||||
/*
|
||||
* Perform XLogInsert for a heap-visible operation. 'block' is the block
|
||||
* being marked all-visible, and vm_buffer is the buffer containing the
|
||||
* corresponding visibility map block. Both should have already been modified
|
||||
* corresponding visibility map block. Both should have already been modified
|
||||
* and dirtied.
|
||||
*
|
||||
* If checksums are enabled, we also add the heap_buffer to the chain to
|
||||
@@ -6642,7 +6641,7 @@ log_heap_visible(RelFileNode rnode, Buffer heap_buffer, Buffer vm_buffer,
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform XLogInsert for a heap-update operation. Caller must already
|
||||
* Perform XLogInsert for a heap-update operation. Caller must already
|
||||
* have modified the buffer(s) and marked them dirty.
|
||||
*/
|
||||
static XLogRecPtr
|
||||
@@ -6674,10 +6673,10 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
info = XLOG_HEAP_UPDATE;
|
||||
|
||||
/*
|
||||
* If the old and new tuple are on the same page, we only need to log
|
||||
* the parts of the new tuple that were changed. That saves on the amount
|
||||
* of WAL we need to write. Currently, we just count any unchanged bytes
|
||||
* in the beginning and end of the tuple. That's quick to check, and
|
||||
* If the old and new tuple are on the same page, we only need to log the
|
||||
* parts of the new tuple that were changed. That saves on the amount of
|
||||
* WAL we need to write. Currently, we just count any unchanged bytes in
|
||||
* the beginning and end of the tuple. That's quick to check, and
|
||||
* perfectly covers the common case that only one field is updated.
|
||||
*
|
||||
* We could do this even if the old and new tuple are on different pages,
|
||||
@@ -6688,10 +6687,10 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
* updates tend to create the new tuple version on the same page, there
|
||||
* isn't much to be gained by doing this across pages anyway.
|
||||
*
|
||||
* Skip this if we're taking a full-page image of the new page, as we don't
|
||||
* include the new tuple in the WAL record in that case. Also disable if
|
||||
* wal_level='logical', as logical decoding needs to be able to read the
|
||||
* new tuple in whole from the WAL record alone.
|
||||
* Skip this if we're taking a full-page image of the new page, as we
|
||||
* don't include the new tuple in the WAL record in that case. Also
|
||||
* disable if wal_level='logical', as logical decoding needs to be able to
|
||||
* read the new tuple in whole from the WAL record alone.
|
||||
*/
|
||||
if (oldbuf == newbuf && !need_tuple_data &&
|
||||
!XLogCheckBufferNeedsBackup(newbuf))
|
||||
@@ -6707,6 +6706,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
if (newp[prefixlen] != oldp[prefixlen])
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Storing the length of the prefix takes 2 bytes, so we need to save
|
||||
* at least 3 bytes or there's no point.
|
||||
@@ -6793,8 +6793,8 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
xlhdr.header.t_infomask2 = newtup->t_data->t_infomask2;
|
||||
xlhdr.header.t_infomask = newtup->t_data->t_infomask;
|
||||
xlhdr.header.t_hoff = newtup->t_data->t_hoff;
|
||||
Assert(offsetof(HeapTupleHeaderData, t_bits) + prefixlen + suffixlen <= newtup->t_len);
|
||||
xlhdr.t_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) - prefixlen - suffixlen;
|
||||
Assert(offsetof(HeapTupleHeaderData, t_bits) +prefixlen + suffixlen <= newtup->t_len);
|
||||
xlhdr.t_len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) -prefixlen - suffixlen;
|
||||
|
||||
/*
|
||||
* As with insert records, we need not store this rdata segment if we
|
||||
@@ -6816,7 +6816,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
if (prefixlen == 0)
|
||||
{
|
||||
rdata[nr].data = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits);
|
||||
rdata[nr].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) - suffixlen;
|
||||
rdata[nr].len = newtup->t_len - offsetof(HeapTupleHeaderData, t_bits) -suffixlen;
|
||||
rdata[nr].buffer = need_tuple_data ? InvalidBuffer : newbufref;
|
||||
rdata[nr].buffer_std = true;
|
||||
rdata[nr].next = NULL;
|
||||
@@ -6829,7 +6829,7 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
* two separate rdata entries.
|
||||
*/
|
||||
/* bitmap [+ padding] [+ oid] */
|
||||
if (newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits) > 0)
|
||||
if (newtup->t_data->t_hoff - offsetof(HeapTupleHeaderData, t_bits) >0)
|
||||
{
|
||||
rdata[nr - 1].next = &(rdata[nr]);
|
||||
rdata[nr].data = ((char *) newtup->t_data) + offsetof(HeapTupleHeaderData, t_bits);
|
||||
@@ -6853,13 +6853,13 @@ log_heap_update(Relation reln, Buffer oldbuf,
|
||||
/*
|
||||
* Separate storage for the FPW buffer reference of the new page in the
|
||||
* wal_level >= logical case.
|
||||
*/
|
||||
*/
|
||||
if (need_tuple_data)
|
||||
{
|
||||
rdata[nr - 1].next = &(rdata[nr]);
|
||||
|
||||
rdata[nr].data = NULL,
|
||||
rdata[nr].len = 0;
|
||||
rdata[nr].len = 0;
|
||||
rdata[nr].buffer = newbufref;
|
||||
rdata[nr].buffer_std = true;
|
||||
rdata[nr].next = NULL;
|
||||
@@ -6992,8 +6992,8 @@ log_newpage(RelFileNode *rnode, ForkNumber forkNum, BlockNumber blkno,
|
||||
recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_NEWPAGE, rdata);
|
||||
|
||||
/*
|
||||
* The page may be uninitialized. If so, we can't set the LSN because
|
||||
* that would corrupt the page.
|
||||
* The page may be uninitialized. If so, we can't set the LSN because that
|
||||
* would corrupt the page.
|
||||
*/
|
||||
if (!PageIsNew(page))
|
||||
{
|
||||
@@ -7173,14 +7173,14 @@ ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed, bool *
|
||||
*/
|
||||
for (natt = 0; natt < idx_desc->natts; natt++)
|
||||
{
|
||||
int attno = idx_rel->rd_index->indkey.values[natt];
|
||||
int attno = idx_rel->rd_index->indkey.values[natt];
|
||||
|
||||
if (attno < 0)
|
||||
{
|
||||
/*
|
||||
* The OID column can appear in an index definition, but that's
|
||||
* OK, becuse we always copy the OID if present (see below).
|
||||
* Other system columns may not.
|
||||
* OK, becuse we always copy the OID if present (see below). Other
|
||||
* system columns may not.
|
||||
*/
|
||||
if (attno == ObjectIdAttributeNumber)
|
||||
continue;
|
||||
@@ -7210,7 +7210,8 @@ ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_changed, bool *
|
||||
*/
|
||||
if (HeapTupleHasExternal(key_tuple))
|
||||
{
|
||||
HeapTuple oldtup = key_tuple;
|
||||
HeapTuple oldtup = key_tuple;
|
||||
|
||||
key_tuple = toast_flatten_tuple(oldtup, RelationGetDescr(relation));
|
||||
heap_freetuple(oldtup);
|
||||
}
|
||||
@@ -7963,7 +7964,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
|
||||
/*
|
||||
* In normal operation, it is important to lock the two pages in
|
||||
* page-number order, to avoid possible deadlocks against other update
|
||||
* operations going the other way. However, during WAL replay there can
|
||||
* operations going the other way. However, during WAL replay there can
|
||||
* be no other update happening, so we don't need to worry about that. But
|
||||
* we *do* need to worry that we don't expose an inconsistent state to Hot
|
||||
* Standby queries --- so the original page can't be unlocked before we've
|
||||
@@ -8169,7 +8170,7 @@ newsame:;
|
||||
if (suffixlen > 0)
|
||||
memcpy(newp, (char *) oldtup.t_data + oldtup.t_len - suffixlen, suffixlen);
|
||||
|
||||
newlen = offsetof(HeapTupleHeaderData, t_bits) + xlhdr.t_len + prefixlen + suffixlen;
|
||||
newlen = offsetof(HeapTupleHeaderData, t_bits) +xlhdr.t_len + prefixlen + suffixlen;
|
||||
htup->t_infomask2 = xlhdr.header.t_infomask2;
|
||||
htup->t_infomask = xlhdr.header.t_infomask;
|
||||
htup->t_hoff = xlhdr.header.t_hoff;
|
||||
@@ -8444,6 +8445,7 @@ heap2_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
heap_xlog_lock_updated(lsn, record);
|
||||
break;
|
||||
case XLOG_HEAP2_NEW_CID:
|
||||
|
||||
/*
|
||||
* Nothing to do on a real replay, only used during logical
|
||||
* decoding.
|
||||
|
||||
@@ -146,7 +146,7 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
|
||||
/*
|
||||
* If there are two buffers involved and we pinned just one of them,
|
||||
* it's possible that the second one became all-visible while we were
|
||||
* busy pinning the first one. If it looks like that's a possible
|
||||
* busy pinning the first one. If it looks like that's a possible
|
||||
* scenario, we'll need to make a second pass through this loop.
|
||||
*/
|
||||
if (buffer2 == InvalidBuffer || buffer1 == buffer2
|
||||
@@ -177,7 +177,7 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
|
||||
* NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
|
||||
* same buffer we select for insertion of the new tuple (this could only
|
||||
* happen if space is freed in that page after heap_update finds there's not
|
||||
* enough there). In that case, the page will be pinned and locked only once.
|
||||
* enough there). In that case, the page will be pinned and locked only once.
|
||||
*
|
||||
* For the vmbuffer and vmbuffer_other arguments, we avoid deadlock by
|
||||
* locking them only after locking the corresponding heap page, and taking
|
||||
@@ -198,7 +198,7 @@ GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2,
|
||||
* for additional constraints needed for safe usage of this behavior.)
|
||||
*
|
||||
* The caller can also provide a BulkInsertState object to optimize many
|
||||
* insertions into the same relation. This keeps a pin on the current
|
||||
* insertions into the same relation. This keeps a pin on the current
|
||||
* insertion target page (to save pin/unpin cycles) and also passes a
|
||||
* BULKWRITE buffer selection strategy object to the buffer manager.
|
||||
* Passing NULL for bistate selects the default behavior.
|
||||
@@ -251,7 +251,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* We first try to put the tuple on the same page we last inserted a tuple
|
||||
* on, as cached in the BulkInsertState or relcache entry. If that
|
||||
* on, as cached in the BulkInsertState or relcache entry. If that
|
||||
* doesn't work, we ask the Free Space Map to locate a suitable page.
|
||||
* Since the FSM's info might be out of date, we have to be prepared to
|
||||
* loop around and retry multiple times. (To insure this isn't an infinite
|
||||
@@ -283,7 +283,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* If the FSM knows nothing of the rel, try the last page before we
|
||||
* give up and extend. This avoids one-tuple-per-page syndrome during
|
||||
* give up and extend. This avoids one-tuple-per-page syndrome during
|
||||
* bootstrapping or in a recently-started system.
|
||||
*/
|
||||
if (targetBlock == InvalidBlockNumber)
|
||||
@@ -305,7 +305,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
* If the page-level all-visible flag is set, caller will need to
|
||||
* clear both that and the corresponding visibility map bit. However,
|
||||
* by the time we return, we'll have x-locked the buffer, and we don't
|
||||
* want to do any I/O while in that state. So we check the bit here
|
||||
* want to do any I/O while in that state. So we check the bit here
|
||||
* before taking the lock, and pin the page if it appears necessary.
|
||||
* Checking without the lock creates a risk of getting the wrong
|
||||
* answer, so we'll have to recheck after acquiring the lock.
|
||||
@@ -347,7 +347,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* We now have the target page (and the other buffer, if any) pinned
|
||||
* and locked. However, since our initial PageIsAllVisible checks
|
||||
* and locked. However, since our initial PageIsAllVisible checks
|
||||
* were performed before acquiring the lock, the results might now be
|
||||
* out of date, either for the selected victim buffer, or for the
|
||||
* other buffer passed by the caller. In that case, we'll need to
|
||||
@@ -390,7 +390,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* Not enough space, so we must give up our page locks and pin (if
|
||||
* any) and prepare to look elsewhere. We don't care which order we
|
||||
* any) and prepare to look elsewhere. We don't care which order we
|
||||
* unlock the two buffers in, so this can be slightly simpler than the
|
||||
* code above.
|
||||
*/
|
||||
@@ -432,7 +432,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* XXX This does an lseek - rather expensive - but at the moment it is the
|
||||
* only way to accurately determine how many blocks are in a relation. Is
|
||||
* only way to accurately determine how many blocks are in a relation. Is
|
||||
* it worth keeping an accurate file length in shared memory someplace,
|
||||
* rather than relying on the kernel to do it for us?
|
||||
*/
|
||||
@@ -452,7 +452,7 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* Release the file-extension lock; it's now OK for someone else to extend
|
||||
* the relation some more. Note that we cannot release this lock before
|
||||
* the relation some more. Note that we cannot release this lock before
|
||||
* we have buffer lock on the new page, or we risk a race condition
|
||||
* against vacuumlazy.c --- see comments therein.
|
||||
*/
|
||||
|
||||
@@ -117,7 +117,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
|
||||
* Checking free space here is questionable since we aren't holding any
|
||||
* lock on the buffer; in the worst case we could get a bogus answer. It's
|
||||
* unlikely to be *seriously* wrong, though, since reading either pd_lower
|
||||
* or pd_upper is probably atomic. Avoiding taking a lock seems more
|
||||
* or pd_upper is probably atomic. Avoiding taking a lock seems more
|
||||
* important than sometimes getting a wrong answer in what is after all
|
||||
* just a heuristic estimate.
|
||||
*/
|
||||
@@ -332,8 +332,8 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
|
||||
* OldestXmin is the cutoff XID used to identify dead tuples.
|
||||
*
|
||||
* We don't actually change the page here, except perhaps for hint-bit updates
|
||||
* caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in
|
||||
* prstate showing the changes to be made. Items to be redirected are added
|
||||
* caused by HeapTupleSatisfiesVacuum. We just add entries to the arrays in
|
||||
* prstate showing the changes to be made. Items to be redirected are added
|
||||
* to the redirected[] array (two entries per redirection); items to be set to
|
||||
* LP_DEAD state are added to nowdead[]; and items to be set to LP_UNUSED
|
||||
* state are added to nowunused[].
|
||||
@@ -384,7 +384,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
* We need this primarily to handle aborted HOT updates, that is,
|
||||
* XMIN_INVALID heap-only tuples. Those might not be linked to by
|
||||
* any chain, since the parent tuple might be re-updated before
|
||||
* any pruning occurs. So we have to be able to reap them
|
||||
* any pruning occurs. So we have to be able to reap them
|
||||
* separately from chain-pruning. (Note that
|
||||
* HeapTupleHeaderIsHotUpdated will never return true for an
|
||||
* XMIN_INVALID tuple, so this code will work even when there were
|
||||
@@ -496,9 +496,10 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
break;
|
||||
|
||||
case HEAPTUPLE_DELETE_IN_PROGRESS:
|
||||
|
||||
/*
|
||||
* This tuple may soon become DEAD. Update the hint field
|
||||
* so that the page is reconsidered for pruning in future.
|
||||
* This tuple may soon become DEAD. Update the hint field so
|
||||
* that the page is reconsidered for pruning in future.
|
||||
*/
|
||||
heap_prune_record_prunable(prstate,
|
||||
HeapTupleHeaderGetUpdateXid(htup));
|
||||
@@ -574,7 +575,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
|
||||
|
||||
/*
|
||||
* If the root entry had been a normal tuple, we are deleting it, so
|
||||
* count it in the result. But changing a redirect (even to DEAD
|
||||
* count it in the result. But changing a redirect (even to DEAD
|
||||
* state) doesn't count.
|
||||
*/
|
||||
if (ItemIdIsNormal(rootlp))
|
||||
@@ -663,7 +664,7 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum)
|
||||
* buffer, and is inside a critical section.
|
||||
*
|
||||
* This is split out because it is also used by heap_xlog_clean()
|
||||
* to replay the WAL record when needed after a crash. Note that the
|
||||
* to replay the WAL record when needed after a crash. Note that the
|
||||
* arguments are identical to those of log_heap_clean().
|
||||
*/
|
||||
void
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
*
|
||||
* The caller is responsible for creating the new heap, all catalog
|
||||
* changes, supplying the tuples to be written to the new heap, and
|
||||
* rebuilding indexes. The caller must hold AccessExclusiveLock on the
|
||||
* rebuilding indexes. The caller must hold AccessExclusiveLock on the
|
||||
* target table, because we assume no one else is writing into it.
|
||||
*
|
||||
* To use the facility:
|
||||
@@ -43,7 +43,7 @@
|
||||
* to substitute the correct ctid instead.
|
||||
*
|
||||
* For each ctid reference from A -> B, we might encounter either A first
|
||||
* or B first. (Note that a tuple in the middle of a chain is both A and B
|
||||
* or B first. (Note that a tuple in the middle of a chain is both A and B
|
||||
* of different pairs.)
|
||||
*
|
||||
* If we encounter A first, we'll store the tuple in the unresolved_tups
|
||||
@@ -58,11 +58,11 @@
|
||||
* and can write A immediately with the correct ctid.
|
||||
*
|
||||
* Entries in the hash tables can be removed as soon as the later tuple
|
||||
* is encountered. That helps to keep the memory usage down. At the end,
|
||||
* is encountered. That helps to keep the memory usage down. At the end,
|
||||
* both tables are usually empty; we should have encountered both A and B
|
||||
* of each pair. However, it's possible for A to be RECENTLY_DEAD and B
|
||||
* entirely DEAD according to HeapTupleSatisfiesVacuum, because the test
|
||||
* for deadness using OldestXmin is not exact. In such a case we might
|
||||
* for deadness using OldestXmin is not exact. In such a case we might
|
||||
* encounter B first, and skip it, and find A later. Then A would be added
|
||||
* to unresolved_tups, and stay there until end of the rewrite. Since
|
||||
* this case is very unusual, we don't worry about the memory usage.
|
||||
@@ -78,7 +78,7 @@
|
||||
* of CLUSTERing on an unchanging key column, we'll see all the versions
|
||||
* of a given tuple together anyway, and so the peak memory usage is only
|
||||
* proportional to the number of RECENTLY_DEAD versions of a single row, not
|
||||
* in the whole table. Note that if we do fail halfway through a CLUSTER,
|
||||
* in the whole table. Note that if we do fail halfway through a CLUSTER,
|
||||
* the old table is still valid, so failure is not catastrophic.
|
||||
*
|
||||
* We can't use the normal heap_insert function to insert into the new
|
||||
@@ -143,13 +143,13 @@ typedef struct RewriteStateData
|
||||
BlockNumber rs_blockno; /* block where page will go */
|
||||
bool rs_buffer_valid; /* T if any tuples in buffer */
|
||||
bool rs_use_wal; /* must we WAL-log inserts? */
|
||||
bool rs_logical_rewrite; /* do we need to do logical rewriting */
|
||||
bool rs_logical_rewrite; /* do we need to do logical rewriting */
|
||||
TransactionId rs_oldest_xmin; /* oldest xmin used by caller to
|
||||
* determine tuple visibility */
|
||||
TransactionId rs_freeze_xid;/* Xid that will be used as freeze cutoff
|
||||
* point */
|
||||
TransactionId rs_logical_xmin; /* Xid that will be used as cutoff
|
||||
* point for logical rewrites */
|
||||
TransactionId rs_logical_xmin; /* Xid that will be used as cutoff
|
||||
* point for logical rewrites */
|
||||
MultiXactId rs_cutoff_multi;/* MultiXactId that will be used as cutoff
|
||||
* point for multixacts */
|
||||
MemoryContext rs_cxt; /* for hash tables and entries and tuples in
|
||||
@@ -158,7 +158,7 @@ typedef struct RewriteStateData
|
||||
HTAB *rs_unresolved_tups; /* unmatched A tuples */
|
||||
HTAB *rs_old_new_tid_map; /* unmatched B tuples */
|
||||
HTAB *rs_logical_mappings; /* logical remapping files */
|
||||
uint32 rs_num_rewrite_mappings; /* # in memory mappings */
|
||||
uint32 rs_num_rewrite_mappings; /* # in memory mappings */
|
||||
} RewriteStateData;
|
||||
|
||||
/*
|
||||
@@ -199,12 +199,12 @@ typedef OldToNewMappingData *OldToNewMapping;
|
||||
*/
|
||||
typedef struct RewriteMappingFile
|
||||
{
|
||||
TransactionId xid; /* xid that might need to see the row */
|
||||
int vfd; /* fd of mappings file */
|
||||
off_t off; /* how far have we written yet */
|
||||
uint32 num_mappings; /* number of in-memory mappings */
|
||||
dlist_head mappings; /* list of in-memory mappings */
|
||||
char path[MAXPGPATH]; /* path, for error messages */
|
||||
TransactionId xid; /* xid that might need to see the row */
|
||||
int vfd; /* fd of mappings file */
|
||||
off_t off; /* how far have we written yet */
|
||||
uint32 num_mappings; /* number of in-memory mappings */
|
||||
dlist_head mappings; /* list of in-memory mappings */
|
||||
char path[MAXPGPATH]; /* path, for error messages */
|
||||
} RewriteMappingFile;
|
||||
|
||||
/*
|
||||
@@ -213,8 +213,8 @@ typedef struct RewriteMappingFile
|
||||
*/
|
||||
typedef struct RewriteMappingDataEntry
|
||||
{
|
||||
LogicalRewriteMappingData map; /* map between old and new location of
|
||||
* the tuple */
|
||||
LogicalRewriteMappingData map; /* map between old and new location of
|
||||
* the tuple */
|
||||
dlist_node node;
|
||||
} RewriteMappingDataEntry;
|
||||
|
||||
@@ -346,7 +346,7 @@ end_heap_rewrite(RewriteState state)
|
||||
}
|
||||
|
||||
/*
|
||||
* If the rel is WAL-logged, must fsync before commit. We use heap_sync
|
||||
* If the rel is WAL-logged, must fsync before commit. We use heap_sync
|
||||
* to ensure that the toast table gets fsync'd too.
|
||||
*
|
||||
* It's obvious that we must do this when not WAL-logging. It's less
|
||||
@@ -617,7 +617,7 @@ rewrite_heap_dead_tuple(RewriteState state, HeapTuple old_tuple)
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a tuple to the new relation. This has to track heap_insert
|
||||
* Insert a tuple to the new relation. This has to track heap_insert
|
||||
* and its subsidiary functions!
|
||||
*
|
||||
* t_self of the tuple is set to the new TID of the tuple. If t_ctid of the
|
||||
@@ -866,13 +866,13 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
|
||||
hash_seq_init(&seq_status, state->rs_logical_mappings);
|
||||
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
|
||||
{
|
||||
XLogRecData rdata[2];
|
||||
char *waldata;
|
||||
char *waldata_start;
|
||||
XLogRecData rdata[2];
|
||||
char *waldata;
|
||||
char *waldata_start;
|
||||
xl_heap_rewrite_mapping xlrec;
|
||||
Oid dboid;
|
||||
uint32 len;
|
||||
int written;
|
||||
Oid dboid;
|
||||
uint32 len;
|
||||
int written;
|
||||
|
||||
/* this file hasn't got any new mappings */
|
||||
if (src->num_mappings == 0)
|
||||
@@ -962,14 +962,14 @@ logical_end_heap_rewrite(RewriteState state)
|
||||
return;
|
||||
|
||||
/* writeout remaining in-memory entries */
|
||||
if (state->rs_num_rewrite_mappings > 0 )
|
||||
if (state->rs_num_rewrite_mappings > 0)
|
||||
logical_heap_rewrite_flush_mappings(state);
|
||||
|
||||
/* Iterate over all mappings we have written and fsync the files. */
|
||||
hash_seq_init(&seq_status, state->rs_logical_mappings);
|
||||
while ((src = (RewriteMappingFile *) hash_seq_search(&seq_status)) != NULL)
|
||||
{
|
||||
if(FileSync(src->vfd) != 0)
|
||||
if (FileSync(src->vfd) != 0)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not fsync file \"%s\": %m", src->path)));
|
||||
@@ -985,10 +985,10 @@ static void
|
||||
logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
|
||||
LogicalRewriteMappingData *map)
|
||||
{
|
||||
RewriteMappingFile *src;
|
||||
RewriteMappingDataEntry *pmap;
|
||||
Oid relid;
|
||||
bool found;
|
||||
RewriteMappingFile *src;
|
||||
RewriteMappingDataEntry *pmap;
|
||||
Oid relid;
|
||||
bool found;
|
||||
|
||||
relid = RelationGetRelid(state->rs_old_rel);
|
||||
|
||||
@@ -1027,7 +1027,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
|
||||
if (src->vfd < 0)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not create file \"%s\": %m", path)));
|
||||
errmsg("could not create file \"%s\": %m", path)));
|
||||
}
|
||||
|
||||
pmap = MemoryContextAlloc(state->rs_cxt,
|
||||
@@ -1041,7 +1041,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid,
|
||||
* Write out buffer every time we've too many in-memory entries across all
|
||||
* mapping files.
|
||||
*/
|
||||
if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */)
|
||||
if (state->rs_num_rewrite_mappings >= 1000 /* arbitrary number */ )
|
||||
logical_heap_rewrite_flush_mappings(state);
|
||||
}
|
||||
|
||||
@@ -1054,11 +1054,11 @@ logical_rewrite_heap_tuple(RewriteState state, ItemPointerData old_tid,
|
||||
HeapTuple new_tuple)
|
||||
{
|
||||
ItemPointerData new_tid = new_tuple->t_self;
|
||||
TransactionId cutoff = state->rs_logical_xmin;
|
||||
TransactionId xmin;
|
||||
TransactionId xmax;
|
||||
bool do_log_xmin = false;
|
||||
bool do_log_xmax = false;
|
||||
TransactionId cutoff = state->rs_logical_xmin;
|
||||
TransactionId xmin;
|
||||
TransactionId xmax;
|
||||
bool do_log_xmin = false;
|
||||
bool do_log_xmax = false;
|
||||
LogicalRewriteMappingData map;
|
||||
|
||||
/* no logical rewrite in progress, we don't need to log anything */
|
||||
@@ -1147,7 +1147,8 @@ heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
|
||||
if (fd < 0)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not create file \"%s\": %m", path)));
|
||||
errmsg("could not create file \"%s\": %m", path)));
|
||||
|
||||
/*
|
||||
* Truncate all data that's not guaranteed to have been safely fsynced (by
|
||||
* previous record or by the last checkpoint).
|
||||
@@ -1174,6 +1175,7 @@ heap_xlog_logical_rewrite(XLogRecPtr lsn, XLogRecord *r)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to file \"%s\": %m", path)));
|
||||
|
||||
/*
|
||||
* Now fsync all previously written data. We could improve things and only
|
||||
* do this for the last write to a file, but the required bookkeeping
|
||||
@@ -1222,13 +1224,14 @@ CheckPointLogicalRewriteHeap(void)
|
||||
mappings_dir = AllocateDir("pg_llog/mappings");
|
||||
while ((mapping_de = ReadDir(mappings_dir, "pg_llog/mappings")) != NULL)
|
||||
{
|
||||
struct stat statbuf;
|
||||
struct stat statbuf;
|
||||
Oid dboid;
|
||||
Oid relid;
|
||||
XLogRecPtr lsn;
|
||||
TransactionId rewrite_xid;
|
||||
TransactionId create_xid;
|
||||
uint32 hi, lo;
|
||||
uint32 hi,
|
||||
lo;
|
||||
|
||||
if (strcmp(mapping_de->d_name, ".") == 0 ||
|
||||
strcmp(mapping_de->d_name, "..") == 0)
|
||||
@@ -1244,7 +1247,7 @@ CheckPointLogicalRewriteHeap(void)
|
||||
|
||||
if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
|
||||
&dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
|
||||
elog(ERROR,"could not parse filename \"%s\"", mapping_de->d_name);
|
||||
elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
|
||||
|
||||
lsn = ((uint64) hi) << 32 | lo;
|
||||
|
||||
@@ -1258,7 +1261,7 @@ CheckPointLogicalRewriteHeap(void)
|
||||
}
|
||||
else
|
||||
{
|
||||
int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
|
||||
int fd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
|
||||
|
||||
/*
|
||||
* The file cannot vanish due to concurrency since this function
|
||||
@@ -1269,6 +1272,7 @@ CheckPointLogicalRewriteHeap(void)
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open file \"%s\": %m", path)));
|
||||
|
||||
/*
|
||||
* We could try to avoid fsyncing files that either haven't
|
||||
* changed or have only been created since the checkpoint's start,
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
* heap scan synchronization support
|
||||
*
|
||||
* When multiple backends run a sequential scan on the same table, we try
|
||||
* to keep them synchronized to reduce the overall I/O needed. The goal is
|
||||
* to keep them synchronized to reduce the overall I/O needed. The goal is
|
||||
* to read each page into shared buffer cache only once, and let all backends
|
||||
* that take part in the shared scan process the page before it falls out of
|
||||
* the cache.
|
||||
@@ -26,7 +26,7 @@
|
||||
* don't want such queries to slow down others.
|
||||
*
|
||||
* There can realistically only be a few large sequential scans on different
|
||||
* tables in progress at any time. Therefore we just keep the scan positions
|
||||
* tables in progress at any time. Therefore we just keep the scan positions
|
||||
* in a small LRU list which we scan every time we need to look up or update a
|
||||
* scan position. The whole mechanism is only applied for tables exceeding
|
||||
* a threshold size (but that is not the concern of this module).
|
||||
@@ -243,7 +243,7 @@ ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
|
||||
* relation, or 0 if no valid location is found.
|
||||
*
|
||||
* We expect the caller has just done RelationGetNumberOfBlocks(), and
|
||||
* so that number is passed in rather than computing it again. The result
|
||||
* so that number is passed in rather than computing it again. The result
|
||||
* is guaranteed less than relnblocks (assuming that's > 0).
|
||||
*/
|
||||
BlockNumber
|
||||
|
||||
@@ -53,11 +53,11 @@ static struct varlena *toast_fetch_datum(struct varlena * attr);
|
||||
static struct varlena *toast_fetch_datum_slice(struct varlena * attr,
|
||||
int32 sliceoffset, int32 length);
|
||||
static int toast_open_indexes(Relation toastrel,
|
||||
LOCKMODE lock,
|
||||
Relation **toastidxs,
|
||||
int *num_indexes);
|
||||
LOCKMODE lock,
|
||||
Relation **toastidxs,
|
||||
int *num_indexes);
|
||||
static void toast_close_indexes(Relation *toastidxs, int num_indexes,
|
||||
LOCKMODE lock);
|
||||
LOCKMODE lock);
|
||||
|
||||
|
||||
/* ----------
|
||||
@@ -91,8 +91,9 @@ heap_tuple_fetch_attr(struct varlena * attr)
|
||||
* to persist a Datum for unusually long time, like in a HOLD cursor.
|
||||
*/
|
||||
struct varatt_indirect redirect;
|
||||
|
||||
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
|
||||
attr = (struct varlena *)redirect.pointer;
|
||||
attr = (struct varlena *) redirect.pointer;
|
||||
|
||||
/* nested indirect Datums aren't allowed */
|
||||
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
|
||||
@@ -147,8 +148,9 @@ heap_tuple_untoast_attr(struct varlena * attr)
|
||||
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
||||
{
|
||||
struct varatt_indirect redirect;
|
||||
|
||||
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
|
||||
attr = (struct varlena *)redirect.pointer;
|
||||
attr = (struct varlena *) redirect.pointer;
|
||||
|
||||
/* nested indirect Datums aren't allowed */
|
||||
Assert(!VARATT_IS_EXTERNAL_INDIRECT(attr));
|
||||
@@ -217,6 +219,7 @@ heap_tuple_untoast_attr_slice(struct varlena * attr,
|
||||
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
||||
{
|
||||
struct varatt_indirect redirect;
|
||||
|
||||
VARATT_EXTERNAL_GET_POINTER(redirect, attr);
|
||||
|
||||
/* nested indirect Datums aren't allowed */
|
||||
@@ -299,6 +302,7 @@ toast_raw_datum_size(Datum value)
|
||||
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
||||
{
|
||||
struct varatt_indirect toast_pointer;
|
||||
|
||||
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
||||
|
||||
/* nested indirect Datums aren't allowed */
|
||||
@@ -354,6 +358,7 @@ toast_datum_size(Datum value)
|
||||
else if (VARATT_IS_EXTERNAL_INDIRECT(attr))
|
||||
{
|
||||
struct varatt_indirect toast_pointer;
|
||||
|
||||
VARATT_EXTERNAL_GET_POINTER(toast_pointer, attr);
|
||||
|
||||
/* nested indirect Datums aren't allowed */
|
||||
@@ -597,7 +602,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
|
||||
* We took care of UPDATE above, so any external value we find
|
||||
* still in the tuple must be someone else's we cannot reuse.
|
||||
* Fetch it back (without decompression, unless we are forcing
|
||||
* PLAIN storage). If necessary, we'll push it out as a new
|
||||
* PLAIN storage). If necessary, we'll push it out as a new
|
||||
* external value below.
|
||||
*/
|
||||
if (VARATT_IS_EXTERNAL(new_value))
|
||||
@@ -740,7 +745,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
|
||||
|
||||
/*
|
||||
* Second we look for attributes of attstorage 'x' or 'e' that are still
|
||||
* inline. But skip this if there's no toast table to push them to.
|
||||
* inline. But skip this if there's no toast table to push them to.
|
||||
*/
|
||||
while (heap_compute_data_size(tupleDesc,
|
||||
toast_values, toast_isnull) > maxDataLen &&
|
||||
@@ -850,7 +855,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup,
|
||||
}
|
||||
|
||||
/*
|
||||
* Finally we store attributes of type 'm' externally. At this point we
|
||||
* Finally we store attributes of type 'm' externally. At this point we
|
||||
* increase the target tuple size, so that 'm' attributes aren't stored
|
||||
* externally unless really necessary.
|
||||
*/
|
||||
@@ -1438,7 +1443,7 @@ toast_save_datum(Relation rel, Datum value,
|
||||
* those versions could easily reference the same toast value.
|
||||
* When we copy the second or later version of such a row,
|
||||
* reusing the OID will mean we select an OID that's already
|
||||
* in the new toast table. Check for that, and if so, just
|
||||
* in the new toast table. Check for that, and if so, just
|
||||
* fall through without writing the data again.
|
||||
*
|
||||
* While annoying and ugly-looking, this is a good thing
|
||||
@@ -1467,7 +1472,7 @@ toast_save_datum(Relation rel, Datum value,
|
||||
{
|
||||
toast_pointer.va_valueid =
|
||||
GetNewOidWithIndex(toastrel,
|
||||
RelationGetRelid(toastidxs[validIndex]),
|
||||
RelationGetRelid(toastidxs[validIndex]),
|
||||
(AttrNumber) 1);
|
||||
} while (toastid_valueid_exists(rel->rd_toastoid,
|
||||
toast_pointer.va_valueid));
|
||||
@@ -1488,7 +1493,7 @@ toast_save_datum(Relation rel, Datum value,
|
||||
*/
|
||||
while (data_todo > 0)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Calculate the size of this chunk
|
||||
@@ -1506,7 +1511,7 @@ toast_save_datum(Relation rel, Datum value,
|
||||
heap_insert(toastrel, toasttup, mycid, options, NULL);
|
||||
|
||||
/*
|
||||
* Create the index entry. We cheat a little here by not using
|
||||
* Create the index entry. We cheat a little here by not using
|
||||
* FormIndexDatum: this relies on the knowledge that the index columns
|
||||
* are the same as the initial columns of the table for all the
|
||||
* indexes.
|
||||
@@ -1656,8 +1661,8 @@ toastrel_valueid_exists(Relation toastrel, Oid valueid)
|
||||
* Is there any such chunk?
|
||||
*/
|
||||
toastscan = systable_beginscan(toastrel,
|
||||
RelationGetRelid(toastidxs[validIndex]),
|
||||
true, SnapshotToast, 1, &toastkey);
|
||||
RelationGetRelid(toastidxs[validIndex]),
|
||||
true, SnapshotToast, 1, &toastkey);
|
||||
|
||||
if (systable_getnext(toastscan) != NULL)
|
||||
result = true;
|
||||
@@ -2126,7 +2131,8 @@ toast_open_indexes(Relation toastrel,
|
||||
/* Fetch the first valid index in list */
|
||||
for (i = 0; i < *num_indexes; i++)
|
||||
{
|
||||
Relation toastidx = (*toastidxs)[i];
|
||||
Relation toastidx = (*toastidxs)[i];
|
||||
|
||||
if (toastidx->rd_index->indisvalid)
|
||||
{
|
||||
res = i;
|
||||
@@ -2136,14 +2142,14 @@ toast_open_indexes(Relation toastrel,
|
||||
}
|
||||
|
||||
/*
|
||||
* Free index list, not necessary anymore as relations are opened
|
||||
* and a valid index has been found.
|
||||
* Free index list, not necessary anymore as relations are opened and a
|
||||
* valid index has been found.
|
||||
*/
|
||||
list_free(indexlist);
|
||||
|
||||
/*
|
||||
* The toast relation should have one valid index, so something is
|
||||
* going wrong if there is nothing.
|
||||
* The toast relation should have one valid index, so something is going
|
||||
* wrong if there is nothing.
|
||||
*/
|
||||
if (!found)
|
||||
elog(ERROR, "no valid index found for toast relation with Oid %d",
|
||||
@@ -2161,7 +2167,7 @@ toast_open_indexes(Relation toastrel,
|
||||
static void
|
||||
toast_close_indexes(Relation *toastidxs, int num_indexes, LOCKMODE lock)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
/* Close relations and clean up things */
|
||||
for (i = 0; i < num_indexes; i++)
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
* the sense that we make sure that whenever a bit is set, we know the
|
||||
* condition is true, but if a bit is not set, it might or might not be true.
|
||||
*
|
||||
* Clearing a visibility map bit is not separately WAL-logged. The callers
|
||||
* Clearing a visibility map bit is not separately WAL-logged. The callers
|
||||
* must make sure that whenever a bit is cleared, the bit is cleared on WAL
|
||||
* replay of the updating operation as well.
|
||||
*
|
||||
@@ -36,9 +36,9 @@
|
||||
* it may still be the case that every tuple on the page is visible to all
|
||||
* transactions; we just don't know that for certain. The difficulty is that
|
||||
* there are two bits which are typically set together: the PD_ALL_VISIBLE bit
|
||||
* on the page itself, and the visibility map bit. If a crash occurs after the
|
||||
* on the page itself, and the visibility map bit. If a crash occurs after the
|
||||
* visibility map page makes it to disk and before the updated heap page makes
|
||||
* it to disk, redo must set the bit on the heap page. Otherwise, the next
|
||||
* it to disk, redo must set the bit on the heap page. Otherwise, the next
|
||||
* insert, update, or delete on the heap page will fail to realize that the
|
||||
* visibility map bit must be cleared, possibly causing index-only scans to
|
||||
* return wrong answers.
|
||||
@@ -59,10 +59,10 @@
|
||||
* the buffer lock over any I/O that may be required to read in the visibility
|
||||
* map page. To avoid this, we examine the heap page before locking it;
|
||||
* if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
|
||||
* bit. Then, we lock the buffer. But this creates a race condition: there
|
||||
* bit. Then, we lock the buffer. But this creates a race condition: there
|
||||
* is a possibility that in the time it takes to lock the buffer, the
|
||||
* PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the
|
||||
* buffer, pin the visibility map page, and relock the buffer. This shouldn't
|
||||
* buffer, pin the visibility map page, and relock the buffer. This shouldn't
|
||||
* happen often, because only VACUUM currently sets visibility map bits,
|
||||
* and the race will only occur if VACUUM processes a given page at almost
|
||||
* exactly the same time that someone tries to further modify it.
|
||||
@@ -227,9 +227,9 @@ visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
|
||||
* visibilitymap_set - set a bit on a previously pinned page
|
||||
*
|
||||
* recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
|
||||
* or InvalidXLogRecPtr in normal running. The page LSN is advanced to the
|
||||
* or InvalidXLogRecPtr in normal running. The page LSN is advanced to the
|
||||
* one provided; in normal running, we generate a new XLOG record and set the
|
||||
* page LSN to that value. cutoff_xid is the largest xmin on the page being
|
||||
* page LSN to that value. cutoff_xid is the largest xmin on the page being
|
||||
* marked all-visible; it is needed for Hot Standby, and can be
|
||||
* InvalidTransactionId if the page contains no tuples.
|
||||
*
|
||||
@@ -320,10 +320,10 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf,
|
||||
* releasing *buf after it's done testing and setting bits.
|
||||
*
|
||||
* NOTE: This function is typically called without a lock on the heap page,
|
||||
* so somebody else could change the bit just after we look at it. In fact,
|
||||
* so somebody else could change the bit just after we look at it. In fact,
|
||||
* since we don't lock the visibility map page either, it's even possible that
|
||||
* someone else could have changed the bit just before we look at it, but yet
|
||||
* we might see the old value. It is the caller's responsibility to deal with
|
||||
* we might see the old value. It is the caller's responsibility to deal with
|
||||
* all concurrency issues!
|
||||
*/
|
||||
bool
|
||||
@@ -526,7 +526,7 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
|
||||
|
||||
/*
|
||||
* We might not have opened the relation at the smgr level yet, or we
|
||||
* might have been forced to close it by a sinval message. The code below
|
||||
* might have been forced to close it by a sinval message. The code below
|
||||
* won't necessarily notice relation extension immediately when extend =
|
||||
* false, so we rely on sinval messages to ensure that our ideas about the
|
||||
* size of the map aren't too far out of date.
|
||||
|
||||
@@ -45,7 +45,7 @@
|
||||
*
|
||||
* At the end of a scan, the AM's endscan routine undoes the locking,
|
||||
* but does *not* call IndexScanEnd --- the higher-level index_endscan
|
||||
* routine does that. (We can't do it in the AM because index_endscan
|
||||
* routine does that. (We can't do it in the AM because index_endscan
|
||||
* still needs to touch the IndexScanDesc after calling the AM.)
|
||||
*
|
||||
* Because of this, the AM does not have a choice whether to call
|
||||
@@ -79,7 +79,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
|
||||
|
||||
scan->heapRelation = NULL; /* may be set later */
|
||||
scan->indexRelation = indexRelation;
|
||||
scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */
|
||||
scan->xs_snapshot = InvalidSnapshot; /* caller must initialize this */
|
||||
scan->numberOfKeys = nkeys;
|
||||
scan->numberOfOrderBys = norderbys;
|
||||
|
||||
@@ -188,7 +188,7 @@ BuildIndexValueDescription(Relation indexRelation,
|
||||
* at rd_opcintype not the index tupdesc.
|
||||
*
|
||||
* Note: this is a bit shaky for opclasses that have pseudotype
|
||||
* input types such as ANYARRAY or RECORD. Currently, the
|
||||
* input types such as ANYARRAY or RECORD. Currently, the
|
||||
* typoutput functions associated with the pseudotypes will work
|
||||
* okay, but we might have to try harder in future.
|
||||
*/
|
||||
@@ -269,7 +269,7 @@ systable_beginscan(Relation heapRelation,
|
||||
|
||||
if (snapshot == NULL)
|
||||
{
|
||||
Oid relid = RelationGetRelid(heapRelation);
|
||||
Oid relid = RelationGetRelid(heapRelation);
|
||||
|
||||
snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
|
||||
sysscan->snapshot = snapshot;
|
||||
@@ -442,7 +442,7 @@ systable_endscan(SysScanDesc sysscan)
|
||||
* index order. Also, for largely historical reasons, the index to use
|
||||
* is opened and locked by the caller, not here.
|
||||
*
|
||||
* Currently we do not support non-index-based scans here. (In principle
|
||||
* Currently we do not support non-index-based scans here. (In principle
|
||||
* we could do a heapscan and sort, but the uses are in places that
|
||||
* probably don't need to still work with corrupted catalog indexes.)
|
||||
* For the moment, therefore, these functions are merely the thinnest of
|
||||
@@ -475,7 +475,7 @@ systable_beginscan_ordered(Relation heapRelation,
|
||||
|
||||
if (snapshot == NULL)
|
||||
{
|
||||
Oid relid = RelationGetRelid(heapRelation);
|
||||
Oid relid = RelationGetRelid(heapRelation);
|
||||
|
||||
snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
|
||||
sysscan->snapshot = snapshot;
|
||||
|
||||
@@ -84,7 +84,7 @@
|
||||
*
|
||||
* Note: the ReindexIsProcessingIndex() check in RELATION_CHECKS is there
|
||||
* to check that we don't try to scan or do retail insertions into an index
|
||||
* that is currently being rebuilt or pending rebuild. This helps to catch
|
||||
* that is currently being rebuilt or pending rebuild. This helps to catch
|
||||
* things that don't work when reindexing system catalogs. The assertion
|
||||
* doesn't prevent the actual rebuild because we don't use RELATION_CHECKS
|
||||
* when calling the index AM's ambuild routine, and there is no reason for
|
||||
@@ -149,7 +149,7 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
|
||||
* index_open - open an index relation by relation OID
|
||||
*
|
||||
* If lockmode is not "NoLock", the specified kind of lock is
|
||||
* obtained on the index. (Generally, NoLock should only be
|
||||
* obtained on the index. (Generally, NoLock should only be
|
||||
* used if the caller knows it has some appropriate lock on the
|
||||
* index already.)
|
||||
*
|
||||
@@ -414,7 +414,7 @@ index_markpos(IndexScanDesc scan)
|
||||
* returnable tuple in each HOT chain, and so restoring the prior state at the
|
||||
* granularity of the index AM is sufficient. Since the only current user
|
||||
* of mark/restore functionality is nodeMergejoin.c, this effectively means
|
||||
* that merge-join plans only work for MVCC snapshots. This could be fixed
|
||||
* that merge-join plans only work for MVCC snapshots. This could be fixed
|
||||
* if necessary, but for now it seems unimportant.
|
||||
* ----------------
|
||||
*/
|
||||
@@ -553,7 +553,7 @@ index_fetch_heap(IndexScanDesc scan)
|
||||
/*
|
||||
* If we scanned a whole HOT chain and found only dead tuples, tell index
|
||||
* AM to kill its entry for that TID (this will take effect in the next
|
||||
* amgettuple call, in index_getnext_tid). We do not do this when in
|
||||
* amgettuple call, in index_getnext_tid). We do not do this when in
|
||||
* recovery because it may violate MVCC to do so. See comments in
|
||||
* RelationGetIndexScan().
|
||||
*/
|
||||
@@ -590,7 +590,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||
{
|
||||
/*
|
||||
* We are resuming scan of a HOT chain after having returned an
|
||||
* earlier member. Must still hold pin on current heap page.
|
||||
* earlier member. Must still hold pin on current heap page.
|
||||
*/
|
||||
Assert(BufferIsValid(scan->xs_cbuf));
|
||||
Assert(ItemPointerGetBlockNumber(&scan->xs_ctup.t_self) ==
|
||||
@@ -760,7 +760,7 @@ index_can_return(Relation indexRelation)
|
||||
* particular indexed attribute are those with both types equal to
|
||||
* the index opclass' opcintype (note that this is subtly different
|
||||
* from the indexed attribute's own type: it may be a binary-compatible
|
||||
* type instead). Only the default functions are stored in relcache
|
||||
* type instead). Only the default functions are stored in relcache
|
||||
* entries --- access methods can use the syscache to look up non-default
|
||||
* functions.
|
||||
*
|
||||
@@ -794,7 +794,7 @@ index_getprocid(Relation irel,
|
||||
* index_getprocinfo
|
||||
*
|
||||
* This routine allows index AMs to keep fmgr lookup info for
|
||||
* support procs in the relcache. As above, only the "default"
|
||||
* support procs in the relcache. As above, only the "default"
|
||||
* functions for any particular indexed attribute are cached.
|
||||
*
|
||||
* Note: the return value points into cached data that will be lost during
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
* Although any negative int32 (except INT_MIN) is acceptable for reporting
|
||||
* "<", and any positive int32 is acceptable for reporting ">", routines
|
||||
* that work on 32-bit or wider datatypes can't just return "a - b".
|
||||
* That could overflow and give the wrong answer. Also, one must not
|
||||
* That could overflow and give the wrong answer. Also, one must not
|
||||
* return INT_MIN to report "<", since some callers will negate the result.
|
||||
*
|
||||
* NOTE: it is critical that the comparison function impose a total order
|
||||
|
||||
@@ -90,7 +90,7 @@ static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
|
||||
* By here, itup is filled in, including the TID.
|
||||
*
|
||||
* If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this
|
||||
* will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
|
||||
* will allow duplicates. Otherwise (UNIQUE_CHECK_YES or
|
||||
* UNIQUE_CHECK_EXISTING) it will throw error for a duplicate.
|
||||
* For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and
|
||||
* don't actually insert.
|
||||
@@ -129,7 +129,7 @@ top:
|
||||
* If the page was split between the time that we surrendered our read
|
||||
* lock and acquired our write lock, then this page may no longer be the
|
||||
* right place for the key we want to insert. In this case, we need to
|
||||
* move right in the tree. See Lehman and Yao for an excruciatingly
|
||||
* move right in the tree. See Lehman and Yao for an excruciatingly
|
||||
* precise description.
|
||||
*/
|
||||
buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
|
||||
@@ -211,7 +211,7 @@ top:
|
||||
* is the first tuple on the next page.
|
||||
*
|
||||
* Returns InvalidTransactionId if there is no conflict, else an xact ID
|
||||
* we must wait for to see if it commits a conflicting tuple. If an actual
|
||||
* we must wait for to see if it commits a conflicting tuple. If an actual
|
||||
* conflict is detected, no return --- just ereport().
|
||||
*
|
||||
* However, if checkUnique == UNIQUE_CHECK_PARTIAL, we always return
|
||||
@@ -293,7 +293,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
|
||||
/*
|
||||
* If we are doing a recheck, we expect to find the tuple we
|
||||
* are rechecking. It's not a duplicate, but we have to keep
|
||||
* are rechecking. It's not a duplicate, but we have to keep
|
||||
* scanning.
|
||||
*/
|
||||
if (checkUnique == UNIQUE_CHECK_EXISTING &&
|
||||
@@ -482,7 +482,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
* If the new key is equal to one or more existing keys, we can
|
||||
* legitimately place it anywhere in the series of equal keys --- in fact,
|
||||
* if the new key is equal to the page's "high key" we can place it on
|
||||
* the next page. If it is equal to the high key, and there's not room
|
||||
* the next page. If it is equal to the high key, and there's not room
|
||||
* to insert the new tuple on the current page without splitting, then
|
||||
* we can move right hoping to find more free space and avoid a split.
|
||||
* (We should not move right indefinitely, however, since that leads to
|
||||
@@ -494,7 +494,7 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel,
|
||||
* removing any LP_DEAD tuples.
|
||||
*
|
||||
* On entry, *buf and *offsetptr point to the first legal position
|
||||
* where the new tuple could be inserted. The caller should hold an
|
||||
* where the new tuple could be inserted. The caller should hold an
|
||||
* exclusive lock on *buf. *offsetptr can also be set to
|
||||
* InvalidOffsetNumber, in which case the function will search for the
|
||||
* right location within the page if needed. On exit, they point to the
|
||||
@@ -564,7 +564,7 @@ _bt_findinsertloc(Relation rel,
|
||||
* on every insert. We implement "get tired" as a random choice,
|
||||
* since stopping after scanning a fixed number of pages wouldn't work
|
||||
* well (we'd never reach the right-hand side of previously split
|
||||
* pages). Currently the probability of moving right is set at 0.99,
|
||||
* pages). Currently the probability of moving right is set at 0.99,
|
||||
* which may seem too high to change the behavior much, but it does an
|
||||
* excellent job of preventing O(N^2) behavior with many equal keys.
|
||||
*----------
|
||||
@@ -574,7 +574,7 @@ _bt_findinsertloc(Relation rel,
|
||||
while (PageGetFreeSpace(page) < itemsz)
|
||||
{
|
||||
Buffer rbuf;
|
||||
BlockNumber rblkno;
|
||||
BlockNumber rblkno;
|
||||
|
||||
/*
|
||||
* before considering moving right, see if we can obtain enough space
|
||||
@@ -620,10 +620,10 @@ _bt_findinsertloc(Relation rel,
|
||||
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* If this page was incompletely split, finish the split now.
|
||||
* We do this while holding a lock on the left sibling, which
|
||||
* is not good because finishing the split could be a fairly
|
||||
* lengthy operation. But this should happen very seldom.
|
||||
* If this page was incompletely split, finish the split now. We
|
||||
* do this while holding a lock on the left sibling, which is not
|
||||
* good because finishing the split could be a fairly lengthy
|
||||
* operation. But this should happen very seldom.
|
||||
*/
|
||||
if (P_INCOMPLETE_SPLIT(lpageop))
|
||||
{
|
||||
@@ -681,7 +681,7 @@ _bt_findinsertloc(Relation rel,
|
||||
* + updates the metapage if a true root or fast root is split.
|
||||
*
|
||||
* On entry, we must have the correct buffer in which to do the
|
||||
* insertion, and the buffer must be pinned and write-locked. On return,
|
||||
* insertion, and the buffer must be pinned and write-locked. On return,
|
||||
* we will have dropped both the pin and the lock on the buffer.
|
||||
*
|
||||
* When inserting to a non-leaf page, 'cbuf' is the left-sibling of the
|
||||
@@ -978,7 +978,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
* origpage is the original page to be split. leftpage is a temporary
|
||||
* buffer that receives the left-sibling data, which will be copied back
|
||||
* into origpage on success. rightpage is the new page that receives the
|
||||
* right-sibling data. If we fail before reaching the critical section,
|
||||
* right-sibling data. If we fail before reaching the critical section,
|
||||
* origpage hasn't been modified and leftpage is only workspace. In
|
||||
* principle we shouldn't need to worry about rightpage either, because it
|
||||
* hasn't been linked into the btree page structure; but to avoid leaving
|
||||
@@ -1196,7 +1196,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
* page. If you're confused, imagine that page A splits to A B and
|
||||
* then again, yielding A C B, while vacuum is in progress. Tuples
|
||||
* originally in A could now be in either B or C, hence vacuum must
|
||||
* examine both pages. But if D, our right sibling, has a different
|
||||
* examine both pages. But if D, our right sibling, has a different
|
||||
* cycleid then it could not contain any tuples that were in A when
|
||||
* the vacuum started.
|
||||
*/
|
||||
@@ -1330,11 +1330,10 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
lastrdata++;
|
||||
|
||||
/*
|
||||
* Although we don't need to WAL-log anything on the left page,
|
||||
* we still need XLogInsert to consider storing a full-page image
|
||||
* of the left page, so make an empty entry referencing that
|
||||
* buffer. This also ensures that the left page is always backup
|
||||
* block 1.
|
||||
* Although we don't need to WAL-log anything on the left page, we
|
||||
* still need XLogInsert to consider storing a full-page image of
|
||||
* the left page, so make an empty entry referencing that buffer.
|
||||
* This also ensures that the left page is always backup block 1.
|
||||
*/
|
||||
lastrdata->data = NULL;
|
||||
lastrdata->len = 0;
|
||||
@@ -1448,7 +1447,7 @@ _bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
|
||||
*
|
||||
* We return the index of the first existing tuple that should go on the
|
||||
* righthand page, plus a boolean indicating whether the new tuple goes on
|
||||
* the left or right page. The bool is necessary to disambiguate the case
|
||||
* the left or right page. The bool is necessary to disambiguate the case
|
||||
* where firstright == newitemoff.
|
||||
*/
|
||||
static OffsetNumber
|
||||
@@ -1684,7 +1683,7 @@ _bt_checksplitloc(FindSplitData *state,
|
||||
*
|
||||
* On entry, buf and rbuf are the left and right split pages, which we
|
||||
* still hold write locks on per the L&Y algorithm. We release the
|
||||
* write locks once we have write lock on the parent page. (Any sooner,
|
||||
* write locks once we have write lock on the parent page. (Any sooner,
|
||||
* and it'd be possible for some other process to try to split or delete
|
||||
* one of these pages, and get confused because it cannot find the downlink.)
|
||||
*
|
||||
@@ -1705,7 +1704,7 @@ _bt_insert_parent(Relation rel,
|
||||
* Here we have to do something Lehman and Yao don't talk about: deal with
|
||||
* a root split and construction of a new root. If our stack is empty
|
||||
* then we have just split a node on what had been the root level when we
|
||||
* descended the tree. If it was still the root then we perform a
|
||||
* descended the tree. If it was still the root then we perform a
|
||||
* new-root construction. If it *wasn't* the root anymore, search to find
|
||||
* the next higher level that someone constructed meanwhile, and find the
|
||||
* right place to insert as for the normal case.
|
||||
@@ -1917,7 +1916,7 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
/*
|
||||
* These loops will check every item on the page --- but in an
|
||||
* order that's attuned to the probability of where it actually
|
||||
* is. Scan to the right first, then to the left.
|
||||
* is. Scan to the right first, then to the left.
|
||||
*/
|
||||
for (offnum = start;
|
||||
offnum <= maxoff;
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
* src/backend/access/nbtree/nbtpage.c
|
||||
*
|
||||
* NOTES
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
* data at high addresses includes pointers to left and right siblings
|
||||
* and flag data describing page state. The first page in a btree, page
|
||||
* zero, is special -- it stores meta-information describing the tree.
|
||||
@@ -36,7 +36,7 @@ static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,
|
||||
static bool _bt_lock_branch_parent(Relation rel, BlockNumber child,
|
||||
BTStack stack, Buffer *topparent, OffsetNumber *topoff,
|
||||
BlockNumber *target, BlockNumber *rightsib);
|
||||
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
|
||||
static void _bt_log_reuse_page(Relation rel, BlockNumber blkno,
|
||||
TransactionId latestRemovedXid);
|
||||
|
||||
/*
|
||||
@@ -62,7 +62,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
|
||||
metaopaque->btpo_flags = BTP_META;
|
||||
|
||||
/*
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* but it makes the page look compressible to xlog.c.
|
||||
*/
|
||||
((PageHeader) page)->pd_lower =
|
||||
@@ -80,7 +80,7 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
|
||||
*
|
||||
* The access type parameter (BT_READ or BT_WRITE) controls whether
|
||||
* a new root page will be created or not. If access = BT_READ,
|
||||
* and no root page exists, we just return InvalidBuffer. For
|
||||
* and no root page exists, we just return InvalidBuffer. For
|
||||
* BT_WRITE, we try to create the root page if it doesn't exist.
|
||||
* NOTE that the returned root page will have only a read lock set
|
||||
* on it even if access = BT_WRITE!
|
||||
@@ -197,7 +197,7 @@ _bt_getroot(Relation rel, int access)
|
||||
/*
|
||||
* Metadata initialized by someone else. In order to guarantee no
|
||||
* deadlocks, we have to release the metadata page and start all
|
||||
* over again. (Is that really true? But it's hardly worth trying
|
||||
* over again. (Is that really true? But it's hardly worth trying
|
||||
* to optimize this case.)
|
||||
*/
|
||||
_bt_relbuf(rel, metabuf);
|
||||
@@ -254,7 +254,7 @@ _bt_getroot(Relation rel, int access)
|
||||
END_CRIT_SECTION();
|
||||
|
||||
/*
|
||||
* swap root write lock for read lock. There is no danger of anyone
|
||||
* swap root write lock for read lock. There is no danger of anyone
|
||||
* else accessing the new root page while it's unlocked, since no one
|
||||
* else knows where it is yet.
|
||||
*/
|
||||
@@ -322,7 +322,7 @@ _bt_getroot(Relation rel, int access)
|
||||
* By the time we acquire lock on the root page, it might have been split and
|
||||
* not be the true root anymore. This is okay for the present uses of this
|
||||
* routine; we only really need to be able to move up at least one tree level
|
||||
* from whatever non-root page we were at. If we ever do need to lock the
|
||||
* from whatever non-root page we were at. If we ever do need to lock the
|
||||
* one true root page, we could loop here, re-reading the metapage on each
|
||||
* failure. (Note that it wouldn't do to hold the lock on the metapage while
|
||||
* moving to the root --- that'd deadlock against any concurrent root split.)
|
||||
@@ -497,7 +497,7 @@ _bt_checkpage(Relation rel, Buffer buf)
|
||||
/*
|
||||
* ReadBuffer verifies that every newly-read page passes
|
||||
* PageHeaderIsValid, which means it either contains a reasonably sane
|
||||
* page header or is all-zero. We have to defend against the all-zero
|
||||
* page header or is all-zero. We have to defend against the all-zero
|
||||
* case, however.
|
||||
*/
|
||||
if (PageIsNew(page))
|
||||
@@ -564,7 +564,7 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
|
||||
/*
|
||||
* _bt_getbuf() -- Get a buffer by block number for read or write.
|
||||
*
|
||||
* blkno == P_NEW means to get an unallocated index page. The page
|
||||
* blkno == P_NEW means to get an unallocated index page. The page
|
||||
* will be initialized before returning it.
|
||||
*
|
||||
* When this routine returns, the appropriate lock is set on the
|
||||
@@ -595,7 +595,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
* First see if the FSM knows of any free pages.
|
||||
*
|
||||
* We can't trust the FSM's report unreservedly; we have to check that
|
||||
* the page is still free. (For example, an already-free page could
|
||||
* the page is still free. (For example, an already-free page could
|
||||
* have been re-used between the time the last VACUUM scanned it and
|
||||
* the time the VACUUM made its FSM updates.)
|
||||
*
|
||||
@@ -774,7 +774,7 @@ _bt_page_recyclable(Page page)
|
||||
/*
|
||||
* Delete item(s) from a btree page during VACUUM.
|
||||
*
|
||||
* This must only be used for deleting leaf items. Deleting an item on a
|
||||
* This must only be used for deleting leaf items. Deleting an item on a
|
||||
* non-leaf page has to be done as part of an atomic action that includes
|
||||
* deleting the page it points to.
|
||||
*
|
||||
@@ -842,7 +842,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
|
||||
|
||||
/*
|
||||
* The target-offsets array is not in the buffer, but pretend that it
|
||||
* is. When XLogInsert stores the whole buffer, the offsets array
|
||||
* is. When XLogInsert stores the whole buffer, the offsets array
|
||||
* need not be stored too.
|
||||
*/
|
||||
if (nitems > 0)
|
||||
@@ -1049,11 +1049,12 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
|
||||
lbuf = _bt_getbuf(rel, leftsib, BT_READ);
|
||||
lpage = BufferGetPage(lbuf);
|
||||
lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
|
||||
|
||||
/*
|
||||
* If the left sibling was concurrently split, so that its
|
||||
* next-pointer doesn't point to the current page anymore,
|
||||
* the split that created the current page must be completed.
|
||||
* (We don't allow splitting an incompletely split page again
|
||||
* next-pointer doesn't point to the current page anymore, the
|
||||
* split that created the current page must be completed. (We
|
||||
* don't allow splitting an incompletely split page again
|
||||
* until the previous split has been completed)
|
||||
*/
|
||||
if (lopaque->btpo_next == parent &&
|
||||
@@ -1066,7 +1067,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
|
||||
}
|
||||
|
||||
return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
|
||||
topparent, topoff, target, rightsib);
|
||||
topparent, topoff, target, rightsib);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1112,6 +1113,7 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
bool rightsib_empty;
|
||||
Page page;
|
||||
BTPageOpaque opaque;
|
||||
|
||||
/*
|
||||
* "stack" is a search stack leading (approximately) to the target page.
|
||||
* It is initially NULL, but when iterating, we keep it to avoid
|
||||
@@ -1140,24 +1142,24 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
* was never supposed to leave half-dead pages in the tree, it was
|
||||
* just a transient state, but it was nevertheless possible in
|
||||
* error scenarios. We don't know how to deal with them here. They
|
||||
* are harmless as far as searches are considered, but inserts into
|
||||
* the deleted keyspace could add out-of-order downlinks in the
|
||||
* upper levels. Log a notice, hopefully the admin will notice and
|
||||
* reindex.
|
||||
* are harmless as far as searches are considered, but inserts
|
||||
* into the deleted keyspace could add out-of-order downlinks in
|
||||
* the upper levels. Log a notice, hopefully the admin will notice
|
||||
* and reindex.
|
||||
*/
|
||||
if (P_ISHALFDEAD(opaque))
|
||||
ereport(LOG,
|
||||
(errcode(ERRCODE_INDEX_CORRUPTED),
|
||||
errmsg("index \"%s\" contains a half-dead internal page",
|
||||
RelationGetRelationName(rel)),
|
||||
errmsg("index \"%s\" contains a half-dead internal page",
|
||||
RelationGetRelationName(rel)),
|
||||
errhint("This can be caused by an interrupt VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
|
||||
_bt_relbuf(rel, buf);
|
||||
return ndeleted;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can never delete rightmost pages nor root pages. While at
|
||||
* it, check that page is not already deleted and is empty.
|
||||
* We can never delete rightmost pages nor root pages. While at it,
|
||||
* check that page is not already deleted and is empty.
|
||||
*
|
||||
* To keep the algorithm simple, we also never delete an incompletely
|
||||
* split page (they should be rare enough that this doesn't make any
|
||||
@@ -1167,10 +1169,10 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
* left half of an incomplete split, but ensuring that it's not the
|
||||
* right half is more complicated. For that, we have to check that
|
||||
* the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
|
||||
* the first iteration, we temporarily release the lock on the
|
||||
* current page, and check the left sibling and also construct a
|
||||
* search stack to. On subsequent iterations, we know we stepped right
|
||||
* from a page that passed these tests, so it's OK.
|
||||
* the first iteration, we temporarily release the lock on the current
|
||||
* page, and check the left sibling and also construct a search stack
|
||||
* to. On subsequent iterations, we know we stepped right from a page
|
||||
* that passed these tests, so it's OK.
|
||||
*/
|
||||
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
|
||||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
|
||||
@@ -1184,9 +1186,9 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
}
|
||||
|
||||
/*
|
||||
* First, remove downlink pointing to the page (or a parent of the page,
|
||||
* if we are going to delete a taller branch), and mark the page as
|
||||
* half-dead.
|
||||
* First, remove downlink pointing to the page (or a parent of the
|
||||
* page, if we are going to delete a taller branch), and mark the page
|
||||
* as half-dead.
|
||||
*/
|
||||
if (!P_ISHALFDEAD(opaque))
|
||||
{
|
||||
@@ -1205,7 +1207,7 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
ItemId itemid;
|
||||
IndexTuple targetkey;
|
||||
Buffer lbuf;
|
||||
BlockNumber leftsib;
|
||||
BlockNumber leftsib;
|
||||
|
||||
itemid = PageGetItemId(page, P_HIKEY);
|
||||
targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
|
||||
@@ -1219,9 +1221,9 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/*
|
||||
* Fetch the left sibling, to check that it's not marked
|
||||
* with INCOMPLETE_SPLIT flag. That would mean that the
|
||||
* page to-be-deleted doesn't have a downlink, and the page
|
||||
* Fetch the left sibling, to check that it's not marked with
|
||||
* INCOMPLETE_SPLIT flag. That would mean that the page
|
||||
* to-be-deleted doesn't have a downlink, and the page
|
||||
* deletion algorithm isn't prepared to handle that.
|
||||
*/
|
||||
if (!P_LEFTMOST(opaque))
|
||||
@@ -1267,7 +1269,7 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
|
||||
/*
|
||||
* Then unlink it from its siblings. Each call to
|
||||
*_bt_unlink_halfdead_page unlinks the topmost page from the branch,
|
||||
* _bt_unlink_halfdead_page unlinks the topmost page from the branch,
|
||||
* making it shallower. Iterate until the leaf page is gone.
|
||||
*/
|
||||
rightsib_empty = false;
|
||||
@@ -1291,8 +1293,8 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
* is that it was the rightmost child of the parent. Now that we
|
||||
* removed the downlink for this page, the right sibling might now be
|
||||
* the only child of the parent, and could be removed. It would be
|
||||
* picked up by the next vacuum anyway, but might as well try to remove
|
||||
* it now, so loop back to process the right sibling.
|
||||
* picked up by the next vacuum anyway, but might as well try to
|
||||
* remove it now, so loop back to process the right sibling.
|
||||
*/
|
||||
if (!rightsib_empty)
|
||||
break;
|
||||
@@ -1310,9 +1312,9 @@ _bt_pagedel(Relation rel, Buffer buf)
|
||||
static bool
|
||||
_bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
|
||||
{
|
||||
BlockNumber leafblkno;
|
||||
BlockNumber leafblkno;
|
||||
BlockNumber leafrightsib;
|
||||
BlockNumber target;
|
||||
BlockNumber target;
|
||||
BlockNumber rightsib;
|
||||
ItemId itemid;
|
||||
Page page;
|
||||
@@ -1351,7 +1353,7 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
|
||||
|
||||
/*
|
||||
* Check that the parent-page index items we're about to delete/overwrite
|
||||
* contain what we expect. This can fail if the index has become corrupt
|
||||
* contain what we expect. This can fail if the index has become corrupt
|
||||
* for some reason. We want to throw any error before entering the
|
||||
* critical section --- otherwise it'd be a PANIC.
|
||||
*
|
||||
@@ -1490,9 +1492,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
|
||||
BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);
|
||||
BlockNumber leafleftsib;
|
||||
BlockNumber leafrightsib;
|
||||
BlockNumber target;
|
||||
BlockNumber leftsib;
|
||||
BlockNumber rightsib;
|
||||
BlockNumber target;
|
||||
BlockNumber leftsib;
|
||||
BlockNumber rightsib;
|
||||
Buffer lbuf = InvalidBuffer;
|
||||
Buffer buf;
|
||||
Buffer rbuf;
|
||||
@@ -1506,7 +1508,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
|
||||
int targetlevel;
|
||||
ItemPointer leafhikey;
|
||||
BlockNumber nextchild;
|
||||
BlockNumber topblkno;
|
||||
BlockNumber topblkno;
|
||||
|
||||
page = BufferGetPage(leafbuf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -1596,7 +1598,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
|
||||
lbuf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Next write-lock the target page itself. It should be okay to take just
|
||||
* Next write-lock the target page itself. It should be okay to take just
|
||||
* a write lock not a superexclusive lock, since no scans would stop on an
|
||||
* empty page.
|
||||
*/
|
||||
@@ -1605,9 +1607,9 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* Check page is still empty etc, else abandon deletion. This is just
|
||||
* for paranoia's sake; a half-dead page cannot resurrect because there
|
||||
* can be only one vacuum process running at a time.
|
||||
* Check page is still empty etc, else abandon deletion. This is just for
|
||||
* paranoia's sake; a half-dead page cannot resurrect because there can be
|
||||
* only one vacuum process running at a time.
|
||||
*/
|
||||
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))
|
||||
{
|
||||
@@ -1733,7 +1735,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty)
|
||||
* we're in VACUUM and would not otherwise have an XID. Having already
|
||||
* updated links to the target, ReadNewTransactionId() suffices as an
|
||||
* upper bound. Any scan having retained a now-stale link is advertising
|
||||
* in its PGXACT an xmin less than or equal to the value we read here. It
|
||||
* in its PGXACT an xmin less than or equal to the value we read here. It
|
||||
* will continue to do so, holding back RecentGlobalXmin, for the duration
|
||||
* of that scan.
|
||||
*/
|
||||
|
||||
@@ -208,7 +208,7 @@ btbuildempty(PG_FUNCTION_ARGS)
|
||||
metapage = (Page) palloc(BLCKSZ);
|
||||
_bt_initmetapage(metapage, P_NONE, 0);
|
||||
|
||||
/* Write the page. If archiving/streaming, XLOG it. */
|
||||
/* Write the page. If archiving/streaming, XLOG it. */
|
||||
PageSetChecksumInplace(metapage, BTREE_METAPAGE);
|
||||
smgrwrite(index->rd_smgr, INIT_FORKNUM, BTREE_METAPAGE,
|
||||
(char *) metapage, true);
|
||||
@@ -427,7 +427,7 @@ btbeginscan(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* We don't know yet whether the scan will be index-only, so we do not
|
||||
* allocate the tuple workspace arrays until btrescan. However, we set up
|
||||
* allocate the tuple workspace arrays until btrescan. However, we set up
|
||||
* scan->xs_itupdesc whether we'll need it or not, since that's so cheap.
|
||||
*/
|
||||
so->currTuples = so->markTuples = NULL;
|
||||
@@ -472,7 +472,7 @@ btrescan(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* Allocate tuple workspace arrays, if needed for an index-only scan and
|
||||
* not already done in a previous rescan call. To save on palloc
|
||||
* not already done in a previous rescan call. To save on palloc
|
||||
* overhead, both workspaces are allocated as one palloc block; only this
|
||||
* function and btendscan know that.
|
||||
*
|
||||
@@ -952,7 +952,7 @@ restart:
|
||||
vstate->lastBlockLocked = blkno;
|
||||
|
||||
/*
|
||||
* Check whether we need to recurse back to earlier pages. What we
|
||||
* Check whether we need to recurse back to earlier pages. What we
|
||||
* are concerned about is a page split that happened since we started
|
||||
* the vacuum scan. If the split moved some tuples to a lower page
|
||||
* then we might have missed 'em. If so, set up for tail recursion.
|
||||
|
||||
@@ -50,7 +50,7 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
|
||||
*
|
||||
* NOTE that the returned buffer is read-locked regardless of the access
|
||||
* parameter. However, access = BT_WRITE will allow an empty root page
|
||||
* to be created and returned. When access = BT_READ, an empty index
|
||||
* to be created and returned. When access = BT_READ, an empty index
|
||||
* will result in *bufP being set to InvalidBuffer. Also, in BT_WRITE mode,
|
||||
* any incomplete splits encountered during the search will be finished.
|
||||
*/
|
||||
@@ -271,7 +271,7 @@ _bt_moveright(Relation rel,
|
||||
* (or leaf keys > given scankey when nextkey is true).
|
||||
*
|
||||
* This procedure is not responsible for walking right, it just examines
|
||||
* the given page. _bt_binsrch() has no lock or refcount side effects
|
||||
* the given page. _bt_binsrch() has no lock or refcount side effects
|
||||
* on the buffer.
|
||||
*/
|
||||
OffsetNumber
|
||||
@@ -403,7 +403,7 @@ _bt_compare(Relation rel,
|
||||
/*
|
||||
* The scan key is set up with the attribute number associated with each
|
||||
* term in the key. It is important that, if the index is multi-key, the
|
||||
* scan contain the first k key attributes, and that they be in order. If
|
||||
* scan contain the first k key attributes, and that they be in order. If
|
||||
* you think about how multi-key ordering works, you'll understand why
|
||||
* this is.
|
||||
*
|
||||
@@ -442,7 +442,7 @@ _bt_compare(Relation rel,
|
||||
/*
|
||||
* The sk_func needs to be passed the index value as left arg and
|
||||
* the sk_argument as right arg (they might be of different
|
||||
* types). Since it is convenient for callers to think of
|
||||
* types). Since it is convenient for callers to think of
|
||||
* _bt_compare as comparing the scankey to the index item, we have
|
||||
* to flip the sign of the comparison result. (Unless it's a DESC
|
||||
* column, in which case we *don't* flip the sign.)
|
||||
@@ -471,7 +471,7 @@ _bt_compare(Relation rel,
|
||||
* _bt_first() -- Find the first item in a scan.
|
||||
*
|
||||
* We need to be clever about the direction of scan, the search
|
||||
* conditions, and the tree ordering. We find the first item (or,
|
||||
* conditions, and the tree ordering. We find the first item (or,
|
||||
* if backwards scan, the last item) in the tree that satisfies the
|
||||
* qualifications in the scan key. On success exit, the page containing
|
||||
* the current index tuple is pinned but not locked, and data about
|
||||
@@ -527,7 +527,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
* We want to identify the keys that can be used as starting boundaries;
|
||||
* these are =, >, or >= keys for a forward scan or =, <, <= keys for
|
||||
* a backwards scan. We can use keys for multiple attributes so long as
|
||||
* the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
|
||||
* the prior attributes had only =, >= (resp. =, <=) keys. Once we accept
|
||||
* a > or < boundary or find an attribute with no boundary (which can be
|
||||
* thought of as the same as "> -infinity"), we can't use keys for any
|
||||
* attributes to its right, because it would break our simplistic notion
|
||||
@@ -742,7 +742,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
* even if the row comparison is of ">" or "<" type, because the
|
||||
* condition applied to all but the last row member is effectively
|
||||
* ">=" or "<=", and so the extra keys don't break the positioning
|
||||
* scheme. But, by the same token, if we aren't able to use all
|
||||
* scheme. But, by the same token, if we aren't able to use all
|
||||
* the row members, then the part of the row comparison that we
|
||||
* did use has to be treated as just a ">=" or "<=" condition, and
|
||||
* so we'd better adjust strat_total accordingly.
|
||||
@@ -861,7 +861,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
|
||||
/*
|
||||
* Find first item >= scankey, then back up one to arrive at last
|
||||
* item < scankey. (Note: this positioning strategy is only used
|
||||
* item < scankey. (Note: this positioning strategy is only used
|
||||
* for a backward scan, so that is always the correct starting
|
||||
* position.)
|
||||
*/
|
||||
@@ -910,7 +910,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTGreaterEqualStrategyNumber:
|
||||
|
||||
/*
|
||||
* Find first item >= scankey. (This is only used for forward
|
||||
* Find first item >= scankey. (This is only used for forward
|
||||
* scans.)
|
||||
*/
|
||||
nextkey = false;
|
||||
@@ -988,7 +988,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
*
|
||||
* The actually desired starting point is either this item or the prior
|
||||
* one, or in the end-of-page case it's the first item on the next page or
|
||||
* the last item on this page. Adjust the starting offset if needed. (If
|
||||
* the last item on this page. Adjust the starting offset if needed. (If
|
||||
* this results in an offset before the first item or after the last one,
|
||||
* _bt_readpage will report no items found, and then we'll step to the
|
||||
* next page as needed.)
|
||||
@@ -1304,7 +1304,7 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
|
||||
* than the walk-right case because of the possibility that the page
|
||||
* to our left splits while we are in flight to it, plus the
|
||||
* possibility that the page we were on gets deleted after we leave
|
||||
* it. See nbtree/README for details.
|
||||
* it. See nbtree/README for details.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -1399,7 +1399,7 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
* anymore, not that its left sibling got split more than four times.
|
||||
*
|
||||
* Note that it is correct to test P_ISDELETED not P_IGNORE here,
|
||||
* because half-dead pages are still in the sibling chain. Caller
|
||||
* because half-dead pages are still in the sibling chain. Caller
|
||||
* must reject half-dead pages if wanted.
|
||||
*/
|
||||
tries = 0;
|
||||
@@ -1425,7 +1425,7 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
if (P_ISDELETED(opaque))
|
||||
{
|
||||
/*
|
||||
* It was deleted. Move right to first nondeleted page (there
|
||||
* It was deleted. Move right to first nondeleted page (there
|
||||
* must be one); that is the page that has acquired the deleted
|
||||
* one's keyspace, so stepping left from it will take us where we
|
||||
* want to be.
|
||||
@@ -1469,7 +1469,7 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
* _bt_get_endpoint() -- Find the first or last page on a given tree level
|
||||
*
|
||||
* If the index is empty, we will return InvalidBuffer; any other failure
|
||||
* condition causes ereport(). We will not return a dead page.
|
||||
* condition causes ereport(). We will not return a dead page.
|
||||
*
|
||||
* The returned buffer is pinned and read-locked.
|
||||
*/
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
*
|
||||
* We use tuplesort.c to sort the given index tuples into order.
|
||||
* Then we scan the index tuples in order and build the btree pages
|
||||
* for each level. We load source tuples into leaf-level pages.
|
||||
* for each level. We load source tuples into leaf-level pages.
|
||||
* Whenever we fill a page at one level, we add a link to it to its
|
||||
* parent level (starting a new parent level if necessary). When
|
||||
* done, we write out each final page on each level, adding it to
|
||||
@@ -42,11 +42,11 @@
|
||||
*
|
||||
* Since the index will never be used unless it is completely built,
|
||||
* from a crash-recovery point of view there is no need to WAL-log the
|
||||
* steps of the build. After completing the index build, we can just sync
|
||||
* steps of the build. After completing the index build, we can just sync
|
||||
* the whole file to disk using smgrimmedsync() before exiting this module.
|
||||
* This can be seen to be sufficient for crash recovery by considering that
|
||||
* it's effectively equivalent to what would happen if a CHECKPOINT occurred
|
||||
* just after the index build. However, it is clearly not sufficient if the
|
||||
* just after the index build. However, it is clearly not sufficient if the
|
||||
* DBA is using the WAL log for PITR or replication purposes, since another
|
||||
* machine would not be able to reconstruct the index from WAL. Therefore,
|
||||
* we log the completed index pages to WAL if and only if WAL archiving is
|
||||
@@ -89,7 +89,7 @@ struct BTSpool
|
||||
};
|
||||
|
||||
/*
|
||||
* Status record for a btree page being built. We have one of these
|
||||
* Status record for a btree page being built. We have one of these
|
||||
* for each active tree level.
|
||||
*
|
||||
* The reason we need to store a copy of the minimum key is that we'll
|
||||
@@ -160,7 +160,7 @@ _bt_spoolinit(Relation heap, Relation index, bool isunique, bool isdead)
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem to
|
||||
* speed index creation. This should be OK since a single backend can't
|
||||
* run multiple index creations in parallel. Note that creation of a
|
||||
* unique index actually requires two BTSpool objects. We expect that the
|
||||
* unique index actually requires two BTSpool objects. We expect that the
|
||||
* second one (for dead tuples) won't get very full, so we give it only
|
||||
* work_mem.
|
||||
*/
|
||||
@@ -298,7 +298,7 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
|
||||
PageSetChecksumInplace(page, blkno);
|
||||
|
||||
/*
|
||||
* Now write the page. There's no need for smgr to schedule an fsync for
|
||||
* Now write the page. There's no need for smgr to schedule an fsync for
|
||||
* this write; we'll do it ourselves before ending the build.
|
||||
*/
|
||||
if (blkno == wstate->btws_pages_written)
|
||||
@@ -423,14 +423,14 @@ _bt_sortaddtup(Page page,
|
||||
* A leaf page being built looks like:
|
||||
*
|
||||
* +----------------+---------------------------------+
|
||||
* | PageHeaderData | linp0 linp1 linp2 ... |
|
||||
* | PageHeaderData | linp0 linp1 linp2 ... |
|
||||
* +-----------+----+---------------------------------+
|
||||
* | ... linpN | |
|
||||
* +-----------+--------------------------------------+
|
||||
* | ^ last |
|
||||
* | |
|
||||
* +-------------+------------------------------------+
|
||||
* | | itemN ... |
|
||||
* | | itemN ... |
|
||||
* +-------------+------------------+-----------------+
|
||||
* | ... item3 item2 item1 | "special space" |
|
||||
* +--------------------------------+-----------------+
|
||||
@@ -492,9 +492,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
RelationGetRelationName(wstate->index))));
|
||||
|
||||
/*
|
||||
* Check to see if page is "full". It's definitely full if the item won't
|
||||
* Check to see if page is "full". It's definitely full if the item won't
|
||||
* fit. Otherwise, compare to the target freespace derived from the
|
||||
* fillfactor. However, we must put at least two items on each page, so
|
||||
* fillfactor. However, we must put at least two items on each page, so
|
||||
* disregard fillfactor if we don't have that many.
|
||||
*/
|
||||
if (pgspc < itupsz || (pgspc < state->btps_full && last_off > P_FIRSTKEY))
|
||||
@@ -567,7 +567,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out the old page. We never need to touch it again, so we can
|
||||
* Write out the old page. We never need to touch it again, so we can
|
||||
* free the opage workspace too.
|
||||
*/
|
||||
_bt_blwritepage(wstate, opage, oblkno);
|
||||
@@ -804,7 +804,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
|
||||
/*
|
||||
* If the index is WAL-logged, we must fsync it down to disk before it's
|
||||
* safe to commit the transaction. (For a non-WAL-logged index we don't
|
||||
* safe to commit the transaction. (For a non-WAL-logged index we don't
|
||||
* care since the index will be uninteresting after a crash anyway.)
|
||||
*
|
||||
* It's obvious that we must do this when not WAL-logging the build. It's
|
||||
|
||||
@@ -107,7 +107,7 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
|
||||
* comparison data ultimately used must match the key datatypes.
|
||||
*
|
||||
* The result cannot be used with _bt_compare(), unless comparison
|
||||
* data is first stored into the key entries. Currently this
|
||||
* data is first stored into the key entries. Currently this
|
||||
* routine is only called by nbtsort.c and tuplesort.c, which have
|
||||
* their own comparison routines.
|
||||
*/
|
||||
@@ -269,7 +269,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* First, deconstruct the array into elements. Anything allocated
|
||||
* First, deconstruct the array into elements. Anything allocated
|
||||
* here (including a possibly detoasted array value) is in the
|
||||
* workspace context.
|
||||
*/
|
||||
@@ -283,7 +283,7 @@ _bt_preprocess_array_keys(IndexScanDesc scan)
|
||||
&elem_values, &elem_nulls, &num_elems);
|
||||
|
||||
/*
|
||||
* Compress out any null elements. We can ignore them since we assume
|
||||
* Compress out any null elements. We can ignore them since we assume
|
||||
* all btree operators are strict.
|
||||
*/
|
||||
num_nonnulls = 0;
|
||||
@@ -517,7 +517,7 @@ _bt_compare_array_elements(const void *a, const void *b, void *arg)
|
||||
* _bt_start_array_keys() -- Initialize array keys at start of a scan
|
||||
*
|
||||
* Set up the cur_elem counters and fill in the first sk_argument value for
|
||||
* each array scankey. We can't do this until we know the scan direction.
|
||||
* each array scankey. We can't do this until we know the scan direction.
|
||||
*/
|
||||
void
|
||||
_bt_start_array_keys(IndexScanDesc scan, ScanDirection dir)
|
||||
@@ -670,8 +670,8 @@ _bt_restore_array_keys(IndexScanDesc scan)
|
||||
* so that the index sorts in the desired direction.
|
||||
*
|
||||
* One key purpose of this routine is to discover which scan keys must be
|
||||
* satisfied to continue the scan. It also attempts to eliminate redundant
|
||||
* keys and detect contradictory keys. (If the index opfamily provides
|
||||
* satisfied to continue the scan. It also attempts to eliminate redundant
|
||||
* keys and detect contradictory keys. (If the index opfamily provides
|
||||
* incomplete sets of cross-type operators, we may fail to detect redundant
|
||||
* or contradictory keys, but we can survive that.)
|
||||
*
|
||||
@@ -702,7 +702,7 @@ _bt_restore_array_keys(IndexScanDesc scan)
|
||||
* that's the only one returned. (So, we return either a single = key,
|
||||
* or one or two boundary-condition keys for each attr.) However, if we
|
||||
* cannot compare two keys for lack of a suitable cross-type operator,
|
||||
* we cannot eliminate either. If there are two such keys of the same
|
||||
* we cannot eliminate either. If there are two such keys of the same
|
||||
* operator strategy, the second one is just pushed into the output array
|
||||
* without further processing here. We may also emit both >/>= or both
|
||||
* </<= keys if we can't compare them. The logic about required keys still
|
||||
@@ -737,7 +737,7 @@ _bt_restore_array_keys(IndexScanDesc scan)
|
||||
* Note: the reason we have to copy the preprocessed scan keys into private
|
||||
* storage is that we are modifying the array based on comparisons of the
|
||||
* key argument values, which could change on a rescan or after moving to
|
||||
* new elements of array keys. Therefore we can't overwrite the source data.
|
||||
* new elements of array keys. Therefore we can't overwrite the source data.
|
||||
*/
|
||||
void
|
||||
_bt_preprocess_keys(IndexScanDesc scan)
|
||||
@@ -919,7 +919,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
|
||||
/*
|
||||
* Emit the cleaned-up keys into the outkeys[] array, and then
|
||||
* mark them if they are required. They are required (possibly
|
||||
* mark them if they are required. They are required (possibly
|
||||
* only in one direction) if all attrs before this one had "=".
|
||||
*/
|
||||
for (j = BTMaxStrategyNumber; --j >= 0;)
|
||||
@@ -1017,7 +1017,7 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
* and amoplefttype/amoprighttype equal to the two argument datatypes.
|
||||
*
|
||||
* If the opfamily doesn't supply a complete set of cross-type operators we
|
||||
* may not be able to make the comparison. If we can make the comparison
|
||||
* may not be able to make the comparison. If we can make the comparison
|
||||
* we store the operator result in *result and return TRUE. We return FALSE
|
||||
* if the comparison could not be made.
|
||||
*
|
||||
@@ -1043,7 +1043,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
|
||||
StrategyNumber strat;
|
||||
|
||||
/*
|
||||
* First, deal with cases where one or both args are NULL. This should
|
||||
* First, deal with cases where one or both args are NULL. This should
|
||||
* only happen when the scankeys represent IS NULL/NOT NULL conditions.
|
||||
*/
|
||||
if ((leftarg->sk_flags | rightarg->sk_flags) & SK_ISNULL)
|
||||
@@ -1183,7 +1183,7 @@ _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
|
||||
*
|
||||
* Lastly, for ordinary scankeys (not IS NULL/NOT NULL), we check for a
|
||||
* NULL comparison value. Since all btree operators are assumed strict,
|
||||
* a NULL means that the qual cannot be satisfied. We return TRUE if the
|
||||
* a NULL means that the qual cannot be satisfied. We return TRUE if the
|
||||
* comparison value isn't NULL, or FALSE if the scan should be abandoned.
|
||||
*
|
||||
* This function is applied to the *input* scankey structure; therefore
|
||||
@@ -1212,7 +1212,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
|
||||
* --- we can treat IS NULL as an equality operator for purposes of search
|
||||
* strategy.
|
||||
*
|
||||
* Likewise, "x IS NOT NULL" is supported. We treat that as either "less
|
||||
* Likewise, "x IS NOT NULL" is supported. We treat that as either "less
|
||||
* than NULL" in a NULLS LAST index, or "greater than NULL" in a NULLS
|
||||
* FIRST index.
|
||||
*
|
||||
@@ -1284,7 +1284,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
|
||||
* Mark a scankey as "required to continue the scan".
|
||||
*
|
||||
* Depending on the operator type, the key may be required for both scan
|
||||
* directions or just one. Also, if the key is a row comparison header,
|
||||
* directions or just one. Also, if the key is a row comparison header,
|
||||
* we have to mark the appropriate subsidiary ScanKeys as required. In
|
||||
* such cases, the first subsidiary key is required, but subsequent ones
|
||||
* are required only as long as they correspond to successive index columns
|
||||
@@ -1296,7 +1296,7 @@ _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption)
|
||||
* scribbling on a data structure belonging to the index AM's caller, not on
|
||||
* our private copy. This should be OK because the marking will not change
|
||||
* from scan to scan within a query, and so we'd just re-mark the same way
|
||||
* anyway on a rescan. Something to keep an eye on though.
|
||||
* anyway on a rescan. Something to keep an eye on though.
|
||||
*/
|
||||
static void
|
||||
_bt_mark_scankey_required(ScanKey skey)
|
||||
@@ -1482,7 +1482,7 @@ _bt_checkkeys(IndexScanDesc scan,
|
||||
/*
|
||||
* Since NULLs are sorted before non-NULLs, we know we have
|
||||
* reached the lower limit of the range of values for this
|
||||
* index attr. On a backward scan, we can stop if this qual
|
||||
* index attr. On a backward scan, we can stop if this qual
|
||||
* is one of the "must match" subset. We can stop regardless
|
||||
* of whether the qual is > or <, so long as it's required,
|
||||
* because it's not possible for any future tuples to pass. On
|
||||
@@ -1498,8 +1498,8 @@ _bt_checkkeys(IndexScanDesc scan,
|
||||
/*
|
||||
* Since NULLs are sorted after non-NULLs, we know we have
|
||||
* reached the upper limit of the range of values for this
|
||||
* index attr. On a forward scan, we can stop if this qual is
|
||||
* one of the "must match" subset. We can stop regardless of
|
||||
* index attr. On a forward scan, we can stop if this qual is
|
||||
* one of the "must match" subset. We can stop regardless of
|
||||
* whether the qual is > or <, so long as it's required,
|
||||
* because it's not possible for any future tuples to pass. On
|
||||
* a backward scan, however, we must keep going, because we
|
||||
@@ -1593,7 +1593,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
/*
|
||||
* Since NULLs are sorted before non-NULLs, we know we have
|
||||
* reached the lower limit of the range of values for this
|
||||
* index attr. On a backward scan, we can stop if this qual
|
||||
* index attr. On a backward scan, we can stop if this qual
|
||||
* is one of the "must match" subset. We can stop regardless
|
||||
* of whether the qual is > or <, so long as it's required,
|
||||
* because it's not possible for any future tuples to pass. On
|
||||
@@ -1609,8 +1609,8 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
/*
|
||||
* Since NULLs are sorted after non-NULLs, we know we have
|
||||
* reached the upper limit of the range of values for this
|
||||
* index attr. On a forward scan, we can stop if this qual is
|
||||
* one of the "must match" subset. We can stop regardless of
|
||||
* index attr. On a forward scan, we can stop if this qual is
|
||||
* one of the "must match" subset. We can stop regardless of
|
||||
* whether the qual is > or <, so long as it's required,
|
||||
* because it's not possible for any future tuples to pass. On
|
||||
* a backward scan, however, we must keep going, because we
|
||||
@@ -1631,7 +1631,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
{
|
||||
/*
|
||||
* Unlike the simple-scankey case, this isn't a disallowed case.
|
||||
* But it can never match. If all the earlier row comparison
|
||||
* But it can never match. If all the earlier row comparison
|
||||
* columns are required for the scan direction, we can stop the
|
||||
* scan, because there can't be another tuple that will succeed.
|
||||
*/
|
||||
@@ -1696,7 +1696,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
/*
|
||||
* Tuple fails this qual. If it's a required qual for the current
|
||||
* scan direction, then we can conclude no further tuples will pass,
|
||||
* either. Note we have to look at the deciding column, not
|
||||
* either. Note we have to look at the deciding column, not
|
||||
* necessarily the first or last column of the row condition.
|
||||
*/
|
||||
if ((subkey->sk_flags & SK_BT_REQFWD) &&
|
||||
@@ -1722,7 +1722,7 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
|
||||
* is sufficient for setting LP_DEAD status (which is only a hint).
|
||||
*
|
||||
* We match items by heap TID before assuming they are the right ones to
|
||||
* delete. We cope with cases where items have moved right due to insertions.
|
||||
* delete. We cope with cases where items have moved right due to insertions.
|
||||
* If an item has moved off the current page due to a split, we'll fail to
|
||||
* find it and do nothing (this is not an error case --- we assume the item
|
||||
* will eventually get marked in a future indexscan). Note that because we
|
||||
@@ -1806,8 +1806,8 @@ _bt_killitems(IndexScanDesc scan, bool haveLock)
|
||||
/*
|
||||
* The following routines manage a shared-memory area in which we track
|
||||
* assignment of "vacuum cycle IDs" to currently-active btree vacuuming
|
||||
* operations. There is a single counter which increments each time we
|
||||
* start a vacuum to assign it a cycle ID. Since multiple vacuums could
|
||||
* operations. There is a single counter which increments each time we
|
||||
* start a vacuum to assign it a cycle ID. Since multiple vacuums could
|
||||
* be active concurrently, we have to track the cycle ID for each active
|
||||
* vacuum; this requires at most MaxBackends entries (usually far fewer).
|
||||
* We assume at most one vacuum can be active for a given index.
|
||||
|
||||
@@ -40,9 +40,9 @@ _bt_restore_page(Page page, char *from, int len)
|
||||
int nitems;
|
||||
|
||||
/*
|
||||
* To get the items back in the original order, we add them to the page
|
||||
* in reverse. To figure out where one tuple ends and another begins,
|
||||
* we have to scan them in forward order first.
|
||||
* To get the items back in the original order, we add them to the page in
|
||||
* reverse. To figure out where one tuple ends and another begins, we
|
||||
* have to scan them in forward order first.
|
||||
*/
|
||||
i = 0;
|
||||
while (from < end)
|
||||
@@ -97,7 +97,7 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn,
|
||||
pageop->btpo_flags = BTP_META;
|
||||
|
||||
/*
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* but it makes the page look compressible to xlog.c.
|
||||
*/
|
||||
((PageHeader) metapg)->pd_lower =
|
||||
@@ -118,7 +118,7 @@ static void
|
||||
_bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record,
|
||||
RelFileNode rnode, BlockNumber cblock)
|
||||
{
|
||||
Buffer buf;
|
||||
Buffer buf;
|
||||
|
||||
buf = XLogReadBuffer(rnode, cblock, false);
|
||||
if (BufferIsValid(buf))
|
||||
@@ -128,6 +128,7 @@ _bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record,
|
||||
if (lsn > PageGetLSN(page))
|
||||
{
|
||||
BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0);
|
||||
pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
|
||||
|
||||
@@ -153,6 +154,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
|
||||
datapos = (char *) xlrec + SizeOfBtreeInsert;
|
||||
datalen = record->xl_len - SizeOfBtreeInsert;
|
||||
|
||||
/*
|
||||
* if this insert finishes a split at lower level, extract the block
|
||||
* number of the (left) child.
|
||||
@@ -172,10 +174,10 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
}
|
||||
|
||||
/*
|
||||
* Insertion to an internal page finishes an incomplete split at the
|
||||
* child level. Clear the incomplete-split flag in the child. Note:
|
||||
* during normal operation, the child and parent pages are locked at the
|
||||
* same time, so that clearing the flag and inserting the downlink appear
|
||||
* Insertion to an internal page finishes an incomplete split at the child
|
||||
* level. Clear the incomplete-split flag in the child. Note: during
|
||||
* normal operation, the child and parent pages are locked at the same
|
||||
* time, so that clearing the flag and inserting the downlink appear
|
||||
* atomic to other backends. We don't bother with that during replay,
|
||||
* because readers don't care about the incomplete-split flag and there
|
||||
* cannot be updates happening.
|
||||
@@ -279,9 +281,10 @@ btree_xlog_split(bool onleft, bool isroot,
|
||||
datapos += left_hikeysz;
|
||||
datalen -= left_hikeysz;
|
||||
}
|
||||
|
||||
/*
|
||||
* If this insertion finishes an incomplete split, get the block number
|
||||
* of the child.
|
||||
* If this insertion finishes an incomplete split, get the block number of
|
||||
* the child.
|
||||
*/
|
||||
if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1)))
|
||||
{
|
||||
@@ -439,7 +442,7 @@ btree_xlog_split(bool onleft, bool isroot,
|
||||
* the backup block containing right sibling is 2 or 3, depending
|
||||
* whether this was a leaf or internal page.
|
||||
*/
|
||||
int rnext_index = isleaf ? 2 : 3;
|
||||
int rnext_index = isleaf ? 2 : 3;
|
||||
|
||||
if (record->xl_info & XLR_BKP_BLOCK(rnext_index))
|
||||
(void) RestoreBackupBlock(lsn, record, rnext_index, false, false);
|
||||
@@ -620,7 +623,7 @@ btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec)
|
||||
|
||||
/*
|
||||
* In what follows, we have to examine the previous state of the index
|
||||
* page, as well as the heap page(s) it points to. This is only valid if
|
||||
* page, as well as the heap page(s) it points to. This is only valid if
|
||||
* WAL replay has reached a consistent database state; which means that
|
||||
* the preceding check is not just an optimization, but is *necessary*. We
|
||||
* won't have let in any user sessions before we reach consistency.
|
||||
@@ -629,9 +632,9 @@ btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec)
|
||||
elog(PANIC, "btree_xlog_delete_get_latestRemovedXid: cannot operate with inconsistent data");
|
||||
|
||||
/*
|
||||
* Get index page. If the DB is consistent, this should not fail, nor
|
||||
* Get index page. If the DB is consistent, this should not fail, nor
|
||||
* should any of the heap page fetches below. If one does, we return
|
||||
* InvalidTransactionId to cancel all HS transactions. That's probably
|
||||
* InvalidTransactionId to cancel all HS transactions. That's probably
|
||||
* overkill, but it's safe, and certainly better than panicking here.
|
||||
*/
|
||||
ibuffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
|
||||
@@ -716,9 +719,9 @@ btree_xlog_delete_get_latestRemovedXid(xl_btree_delete *xlrec)
|
||||
/*
|
||||
* If all heap tuples were LP_DEAD then we will be returning
|
||||
* InvalidTransactionId here, which avoids conflicts. This matches
|
||||
* existing logic which assumes that LP_DEAD tuples must already be
|
||||
* older than the latestRemovedXid on the cleanup record that
|
||||
* set them as LP_DEAD, hence must already have generated a conflict.
|
||||
* existing logic which assumes that LP_DEAD tuples must already be older
|
||||
* than the latestRemovedXid on the cleanup record that set them as
|
||||
* LP_DEAD, hence must already have generated a conflict.
|
||||
*/
|
||||
return latestRemovedXid;
|
||||
}
|
||||
@@ -735,7 +738,7 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
|
||||
* If we have any conflict processing to do, it must happen before we
|
||||
* update the page.
|
||||
*
|
||||
* Btree delete records can conflict with standby queries. You might
|
||||
* Btree delete records can conflict with standby queries. You might
|
||||
* think that vacuum records would conflict as well, but we've handled
|
||||
* that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
|
||||
* cleaned by the vacuum of the heap and so we can resolve any conflicts
|
||||
@@ -828,7 +831,7 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogRecPtr lsn, XLogRecord *record)
|
||||
ItemId itemid;
|
||||
IndexTuple itup;
|
||||
OffsetNumber nextoffset;
|
||||
BlockNumber rightsib;
|
||||
BlockNumber rightsib;
|
||||
|
||||
poffset = ItemPointerGetOffsetNumber(&(xlrec->target.tid));
|
||||
|
||||
|
||||
@@ -54,7 +54,7 @@ desc_recompress_leaf(StringInfo buf, ginxlogRecompressDataLeaf *insertData)
|
||||
walbuf += nitems * sizeof(ItemPointerData);
|
||||
}
|
||||
|
||||
switch(a_action)
|
||||
switch (a_action)
|
||||
{
|
||||
case GIN_SEGMENT_ADDITEMS:
|
||||
appendStringInfo(buf, " %d (add %d items)", a_segno, nitems);
|
||||
@@ -94,13 +94,13 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
case XLOG_GIN_INSERT:
|
||||
{
|
||||
ginxlogInsert *xlrec = (ginxlogInsert *) rec;
|
||||
char *payload = rec + sizeof(ginxlogInsert);
|
||||
char *payload = rec + sizeof(ginxlogInsert);
|
||||
|
||||
appendStringInfoString(buf, "Insert item, ");
|
||||
desc_node(buf, xlrec->node, xlrec->blkno);
|
||||
appendStringInfo(buf, " isdata: %c isleaf: %c",
|
||||
(xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
|
||||
(xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
|
||||
(xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
|
||||
(xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
|
||||
if (!(xlrec->flags & GIN_INSERT_ISLEAF))
|
||||
{
|
||||
BlockNumber leftChildBlkno;
|
||||
@@ -115,11 +115,11 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
}
|
||||
if (!(xlrec->flags & GIN_INSERT_ISDATA))
|
||||
appendStringInfo(buf, " isdelete: %c",
|
||||
(((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F');
|
||||
(((ginxlogInsertEntry *) payload)->isDelete) ? 'T' : 'F');
|
||||
else if (xlrec->flags & GIN_INSERT_ISLEAF)
|
||||
{
|
||||
ginxlogRecompressDataLeaf *insertData =
|
||||
(ginxlogRecompressDataLeaf *) payload;
|
||||
(ginxlogRecompressDataLeaf *) payload;
|
||||
|
||||
if (xl_info & XLR_BKP_BLOCK(0))
|
||||
appendStringInfo(buf, " (full page image)");
|
||||
@@ -129,10 +129,11 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
else
|
||||
{
|
||||
ginxlogInsertDataInternal *insertData = (ginxlogInsertDataInternal *) payload;
|
||||
|
||||
appendStringInfo(buf, " pitem: %u-%u/%u",
|
||||
PostingItemGetBlockNumber(&insertData->newitem),
|
||||
ItemPointerGetBlockNumber(&insertData->newitem.key),
|
||||
ItemPointerGetOffsetNumber(&insertData->newitem.key));
|
||||
PostingItemGetBlockNumber(&insertData->newitem),
|
||||
ItemPointerGetBlockNumber(&insertData->newitem.key),
|
||||
ItemPointerGetOffsetNumber(&insertData->newitem.key));
|
||||
}
|
||||
}
|
||||
break;
|
||||
@@ -144,8 +145,8 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
desc_node(buf, ((ginxlogSplit *) rec)->node, ((ginxlogSplit *) rec)->lblkno);
|
||||
appendStringInfo(buf, " isrootsplit: %c", (((ginxlogSplit *) rec)->flags & GIN_SPLIT_ROOT) ? 'T' : 'F');
|
||||
appendStringInfo(buf, " isdata: %c isleaf: %c",
|
||||
(xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
|
||||
(xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
|
||||
(xlrec->flags & GIN_INSERT_ISDATA) ? 'T' : 'F',
|
||||
(xlrec->flags & GIN_INSERT_ISLEAF) ? 'T' : 'F');
|
||||
}
|
||||
break;
|
||||
case XLOG_GIN_VACUUM_PAGE:
|
||||
@@ -155,6 +156,7 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
case XLOG_GIN_VACUUM_DATA_LEAF_PAGE:
|
||||
{
|
||||
ginxlogVacuumDataLeafPage *xlrec = (ginxlogVacuumDataLeafPage *) rec;
|
||||
|
||||
appendStringInfoString(buf, "Vacuum data leaf page, ");
|
||||
desc_node(buf, xlrec->node, xlrec->blkno);
|
||||
if (xl_info & XLR_BKP_BLOCK(0))
|
||||
|
||||
@@ -140,7 +140,7 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
|
||||
xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) rec;
|
||||
|
||||
appendStringInfo(buf, "unlink_page: rel %u/%u/%u; ",
|
||||
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode);
|
||||
xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode);
|
||||
appendStringInfo(buf, "dead %u; left %u; right %u; btpo_xact %u; ",
|
||||
xlrec->deadblk, xlrec->leftsib, xlrec->rightsib, xlrec->btpo_xact);
|
||||
appendStringInfo(buf, "leaf %u; leafleft %u; leafright %u; topparent %u",
|
||||
|
||||
@@ -25,7 +25,7 @@
|
||||
/*
|
||||
* SPPageDesc tracks all info about a page we are inserting into. In some
|
||||
* situations it actually identifies a tuple, or even a specific node within
|
||||
* an inner tuple. But any of the fields can be invalid. If the buffer
|
||||
* an inner tuple. But any of the fields can be invalid. If the buffer
|
||||
* field is valid, it implies we hold pin and exclusive lock on that buffer.
|
||||
* page pointer should be valid exactly when buffer is.
|
||||
*/
|
||||
@@ -249,7 +249,7 @@ addLeafTuple(Relation index, SpGistState *state, SpGistLeafTuple leafTuple,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Tuple must be inserted into existing chain. We mustn't change the
|
||||
* Tuple must be inserted into existing chain. We mustn't change the
|
||||
* chain's head address, but we don't need to chase the entire chain
|
||||
* to put the tuple at the end; we can insert it second.
|
||||
*
|
||||
@@ -814,7 +814,7 @@ doPickSplit(Relation index, SpGistState *state,
|
||||
* We may not actually insert new tuple because another picksplit may be
|
||||
* necessary due to too large value, but we will try to allocate enough
|
||||
* space to include it; and in any case it has to be included in the input
|
||||
* for the picksplit function. So don't increment nToInsert yet.
|
||||
* for the picksplit function. So don't increment nToInsert yet.
|
||||
*/
|
||||
in.datums[in.nTuples] = SGLTDATUM(newLeafTuple, state);
|
||||
heapPtrs[in.nTuples] = newLeafTuple->heapPtr;
|
||||
@@ -872,7 +872,7 @@ doPickSplit(Relation index, SpGistState *state,
|
||||
/*
|
||||
* Check to see if the picksplit function failed to separate the values,
|
||||
* ie, it put them all into the same child node. If so, select allTheSame
|
||||
* mode and create a random split instead. See comments for
|
||||
* mode and create a random split instead. See comments for
|
||||
* checkAllTheSame as to why we need to know if the new leaf tuples could
|
||||
* fit on one page.
|
||||
*/
|
||||
@@ -1037,7 +1037,7 @@ doPickSplit(Relation index, SpGistState *state,
|
||||
&xlrec.initDest);
|
||||
|
||||
/*
|
||||
* Attempt to assign node groups to the two pages. We might fail to
|
||||
* Attempt to assign node groups to the two pages. We might fail to
|
||||
* do so, even if totalLeafSizes is less than the available space,
|
||||
* because we can't split a group across pages.
|
||||
*/
|
||||
@@ -1917,7 +1917,7 @@ spgdoinsert(Relation index, SpGistState *state,
|
||||
if (current.blkno == InvalidBlockNumber)
|
||||
{
|
||||
/*
|
||||
* Create a leaf page. If leafSize is too large to fit on a page,
|
||||
* Create a leaf page. If leafSize is too large to fit on a page,
|
||||
* we won't actually use the page yet, but it simplifies the API
|
||||
* for doPickSplit to always have a leaf page at hand; so just
|
||||
* quietly limit our request to a page size.
|
||||
@@ -2120,7 +2120,7 @@ spgdoinsert(Relation index, SpGistState *state,
|
||||
out.result.addNode.nodeLabel);
|
||||
|
||||
/*
|
||||
* Retry insertion into the enlarged node. We assume that
|
||||
* Retry insertion into the enlarged node. We assume that
|
||||
* we'll get a MatchNode result this time.
|
||||
*/
|
||||
goto process_inner_tuple;
|
||||
|
||||
@@ -163,7 +163,7 @@ spgbuildempty(PG_FUNCTION_ARGS)
|
||||
page = (Page) palloc(BLCKSZ);
|
||||
SpGistInitMetapage(page);
|
||||
|
||||
/* Write the page. If archiving/streaming, XLOG it. */
|
||||
/* Write the page. If archiving/streaming, XLOG it. */
|
||||
PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
|
||||
smgrwrite(index->rd_smgr, INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
|
||||
(char *) page, true);
|
||||
@@ -232,7 +232,7 @@ spginsert(PG_FUNCTION_ARGS)
|
||||
/*
|
||||
* We might have to repeat spgdoinsert() multiple times, if conflicts
|
||||
* occur with concurrent insertions. If so, reset the insertCtx each time
|
||||
* to avoid cumulative memory consumption. That means we also have to
|
||||
* to avoid cumulative memory consumption. That means we also have to
|
||||
* redo initSpGistState(), but it's cheap enough not to matter.
|
||||
*/
|
||||
while (!spgdoinsert(index, &spgstate, ht_ctid, *values, *isnull))
|
||||
|
||||
@@ -103,7 +103,7 @@ resetSpGistScanOpaque(SpGistScanOpaque so)
|
||||
* Sets searchNulls, searchNonNulls, numberOfKeys, keyData fields of *so.
|
||||
*
|
||||
* The point here is to eliminate null-related considerations from what the
|
||||
* opclass consistent functions need to deal with. We assume all SPGiST-
|
||||
* opclass consistent functions need to deal with. We assume all SPGiST-
|
||||
* indexable operators are strict, so any null RHS value makes the scan
|
||||
* condition unsatisfiable. We also pull out any IS NULL/IS NOT NULL
|
||||
* conditions; their effect is reflected into searchNulls/searchNonNulls.
|
||||
@@ -600,7 +600,7 @@ storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr,
|
||||
if (so->want_itup)
|
||||
{
|
||||
/*
|
||||
* Reconstruct desired IndexTuple. We have to copy the datum out of
|
||||
* Reconstruct desired IndexTuple. We have to copy the datum out of
|
||||
* the temp context anyway, so we may as well create the tuple here.
|
||||
*/
|
||||
so->indexTups[so->nPtrs] = index_form_tuple(so->indexTupDesc,
|
||||
|
||||
@@ -26,11 +26,11 @@
|
||||
* In the worst case, an inner tuple in a text radix tree could have as many
|
||||
* as 256 nodes (one for each possible byte value). Each node can take 16
|
||||
* bytes on MAXALIGN=8 machines. The inner tuple must fit on an index page
|
||||
* of size BLCKSZ. Rather than assuming we know the exact amount of overhead
|
||||
* of size BLCKSZ. Rather than assuming we know the exact amount of overhead
|
||||
* imposed by page headers, tuple headers, etc, we leave 100 bytes for that
|
||||
* (the actual overhead should be no more than 56 bytes at this writing, so
|
||||
* there is slop in this number). So we can safely create prefixes up to
|
||||
* BLCKSZ - 256 * 16 - 100 bytes long. Unfortunately, because 256 * 16 is
|
||||
* BLCKSZ - 256 * 16 - 100 bytes long. Unfortunately, because 256 * 16 is
|
||||
* already 4K, there is no safe prefix length when BLCKSZ is less than 8K;
|
||||
* it is always possible to get "SPGiST inner tuple size exceeds maximum"
|
||||
* if there are too many distinct next-byte values at a given place in the
|
||||
@@ -327,7 +327,7 @@ spg_text_picksplit(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Sort by label bytes so that we can group the values into nodes. This
|
||||
* Sort by label bytes so that we can group the values into nodes. This
|
||||
* also ensures that the nodes are ordered by label value, allowing the
|
||||
* use of binary search in searchChar.
|
||||
*/
|
||||
@@ -377,7 +377,7 @@ spg_text_inner_consistent(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* Reconstruct values represented at this tuple, including parent data,
|
||||
* prefix of this tuple if any, and the node label if any. in->level
|
||||
* prefix of this tuple if any, and the node label if any. in->level
|
||||
* should be the length of the previously reconstructed value, and the
|
||||
* number of bytes added here is prefixSize or prefixSize + 1.
|
||||
*
|
||||
|
||||
@@ -235,7 +235,7 @@ SpGistUpdateMetaPage(Relation index)
|
||||
*
|
||||
* When requesting an inner page, if we get one with the wrong parity,
|
||||
* we just release the buffer and try again. We will get a different page
|
||||
* because GetFreeIndexPage will have marked the page used in FSM. The page
|
||||
* because GetFreeIndexPage will have marked the page used in FSM. The page
|
||||
* is entered in our local lastUsedPages cache, so there's some hope of
|
||||
* making use of it later in this session, but otherwise we rely on VACUUM
|
||||
* to eventually re-enter the page in FSM, making it available for recycling.
|
||||
@@ -245,7 +245,7 @@ SpGistUpdateMetaPage(Relation index)
|
||||
*
|
||||
* When we return a buffer to the caller, the page is *not* entered into
|
||||
* the lastUsedPages cache; we expect the caller will do so after it's taken
|
||||
* whatever space it will use. This is because after the caller has used up
|
||||
* whatever space it will use. This is because after the caller has used up
|
||||
* some space, the page might have less space than whatever was cached already
|
||||
* so we'd rather not trash the old cache entry.
|
||||
*/
|
||||
@@ -317,7 +317,7 @@ SpGistGetBuffer(Relation index, int flags, int needSpace, bool *isNew)
|
||||
|
||||
/*
|
||||
* If possible, increase the space request to include relation's
|
||||
* fillfactor. This ensures that when we add unrelated tuples to a page,
|
||||
* fillfactor. This ensures that when we add unrelated tuples to a page,
|
||||
* we try to keep 100-fillfactor% available for adding tuples that are
|
||||
* related to the ones already on it. But fillfactor mustn't cause an
|
||||
* error for requests that would otherwise be legal.
|
||||
|
||||
@@ -211,7 +211,7 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
|
||||
* Figure out exactly what we have to do. We do this separately from
|
||||
* actually modifying the page, mainly so that we have a representation
|
||||
* that can be dumped into WAL and then the replay code can do exactly
|
||||
* the same thing. The output of this step consists of six arrays
|
||||
* the same thing. The output of this step consists of six arrays
|
||||
* describing four kinds of operations, to be performed in this order:
|
||||
*
|
||||
* toDead[]: tuple numbers to be replaced with DEAD tuples
|
||||
@@ -287,7 +287,7 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Second or later live tuple. Arrange to re-chain it to the
|
||||
* Second or later live tuple. Arrange to re-chain it to the
|
||||
* previous live one, if there was a gap.
|
||||
*/
|
||||
if (interveningDeletable)
|
||||
|
||||
@@ -41,7 +41,7 @@ fillFakeState(SpGistState *state, spgxlogState stateSrc)
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a leaf tuple, or replace an existing placeholder tuple. This is used
|
||||
* Add a leaf tuple, or replace an existing placeholder tuple. This is used
|
||||
* to replay SpGistPageAddNewItem() operations. If the offset points at an
|
||||
* existing tuple, it had better be a placeholder tuple.
|
||||
*/
|
||||
@@ -462,7 +462,7 @@ spgRedoAddNode(XLogRecPtr lsn, XLogRecord *record)
|
||||
}
|
||||
|
||||
/*
|
||||
* Update parent downlink. Since parent could be in either of the
|
||||
* Update parent downlink. Since parent could be in either of the
|
||||
* previous two buffers, it's a bit tricky to determine which BKP bit
|
||||
* applies.
|
||||
*/
|
||||
@@ -799,7 +799,7 @@ spgRedoPickSplit(XLogRecPtr lsn, XLogRecord *record)
|
||||
bbi++;
|
||||
|
||||
/*
|
||||
* Now we can release the leaf-page locks. It's okay to do this before
|
||||
* Now we can release the leaf-page locks. It's okay to do this before
|
||||
* updating the parent downlink.
|
||||
*/
|
||||
if (BufferIsValid(srcBuffer))
|
||||
|
||||
@@ -11,15 +11,15 @@
|
||||
* log can be broken into relatively small, independent segments.
|
||||
*
|
||||
* XLOG interactions: this module generates an XLOG record whenever a new
|
||||
* CLOG page is initialized to zeroes. Other writes of CLOG come from
|
||||
* CLOG page is initialized to zeroes. Other writes of CLOG come from
|
||||
* recording of transaction commit or abort in xact.c, which generates its
|
||||
* own XLOG records for these events and will re-perform the status update
|
||||
* on redo; so we need make no additional XLOG entry here. For synchronous
|
||||
* on redo; so we need make no additional XLOG entry here. For synchronous
|
||||
* transaction commits, the XLOG is guaranteed flushed through the XLOG commit
|
||||
* record before we are called to log a commit, so the WAL rule "write xlog
|
||||
* before data" is satisfied automatically. However, for async commits we
|
||||
* must track the latest LSN affecting each CLOG page, so that we can flush
|
||||
* XLOG that far and satisfy the WAL rule. We don't have to worry about this
|
||||
* XLOG that far and satisfy the WAL rule. We don't have to worry about this
|
||||
* for aborts (whether sync or async), since the post-crash assumption would
|
||||
* be that such transactions failed anyway.
|
||||
*
|
||||
@@ -105,7 +105,7 @@ static void set_status_by_pages(int nsubxids, TransactionId *subxids,
|
||||
* in the tree of xid. In various cases nsubxids may be zero.
|
||||
*
|
||||
* lsn must be the WAL location of the commit record when recording an async
|
||||
* commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
|
||||
* commit. For a synchronous commit it can be InvalidXLogRecPtr, since the
|
||||
* caller guarantees the commit record is already flushed in that case. It
|
||||
* should be InvalidXLogRecPtr for abort cases, too.
|
||||
*
|
||||
@@ -417,7 +417,7 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
|
||||
* Testing during the PostgreSQL 9.2 development cycle revealed that on a
|
||||
* large multi-processor system, it was possible to have more CLOG page
|
||||
* requests in flight at one time than the numebr of CLOG buffers which existed
|
||||
* at that time, which was hardcoded to 8. Further testing revealed that
|
||||
* at that time, which was hardcoded to 8. Further testing revealed that
|
||||
* performance dropped off with more than 32 CLOG buffers, possibly because
|
||||
* the linear buffer search algorithm doesn't scale well.
|
||||
*
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
*
|
||||
* The pg_multixact manager is a pg_clog-like manager that stores an array of
|
||||
* MultiXactMember for each MultiXactId. It is a fundamental part of the
|
||||
* shared-row-lock implementation. Each MultiXactMember is comprised of a
|
||||
* shared-row-lock implementation. Each MultiXactMember is comprised of a
|
||||
* TransactionId and a set of flag bits. The name is a bit historical:
|
||||
* originally, a MultiXactId consisted of more than one TransactionId (except
|
||||
* in rare corner cases), hence "multi". Nowadays, however, it's perfectly
|
||||
@@ -18,7 +18,7 @@
|
||||
*
|
||||
* We use two SLRU areas, one for storing the offsets at which the data
|
||||
* starts for each MultiXactId in the other one. This trick allows us to
|
||||
* store variable length arrays of TransactionIds. (We could alternatively
|
||||
* store variable length arrays of TransactionIds. (We could alternatively
|
||||
* use one area containing counts and TransactionIds, with valid MultiXactId
|
||||
* values pointing at slots containing counts; but that way seems less robust
|
||||
* since it would get completely confused if someone inquired about a bogus
|
||||
@@ -38,7 +38,7 @@
|
||||
*
|
||||
* Like clog.c, and unlike subtrans.c, we have to preserve state across
|
||||
* crashes and ensure that MXID and offset numbering increases monotonically
|
||||
* across a crash. We do this in the same way as it's done for transaction
|
||||
* across a crash. We do this in the same way as it's done for transaction
|
||||
* IDs: the WAL record is guaranteed to contain evidence of every MXID we
|
||||
* could need to worry about, and we just make sure that at the end of
|
||||
* replay, the next-MXID and next-offset counters are at least as large as
|
||||
@@ -50,7 +50,7 @@
|
||||
* The minimum value in each database is stored in pg_database, and the
|
||||
* global minimum is part of pg_control. Any vacuum that is able to
|
||||
* advance its database's minimum value also computes a new global minimum,
|
||||
* and uses this value to truncate older segments. When new multixactid
|
||||
* and uses this value to truncate older segments. When new multixactid
|
||||
* values are to be created, care is taken that the counter does not
|
||||
* fall within the wraparound horizon considering the global minimum value.
|
||||
*
|
||||
@@ -85,13 +85,13 @@
|
||||
|
||||
|
||||
/*
|
||||
* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
|
||||
* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
|
||||
* used everywhere else in Postgres.
|
||||
*
|
||||
* Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF,
|
||||
* MultiXact page numbering also wraps around at
|
||||
* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at
|
||||
* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need
|
||||
* 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need
|
||||
* take no explicit notice of that fact in this module, except when comparing
|
||||
* segment and page numbers in TruncateMultiXact (see
|
||||
* MultiXactOffsetPagePrecedes).
|
||||
@@ -110,7 +110,7 @@
|
||||
* additional flag bits for each TransactionId. To do this without getting
|
||||
* into alignment issues, we store four bytes of flags, and then the
|
||||
* corresponding 4 Xids. Each such 5-word (20-byte) set we call a "group", and
|
||||
* are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
|
||||
* are stored as a whole in pages. Thus, with 8kB BLCKSZ, we keep 409 groups
|
||||
* per page. This wastes 12 bytes per page, but that's OK -- simplicity (and
|
||||
* performance) trumps space efficiency here.
|
||||
*
|
||||
@@ -161,7 +161,7 @@ static SlruCtlData MultiXactMemberCtlData;
|
||||
#define MultiXactMemberCtl (&MultiXactMemberCtlData)
|
||||
|
||||
/*
|
||||
* MultiXact state shared across all backends. All this state is protected
|
||||
* MultiXact state shared across all backends. All this state is protected
|
||||
* by MultiXactGenLock. (We also use MultiXactOffsetControlLock and
|
||||
* MultiXactMemberControlLock to guard accesses to the two sets of SLRU
|
||||
* buffers. For concurrency's sake, we avoid holding more than one of these
|
||||
@@ -179,7 +179,7 @@ typedef struct MultiXactStateData
|
||||
MultiXactId lastTruncationPoint;
|
||||
|
||||
/*
|
||||
* oldest multixact that is still on disk. Anything older than this
|
||||
* oldest multixact that is still on disk. Anything older than this
|
||||
* should not be consulted.
|
||||
*/
|
||||
MultiXactId oldestMultiXactId;
|
||||
@@ -269,8 +269,8 @@ typedef struct mXactCacheEnt
|
||||
} mXactCacheEnt;
|
||||
|
||||
#define MAX_CACHE_ENTRIES 256
|
||||
static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
|
||||
static int MXactCacheMembers = 0;
|
||||
static dlist_head MXactCache = DLIST_STATIC_INIT(MXactCache);
|
||||
static int MXactCacheMembers = 0;
|
||||
static MemoryContext MXactContext = NULL;
|
||||
|
||||
#ifdef MULTIXACT_DEBUG
|
||||
@@ -528,7 +528,7 @@ MultiXactIdIsRunning(MultiXactId multi)
|
||||
|
||||
/*
|
||||
* This could be made faster by having another entry point in procarray.c,
|
||||
* walking the PGPROC array only once for all the members. But in most
|
||||
* walking the PGPROC array only once for all the members. But in most
|
||||
* cases nmembers should be small enough that it doesn't much matter.
|
||||
*/
|
||||
for (i = 0; i < nmembers; i++)
|
||||
@@ -579,9 +579,9 @@ MultiXactIdSetOldestMember(void)
|
||||
* back. Which would be wrong.
|
||||
*
|
||||
* Note that a shared lock is sufficient, because it's enough to stop
|
||||
* someone from advancing nextMXact; and nobody else could be trying to
|
||||
* write to our OldestMember entry, only reading (and we assume storing
|
||||
* it is atomic.)
|
||||
* someone from advancing nextMXact; and nobody else could be trying
|
||||
* to write to our OldestMember entry, only reading (and we assume
|
||||
* storing it is atomic.)
|
||||
*/
|
||||
LWLockAcquire(MultiXactGenLock, LW_SHARED);
|
||||
|
||||
@@ -615,7 +615,7 @@ MultiXactIdSetOldestMember(void)
|
||||
* The value to set is the oldest of nextMXact and all the valid per-backend
|
||||
* OldestMemberMXactId[] entries. Because of the locking we do, we can be
|
||||
* certain that no subsequent call to MultiXactIdSetOldestMember can set
|
||||
* an OldestMemberMXactId[] entry older than what we compute here. Therefore
|
||||
* an OldestMemberMXactId[] entry older than what we compute here. Therefore
|
||||
* there is no live transaction, now or later, that can be a member of any
|
||||
* MultiXactId older than the OldestVisibleMXactId we compute here.
|
||||
*/
|
||||
@@ -751,7 +751,7 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
|
||||
* heap_lock_tuple() to have put it there, and heap_lock_tuple() generates
|
||||
* an XLOG record that must follow ours. The normal LSN interlock between
|
||||
* the data page and that XLOG record will ensure that our XLOG record
|
||||
* reaches disk first. If the SLRU members/offsets data reaches disk
|
||||
* reaches disk first. If the SLRU members/offsets data reaches disk
|
||||
* sooner than the XLOG record, we do not care because we'll overwrite it
|
||||
* with zeroes unless the XLOG record is there too; see notes at top of
|
||||
* this file.
|
||||
@@ -882,7 +882,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
|
||||
* GetNewMultiXactId
|
||||
* Get the next MultiXactId.
|
||||
*
|
||||
* Also, reserve the needed amount of space in the "members" area. The
|
||||
* Also, reserve the needed amount of space in the "members" area. The
|
||||
* starting offset of the reserved space is returned in *offset.
|
||||
*
|
||||
* This may generate XLOG records for expansion of the offsets and/or members
|
||||
@@ -916,7 +916,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
|
||||
|
||||
/*----------
|
||||
* Check to see if it's safe to assign another MultiXactId. This protects
|
||||
* against catastrophic data loss due to multixact wraparound. The basic
|
||||
* against catastrophic data loss due to multixact wraparound. The basic
|
||||
* rules are:
|
||||
*
|
||||
* If we're past multiVacLimit, start trying to force autovacuum cycles.
|
||||
@@ -930,7 +930,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
|
||||
{
|
||||
/*
|
||||
* For safety's sake, we release MultiXactGenLock while sending
|
||||
* signals, warnings, etc. This is not so much because we care about
|
||||
* signals, warnings, etc. This is not so much because we care about
|
||||
* preserving concurrency in this situation, as to avoid any
|
||||
* possibility of deadlock while doing get_database_name(). First,
|
||||
* copy all the shared values we'll need in this path.
|
||||
@@ -981,8 +981,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
|
||||
(errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
|
||||
"database \"%s\" must be vacuumed before %u more MultiXactIds are used",
|
||||
multiWrapLimit - result,
|
||||
oldest_datname,
|
||||
multiWrapLimit - result),
|
||||
oldest_datname,
|
||||
multiWrapLimit - result),
|
||||
errhint("Execute a database-wide VACUUM in that database.\n"
|
||||
"You might also need to commit or roll back old prepared transactions.")));
|
||||
else
|
||||
@@ -990,8 +990,8 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
|
||||
(errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
|
||||
"database with OID %u must be vacuumed before %u more MultiXactIds are used",
|
||||
multiWrapLimit - result,
|
||||
oldest_datoid,
|
||||
multiWrapLimit - result),
|
||||
oldest_datoid,
|
||||
multiWrapLimit - result),
|
||||
errhint("Execute a database-wide VACUUM in that database.\n"
|
||||
"You might also need to commit or roll back old prepared transactions.")));
|
||||
}
|
||||
@@ -1036,7 +1036,7 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset)
|
||||
* until after file extension has succeeded!
|
||||
*
|
||||
* We don't care about MultiXactId wraparound here; it will be handled by
|
||||
* the next iteration. But note that nextMXact may be InvalidMultiXactId
|
||||
* the next iteration. But note that nextMXact may be InvalidMultiXactId
|
||||
* or the first value on a segment-beginning page after this routine
|
||||
* exits, so anyone else looking at the variable must be prepared to deal
|
||||
* with either case. Similarly, nextOffset may be zero, but we won't use
|
||||
@@ -1114,16 +1114,16 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
|
||||
* need to allow an empty set to be returned regardless, if the caller is
|
||||
* willing to accept it; the caller is expected to check that it's an
|
||||
* allowed condition (such as ensuring that the infomask bits set on the
|
||||
* tuple are consistent with the pg_upgrade scenario). If the caller is
|
||||
* tuple are consistent with the pg_upgrade scenario). If the caller is
|
||||
* expecting this to be called only on recently created multis, then we
|
||||
* raise an error.
|
||||
*
|
||||
* Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
|
||||
* seen, it implies undetected ID wraparound has occurred. This raises a
|
||||
* seen, it implies undetected ID wraparound has occurred. This raises a
|
||||
* hard error.
|
||||
*
|
||||
* Shared lock is enough here since we aren't modifying any global state.
|
||||
* Acquire it just long enough to grab the current counter values. We may
|
||||
* Acquire it just long enough to grab the current counter values. We may
|
||||
* need both nextMXact and nextOffset; see below.
|
||||
*/
|
||||
LWLockAcquire(MultiXactGenLock, LW_SHARED);
|
||||
@@ -1151,12 +1151,12 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
|
||||
|
||||
/*
|
||||
* Find out the offset at which we need to start reading MultiXactMembers
|
||||
* and the number of members in the multixact. We determine the latter as
|
||||
* and the number of members in the multixact. We determine the latter as
|
||||
* the difference between this multixact's starting offset and the next
|
||||
* one's. However, there are some corner cases to worry about:
|
||||
*
|
||||
* 1. This multixact may be the latest one created, in which case there is
|
||||
* no next one to look at. In this case the nextOffset value we just
|
||||
* no next one to look at. In this case the nextOffset value we just
|
||||
* saved is the correct endpoint.
|
||||
*
|
||||
* 2. The next multixact may still be in process of being filled in: that
|
||||
@@ -1167,11 +1167,11 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
|
||||
* (because we are careful to pre-zero offset pages). Because
|
||||
* GetNewMultiXactId will never return zero as the starting offset for a
|
||||
* multixact, when we read zero as the next multixact's offset, we know we
|
||||
* have this case. We sleep for a bit and try again.
|
||||
* have this case. We sleep for a bit and try again.
|
||||
*
|
||||
* 3. Because GetNewMultiXactId increments offset zero to offset one to
|
||||
* handle case #2, there is an ambiguity near the point of offset
|
||||
* wraparound. If we see next multixact's offset is one, is that our
|
||||
* wraparound. If we see next multixact's offset is one, is that our
|
||||
* multixact's actual endpoint, or did it end at zero with a subsequent
|
||||
* increment? We handle this using the knowledge that if the zero'th
|
||||
* member slot wasn't filled, it'll contain zero, and zero isn't a valid
|
||||
@@ -1297,8 +1297,8 @@ retry:
|
||||
|
||||
/*
|
||||
* MultiXactHasRunningRemoteMembers
|
||||
* Does the given multixact have still-live members from
|
||||
* transactions other than our own?
|
||||
* Does the given multixact have still-live members from
|
||||
* transactions other than our own?
|
||||
*/
|
||||
bool
|
||||
MultiXactHasRunningRemoteMembers(MultiXactId multi)
|
||||
@@ -1694,7 +1694,7 @@ multixact_twophase_postabort(TransactionId xid, uint16 info,
|
||||
|
||||
/*
|
||||
* Initialization of shared memory for MultiXact. We use two SLRU areas,
|
||||
* thus double memory. Also, reserve space for the shared MultiXactState
|
||||
* thus double memory. Also, reserve space for the shared MultiXactState
|
||||
* struct and the per-backend MultiXactId arrays (two of those, too).
|
||||
*/
|
||||
Size
|
||||
@@ -1754,7 +1754,7 @@ MultiXactShmemInit(void)
|
||||
|
||||
/*
|
||||
* This func must be called ONCE on system install. It creates the initial
|
||||
* MultiXact segments. (The MultiXacts directories are assumed to have been
|
||||
* MultiXact segments. (The MultiXacts directories are assumed to have been
|
||||
* created by initdb, and MultiXactShmemInit must have been called already.)
|
||||
*/
|
||||
void
|
||||
@@ -1849,7 +1849,7 @@ MaybeExtendOffsetSlru(void)
|
||||
|
||||
if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
|
||||
{
|
||||
int slotno;
|
||||
int slotno;
|
||||
|
||||
/*
|
||||
* Fortunately for us, SimpleLruWritePage is already prepared to deal
|
||||
@@ -1925,7 +1925,7 @@ TrimMultiXact(void)
|
||||
MultiXactOffsetCtl->shared->latest_page_number = pageno;
|
||||
|
||||
/*
|
||||
* Zero out the remainder of the current offsets page. See notes in
|
||||
* Zero out the remainder of the current offsets page. See notes in
|
||||
* StartupCLOG() for motivation.
|
||||
*/
|
||||
entryno = MultiXactIdToOffsetEntry(multi);
|
||||
@@ -1955,7 +1955,7 @@ TrimMultiXact(void)
|
||||
MultiXactMemberCtl->shared->latest_page_number = pageno;
|
||||
|
||||
/*
|
||||
* Zero out the remainder of the current members page. See notes in
|
||||
* Zero out the remainder of the current members page. See notes in
|
||||
* TrimCLOG() for motivation.
|
||||
*/
|
||||
flagsoff = MXOffsetToFlagsOffset(offset);
|
||||
@@ -2097,7 +2097,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
|
||||
|
||||
/*
|
||||
* We'll start complaining loudly when we get within 10M multis of the
|
||||
* stop point. This is kind of arbitrary, but if you let your gas gauge
|
||||
* stop point. This is kind of arbitrary, but if you let your gas gauge
|
||||
* get down to 1% of full, would you be looking for the next gas station?
|
||||
* We need to be fairly liberal about this number because there are lots
|
||||
* of scenarios where most transactions are done by automatic clients that
|
||||
@@ -2172,8 +2172,8 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
|
||||
(errmsg_plural("database \"%s\" must be vacuumed before %u more MultiXactId is used",
|
||||
"database \"%s\" must be vacuumed before %u more MultiXactIds are used",
|
||||
multiWrapLimit - curMulti,
|
||||
oldest_datname,
|
||||
multiWrapLimit - curMulti),
|
||||
oldest_datname,
|
||||
multiWrapLimit - curMulti),
|
||||
errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
|
||||
"You might also need to commit or roll back old prepared transactions.")));
|
||||
else
|
||||
@@ -2181,8 +2181,8 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
|
||||
(errmsg_plural("database with OID %u must be vacuumed before %u more MultiXactId is used",
|
||||
"database with OID %u must be vacuumed before %u more MultiXactIds are used",
|
||||
multiWrapLimit - curMulti,
|
||||
oldest_datoid,
|
||||
multiWrapLimit - curMulti),
|
||||
oldest_datoid,
|
||||
multiWrapLimit - curMulti),
|
||||
errhint("To avoid a database shutdown, execute a database-wide VACUUM in that database.\n"
|
||||
"You might also need to commit or roll back old prepared transactions.")));
|
||||
}
|
||||
@@ -2375,16 +2375,16 @@ GetOldestMultiXactId(void)
|
||||
|
||||
/*
|
||||
* SlruScanDirectory callback.
|
||||
* This callback deletes segments that are outside the range determined by
|
||||
* the given page numbers.
|
||||
* This callback deletes segments that are outside the range determined by
|
||||
* the given page numbers.
|
||||
*
|
||||
* Both range endpoints are exclusive (that is, segments containing any of
|
||||
* those pages are kept.)
|
||||
*/
|
||||
typedef struct MembersLiveRange
|
||||
{
|
||||
int rangeStart;
|
||||
int rangeEnd;
|
||||
int rangeStart;
|
||||
int rangeEnd;
|
||||
} MembersLiveRange;
|
||||
|
||||
static bool
|
||||
@@ -2392,15 +2392,15 @@ SlruScanDirCbRemoveMembers(SlruCtl ctl, char *filename, int segpage,
|
||||
void *data)
|
||||
{
|
||||
MembersLiveRange *range = (MembersLiveRange *) data;
|
||||
MultiXactOffset nextOffset;
|
||||
MultiXactOffset nextOffset;
|
||||
|
||||
if ((segpage == range->rangeStart) ||
|
||||
(segpage == range->rangeEnd))
|
||||
return false; /* easy case out */
|
||||
return false; /* easy case out */
|
||||
|
||||
/*
|
||||
* To ensure that no segment is spuriously removed, we must keep track
|
||||
* of new segments added since the start of the directory scan; to do this,
|
||||
* To ensure that no segment is spuriously removed, we must keep track of
|
||||
* new segments added since the start of the directory scan; to do this,
|
||||
* we update our end-of-range point as we run.
|
||||
*
|
||||
* As an optimization, we can skip looking at shared memory if we know for
|
||||
@@ -2473,10 +2473,10 @@ void
|
||||
TruncateMultiXact(MultiXactId oldestMXact)
|
||||
{
|
||||
MultiXactOffset oldestOffset;
|
||||
MultiXactOffset nextOffset;
|
||||
MultiXactOffset nextOffset;
|
||||
mxtruncinfo trunc;
|
||||
MultiXactId earliest;
|
||||
MembersLiveRange range;
|
||||
MembersLiveRange range;
|
||||
|
||||
/*
|
||||
* Note we can't just plow ahead with the truncation; it's possible that
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
*
|
||||
* We use a control LWLock to protect the shared data structures, plus
|
||||
* per-buffer LWLocks that synchronize I/O for each buffer. The control lock
|
||||
* must be held to examine or modify any shared state. A process that is
|
||||
* must be held to examine or modify any shared state. A process that is
|
||||
* reading in or writing out a page buffer does not hold the control lock,
|
||||
* only the per-buffer lock for the buffer it is working on.
|
||||
*
|
||||
@@ -34,7 +34,7 @@
|
||||
* could have happened while we didn't have the lock).
|
||||
*
|
||||
* As with the regular buffer manager, it is possible for another process
|
||||
* to re-dirty a page that is currently being written out. This is handled
|
||||
* to re-dirty a page that is currently being written out. This is handled
|
||||
* by re-setting the page's page_dirty flag.
|
||||
*
|
||||
*
|
||||
@@ -96,7 +96,7 @@ typedef struct SlruFlushData *SlruFlush;
|
||||
* page_lru_count entries to be "reset" to lower values than they should have,
|
||||
* in case a process is delayed while it executes this macro. With care in
|
||||
* SlruSelectLRUPage(), this does little harm, and in any case the absolute
|
||||
* worst possible consequence is a nonoptimal choice of page to evict. The
|
||||
* worst possible consequence is a nonoptimal choice of page to evict. The
|
||||
* gain from allowing concurrent reads of SLRU pages seems worth it.
|
||||
*/
|
||||
#define SlruRecentlyUsed(shared, slotno) \
|
||||
@@ -481,7 +481,7 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
|
||||
*
|
||||
* NOTE: only one write attempt is made here. Hence, it is possible that
|
||||
* the page is still dirty at exit (if someone else re-dirtied it during
|
||||
* the write). However, we *do* attempt a fresh write even if the page
|
||||
* the write). However, we *do* attempt a fresh write even if the page
|
||||
* is already being written; this is for checkpoints.
|
||||
*
|
||||
* Control lock must be held at entry, and will be held at exit.
|
||||
@@ -634,7 +634,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
|
||||
* In a crash-and-restart situation, it's possible for us to receive
|
||||
* commands to set the commit status of transactions whose bits are in
|
||||
* already-truncated segments of the commit log (see notes in
|
||||
* SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
|
||||
* SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
|
||||
* where the file doesn't exist, and return zeroes instead.
|
||||
*/
|
||||
fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
|
||||
@@ -964,9 +964,9 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
|
||||
|
||||
/*
|
||||
* If we find any EMPTY slot, just select that one. Else choose a
|
||||
* victim page to replace. We normally take the least recently used
|
||||
* victim page to replace. We normally take the least recently used
|
||||
* valid page, but we will never take the slot containing
|
||||
* latest_page_number, even if it appears least recently used. We
|
||||
* latest_page_number, even if it appears least recently used. We
|
||||
* will select a slot that is already I/O busy only if there is no
|
||||
* other choice: a read-busy slot will not be least recently used once
|
||||
* the read finishes, and waiting for an I/O on a write-busy slot is
|
||||
@@ -1041,7 +1041,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
|
||||
|
||||
/*
|
||||
* If all pages (except possibly the latest one) are I/O busy, we'll
|
||||
* have to wait for an I/O to complete and then retry. In that
|
||||
* have to wait for an I/O to complete and then retry. In that
|
||||
* unhappy case, we choose to wait for the I/O on the least recently
|
||||
* used slot, on the assumption that it was likely initiated first of
|
||||
* all the I/Os in progress and may therefore finish first.
|
||||
@@ -1193,7 +1193,7 @@ restart:;
|
||||
/*
|
||||
* Hmm, we have (or may have) I/O operations acting on the page, so
|
||||
* we've got to wait for them to finish and then start again. This is
|
||||
* the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
|
||||
* the same logic as in SlruSelectLRUPage. (XXX if page is dirty,
|
||||
* wouldn't it be OK to just discard it without writing it? For now,
|
||||
* keep the logic the same as it was.)
|
||||
*/
|
||||
@@ -1293,7 +1293,7 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
|
||||
cldir = AllocateDir(ctl->Dir);
|
||||
while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
|
||||
{
|
||||
size_t len;
|
||||
size_t len;
|
||||
|
||||
len = strlen(clde->d_name);
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
*
|
||||
* The pg_subtrans manager is a pg_clog-like manager that stores the parent
|
||||
* transaction Id for each transaction. It is a fundamental part of the
|
||||
* nested transactions implementation. A main transaction has a parent
|
||||
* nested transactions implementation. A main transaction has a parent
|
||||
* of InvalidTransactionId, and each subtransaction has its immediate parent.
|
||||
* The tree can easily be walked from child to parent, but not in the
|
||||
* opposite direction.
|
||||
@@ -191,7 +191,7 @@ SUBTRANSShmemInit(void)
|
||||
* must have been called already.)
|
||||
*
|
||||
* Note: it's not really necessary to create the initial segment now,
|
||||
* since slru.c would create it on first write anyway. But we may as well
|
||||
* since slru.c would create it on first write anyway. But we may as well
|
||||
* do it to be sure the directory is set up correctly.
|
||||
*/
|
||||
void
|
||||
|
||||
@@ -66,7 +66,7 @@ restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
|
||||
* Try to read a timeline's history file.
|
||||
*
|
||||
* If successful, return the list of component TLIs (the given TLI followed by
|
||||
* its ancestor TLIs). If we can't find the history file, assume that the
|
||||
* its ancestor TLIs). If we can't find the history file, assume that the
|
||||
* timeline has no parents, and return a list of just the specified timeline
|
||||
* ID.
|
||||
*/
|
||||
@@ -150,7 +150,7 @@ readTimeLineHistory(TimeLineID targetTLI)
|
||||
if (nfields != 3)
|
||||
ereport(FATAL,
|
||||
(errmsg("syntax error in history file: %s", fline),
|
||||
errhint("Expected a transaction log switchpoint location.")));
|
||||
errhint("Expected a transaction log switchpoint location.")));
|
||||
|
||||
if (result && tli <= lasttli)
|
||||
ereport(FATAL,
|
||||
@@ -281,7 +281,7 @@ findNewestTimeLine(TimeLineID startTLI)
|
||||
* reason: human-readable explanation of why the timeline was switched
|
||||
*
|
||||
* Currently this is only used at the end recovery, and so there are no locking
|
||||
* considerations. But we should be just as tense as XLogFileInit to avoid
|
||||
* considerations. But we should be just as tense as XLogFileInit to avoid
|
||||
* emplacing a bogus file.
|
||||
*/
|
||||
void
|
||||
@@ -418,7 +418,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
|
||||
|
||||
/*
|
||||
* Prefer link() to rename() here just to be really sure that we don't
|
||||
* overwrite an existing file. However, there shouldn't be one, so
|
||||
* overwrite an existing file. However, there shouldn't be one, so
|
||||
* rename() is an acceptable substitute except for the truly paranoid.
|
||||
*/
|
||||
#if HAVE_WORKING_LINK
|
||||
|
||||
@@ -145,7 +145,7 @@ TransactionIdDidCommit(TransactionId transactionId)
|
||||
* be a window just after database startup where we do not have complete
|
||||
* knowledge in pg_subtrans of the transactions after TransactionXmin.
|
||||
* StartupSUBTRANS() has ensured that any missing information will be
|
||||
* zeroed. Since this case should not happen under normal conditions, it
|
||||
* zeroed. Since this case should not happen under normal conditions, it
|
||||
* seems reasonable to emit a WARNING for it.
|
||||
*/
|
||||
if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
|
||||
@@ -301,7 +301,7 @@ TransactionIdPrecedes(TransactionId id1, TransactionId id2)
|
||||
{
|
||||
/*
|
||||
* If either ID is a permanent XID then we can just do unsigned
|
||||
* comparison. If both are normal, do a modulo-2^32 comparison.
|
||||
* comparison. If both are normal, do a modulo-2^32 comparison.
|
||||
*/
|
||||
int32 diff;
|
||||
|
||||
|
||||
@@ -443,7 +443,7 @@ LockGXact(const char *gid, Oid user)
|
||||
/*
|
||||
* Note: it probably would be possible to allow committing from
|
||||
* another database; but at the moment NOTIFY is known not to work and
|
||||
* there may be some other issues as well. Hence disallow until
|
||||
* there may be some other issues as well. Hence disallow until
|
||||
* someone gets motivated to make it work.
|
||||
*/
|
||||
if (MyDatabaseId != proc->databaseId)
|
||||
@@ -1031,7 +1031,7 @@ EndPrepare(GlobalTransaction gxact)
|
||||
* out the correct state file CRC, we have an inconsistency: the xact is
|
||||
* prepared according to WAL but not according to our on-disk state. We
|
||||
* use a critical section to force a PANIC if we are unable to complete
|
||||
* the write --- then, WAL replay should repair the inconsistency. The
|
||||
* the write --- then, WAL replay should repair the inconsistency. The
|
||||
* odds of a PANIC actually occurring should be very tiny given that we
|
||||
* were able to write the bogus CRC above.
|
||||
*
|
||||
@@ -1069,7 +1069,7 @@ EndPrepare(GlobalTransaction gxact)
|
||||
errmsg("could not close two-phase state file: %m")));
|
||||
|
||||
/*
|
||||
* Mark the prepared transaction as valid. As soon as xact.c marks
|
||||
* Mark the prepared transaction as valid. As soon as xact.c marks
|
||||
* MyPgXact as not running our XID (which it will do immediately after
|
||||
* this function returns), others can commit/rollback the xact.
|
||||
*
|
||||
@@ -1336,7 +1336,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
|
||||
/*
|
||||
* In case we fail while running the callbacks, mark the gxact invalid so
|
||||
* no one else will try to commit/rollback, and so it can be recycled
|
||||
* properly later. It is still locked by our XID so it won't go away yet.
|
||||
* properly later. It is still locked by our XID so it won't go away yet.
|
||||
*
|
||||
* (We assume it's safe to do this without taking TwoPhaseStateLock.)
|
||||
*/
|
||||
@@ -1540,7 +1540,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
||||
*
|
||||
* This approach creates a race condition: someone else could delete a
|
||||
* GXACT between the time we release TwoPhaseStateLock and the time we try
|
||||
* to open its state file. We handle this by special-casing ENOENT
|
||||
* to open its state file. We handle this by special-casing ENOENT
|
||||
* failures: if we see that, we verify that the GXACT is no longer valid,
|
||||
* and if so ignore the failure.
|
||||
*/
|
||||
@@ -1621,7 +1621,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
||||
*
|
||||
* We throw away any prepared xacts with main XID beyond nextXid --- if any
|
||||
* are present, it suggests that the DBA has done a PITR recovery to an
|
||||
* earlier point in time without cleaning out pg_twophase. We dare not
|
||||
* earlier point in time without cleaning out pg_twophase. We dare not
|
||||
* try to recover such prepared xacts since they likely depend on database
|
||||
* state that doesn't exist now.
|
||||
*
|
||||
@@ -1713,7 +1713,7 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
|
||||
* XID, and they may force us to advance nextXid.
|
||||
*
|
||||
* We don't expect anyone else to modify nextXid, hence we don't
|
||||
* need to hold a lock while examining it. We still acquire the
|
||||
* need to hold a lock while examining it. We still acquire the
|
||||
* lock to modify it, though.
|
||||
*/
|
||||
subxids = (TransactionId *)
|
||||
|
||||
@@ -39,7 +39,7 @@ VariableCache ShmemVariableCache = NULL;
|
||||
*
|
||||
* Note: when this is called, we are actually already inside a valid
|
||||
* transaction, since XIDs are now not allocated until the transaction
|
||||
* does something. So it is safe to do a database lookup if we want to
|
||||
* does something. So it is safe to do a database lookup if we want to
|
||||
* issue a warning about XID wrap.
|
||||
*/
|
||||
TransactionId
|
||||
@@ -165,20 +165,20 @@ GetNewTransactionId(bool isSubXact)
|
||||
/*
|
||||
* Now advance the nextXid counter. This must not happen until after we
|
||||
* have successfully completed ExtendCLOG() --- if that routine fails, we
|
||||
* want the next incoming transaction to try it again. We cannot assign
|
||||
* want the next incoming transaction to try it again. We cannot assign
|
||||
* more XIDs until there is CLOG space for them.
|
||||
*/
|
||||
TransactionIdAdvance(ShmemVariableCache->nextXid);
|
||||
|
||||
/*
|
||||
* We must store the new XID into the shared ProcArray before releasing
|
||||
* XidGenLock. This ensures that every active XID older than
|
||||
* XidGenLock. This ensures that every active XID older than
|
||||
* latestCompletedXid is present in the ProcArray, which is essential for
|
||||
* correct OldestXmin tracking; see src/backend/access/transam/README.
|
||||
*
|
||||
* XXX by storing xid into MyPgXact without acquiring ProcArrayLock, we
|
||||
* are relying on fetch/store of an xid to be atomic, else other backends
|
||||
* might see a partially-set xid here. But holding both locks at once
|
||||
* might see a partially-set xid here. But holding both locks at once
|
||||
* would be a nasty concurrency hit. So for now, assume atomicity.
|
||||
*
|
||||
* Note that readers of PGXACT xid fields should be careful to fetch the
|
||||
@@ -289,7 +289,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
|
||||
|
||||
/*
|
||||
* We'll start complaining loudly when we get within 10M transactions of
|
||||
* the stop point. This is kind of arbitrary, but if you let your gas
|
||||
* the stop point. This is kind of arbitrary, but if you let your gas
|
||||
* gauge get down to 1% of full, would you be looking for the next gas
|
||||
* station? We need to be fairly liberal about this number because there
|
||||
* are lots of scenarios where most transactions are done by automatic
|
||||
@@ -390,7 +390,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
|
||||
* We primarily check whether oldestXidDB is valid. The cases we have in
|
||||
* mind are that that database was dropped, or the field was reset to zero
|
||||
* by pg_resetxlog. In either case we should force recalculation of the
|
||||
* wrap limit. Also do it if oldestXid is old enough to be forcing
|
||||
* wrap limit. Also do it if oldestXid is old enough to be forcing
|
||||
* autovacuums or other actions; this ensures we update our state as soon
|
||||
* as possible once extra overhead is being incurred.
|
||||
*/
|
||||
|
||||
@@ -270,7 +270,7 @@ static void CallSubXactCallbacks(SubXactEvent event,
|
||||
SubTransactionId parentSubid);
|
||||
static void CleanupTransaction(void);
|
||||
static void CheckTransactionChain(bool isTopLevel, bool throwError,
|
||||
const char *stmtType);
|
||||
const char *stmtType);
|
||||
static void CommitTransaction(void);
|
||||
static TransactionId RecordTransactionAbort(bool isSubXact);
|
||||
static void StartTransaction(void);
|
||||
@@ -450,7 +450,7 @@ AssignTransactionId(TransactionState s)
|
||||
{
|
||||
bool isSubXact = (s->parent != NULL);
|
||||
ResourceOwner currentOwner;
|
||||
bool log_unknown_top = false;
|
||||
bool log_unknown_top = false;
|
||||
|
||||
/* Assert that caller didn't screw up */
|
||||
Assert(!TransactionIdIsValid(s->transactionId));
|
||||
@@ -487,8 +487,8 @@ AssignTransactionId(TransactionState s)
|
||||
|
||||
/*
|
||||
* When wal_level=logical, guarantee that a subtransaction's xid can only
|
||||
* be seen in the WAL stream if its toplevel xid has been logged
|
||||
* before. If necessary we log a xact_assignment record with fewer than
|
||||
* be seen in the WAL stream if its toplevel xid has been logged before.
|
||||
* If necessary we log a xact_assignment record with fewer than
|
||||
* PGPROC_MAX_CACHED_SUBXIDS. Note that it is fine if didLogXid isn't set
|
||||
* for a transaction even though it appears in a WAL record, we just might
|
||||
* superfluously log something. That can happen when an xid is included
|
||||
@@ -637,7 +637,7 @@ SubTransactionIsActive(SubTransactionId subxid)
|
||||
*
|
||||
* "used" must be TRUE if the caller intends to use the command ID to mark
|
||||
* inserted/updated/deleted tuples. FALSE means the ID is being fetched
|
||||
* for read-only purposes (ie, as a snapshot validity cutoff). See
|
||||
* for read-only purposes (ie, as a snapshot validity cutoff). See
|
||||
* CommandCounterIncrement() for discussion.
|
||||
*/
|
||||
CommandId
|
||||
@@ -724,7 +724,7 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
|
||||
|
||||
/*
|
||||
* We always say that BootstrapTransactionId is "not my transaction ID"
|
||||
* even when it is (ie, during bootstrap). Along with the fact that
|
||||
* even when it is (ie, during bootstrap). Along with the fact that
|
||||
* transam.c always treats BootstrapTransactionId as already committed,
|
||||
* this causes the tqual.c routines to see all tuples as committed, which
|
||||
* is what we need during bootstrap. (Bootstrap mode only inserts tuples,
|
||||
@@ -866,7 +866,7 @@ AtStart_Memory(void)
|
||||
/*
|
||||
* If this is the first time through, create a private context for
|
||||
* AbortTransaction to work in. By reserving some space now, we can
|
||||
* insulate AbortTransaction from out-of-memory scenarios. Like
|
||||
* insulate AbortTransaction from out-of-memory scenarios. Like
|
||||
* ErrorContext, we set it up with slow growth rate and a nonzero minimum
|
||||
* size, so that space will be reserved immediately.
|
||||
*/
|
||||
@@ -969,7 +969,7 @@ AtSubStart_ResourceOwner(void)
|
||||
Assert(s->parent != NULL);
|
||||
|
||||
/*
|
||||
* Create a resource owner for the subtransaction. We make it a child of
|
||||
* Create a resource owner for the subtransaction. We make it a child of
|
||||
* the immediate parent's resource owner.
|
||||
*/
|
||||
s->curTransactionOwner =
|
||||
@@ -989,7 +989,7 @@ AtSubStart_ResourceOwner(void)
|
||||
* RecordTransactionCommit
|
||||
*
|
||||
* Returns latest XID among xact and its children, or InvalidTransactionId
|
||||
* if the xact has no XID. (We compute that here just because it's easier.)
|
||||
* if the xact has no XID. (We compute that here just because it's easier.)
|
||||
*/
|
||||
static TransactionId
|
||||
RecordTransactionCommit(void)
|
||||
@@ -1034,7 +1034,7 @@ RecordTransactionCommit(void)
|
||||
|
||||
/*
|
||||
* If we didn't create XLOG entries, we're done here; otherwise we
|
||||
* should flush those entries the same as a commit record. (An
|
||||
* should flush those entries the same as a commit record. (An
|
||||
* example of a possible record that wouldn't cause an XID to be
|
||||
* assigned is a sequence advance record due to nextval() --- we want
|
||||
* to flush that to disk before reporting commit.)
|
||||
@@ -1051,7 +1051,7 @@ RecordTransactionCommit(void)
|
||||
BufmgrCommit();
|
||||
|
||||
/*
|
||||
* Mark ourselves as within our "commit critical section". This
|
||||
* Mark ourselves as within our "commit critical section". This
|
||||
* forces any concurrent checkpoint to wait until we've updated
|
||||
* pg_clog. Without this, it is possible for the checkpoint to set
|
||||
* REDO after the XLOG record but fail to flush the pg_clog update to
|
||||
@@ -1059,7 +1059,7 @@ RecordTransactionCommit(void)
|
||||
* crashes a little later.
|
||||
*
|
||||
* Note: we could, but don't bother to, set this flag in
|
||||
* RecordTransactionAbort. That's because loss of a transaction abort
|
||||
* RecordTransactionAbort. That's because loss of a transaction abort
|
||||
* is noncritical; the presumption would be that it aborted, anyway.
|
||||
*
|
||||
* It's safe to change the delayChkpt flag of our own backend without
|
||||
@@ -1168,15 +1168,15 @@ RecordTransactionCommit(void)
|
||||
/*
|
||||
* Check if we want to commit asynchronously. We can allow the XLOG flush
|
||||
* to happen asynchronously if synchronous_commit=off, or if the current
|
||||
* transaction has not performed any WAL-logged operation. The latter
|
||||
* transaction has not performed any WAL-logged operation. The latter
|
||||
* case can arise if the current transaction wrote only to temporary
|
||||
* and/or unlogged tables. In case of a crash, the loss of such a
|
||||
* and/or unlogged tables. In case of a crash, the loss of such a
|
||||
* transaction will be irrelevant since temp tables will be lost anyway,
|
||||
* and unlogged tables will be truncated. (Given the foregoing, you might
|
||||
* think that it would be unnecessary to emit the XLOG record at all in
|
||||
* this case, but we don't currently try to do that. It would certainly
|
||||
* cause problems at least in Hot Standby mode, where the
|
||||
* KnownAssignedXids machinery requires tracking every XID assignment. It
|
||||
* KnownAssignedXids machinery requires tracking every XID assignment. It
|
||||
* might be OK to skip it only when wal_level < hot_standby, but for now
|
||||
* we don't.)
|
||||
*
|
||||
@@ -1423,7 +1423,7 @@ AtSubCommit_childXids(void)
|
||||
* RecordTransactionAbort
|
||||
*
|
||||
* Returns latest XID among xact and its children, or InvalidTransactionId
|
||||
* if the xact has no XID. (We compute that here just because it's easier.)
|
||||
* if the xact has no XID. (We compute that here just because it's easier.)
|
||||
*/
|
||||
static TransactionId
|
||||
RecordTransactionAbort(bool isSubXact)
|
||||
@@ -1440,7 +1440,7 @@ RecordTransactionAbort(bool isSubXact)
|
||||
|
||||
/*
|
||||
* If we haven't been assigned an XID, nobody will care whether we aborted
|
||||
* or not. Hence, we're done in that case. It does not matter if we have
|
||||
* or not. Hence, we're done in that case. It does not matter if we have
|
||||
* rels to delete (note that this routine is not responsible for actually
|
||||
* deleting 'em). We cannot have any child XIDs, either.
|
||||
*/
|
||||
@@ -1456,7 +1456,7 @@ RecordTransactionAbort(bool isSubXact)
|
||||
* We have a valid XID, so we should write an ABORT record for it.
|
||||
*
|
||||
* We do not flush XLOG to disk here, since the default assumption after a
|
||||
* crash would be that we aborted, anyway. For the same reason, we don't
|
||||
* crash would be that we aborted, anyway. For the same reason, we don't
|
||||
* need to worry about interlocking against checkpoint start.
|
||||
*/
|
||||
|
||||
@@ -1624,7 +1624,7 @@ AtSubAbort_childXids(void)
|
||||
|
||||
/*
|
||||
* We keep the child-XID arrays in TopTransactionContext (see
|
||||
* AtSubCommit_childXids). This means we'd better free the array
|
||||
* AtSubCommit_childXids). This means we'd better free the array
|
||||
* explicitly at abort to avoid leakage.
|
||||
*/
|
||||
if (s->childXids != NULL)
|
||||
@@ -1802,7 +1802,7 @@ StartTransaction(void)
|
||||
VirtualXactLockTableInsert(vxid);
|
||||
|
||||
/*
|
||||
* Advertise it in the proc array. We assume assignment of
|
||||
* Advertise it in the proc array. We assume assignment of
|
||||
* LocalTransactionID is atomic, and the backendId should be set already.
|
||||
*/
|
||||
Assert(MyProc->backendId == vxid.backendId);
|
||||
@@ -1899,7 +1899,7 @@ CommitTransaction(void)
|
||||
|
||||
/*
|
||||
* The remaining actions cannot call any user-defined code, so it's safe
|
||||
* to start shutting down within-transaction services. But note that most
|
||||
* to start shutting down within-transaction services. But note that most
|
||||
* of this stuff could still throw an error, which would switch us into
|
||||
* the transaction-abort path.
|
||||
*/
|
||||
@@ -2104,7 +2104,7 @@ PrepareTransaction(void)
|
||||
|
||||
/*
|
||||
* The remaining actions cannot call any user-defined code, so it's safe
|
||||
* to start shutting down within-transaction services. But note that most
|
||||
* to start shutting down within-transaction services. But note that most
|
||||
* of this stuff could still throw an error, which would switch us into
|
||||
* the transaction-abort path.
|
||||
*/
|
||||
@@ -2224,7 +2224,7 @@ PrepareTransaction(void)
|
||||
XactLastRecEnd = 0;
|
||||
|
||||
/*
|
||||
* Let others know about no transaction in progress by me. This has to be
|
||||
* Let others know about no transaction in progress by me. This has to be
|
||||
* done *after* the prepared transaction has been marked valid, else
|
||||
* someone may think it is unlocked and recyclable.
|
||||
*/
|
||||
@@ -2233,7 +2233,7 @@ PrepareTransaction(void)
|
||||
/*
|
||||
* This is all post-transaction cleanup. Note that if an error is raised
|
||||
* here, it's too late to abort the transaction. This should be just
|
||||
* noncritical resource releasing. See notes in CommitTransaction.
|
||||
* noncritical resource releasing. See notes in CommitTransaction.
|
||||
*/
|
||||
|
||||
CallXactCallbacks(XACT_EVENT_PREPARE);
|
||||
@@ -2411,7 +2411,7 @@ AbortTransaction(void)
|
||||
ProcArrayEndTransaction(MyProc, latestXid);
|
||||
|
||||
/*
|
||||
* Post-abort cleanup. See notes in CommitTransaction() concerning
|
||||
* Post-abort cleanup. See notes in CommitTransaction() concerning
|
||||
* ordering. We can skip all of it if the transaction failed before
|
||||
* creating a resource owner.
|
||||
*/
|
||||
@@ -2646,7 +2646,7 @@ CommitTransactionCommand(void)
|
||||
|
||||
/*
|
||||
* Here we were in a perfectly good transaction block but the user
|
||||
* told us to ROLLBACK anyway. We have to abort the transaction
|
||||
* told us to ROLLBACK anyway. We have to abort the transaction
|
||||
* and then clean up.
|
||||
*/
|
||||
case TBLOCK_ABORT_PENDING:
|
||||
@@ -2666,7 +2666,7 @@ CommitTransactionCommand(void)
|
||||
|
||||
/*
|
||||
* We were just issued a SAVEPOINT inside a transaction block.
|
||||
* Start a subtransaction. (DefineSavepoint already did
|
||||
* Start a subtransaction. (DefineSavepoint already did
|
||||
* PushTransaction, so as to have someplace to put the SUBBEGIN
|
||||
* state.)
|
||||
*/
|
||||
@@ -2870,7 +2870,7 @@ AbortCurrentTransaction(void)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Here, we failed while trying to COMMIT. Clean up the
|
||||
* Here, we failed while trying to COMMIT. Clean up the
|
||||
* transaction and return to idle state (we do not want to stay in
|
||||
* the transaction).
|
||||
*/
|
||||
@@ -2932,7 +2932,7 @@ AbortCurrentTransaction(void)
|
||||
|
||||
/*
|
||||
* If we failed while trying to create a subtransaction, clean up
|
||||
* the broken subtransaction and abort the parent. The same
|
||||
* the broken subtransaction and abort the parent. The same
|
||||
* applies if we get a failure while ending a subtransaction.
|
||||
*/
|
||||
case TBLOCK_SUBBEGIN:
|
||||
@@ -3485,7 +3485,7 @@ UserAbortTransactionBlock(void)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We are inside a subtransaction. Mark everything up to top
|
||||
* We are inside a subtransaction. Mark everything up to top
|
||||
* level as exitable.
|
||||
*/
|
||||
case TBLOCK_SUBINPROGRESS:
|
||||
@@ -3619,7 +3619,7 @@ ReleaseSavepoint(List *options)
|
||||
break;
|
||||
|
||||
/*
|
||||
* We are in a non-aborted subtransaction. This is the only valid
|
||||
* We are in a non-aborted subtransaction. This is the only valid
|
||||
* case.
|
||||
*/
|
||||
case TBLOCK_SUBINPROGRESS:
|
||||
@@ -3676,7 +3676,7 @@ ReleaseSavepoint(List *options)
|
||||
|
||||
/*
|
||||
* Mark "commit pending" all subtransactions up to the target
|
||||
* subtransaction. The actual commits will happen when control gets to
|
||||
* subtransaction. The actual commits will happen when control gets to
|
||||
* CommitTransactionCommand.
|
||||
*/
|
||||
xact = CurrentTransactionState;
|
||||
@@ -3775,7 +3775,7 @@ RollbackToSavepoint(List *options)
|
||||
|
||||
/*
|
||||
* Mark "abort pending" all subtransactions up to the target
|
||||
* subtransaction. The actual aborts will happen when control gets to
|
||||
* subtransaction. The actual aborts will happen when control gets to
|
||||
* CommitTransactionCommand.
|
||||
*/
|
||||
xact = CurrentTransactionState;
|
||||
@@ -4182,7 +4182,7 @@ CommitSubTransaction(void)
|
||||
CommandCounterIncrement();
|
||||
|
||||
/*
|
||||
* Prior to 8.4 we marked subcommit in clog at this point. We now only
|
||||
* Prior to 8.4 we marked subcommit in clog at this point. We now only
|
||||
* perform that step, if required, as part of the atomic update of the
|
||||
* whole transaction tree at top level commit or abort.
|
||||
*/
|
||||
@@ -4641,7 +4641,7 @@ TransStateAsString(TransState state)
|
||||
/*
|
||||
* xactGetCommittedChildren
|
||||
*
|
||||
* Gets the list of committed children of the current transaction. The return
|
||||
* Gets the list of committed children of the current transaction. The return
|
||||
* value is the number of child transactions. *ptr is set to point to an
|
||||
* array of TransactionIds. The array is allocated in TopTransactionContext;
|
||||
* the caller should *not* pfree() it (this is a change from pre-8.4 code!).
|
||||
|
||||
@@ -101,7 +101,7 @@ bool XLOG_DEBUG = false;
|
||||
* future XLOG segment as long as there aren't already XLOGfileslop future
|
||||
* segments; else we'll delete it. This could be made a separate GUC
|
||||
* variable, but at present I think it's sufficient to hardwire it as
|
||||
* 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free
|
||||
* 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free
|
||||
* no more than 2*CheckPointSegments log segments, and we want to recycle all
|
||||
* of them; the +1 allows boundary cases to happen without wasting a
|
||||
* delete/create-segment cycle.
|
||||
@@ -190,7 +190,7 @@ static bool LocalHotStandbyActive = false;
|
||||
* 0: unconditionally not allowed to insert XLOG
|
||||
* -1: must check RecoveryInProgress(); disallow until it is false
|
||||
* Most processes start with -1 and transition to 1 after seeing that recovery
|
||||
* is not in progress. But we can also force the value for special cases.
|
||||
* is not in progress. But we can also force the value for special cases.
|
||||
* The coding in XLogInsertAllowed() depends on the first two of these states
|
||||
* being numerically the same as bool true and false.
|
||||
*/
|
||||
@@ -223,7 +223,7 @@ static bool recoveryPauseAtTarget = true;
|
||||
static TransactionId recoveryTargetXid;
|
||||
static TimestampTz recoveryTargetTime;
|
||||
static char *recoveryTargetName;
|
||||
static int min_recovery_apply_delay = 0;
|
||||
static int min_recovery_apply_delay = 0;
|
||||
static TimestampTz recoveryDelayUntilTime;
|
||||
|
||||
/* options taken from recovery.conf for XLOG streaming */
|
||||
@@ -261,7 +261,7 @@ static bool recoveryStopAfter;
|
||||
*
|
||||
* expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
|
||||
* its known parents, newest first (so recoveryTargetTLI is always the
|
||||
* first list member). Only these TLIs are expected to be seen in the WAL
|
||||
* first list member). Only these TLIs are expected to be seen in the WAL
|
||||
* segments we read, and indeed only these TLIs will be considered as
|
||||
* candidate WAL files to open at all.
|
||||
*
|
||||
@@ -290,7 +290,7 @@ XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
|
||||
/*
|
||||
* RedoRecPtr is this backend's local copy of the REDO record pointer
|
||||
* (which is almost but not quite the same as a pointer to the most recent
|
||||
* CHECKPOINT record). We update this from the shared-memory copy,
|
||||
* CHECKPOINT record). We update this from the shared-memory copy,
|
||||
* XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
|
||||
* hold an insertion lock). See XLogInsert for details. We are also allowed
|
||||
* to update from XLogCtl->RedoRecPtr if we hold the info_lck;
|
||||
@@ -418,11 +418,11 @@ typedef struct XLogCtlInsert
|
||||
slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
|
||||
|
||||
/*
|
||||
* CurrBytePos is the end of reserved WAL. The next record will be inserted
|
||||
* at that position. PrevBytePos is the start position of the previously
|
||||
* inserted (or rather, reserved) record - it is copied to the prev-link
|
||||
* of the next record. These are stored as "usable byte positions" rather
|
||||
* than XLogRecPtrs (see XLogBytePosToRecPtr()).
|
||||
* CurrBytePos is the end of reserved WAL. The next record will be
|
||||
* inserted at that position. PrevBytePos is the start position of the
|
||||
* previously inserted (or rather, reserved) record - it is copied to the
|
||||
* prev-link of the next record. These are stored as "usable byte
|
||||
* positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
|
||||
*/
|
||||
uint64 CurrBytePos;
|
||||
uint64 PrevBytePos;
|
||||
@@ -464,7 +464,7 @@ typedef struct XLogCtlInsert
|
||||
/*
|
||||
* WAL insertion locks.
|
||||
*/
|
||||
WALInsertLockPadded *WALInsertLocks;
|
||||
WALInsertLockPadded *WALInsertLocks;
|
||||
LWLockTranche WALInsertLockTranche;
|
||||
int WALInsertLockTrancheId;
|
||||
} XLogCtlInsert;
|
||||
@@ -504,10 +504,11 @@ typedef struct XLogCtlData
|
||||
* Latest initialized page in the cache (last byte position + 1).
|
||||
*
|
||||
* To change the identity of a buffer (and InitializedUpTo), you need to
|
||||
* hold WALBufMappingLock. To change the identity of a buffer that's still
|
||||
* dirty, the old page needs to be written out first, and for that you
|
||||
* need WALWriteLock, and you need to ensure that there are no in-progress
|
||||
* insertions to the page by calling WaitXLogInsertionsToFinish().
|
||||
* hold WALBufMappingLock. To change the identity of a buffer that's
|
||||
* still dirty, the old page needs to be written out first, and for that
|
||||
* you need WALWriteLock, and you need to ensure that there are no
|
||||
* in-progress insertions to the page by calling
|
||||
* WaitXLogInsertionsToFinish().
|
||||
*/
|
||||
XLogRecPtr InitializedUpTo;
|
||||
|
||||
@@ -799,8 +800,8 @@ static void rm_redo_error_callback(void *arg);
|
||||
static int get_sync_bit(int method);
|
||||
|
||||
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
|
||||
XLogRecData *rdata,
|
||||
XLogRecPtr StartPos, XLogRecPtr EndPos);
|
||||
XLogRecData *rdata,
|
||||
XLogRecPtr StartPos, XLogRecPtr EndPos);
|
||||
static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
|
||||
XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
|
||||
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
|
||||
@@ -860,6 +861,7 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
|
||||
if (rechdr == NULL)
|
||||
{
|
||||
static char rechdrbuf[SizeOfXLogRecord + MAXIMUM_ALIGNOF];
|
||||
|
||||
rechdr = (XLogRecord *) MAXALIGN(&rechdrbuf);
|
||||
MemSet(rechdr, 0, SizeOfXLogRecord);
|
||||
}
|
||||
@@ -1075,12 +1077,12 @@ begin:;
|
||||
* record to the shared WAL buffer cache is a two-step process:
|
||||
*
|
||||
* 1. Reserve the right amount of space from the WAL. The current head of
|
||||
* reserved space is kept in Insert->CurrBytePos, and is protected by
|
||||
* insertpos_lck.
|
||||
* reserved space is kept in Insert->CurrBytePos, and is protected by
|
||||
* insertpos_lck.
|
||||
*
|
||||
* 2. Copy the record to the reserved WAL space. This involves finding the
|
||||
* correct WAL buffer containing the reserved space, and copying the
|
||||
* record in place. This can be done concurrently in multiple processes.
|
||||
* correct WAL buffer containing the reserved space, and copying the
|
||||
* record in place. This can be done concurrently in multiple processes.
|
||||
*
|
||||
* To keep track of which insertions are still in-progress, each concurrent
|
||||
* inserter acquires an insertion lock. In addition to just indicating that
|
||||
@@ -1232,6 +1234,7 @@ begin:;
|
||||
{
|
||||
TRACE_POSTGRESQL_XLOG_SWITCH();
|
||||
XLogFlush(EndPos);
|
||||
|
||||
/*
|
||||
* Even though we reserved the rest of the segment for us, which is
|
||||
* reflected in EndPos, we return a pointer to just the end of the
|
||||
@@ -1272,7 +1275,7 @@ begin:;
|
||||
rdt_lastnormal->next = NULL;
|
||||
|
||||
initStringInfo(&recordbuf);
|
||||
for (;rdata != NULL; rdata = rdata->next)
|
||||
for (; rdata != NULL; rdata = rdata->next)
|
||||
appendBinaryStringInfo(&recordbuf, rdata->data, rdata->len);
|
||||
|
||||
appendStringInfoString(&buf, " - ");
|
||||
@@ -1514,8 +1517,8 @@ CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
|
||||
|
||||
/*
|
||||
* If this was an xlog-switch, it's not enough to write the switch record,
|
||||
* we also have to consume all the remaining space in the WAL segment.
|
||||
* We have already reserved it for us, but we still need to make sure it's
|
||||
* we also have to consume all the remaining space in the WAL segment. We
|
||||
* have already reserved it for us, but we still need to make sure it's
|
||||
* allocated and zeroed in the WAL buffers so that when the caller (or
|
||||
* someone else) does XLogWrite(), it can really write out all the zeros.
|
||||
*/
|
||||
@@ -1556,14 +1559,14 @@ WALInsertLockAcquire(void)
|
||||
|
||||
/*
|
||||
* It doesn't matter which of the WAL insertion locks we acquire, so try
|
||||
* the one we used last time. If the system isn't particularly busy,
|
||||
* it's a good bet that it's still available, and it's good to have some
|
||||
* the one we used last time. If the system isn't particularly busy, it's
|
||||
* a good bet that it's still available, and it's good to have some
|
||||
* affinity to a particular lock so that you don't unnecessarily bounce
|
||||
* cache lines between processes when there's no contention.
|
||||
*
|
||||
* If this is the first time through in this backend, pick a lock
|
||||
* (semi-)randomly. This allows the locks to be used evenly if you have
|
||||
* a lot of very short connections.
|
||||
* (semi-)randomly. This allows the locks to be used evenly if you have a
|
||||
* lot of very short connections.
|
||||
*/
|
||||
static int lockToTry = -1;
|
||||
|
||||
@@ -1583,10 +1586,10 @@ WALInsertLockAcquire(void)
|
||||
/*
|
||||
* If we couldn't get the lock immediately, try another lock next
|
||||
* time. On a system with more insertion locks than concurrent
|
||||
* inserters, this causes all the inserters to eventually migrate
|
||||
* to a lock that no-one else is using. On a system with more
|
||||
* inserters than locks, it still helps to distribute the inserters
|
||||
* evenly across the locks.
|
||||
* inserters, this causes all the inserters to eventually migrate to a
|
||||
* lock that no-one else is using. On a system with more inserters
|
||||
* than locks, it still helps to distribute the inserters evenly
|
||||
* across the locks.
|
||||
*/
|
||||
lockToTry = (lockToTry + 1) % num_xloginsert_locks;
|
||||
}
|
||||
@@ -1604,8 +1607,8 @@ WALInsertLockAcquireExclusive(void)
|
||||
/*
|
||||
* When holding all the locks, we only update the last lock's insertingAt
|
||||
* indicator. The others are set to 0xFFFFFFFFFFFFFFFF, which is higher
|
||||
* than any real XLogRecPtr value, to make sure that no-one blocks
|
||||
* waiting on those.
|
||||
* than any real XLogRecPtr value, to make sure that no-one blocks waiting
|
||||
* on those.
|
||||
*/
|
||||
for (i = 0; i < num_xloginsert_locks - 1; i++)
|
||||
{
|
||||
@@ -1655,7 +1658,7 @@ WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
|
||||
* WALInsertLockAcquireExclusive.
|
||||
*/
|
||||
LWLockUpdateVar(&WALInsertLocks[num_xloginsert_locks - 1].l.lock,
|
||||
&WALInsertLocks[num_xloginsert_locks - 1].l.insertingAt,
|
||||
&WALInsertLocks[num_xloginsert_locks - 1].l.insertingAt,
|
||||
insertingAt);
|
||||
}
|
||||
else
|
||||
@@ -1716,15 +1719,16 @@ WaitXLogInsertionsToFinish(XLogRecPtr upto)
|
||||
* Loop through all the locks, sleeping on any in-progress insert older
|
||||
* than 'upto'.
|
||||
*
|
||||
* finishedUpto is our return value, indicating the point upto which
|
||||
* all the WAL insertions have been finished. Initialize it to the head
|
||||
* of reserved WAL, and as we iterate through the insertion locks, back it
|
||||
* finishedUpto is our return value, indicating the point upto which all
|
||||
* the WAL insertions have been finished. Initialize it to the head of
|
||||
* reserved WAL, and as we iterate through the insertion locks, back it
|
||||
* out for any insertion that's still in progress.
|
||||
*/
|
||||
finishedUpto = reservedUpto;
|
||||
for (i = 0; i < num_xloginsert_locks; i++)
|
||||
{
|
||||
XLogRecPtr insertingat = InvalidXLogRecPtr;
|
||||
XLogRecPtr insertingat = InvalidXLogRecPtr;
|
||||
|
||||
do
|
||||
{
|
||||
/*
|
||||
@@ -1797,9 +1801,9 @@ GetXLogBuffer(XLogRecPtr ptr)
|
||||
}
|
||||
|
||||
/*
|
||||
* The XLog buffer cache is organized so that a page is always loaded
|
||||
* to a particular buffer. That way we can easily calculate the buffer
|
||||
* a given page must be loaded into, from the XLogRecPtr alone.
|
||||
* The XLog buffer cache is organized so that a page is always loaded to a
|
||||
* particular buffer. That way we can easily calculate the buffer a given
|
||||
* page must be loaded into, from the XLogRecPtr alone.
|
||||
*/
|
||||
idx = XLogRecPtrToBufIdx(ptr);
|
||||
|
||||
@@ -1827,8 +1831,8 @@ GetXLogBuffer(XLogRecPtr ptr)
|
||||
if (expectedEndPtr != endptr)
|
||||
{
|
||||
/*
|
||||
* Let others know that we're finished inserting the record up
|
||||
* to the page boundary.
|
||||
* Let others know that we're finished inserting the record up to the
|
||||
* page boundary.
|
||||
*/
|
||||
WALInsertLockUpdateInsertingAt(expectedEndPtr - XLOG_BLCKSZ);
|
||||
|
||||
@@ -1837,7 +1841,7 @@ GetXLogBuffer(XLogRecPtr ptr)
|
||||
|
||||
if (expectedEndPtr != endptr)
|
||||
elog(PANIC, "could not find WAL buffer for %X/%X",
|
||||
(uint32) (ptr >> 32) , (uint32) ptr);
|
||||
(uint32) (ptr >> 32), (uint32) ptr);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -1974,8 +1978,8 @@ XLogRecPtrToBytePos(XLogRecPtr ptr)
|
||||
else
|
||||
{
|
||||
result = fullsegs * UsableBytesInSegment +
|
||||
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
|
||||
(fullpages - 1) * UsableBytesInPage; /* full pages */
|
||||
(XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
|
||||
(fullpages - 1) * UsableBytesInPage; /* full pages */
|
||||
if (offset > 0)
|
||||
{
|
||||
Assert(offset >= SizeOfXLogShortPHD);
|
||||
@@ -2170,8 +2174,8 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
|
||||
}
|
||||
|
||||
/*
|
||||
* Now the next buffer slot is free and we can set it up to be the next
|
||||
* output page.
|
||||
* Now the next buffer slot is free and we can set it up to be the
|
||||
* next output page.
|
||||
*/
|
||||
NewPageBeginPtr = XLogCtl->InitializedUpTo;
|
||||
NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
|
||||
@@ -2194,7 +2198,8 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
|
||||
/* NewPage->xlp_info = 0; */ /* done by memset */
|
||||
NewPage ->xlp_tli = ThisTimeLineID;
|
||||
NewPage ->xlp_pageaddr = NewPageBeginPtr;
|
||||
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
|
||||
|
||||
/* NewPage->xlp_rem_len = 0; */ /* done by memset */
|
||||
|
||||
/*
|
||||
* If online backup is not in progress, mark the header to indicate
|
||||
@@ -2202,12 +2207,12 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
|
||||
* blocks. This allows the WAL archiver to know whether it is safe to
|
||||
* compress archived WAL data by transforming full-block records into
|
||||
* the non-full-block format. It is sufficient to record this at the
|
||||
* page level because we force a page switch (in fact a segment switch)
|
||||
* when starting a backup, so the flag will be off before any records
|
||||
* can be written during the backup. At the end of a backup, the last
|
||||
* page will be marked as all unsafe when perhaps only part is unsafe,
|
||||
* but at worst the archiver would miss the opportunity to compress a
|
||||
* few records.
|
||||
* page level because we force a page switch (in fact a segment
|
||||
* switch) when starting a backup, so the flag will be off before any
|
||||
* records can be written during the backup. At the end of a backup,
|
||||
* the last page will be marked as all unsafe when perhaps only part
|
||||
* is unsafe, but at worst the archiver would miss the opportunity to
|
||||
* compress a few records.
|
||||
*/
|
||||
if (!Insert->forcePageWrites)
|
||||
NewPage ->xlp_info |= XLP_BKP_REMOVABLE;
|
||||
@@ -2329,7 +2334,8 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
|
||||
* if we're passed a bogus WriteRqst.Write that is past the end of the
|
||||
* last page that's been initialized by AdvanceXLInsertBuffer.
|
||||
*/
|
||||
XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
|
||||
XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
|
||||
|
||||
if (LogwrtResult.Write >= EndPtr)
|
||||
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
|
||||
(uint32) (LogwrtResult.Write >> 32),
|
||||
@@ -2413,7 +2419,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
|
||||
do
|
||||
{
|
||||
errno = 0;
|
||||
written = write(openLogFile, from, nleft);
|
||||
written = write(openLogFile, from, nleft);
|
||||
if (written <= 0)
|
||||
{
|
||||
if (errno == EINTR)
|
||||
@@ -2422,7 +2428,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to log file %s "
|
||||
"at offset %u, length %zu: %m",
|
||||
XLogFileNameP(ThisTimeLineID, openLogSegNo),
|
||||
XLogFileNameP(ThisTimeLineID, openLogSegNo),
|
||||
openLogOff, nbytes)));
|
||||
}
|
||||
nleft -= written;
|
||||
@@ -2500,7 +2506,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
|
||||
{
|
||||
/*
|
||||
* Could get here without iterating above loop, in which case we might
|
||||
* have no open file or the wrong one. However, we do not need to
|
||||
* have no open file or the wrong one. However, we do not need to
|
||||
* fsync more than one file.
|
||||
*/
|
||||
if (sync_method != SYNC_METHOD_OPEN &&
|
||||
@@ -2569,7 +2575,7 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
|
||||
|
||||
/*
|
||||
* If the WALWriter is sleeping, we should kick it to make it come out of
|
||||
* low-power mode. Otherwise, determine whether there's a full page of
|
||||
* low-power mode. Otherwise, determine whether there's a full page of
|
||||
* WAL available to write.
|
||||
*/
|
||||
if (!sleeping)
|
||||
@@ -2616,7 +2622,8 @@ XLogGetReplicationSlotMinimumLSN(void)
|
||||
{
|
||||
/* use volatile pointer to prevent code rearrangement */
|
||||
volatile XLogCtlData *xlogctl = XLogCtl;
|
||||
XLogRecPtr retval;
|
||||
XLogRecPtr retval;
|
||||
|
||||
SpinLockAcquire(&xlogctl->info_lck);
|
||||
retval = xlogctl->replicationSlotMinLSN;
|
||||
SpinLockRelease(&xlogctl->info_lck);
|
||||
@@ -2883,9 +2890,9 @@ XLogFlush(XLogRecPtr record)
|
||||
* We normally flush only completed blocks; but if there is nothing to do on
|
||||
* that basis, we check for unflushed async commits in the current incomplete
|
||||
* block, and flush through the latest one of those. Thus, if async commits
|
||||
* are not being used, we will flush complete blocks only. We can guarantee
|
||||
* are not being used, we will flush complete blocks only. We can guarantee
|
||||
* that async commits reach disk after at most three cycles; normally only
|
||||
* one or two. (When flushing complete blocks, we allow XLogWrite to write
|
||||
* one or two. (When flushing complete blocks, we allow XLogWrite to write
|
||||
* "flexibly", meaning it can stop at the end of the buffer ring; this makes a
|
||||
* difference only with very high load or long wal_writer_delay, but imposes
|
||||
* one extra cycle for the worst case for async commits.)
|
||||
@@ -3060,7 +3067,7 @@ XLogNeedsFlush(XLogRecPtr record)
|
||||
* log, seg: identify segment to be created/opened.
|
||||
*
|
||||
* *use_existent: if TRUE, OK to use a pre-existing file (else, any
|
||||
* pre-existing file will be deleted). On return, TRUE if a pre-existing
|
||||
* pre-existing file will be deleted). On return, TRUE if a pre-existing
|
||||
* file was used.
|
||||
*
|
||||
* use_lock: if TRUE, acquire ControlFileLock while moving file into
|
||||
@@ -3127,11 +3134,11 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
|
||||
errmsg("could not create file \"%s\": %m", tmppath)));
|
||||
|
||||
/*
|
||||
* Zero-fill the file. We have to do this the hard way to ensure that all
|
||||
* Zero-fill the file. We have to do this the hard way to ensure that all
|
||||
* the file space has really been allocated --- on platforms that allow
|
||||
* "holes" in files, just seeking to the end doesn't allocate intermediate
|
||||
* space. This way, we know that we have all the space and (after the
|
||||
* fsync below) that all the indirect blocks are down on disk. Therefore,
|
||||
* fsync below) that all the indirect blocks are down on disk. Therefore,
|
||||
* fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
|
||||
* log file.
|
||||
*
|
||||
@@ -3223,7 +3230,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
|
||||
* a different timeline)
|
||||
*
|
||||
* Currently this is only used during recovery, and so there are no locking
|
||||
* considerations. But we should be just as tense as XLogFileInit to avoid
|
||||
* considerations. But we should be just as tense as XLogFileInit to avoid
|
||||
* emplacing a bogus file.
|
||||
*/
|
||||
static void
|
||||
@@ -3434,7 +3441,7 @@ XLogFileOpen(XLogSegNo segno)
|
||||
if (fd < 0)
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not open transaction log file \"%s\": %m", path)));
|
||||
errmsg("could not open transaction log file \"%s\": %m", path)));
|
||||
|
||||
return fd;
|
||||
}
|
||||
@@ -3541,13 +3548,13 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
|
||||
* the timelines listed in expectedTLEs.
|
||||
*
|
||||
* We expect curFileTLI on entry to be the TLI of the preceding file in
|
||||
* sequence, or 0 if there was no predecessor. We do not allow curFileTLI
|
||||
* sequence, or 0 if there was no predecessor. We do not allow curFileTLI
|
||||
* to go backwards; this prevents us from picking up the wrong file when a
|
||||
* parent timeline extends to higher segment numbers than the child we
|
||||
* want to read.
|
||||
*
|
||||
* If we haven't read the timeline history file yet, read it now, so that
|
||||
* we know which TLIs to scan. We don't save the list in expectedTLEs,
|
||||
* we know which TLIs to scan. We don't save the list in expectedTLEs,
|
||||
* however, unless we actually find a valid segment. That way if there is
|
||||
* neither a timeline history file nor a WAL segment in the archive, and
|
||||
* streaming replication is set up, we'll read the timeline history file
|
||||
@@ -3611,7 +3618,7 @@ XLogFileClose(void)
|
||||
|
||||
/*
|
||||
* WAL segment files will not be re-read in normal operation, so we advise
|
||||
* the OS to release any cached pages. But do not do so if WAL archiving
|
||||
* the OS to release any cached pages. But do not do so if WAL archiving
|
||||
* or streaming is active, because archiver and walsender process could
|
||||
* use the cache to read the WAL segment.
|
||||
*/
|
||||
@@ -3777,7 +3784,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
|
||||
{
|
||||
/*
|
||||
* We ignore the timeline part of the XLOG segment identifiers in
|
||||
* deciding whether a segment is still needed. This ensures that we
|
||||
* deciding whether a segment is still needed. This ensures that we
|
||||
* won't prematurely remove a segment from a parent timeline. We could
|
||||
* probably be a little more proactive about removing segments of
|
||||
* non-parent timelines, but that would be a whole lot more
|
||||
@@ -3828,6 +3835,7 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
|
||||
xlde->d_name)));
|
||||
|
||||
#ifdef WIN32
|
||||
|
||||
/*
|
||||
* On Windows, if another process (e.g another backend)
|
||||
* holds the file open in FILE_SHARE_DELETE mode, unlink
|
||||
@@ -4310,7 +4318,7 @@ rescanLatestTimeLine(void)
|
||||
* I/O routines for pg_control
|
||||
*
|
||||
* *ControlFile is a buffer in shared memory that holds an image of the
|
||||
* contents of pg_control. WriteControlFile() initializes pg_control
|
||||
* contents of pg_control. WriteControlFile() initializes pg_control
|
||||
* given a preloaded buffer, ReadControlFile() loads the buffer from
|
||||
* the pg_control file (during postmaster or standalone-backend startup),
|
||||
* and UpdateControlFile() rewrites pg_control after we modify xlog state.
|
||||
@@ -4715,7 +4723,7 @@ check_wal_buffers(int *newval, void **extra, GucSource source)
|
||||
{
|
||||
/*
|
||||
* If we haven't yet changed the boot_val default of -1, just let it
|
||||
* be. We'll fix it when XLOGShmemSize is called.
|
||||
* be. We'll fix it when XLOGShmemSize is called.
|
||||
*/
|
||||
if (XLOGbuffers == -1)
|
||||
return true;
|
||||
@@ -4815,7 +4823,7 @@ XLOGShmemInit(void)
|
||||
|
||||
/* WAL insertion locks. Ensure they're aligned to the full padded size */
|
||||
allocptr += sizeof(WALInsertLockPadded) -
|
||||
((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
|
||||
((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
|
||||
WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
|
||||
(WALInsertLockPadded *) allocptr;
|
||||
allocptr += sizeof(WALInsertLockPadded) * num_xloginsert_locks;
|
||||
@@ -4836,8 +4844,8 @@ XLOGShmemInit(void)
|
||||
|
||||
/*
|
||||
* Align the start of the page buffers to a full xlog block size boundary.
|
||||
* This simplifies some calculations in XLOG insertion. It is also required
|
||||
* for O_DIRECT.
|
||||
* This simplifies some calculations in XLOG insertion. It is also
|
||||
* required for O_DIRECT.
|
||||
*/
|
||||
allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
|
||||
XLogCtl->pages = allocptr;
|
||||
@@ -5233,7 +5241,7 @@ readRecoveryCommandFile(void)
|
||||
const char *hintmsg;
|
||||
|
||||
if (!parse_int(item->value, &min_recovery_apply_delay, GUC_UNIT_MS,
|
||||
&hintmsg))
|
||||
&hintmsg))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("parameter \"%s\" requires a temporal value", "min_recovery_apply_delay"),
|
||||
@@ -5271,7 +5279,7 @@ readRecoveryCommandFile(void)
|
||||
|
||||
/*
|
||||
* If user specified recovery_target_timeline, validate it or compute the
|
||||
* "latest" value. We can't do this until after we've gotten the restore
|
||||
* "latest" value. We can't do this until after we've gotten the restore
|
||||
* command and set InArchiveRecovery, because we need to fetch timeline
|
||||
* history files from the archive.
|
||||
*/
|
||||
@@ -5464,8 +5472,8 @@ recoveryStopsBefore(XLogRecord *record)
|
||||
*
|
||||
* when testing for an xid, we MUST test for equality only, since
|
||||
* transactions are numbered in the order they start, not the order
|
||||
* they complete. A higher numbered xid will complete before you
|
||||
* about 50% of the time...
|
||||
* they complete. A higher numbered xid will complete before you about
|
||||
* 50% of the time...
|
||||
*/
|
||||
stopsHere = (record->xl_xid == recoveryTargetXid);
|
||||
}
|
||||
@@ -5525,8 +5533,8 @@ recoveryStopsAfter(XLogRecord *record)
|
||||
record_info = record->xl_info & ~XLR_INFO_MASK;
|
||||
|
||||
/*
|
||||
* There can be many restore points that share the same name; we stop
|
||||
* at the first one.
|
||||
* There can be many restore points that share the same name; we stop at
|
||||
* the first one.
|
||||
*/
|
||||
if (recoveryTarget == RECOVERY_TARGET_NAME &&
|
||||
record->xl_rmid == RM_XLOG_ID && record_info == XLOG_RESTORE_POINT)
|
||||
@@ -5543,9 +5551,9 @@ recoveryStopsAfter(XLogRecord *record)
|
||||
strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
|
||||
|
||||
ereport(LOG,
|
||||
(errmsg("recovery stopping at restore point \"%s\", time %s",
|
||||
recoveryStopName,
|
||||
timestamptz_to_str(recoveryStopTime))));
|
||||
(errmsg("recovery stopping at restore point \"%s\", time %s",
|
||||
recoveryStopName,
|
||||
timestamptz_to_str(recoveryStopTime))));
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -5688,10 +5696,10 @@ recoveryApplyDelay(XLogRecord *record)
|
||||
/*
|
||||
* Is it a COMMIT record?
|
||||
*
|
||||
* We deliberately choose not to delay aborts since they have no effect
|
||||
* on MVCC. We already allow replay of records that don't have a
|
||||
* timestamp, so there is already opportunity for issues caused by early
|
||||
* conflicts on standbys.
|
||||
* We deliberately choose not to delay aborts since they have no effect on
|
||||
* MVCC. We already allow replay of records that don't have a timestamp,
|
||||
* so there is already opportunity for issues caused by early conflicts on
|
||||
* standbys.
|
||||
*/
|
||||
record_info = record->xl_info & ~XLR_INFO_MASK;
|
||||
if (!(record->xl_rmid == RM_XACT_ID &&
|
||||
@@ -5711,7 +5719,7 @@ recoveryApplyDelay(XLogRecord *record)
|
||||
*/
|
||||
TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
|
||||
&secs, µsecs);
|
||||
if (secs <= 0 && microsecs <=0)
|
||||
if (secs <= 0 && microsecs <= 0)
|
||||
return false;
|
||||
|
||||
while (true)
|
||||
@@ -5731,15 +5739,15 @@ recoveryApplyDelay(XLogRecord *record)
|
||||
TimestampDifference(GetCurrentTimestamp(), recoveryDelayUntilTime,
|
||||
&secs, µsecs);
|
||||
|
||||
if (secs <= 0 && microsecs <=0)
|
||||
if (secs <= 0 && microsecs <= 0)
|
||||
break;
|
||||
|
||||
elog(DEBUG2, "recovery apply delay %ld seconds, %d milliseconds",
|
||||
secs, microsecs / 1000);
|
||||
secs, microsecs / 1000);
|
||||
|
||||
WaitLatch(&XLogCtl->recoveryWakeupLatch,
|
||||
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
|
||||
secs * 1000L + microsecs / 1000);
|
||||
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
|
||||
secs * 1000L + microsecs / 1000);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@@ -5978,7 +5986,7 @@ StartupXLOG(void)
|
||||
ValidateXLOGDirectoryStructure();
|
||||
|
||||
/*
|
||||
* Clear out any old relcache cache files. This is *necessary* if we do
|
||||
* Clear out any old relcache cache files. This is *necessary* if we do
|
||||
* any WAL replay, since that would probably result in the cache files
|
||||
* being out of sync with database reality. In theory we could leave them
|
||||
* in place if the database had been cleanly shut down, but it seems
|
||||
@@ -6050,7 +6058,7 @@ StartupXLOG(void)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OUT_OF_MEMORY),
|
||||
errmsg("out of memory"),
|
||||
errdetail("Failed while allocating an XLog reading processor.")));
|
||||
errdetail("Failed while allocating an XLog reading processor.")));
|
||||
xlogreader->system_identifier = ControlFile->system_identifier;
|
||||
|
||||
if (read_backup_label(&checkPointLoc, &backupEndRequired,
|
||||
@@ -6261,9 +6269,9 @@ StartupXLOG(void)
|
||||
StartupReorderBuffer();
|
||||
|
||||
/*
|
||||
* Startup MultiXact. We need to do this early for two reasons: one
|
||||
* is that we might try to access multixacts when we do tuple freezing,
|
||||
* and the other is we need its state initialized because we attempt
|
||||
* Startup MultiXact. We need to do this early for two reasons: one is
|
||||
* that we might try to access multixacts when we do tuple freezing, and
|
||||
* the other is we need its state initialized because we attempt
|
||||
* truncation during restartpoints.
|
||||
*/
|
||||
StartupMultiXact();
|
||||
@@ -6517,9 +6525,9 @@ StartupXLOG(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize shared variables for tracking progress of WAL replay,
|
||||
* as if we had just replayed the record before the REDO location
|
||||
* (or the checkpoint record itself, if it's a shutdown checkpoint).
|
||||
* Initialize shared variables for tracking progress of WAL replay, as
|
||||
* if we had just replayed the record before the REDO location (or the
|
||||
* checkpoint record itself, if it's a shutdown checkpoint).
|
||||
*/
|
||||
SpinLockAcquire(&xlogctl->info_lck);
|
||||
if (checkPoint.redo < RecPtr)
|
||||
@@ -6646,17 +6654,17 @@ StartupXLOG(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we've been asked to lag the master, wait on
|
||||
* latch until enough time has passed.
|
||||
* If we've been asked to lag the master, wait on latch until
|
||||
* enough time has passed.
|
||||
*/
|
||||
if (recoveryApplyDelay(record))
|
||||
{
|
||||
/*
|
||||
* We test for paused recovery again here. If
|
||||
* user sets delayed apply, it may be because
|
||||
* they expect to pause recovery in case of
|
||||
* problems, so we must test again here otherwise
|
||||
* pausing during the delay-wait wouldn't work.
|
||||
* We test for paused recovery again here. If user sets
|
||||
* delayed apply, it may be because they expect to pause
|
||||
* recovery in case of problems, so we must test again
|
||||
* here otherwise pausing during the delay-wait wouldn't
|
||||
* work.
|
||||
*/
|
||||
if (xlogctl->recoveryPause)
|
||||
recoveryPausesHere();
|
||||
@@ -6893,8 +6901,8 @@ StartupXLOG(void)
|
||||
/*
|
||||
* Consider whether we need to assign a new timeline ID.
|
||||
*
|
||||
* If we are doing an archive recovery, we always assign a new ID. This
|
||||
* handles a couple of issues. If we stopped short of the end of WAL
|
||||
* If we are doing an archive recovery, we always assign a new ID. This
|
||||
* handles a couple of issues. If we stopped short of the end of WAL
|
||||
* during recovery, then we are clearly generating a new timeline and must
|
||||
* assign it a unique new ID. Even if we ran to the end, modifying the
|
||||
* current last segment is problematic because it may result in trying to
|
||||
@@ -6969,7 +6977,7 @@ StartupXLOG(void)
|
||||
|
||||
/*
|
||||
* Tricky point here: readBuf contains the *last* block that the LastRec
|
||||
* record spans, not the one it starts in. The last block is indeed the
|
||||
* record spans, not the one it starts in. The last block is indeed the
|
||||
* one we want to use.
|
||||
*/
|
||||
if (EndOfLog % XLOG_BLCKSZ != 0)
|
||||
@@ -6996,9 +7004,9 @@ StartupXLOG(void)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* There is no partial block to copy. Just set InitializedUpTo,
|
||||
* and let the first attempt to insert a log record to initialize
|
||||
* the next buffer.
|
||||
* There is no partial block to copy. Just set InitializedUpTo, and
|
||||
* let the first attempt to insert a log record to initialize the next
|
||||
* buffer.
|
||||
*/
|
||||
XLogCtl->InitializedUpTo = EndOfLog;
|
||||
}
|
||||
@@ -7162,7 +7170,7 @@ StartupXLOG(void)
|
||||
XLogReportParameters();
|
||||
|
||||
/*
|
||||
* All done. Allow backends to write WAL. (Although the bool flag is
|
||||
* All done. Allow backends to write WAL. (Although the bool flag is
|
||||
* probably atomic in itself, we use the info_lck here to ensure that
|
||||
* there are no race conditions concerning visibility of other recent
|
||||
* updates to shared memory.)
|
||||
@@ -7200,7 +7208,7 @@ StartupXLOG(void)
|
||||
static void
|
||||
CheckRecoveryConsistency(void)
|
||||
{
|
||||
XLogRecPtr lastReplayedEndRecPtr;
|
||||
XLogRecPtr lastReplayedEndRecPtr;
|
||||
|
||||
/*
|
||||
* During crash recovery, we don't reach a consistent state until we've
|
||||
@@ -7322,7 +7330,7 @@ RecoveryInProgress(void)
|
||||
/*
|
||||
* Initialize TimeLineID and RedoRecPtr when we discover that recovery
|
||||
* is finished. InitPostgres() relies upon this behaviour to ensure
|
||||
* that InitXLOGAccess() is called at backend startup. (If you change
|
||||
* that InitXLOGAccess() is called at backend startup. (If you change
|
||||
* this, see also LocalSetXLogInsertAllowed.)
|
||||
*/
|
||||
if (!LocalRecoveryInProgress)
|
||||
@@ -7335,6 +7343,7 @@ RecoveryInProgress(void)
|
||||
pg_memory_barrier();
|
||||
InitXLOGAccess();
|
||||
}
|
||||
|
||||
/*
|
||||
* Note: We don't need a memory barrier when we're still in recovery.
|
||||
* We might exit recovery immediately after return, so the caller
|
||||
@@ -7594,7 +7603,7 @@ GetRedoRecPtr(void)
|
||||
{
|
||||
/* use volatile pointer to prevent code rearrangement */
|
||||
volatile XLogCtlData *xlogctl = XLogCtl;
|
||||
XLogRecPtr ptr;
|
||||
XLogRecPtr ptr;
|
||||
|
||||
/*
|
||||
* The possibly not up-to-date copy in XlogCtl is enough. Even if we
|
||||
@@ -7983,7 +7992,7 @@ CreateCheckPoint(int flags)
|
||||
/*
|
||||
* If this isn't a shutdown or forced checkpoint, and we have not inserted
|
||||
* any XLOG records since the start of the last checkpoint, skip the
|
||||
* checkpoint. The idea here is to avoid inserting duplicate checkpoints
|
||||
* checkpoint. The idea here is to avoid inserting duplicate checkpoints
|
||||
* when the system is idle. That wastes log space, and more importantly it
|
||||
* exposes us to possible loss of both current and previous checkpoint
|
||||
* records if the machine crashes just as we're writing the update.
|
||||
@@ -8120,7 +8129,7 @@ CreateCheckPoint(int flags)
|
||||
* performing those groups of actions.
|
||||
*
|
||||
* One example is end of transaction, so we must wait for any transactions
|
||||
* that are currently in commit critical sections. If an xact inserted
|
||||
* that are currently in commit critical sections. If an xact inserted
|
||||
* its commit record into XLOG just before the REDO point, then a crash
|
||||
* restart from the REDO point would not replay that record, which means
|
||||
* that our flushing had better include the xact's update of pg_clog. So
|
||||
@@ -8131,9 +8140,8 @@ CreateCheckPoint(int flags)
|
||||
* fuzzy: it is possible that we will wait for xacts we didn't really need
|
||||
* to wait for. But the delay should be short and it seems better to make
|
||||
* checkpoint take a bit longer than to hold off insertions longer than
|
||||
* necessary.
|
||||
* (In fact, the whole reason we have this issue is that xact.c does
|
||||
* commit record XLOG insertion and clog update as two separate steps
|
||||
* necessary. (In fact, the whole reason we have this issue is that xact.c
|
||||
* does commit record XLOG insertion and clog update as two separate steps
|
||||
* protected by different locks, but again that seems best on grounds of
|
||||
* minimizing lock contention.)
|
||||
*
|
||||
@@ -8280,9 +8288,9 @@ CreateCheckPoint(int flags)
|
||||
|
||||
/*
|
||||
* Truncate pg_subtrans if possible. We can throw away all data before
|
||||
* the oldest XMIN of any running transaction. No future transaction will
|
||||
* the oldest XMIN of any running transaction. No future transaction will
|
||||
* attempt to reference any pg_subtrans entry older than that (see Asserts
|
||||
* in subtrans.c). During recovery, though, we mustn't do this because
|
||||
* in subtrans.c). During recovery, though, we mustn't do this because
|
||||
* StartupSUBTRANS hasn't been called yet.
|
||||
*/
|
||||
if (!RecoveryInProgress())
|
||||
@@ -8600,11 +8608,11 @@ CreateRestartPoint(int flags)
|
||||
_logSegNo--;
|
||||
|
||||
/*
|
||||
* Try to recycle segments on a useful timeline. If we've been promoted
|
||||
* since the beginning of this restartpoint, use the new timeline
|
||||
* chosen at end of recovery (RecoveryInProgress() sets ThisTimeLineID
|
||||
* in that case). If we're still in recovery, use the timeline we're
|
||||
* currently replaying.
|
||||
* Try to recycle segments on a useful timeline. If we've been
|
||||
* promoted since the beginning of this restartpoint, use the new
|
||||
* timeline chosen at end of recovery (RecoveryInProgress() sets
|
||||
* ThisTimeLineID in that case). If we're still in recovery, use the
|
||||
* timeline we're currently replaying.
|
||||
*
|
||||
* There is no guarantee that the WAL segments will be useful on the
|
||||
* current timeline; if recovery proceeds to a new timeline right
|
||||
@@ -8636,9 +8644,9 @@ CreateRestartPoint(int flags)
|
||||
|
||||
/*
|
||||
* Truncate pg_subtrans if possible. We can throw away all data before
|
||||
* the oldest XMIN of any running transaction. No future transaction will
|
||||
* the oldest XMIN of any running transaction. No future transaction will
|
||||
* attempt to reference any pg_subtrans entry older than that (see Asserts
|
||||
* in subtrans.c). When hot standby is disabled, though, we mustn't do
|
||||
* in subtrans.c). When hot standby is disabled, though, we mustn't do
|
||||
* this because StartupSUBTRANS hasn't been called yet.
|
||||
*/
|
||||
if (EnableHotStandby)
|
||||
@@ -8697,7 +8705,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
|
||||
/* then check whether slots limit removal further */
|
||||
if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
|
||||
{
|
||||
XLogRecPtr slotSegNo;
|
||||
XLogRecPtr slotSegNo;
|
||||
|
||||
XLByteToSeg(keep, slotSegNo);
|
||||
|
||||
@@ -8730,7 +8738,7 @@ XLogPutNextOid(Oid nextOid)
|
||||
* We need not flush the NEXTOID record immediately, because any of the
|
||||
* just-allocated OIDs could only reach disk as part of a tuple insert or
|
||||
* update that would have its own XLOG record that must follow the NEXTOID
|
||||
* record. Therefore, the standard buffer LSN interlock applied to those
|
||||
* record. Therefore, the standard buffer LSN interlock applied to those
|
||||
* records will ensure no such OID reaches disk before the NEXTOID record
|
||||
* does.
|
||||
*
|
||||
@@ -8859,8 +8867,9 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
|
||||
* lsn updates. We assume pd_lower/upper cannot be changed without an
|
||||
* exclusive lock, so the contents bkp are not racy.
|
||||
*
|
||||
* With buffer_std set to false, XLogCheckBuffer() sets hole_length and
|
||||
* hole_offset to 0; so the following code is safe for either case.
|
||||
* With buffer_std set to false, XLogCheckBuffer() sets hole_length
|
||||
* and hole_offset to 0; so the following code is safe for either
|
||||
* case.
|
||||
*/
|
||||
memcpy(copied_buffer, origdata, bkpb.hole_offset);
|
||||
memcpy(copied_buffer + bkpb.hole_offset,
|
||||
@@ -9072,7 +9081,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
/*
|
||||
* We used to try to take the maximum of ShmemVariableCache->nextOid
|
||||
* and the recorded nextOid, but that fails if the OID counter wraps
|
||||
* around. Since no OID allocation should be happening during replay
|
||||
* around. Since no OID allocation should be happening during replay
|
||||
* anyway, better to just believe the record exactly. We still take
|
||||
* OidGenLock while setting the variable, just in case.
|
||||
*/
|
||||
@@ -9262,10 +9271,10 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
BkpBlock bkpb;
|
||||
|
||||
/*
|
||||
* Full-page image (FPI) records contain a backup block stored "inline"
|
||||
* in the normal data since the locking when writing hint records isn't
|
||||
* sufficient to use the normal backup block mechanism, which assumes
|
||||
* exclusive lock on the buffer supplied.
|
||||
* Full-page image (FPI) records contain a backup block stored
|
||||
* "inline" in the normal data since the locking when writing hint
|
||||
* records isn't sufficient to use the normal backup block mechanism,
|
||||
* which assumes exclusive lock on the buffer supplied.
|
||||
*
|
||||
* Since the only change in these backup block are hint bits, there
|
||||
* are no recovery conflicts generated.
|
||||
@@ -9415,7 +9424,7 @@ get_sync_bit(int method)
|
||||
|
||||
/*
|
||||
* Optimize writes by bypassing kernel cache with O_DIRECT when using
|
||||
* O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
|
||||
* O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
|
||||
* disabled, otherwise the archive command or walsender process will read
|
||||
* the WAL soon after writing it, which is guaranteed to cause a physical
|
||||
* read if we bypassed the kernel cache. We also skip the
|
||||
@@ -9619,7 +9628,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
|
||||
* during an on-line backup even if not doing so at other times, because
|
||||
* it's quite possible for the backup dump to obtain a "torn" (partially
|
||||
* written) copy of a database page if it reads the page concurrently with
|
||||
* our write to the same page. This can be fixed as long as the first
|
||||
* our write to the same page. This can be fixed as long as the first
|
||||
* write to the page in the WAL sequence is a full-page write. Hence, we
|
||||
* turn on forcePageWrites and then force a CHECKPOINT, to ensure there
|
||||
* are no dirty pages in shared memory that might get dumped while the
|
||||
@@ -9663,7 +9672,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
|
||||
* old timeline IDs. That would otherwise happen if you called
|
||||
* pg_start_backup() right after restoring from a PITR archive: the
|
||||
* first WAL segment containing the startup checkpoint has pages in
|
||||
* the beginning with the old timeline ID. That can cause trouble at
|
||||
* the beginning with the old timeline ID. That can cause trouble at
|
||||
* recovery: we won't have a history file covering the old timeline if
|
||||
* pg_xlog directory was not included in the base backup and the WAL
|
||||
* archive was cleared too before starting the backup.
|
||||
@@ -9686,7 +9695,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
|
||||
bool checkpointfpw;
|
||||
|
||||
/*
|
||||
* Force a CHECKPOINT. Aside from being necessary to prevent torn
|
||||
* Force a CHECKPOINT. Aside from being necessary to prevent torn
|
||||
* page problems, this guarantees that two successive backup runs
|
||||
* will have different checkpoint positions and hence different
|
||||
* history file names, even if nothing happened in between.
|
||||
@@ -10339,7 +10348,7 @@ GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
|
||||
*
|
||||
* If we see a backup_label during recovery, we assume that we are recovering
|
||||
* from a backup dump file, and we therefore roll forward from the checkpoint
|
||||
* identified by the label file, NOT what pg_control says. This avoids the
|
||||
* identified by the label file, NOT what pg_control says. This avoids the
|
||||
* problem that pg_control might have been archived one or more checkpoints
|
||||
* later than the start of the dump, and so if we rely on it as the start
|
||||
* point, we will fail to restore a consistent database state.
|
||||
@@ -10686,7 +10695,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
||||
* Standby mode is implemented by a state machine:
|
||||
*
|
||||
* 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
|
||||
* pg_xlog (XLOG_FROM_XLOG)
|
||||
* pg_xlog (XLOG_FROM_XLOG)
|
||||
* 2. Check trigger file
|
||||
* 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
|
||||
* 4. Rescan timelines
|
||||
@@ -10887,8 +10896,8 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
||||
* file from pg_xlog.
|
||||
*/
|
||||
readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
|
||||
currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
|
||||
currentSource);
|
||||
currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
|
||||
currentSource);
|
||||
if (readFile >= 0)
|
||||
return true; /* success! */
|
||||
|
||||
@@ -10945,11 +10954,11 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
||||
if (havedata)
|
||||
{
|
||||
/*
|
||||
* Great, streamed far enough. Open the file if it's
|
||||
* Great, streamed far enough. Open the file if it's
|
||||
* not open already. Also read the timeline history
|
||||
* file if we haven't initialized timeline history
|
||||
* yet; it should be streamed over and present in
|
||||
* pg_xlog by now. Use XLOG_FROM_STREAM so that
|
||||
* pg_xlog by now. Use XLOG_FROM_STREAM so that
|
||||
* source info is set correctly and XLogReceiptTime
|
||||
* isn't changed.
|
||||
*/
|
||||
@@ -11014,7 +11023,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
||||
HandleStartupProcInterrupts();
|
||||
}
|
||||
|
||||
return false; /* not reached */
|
||||
return false; /* not reached */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -11022,9 +11031,9 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
|
||||
* in the current WAL page, previously read by XLogPageRead().
|
||||
*
|
||||
* 'emode' is the error mode that would be used to report a file-not-found
|
||||
* or legitimate end-of-WAL situation. Generally, we use it as-is, but if
|
||||
* or legitimate end-of-WAL situation. Generally, we use it as-is, but if
|
||||
* we're retrying the exact same record that we've tried previously, only
|
||||
* complain the first time to keep the noise down. However, we only do when
|
||||
* complain the first time to keep the noise down. However, we only do when
|
||||
* reading from pg_xlog, because we don't expect any invalid records in archive
|
||||
* or in records streamed from master. Files in the archive should be complete,
|
||||
* and we should never hit the end of WAL because we stop and wait for more WAL
|
||||
|
||||
@@ -300,8 +300,8 @@ RestoreArchivedFile(char *path, const char *xlogfname,
|
||||
signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
|
||||
|
||||
ereport(signaled ? FATAL : DEBUG2,
|
||||
(errmsg("could not restore file \"%s\" from archive: %s",
|
||||
xlogfname, wait_result_to_str(rc))));
|
||||
(errmsg("could not restore file \"%s\" from archive: %s",
|
||||
xlogfname, wait_result_to_str(rc))));
|
||||
|
||||
not_available:
|
||||
|
||||
|
||||
@@ -429,7 +429,7 @@ pg_is_in_recovery(PG_FUNCTION_ARGS)
|
||||
Datum
|
||||
pg_xlog_location_diff(PG_FUNCTION_ARGS)
|
||||
{
|
||||
Datum result;
|
||||
Datum result;
|
||||
|
||||
result = DirectFunctionCall2(pg_lsn_mi,
|
||||
PG_GETARG_DATUM(0),
|
||||
|
||||
@@ -199,7 +199,7 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
|
||||
randAccess = true;
|
||||
|
||||
/*
|
||||
* RecPtr is pointing to end+1 of the previous WAL record. If we're
|
||||
* RecPtr is pointing to end+1 of the previous WAL record. If we're
|
||||
* at a page boundary, no more records can fit on the current page. We
|
||||
* must skip over the page header, but we can't do that until we've
|
||||
* read in the page, since the header size is variable.
|
||||
@@ -277,7 +277,7 @@ XLogReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg)
|
||||
/*
|
||||
* If the whole record header is on this page, validate it immediately.
|
||||
* Otherwise do just a basic sanity check on xl_tot_len, and validate the
|
||||
* rest of the header after reading it from the next page. The xl_tot_len
|
||||
* rest of the header after reading it from the next page. The xl_tot_len
|
||||
* check is necessary here to ensure that we enter the "Need to reassemble
|
||||
* record" code path below; otherwise we might fail to apply
|
||||
* ValidXLogRecordHeader at all.
|
||||
@@ -572,7 +572,7 @@ err:
|
||||
* Validate an XLOG record header.
|
||||
*
|
||||
* This is just a convenience subroutine to avoid duplicated code in
|
||||
* XLogReadRecord. It's not intended for use from anywhere else.
|
||||
* XLogReadRecord. It's not intended for use from anywhere else.
|
||||
*/
|
||||
static bool
|
||||
ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
|
||||
@@ -661,7 +661,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr,
|
||||
* data to read in) until we've checked the CRCs.
|
||||
*
|
||||
* We assume all of the record (that is, xl_tot_len bytes) has been read
|
||||
* into memory at *record. Also, ValidXLogRecordHeader() has accepted the
|
||||
* into memory at *record. Also, ValidXLogRecordHeader() has accepted the
|
||||
* record's header, which means in particular that xl_tot_len is at least
|
||||
* SizeOfXlogRecord, so it is safe to fetch xl_len.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user