mirror of
https://github.com/postgres/postgres.git
synced 2025-11-09 06:21:09 +03:00
Standard pgindent run for 8.1.
This commit is contained in:
@@ -6,7 +6,7 @@
|
||||
*
|
||||
* NOTE: there is massive duplication of code in this module to
|
||||
* support both the convention that a null is marked by a bool TRUE,
|
||||
* and the convention that a null is marked by a char 'n'. The latter
|
||||
* and the convention that a null is marked by a char 'n'. The latter
|
||||
* convention is deprecated but it'll probably be a long time before
|
||||
* we can get rid of it entirely.
|
||||
*
|
||||
@@ -16,7 +16,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/heaptuple.c,v 1.99 2005/03/21 01:23:55 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/heaptuple.c,v 1.100 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -452,8 +452,8 @@ nocachegetattr(HeapTuple tuple,
|
||||
int j;
|
||||
|
||||
/*
|
||||
* In for(), we test <= and not < because we want to see if we
|
||||
* can go past it in initializing offsets.
|
||||
* In for(), we test <= and not < because we want to see if we can
|
||||
* go past it in initializing offsets.
|
||||
*/
|
||||
for (j = 0; j <= attnum; j++)
|
||||
{
|
||||
@@ -467,10 +467,9 @@ nocachegetattr(HeapTuple tuple,
|
||||
}
|
||||
|
||||
/*
|
||||
* If slow is false, and we got here, we know that we have a tuple
|
||||
* with no nulls or var-widths before the target attribute. If
|
||||
* possible, we also want to initialize the remainder of the attribute
|
||||
* cached offset values.
|
||||
* If slow is false, and we got here, we know that we have a tuple with no
|
||||
* nulls or var-widths before the target attribute. If possible, we also
|
||||
* want to initialize the remainder of the attribute cached offset values.
|
||||
*/
|
||||
if (!slow)
|
||||
{
|
||||
@@ -513,11 +512,11 @@ nocachegetattr(HeapTuple tuple,
|
||||
/*
|
||||
* Now we know that we have to walk the tuple CAREFULLY.
|
||||
*
|
||||
* Note - This loop is a little tricky. For each non-null attribute,
|
||||
* we have to first account for alignment padding before the attr,
|
||||
* then advance over the attr based on its length. Nulls have no
|
||||
* storage and no alignment padding either. We can use/set attcacheoff
|
||||
* until we pass either a null or a var-width attribute.
|
||||
* Note - This loop is a little tricky. For each non-null attribute, we
|
||||
* have to first account for alignment padding before the attr, then
|
||||
* advance over the attr based on its length. Nulls have no storage
|
||||
* and no alignment padding either. We can use/set attcacheoff until
|
||||
* we pass either a null or a var-width attribute.
|
||||
*/
|
||||
|
||||
for (i = 0; i < attnum; i++)
|
||||
@@ -597,15 +596,13 @@ heap_getsysattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the attribute number is 0, then we are supposed to
|
||||
* return the entire tuple as a row-type Datum. (Using zero
|
||||
* for this purpose is unclean since it risks confusion with
|
||||
* "invalid attr" result codes, but it's not worth changing
|
||||
* now.)
|
||||
* If the attribute number is 0, then we are supposed to return
|
||||
* the entire tuple as a row-type Datum. (Using zero for this
|
||||
* purpose is unclean since it risks confusion with "invalid attr"
|
||||
* result codes, but it's not worth changing now.)
|
||||
*
|
||||
* We have to make a copy of the tuple so we can safely insert
|
||||
* the Datum overhead fields, which are not set in on-disk
|
||||
* tuples.
|
||||
* We have to make a copy of the tuple so we can safely insert the
|
||||
* Datum overhead fields, which are not set in on-disk tuples.
|
||||
*/
|
||||
case InvalidAttrNumber:
|
||||
{
|
||||
@@ -708,15 +705,15 @@ heap_form_tuple(TupleDesc tupleDescriptor,
|
||||
numberOfAttributes, MaxTupleAttributeNumber)));
|
||||
|
||||
/*
|
||||
* Check for nulls and embedded tuples; expand any toasted attributes
|
||||
* in embedded tuples. This preserves the invariant that toasting can
|
||||
* only go one level deep.
|
||||
* Check for nulls and embedded tuples; expand any toasted attributes in
|
||||
* embedded tuples. This preserves the invariant that toasting can only
|
||||
* go one level deep.
|
||||
*
|
||||
* We can skip calling toast_flatten_tuple_attribute() if the attribute
|
||||
* couldn't possibly be of composite type. All composite datums are
|
||||
* varlena and have alignment 'd'; furthermore they aren't arrays.
|
||||
* Also, if an attribute is already toasted, it must have been sent to
|
||||
* disk already and so cannot contain toasted attributes.
|
||||
* varlena and have alignment 'd'; furthermore they aren't arrays. Also,
|
||||
* if an attribute is already toasted, it must have been sent to disk
|
||||
* already and so cannot contain toasted attributes.
|
||||
*/
|
||||
for (i = 0; i < numberOfAttributes; i++)
|
||||
{
|
||||
@@ -757,8 +754,8 @@ heap_form_tuple(TupleDesc tupleDescriptor,
|
||||
tuple->t_data = td = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
|
||||
|
||||
/*
|
||||
* And fill in the information. Note we fill the Datum fields even
|
||||
* though this tuple may never become a Datum.
|
||||
* And fill in the information. Note we fill the Datum fields even though
|
||||
* this tuple may never become a Datum.
|
||||
*/
|
||||
tuple->t_len = len;
|
||||
ItemPointerSetInvalid(&(tuple->t_self));
|
||||
@@ -816,15 +813,15 @@ heap_formtuple(TupleDesc tupleDescriptor,
|
||||
numberOfAttributes, MaxTupleAttributeNumber)));
|
||||
|
||||
/*
|
||||
* Check for nulls and embedded tuples; expand any toasted attributes
|
||||
* in embedded tuples. This preserves the invariant that toasting can
|
||||
* only go one level deep.
|
||||
* Check for nulls and embedded tuples; expand any toasted attributes in
|
||||
* embedded tuples. This preserves the invariant that toasting can only
|
||||
* go one level deep.
|
||||
*
|
||||
* We can skip calling toast_flatten_tuple_attribute() if the attribute
|
||||
* couldn't possibly be of composite type. All composite datums are
|
||||
* varlena and have alignment 'd'; furthermore they aren't arrays.
|
||||
* Also, if an attribute is already toasted, it must have been sent to
|
||||
* disk already and so cannot contain toasted attributes.
|
||||
* varlena and have alignment 'd'; furthermore they aren't arrays. Also,
|
||||
* if an attribute is already toasted, it must have been sent to disk
|
||||
* already and so cannot contain toasted attributes.
|
||||
*/
|
||||
for (i = 0; i < numberOfAttributes; i++)
|
||||
{
|
||||
@@ -865,8 +862,8 @@ heap_formtuple(TupleDesc tupleDescriptor,
|
||||
tuple->t_data = td = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE);
|
||||
|
||||
/*
|
||||
* And fill in the information. Note we fill the Datum fields even
|
||||
* though this tuple may never become a Datum.
|
||||
* And fill in the information. Note we fill the Datum fields even though
|
||||
* this tuple may never become a Datum.
|
||||
*/
|
||||
tuple->t_len = len;
|
||||
ItemPointerSetInvalid(&(tuple->t_self));
|
||||
@@ -917,15 +914,15 @@ heap_modify_tuple(HeapTuple tuple,
|
||||
HeapTuple newTuple;
|
||||
|
||||
/*
|
||||
* allocate and fill values and isnull arrays from either the tuple or
|
||||
* the repl information, as appropriate.
|
||||
* allocate and fill values and isnull arrays from either the tuple or the
|
||||
* repl information, as appropriate.
|
||||
*
|
||||
* NOTE: it's debatable whether to use heap_deform_tuple() here or just
|
||||
* heap_getattr() only the non-replaced colums. The latter could win
|
||||
* if there are many replaced columns and few non-replaced ones.
|
||||
* However, heap_deform_tuple costs only O(N) while the heap_getattr
|
||||
* way would cost O(N^2) if there are many non-replaced columns, so it
|
||||
* seems better to err on the side of linear cost.
|
||||
* heap_getattr() only the non-replaced colums. The latter could win if
|
||||
* there are many replaced columns and few non-replaced ones. However,
|
||||
* heap_deform_tuple costs only O(N) while the heap_getattr way would cost
|
||||
* O(N^2) if there are many non-replaced columns, so it seems better to
|
||||
* err on the side of linear cost.
|
||||
*/
|
||||
values = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
|
||||
isnull = (bool *) palloc(numberOfAttributes * sizeof(bool));
|
||||
@@ -950,8 +947,8 @@ heap_modify_tuple(HeapTuple tuple,
|
||||
pfree(isnull);
|
||||
|
||||
/*
|
||||
* copy the identification info of the old tuple: t_ctid, t_self, and
|
||||
* OID (if any)
|
||||
* copy the identification info of the old tuple: t_ctid, t_self, and OID
|
||||
* (if any)
|
||||
*/
|
||||
newTuple->t_data->t_ctid = tuple->t_data->t_ctid;
|
||||
newTuple->t_self = tuple->t_self;
|
||||
@@ -986,15 +983,15 @@ heap_modifytuple(HeapTuple tuple,
|
||||
HeapTuple newTuple;
|
||||
|
||||
/*
|
||||
* allocate and fill values and nulls arrays from either the tuple or
|
||||
* the repl information, as appropriate.
|
||||
* allocate and fill values and nulls arrays from either the tuple or the
|
||||
* repl information, as appropriate.
|
||||
*
|
||||
* NOTE: it's debatable whether to use heap_deformtuple() here or just
|
||||
* heap_getattr() only the non-replaced colums. The latter could win
|
||||
* if there are many replaced columns and few non-replaced ones.
|
||||
* However, heap_deformtuple costs only O(N) while the heap_getattr
|
||||
* way would cost O(N^2) if there are many non-replaced columns, so it
|
||||
* seems better to err on the side of linear cost.
|
||||
* heap_getattr() only the non-replaced colums. The latter could win if
|
||||
* there are many replaced columns and few non-replaced ones. However,
|
||||
* heap_deformtuple costs only O(N) while the heap_getattr way would cost
|
||||
* O(N^2) if there are many non-replaced columns, so it seems better to
|
||||
* err on the side of linear cost.
|
||||
*/
|
||||
values = (Datum *) palloc(numberOfAttributes * sizeof(Datum));
|
||||
nulls = (char *) palloc(numberOfAttributes * sizeof(char));
|
||||
@@ -1022,8 +1019,8 @@ heap_modifytuple(HeapTuple tuple,
|
||||
pfree(nulls);
|
||||
|
||||
/*
|
||||
* copy the identification info of the old tuple: t_ctid, t_self, and
|
||||
* OID (if any)
|
||||
* copy the identification info of the old tuple: t_ctid, t_self, and OID
|
||||
* (if any)
|
||||
*/
|
||||
newTuple->t_data->t_ctid = tuple->t_data->t_ctid;
|
||||
newTuple->t_self = tuple->t_self;
|
||||
@@ -1068,9 +1065,9 @@ heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc,
|
||||
natts = tup->t_natts;
|
||||
|
||||
/*
|
||||
* In inheritance situations, it is possible that the given tuple
|
||||
* actually has more fields than the caller is expecting. Don't run
|
||||
* off the end of the caller's arrays.
|
||||
* In inheritance situations, it is possible that the given tuple actually
|
||||
* has more fields than the caller is expecting. Don't run off the end of
|
||||
* the caller's arrays.
|
||||
*/
|
||||
natts = Min(natts, tdesc_natts);
|
||||
|
||||
@@ -1161,9 +1158,9 @@ heap_deformtuple(HeapTuple tuple,
|
||||
natts = tup->t_natts;
|
||||
|
||||
/*
|
||||
* In inheritance situations, it is possible that the given tuple
|
||||
* actually has more fields than the caller is expecting. Don't run
|
||||
* off the end of the caller's arrays.
|
||||
* In inheritance situations, it is possible that the given tuple actually
|
||||
* has more fields than the caller is expecting. Don't run off the end of
|
||||
* the caller's arrays.
|
||||
*/
|
||||
natts = Min(natts, tdesc_natts);
|
||||
|
||||
@@ -1228,22 +1225,22 @@ heap_deformtuple(HeapTuple tuple,
|
||||
static void
|
||||
slot_deform_tuple(TupleTableSlot *slot, int natts)
|
||||
{
|
||||
HeapTuple tuple = slot->tts_tuple;
|
||||
TupleDesc tupleDesc = slot->tts_tupleDescriptor;
|
||||
HeapTuple tuple = slot->tts_tuple;
|
||||
TupleDesc tupleDesc = slot->tts_tupleDescriptor;
|
||||
Datum *values = slot->tts_values;
|
||||
bool *isnull = slot->tts_isnull;
|
||||
HeapTupleHeader tup = tuple->t_data;
|
||||
HeapTupleHeader tup = tuple->t_data;
|
||||
bool hasnulls = HeapTupleHasNulls(tuple);
|
||||
Form_pg_attribute *att = tupleDesc->attrs;
|
||||
int attnum;
|
||||
char *tp; /* ptr to tuple data */
|
||||
long off; /* offset in tuple data */
|
||||
bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */
|
||||
bool slow; /* can we use/set attcacheoff? */
|
||||
char *tp; /* ptr to tuple data */
|
||||
long off; /* offset in tuple data */
|
||||
bits8 *bp = tup->t_bits; /* ptr to null bitmap in tuple */
|
||||
bool slow; /* can we use/set attcacheoff? */
|
||||
|
||||
/*
|
||||
* Check whether the first call for this tuple, and initialize or
|
||||
* restore loop state.
|
||||
* Check whether the first call for this tuple, and initialize or restore
|
||||
* loop state.
|
||||
*/
|
||||
attnum = slot->tts_nvalid;
|
||||
if (attnum == 0)
|
||||
@@ -1269,7 +1266,7 @@ slot_deform_tuple(TupleTableSlot *slot, int natts)
|
||||
{
|
||||
values[attnum] = (Datum) 0;
|
||||
isnull[attnum] = true;
|
||||
slow = true; /* can't use attcacheoff anymore */
|
||||
slow = true; /* can't use attcacheoff anymore */
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1290,7 +1287,7 @@ slot_deform_tuple(TupleTableSlot *slot, int natts)
|
||||
off = att_addlength(off, thisatt->attlen, tp + off);
|
||||
|
||||
if (thisatt->attlen <= 0)
|
||||
slow = true; /* can't use attcacheoff anymore */
|
||||
slow = true; /* can't use attcacheoff anymore */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1316,9 +1313,9 @@ slot_deform_tuple(TupleTableSlot *slot, int natts)
|
||||
Datum
|
||||
slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull)
|
||||
{
|
||||
HeapTuple tuple = slot->tts_tuple;
|
||||
TupleDesc tupleDesc = slot->tts_tupleDescriptor;
|
||||
HeapTupleHeader tup;
|
||||
HeapTuple tuple = slot->tts_tuple;
|
||||
TupleDesc tupleDesc = slot->tts_tupleDescriptor;
|
||||
HeapTupleHeader tup;
|
||||
|
||||
/*
|
||||
* system attributes are handled by heap_getsysattr
|
||||
@@ -1349,18 +1346,18 @@ slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull)
|
||||
}
|
||||
|
||||
/*
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should
|
||||
* equal natts in all virtual-tuple cases)
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should equal
|
||||
* natts in all virtual-tuple cases)
|
||||
*/
|
||||
if (tuple == NULL) /* internal error */
|
||||
if (tuple == NULL) /* internal error */
|
||||
elog(ERROR, "cannot extract attribute from empty tuple slot");
|
||||
|
||||
/*
|
||||
* return NULL if attnum is out of range according to the tuple
|
||||
*
|
||||
* (We have to check this separately because of various inheritance
|
||||
* and table-alteration scenarios: the tuple could be either longer
|
||||
* or shorter than the tupdesc.)
|
||||
* (We have to check this separately because of various inheritance and
|
||||
* table-alteration scenarios: the tuple could be either longer or shorter
|
||||
* than the tupdesc.)
|
||||
*/
|
||||
tup = tuple->t_data;
|
||||
if (attnum > tup->t_natts)
|
||||
@@ -1379,10 +1376,9 @@ slot_getattr(TupleTableSlot *slot, int attnum, bool *isnull)
|
||||
}
|
||||
|
||||
/*
|
||||
* If the attribute's column has been dropped, we force a NULL
|
||||
* result. This case should not happen in normal use, but it could
|
||||
* happen if we are executing a plan cached before the column was
|
||||
* dropped.
|
||||
* If the attribute's column has been dropped, we force a NULL result.
|
||||
* This case should not happen in normal use, but it could happen if we
|
||||
* are executing a plan cached before the column was dropped.
|
||||
*/
|
||||
if (tupleDesc->attrs[attnum - 1]->attisdropped)
|
||||
{
|
||||
@@ -1420,11 +1416,11 @@ slot_getallattrs(TupleTableSlot *slot)
|
||||
return;
|
||||
|
||||
/*
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should
|
||||
* equal natts in all virtual-tuple cases)
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should equal
|
||||
* natts in all virtual-tuple cases)
|
||||
*/
|
||||
tuple = slot->tts_tuple;
|
||||
if (tuple == NULL) /* internal error */
|
||||
if (tuple == NULL) /* internal error */
|
||||
elog(ERROR, "cannot extract attribute from empty tuple slot");
|
||||
|
||||
/*
|
||||
@@ -1467,11 +1463,11 @@ slot_getsomeattrs(TupleTableSlot *slot, int attnum)
|
||||
elog(ERROR, "invalid attribute number %d", attnum);
|
||||
|
||||
/*
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should
|
||||
* equal natts in all virtual-tuple cases)
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should equal
|
||||
* natts in all virtual-tuple cases)
|
||||
*/
|
||||
tuple = slot->tts_tuple;
|
||||
if (tuple == NULL) /* internal error */
|
||||
if (tuple == NULL) /* internal error */
|
||||
elog(ERROR, "cannot extract attribute from empty tuple slot");
|
||||
|
||||
/*
|
||||
@@ -1502,8 +1498,8 @@ slot_getsomeattrs(TupleTableSlot *slot, int attnum)
|
||||
bool
|
||||
slot_attisnull(TupleTableSlot *slot, int attnum)
|
||||
{
|
||||
HeapTuple tuple = slot->tts_tuple;
|
||||
TupleDesc tupleDesc = slot->tts_tupleDescriptor;
|
||||
HeapTuple tuple = slot->tts_tuple;
|
||||
TupleDesc tupleDesc = slot->tts_tupleDescriptor;
|
||||
|
||||
/*
|
||||
* system attributes are handled by heap_attisnull
|
||||
@@ -1528,10 +1524,10 @@ slot_attisnull(TupleTableSlot *slot, int attnum)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should
|
||||
* equal natts in all virtual-tuple cases)
|
||||
* otherwise we had better have a physical tuple (tts_nvalid should equal
|
||||
* natts in all virtual-tuple cases)
|
||||
*/
|
||||
if (tuple == NULL) /* internal error */
|
||||
if (tuple == NULL) /* internal error */
|
||||
elog(ERROR, "cannot extract attribute from empty tuple slot");
|
||||
|
||||
/* and let the tuple tell it */
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/indextuple.c,v 1.74 2005/03/27 18:38:26 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/indextuple.c,v 1.75 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -70,20 +70,20 @@ index_form_tuple(TupleDesc tupleDescriptor,
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If value is stored EXTERNAL, must fetch it so we are not
|
||||
* depending on outside storage. This should be improved someday.
|
||||
* If value is stored EXTERNAL, must fetch it so we are not depending
|
||||
* on outside storage. This should be improved someday.
|
||||
*/
|
||||
if (VARATT_IS_EXTERNAL(values[i]))
|
||||
{
|
||||
untoasted_values[i] = PointerGetDatum(
|
||||
heap_tuple_fetch_attr(
|
||||
(varattrib *) DatumGetPointer(values[i])));
|
||||
heap_tuple_fetch_attr(
|
||||
(varattrib *) DatumGetPointer(values[i])));
|
||||
untoasted_free[i] = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* If value is above size target, and is of a compressible
|
||||
* datatype, try to compress it in-line.
|
||||
* If value is above size target, and is of a compressible datatype,
|
||||
* try to compress it in-line.
|
||||
*/
|
||||
if (VARATT_SIZE(untoasted_values[i]) > TOAST_INDEX_TARGET &&
|
||||
!VARATT_IS_EXTENDED(untoasted_values[i]) &&
|
||||
@@ -149,23 +149,23 @@ index_form_tuple(TupleDesc tupleDescriptor,
|
||||
|
||||
/*
|
||||
* We do this because heap_fill_tuple wants to initialize a "tupmask"
|
||||
* which is used for HeapTuples, but we want an indextuple infomask.
|
||||
* The only relevant info is the "has variable attributes" field.
|
||||
* We have already set the hasnull bit above.
|
||||
* which is used for HeapTuples, but we want an indextuple infomask. The
|
||||
* only relevant info is the "has variable attributes" field. We have
|
||||
* already set the hasnull bit above.
|
||||
*/
|
||||
if (tupmask & HEAP_HASVARWIDTH)
|
||||
infomask |= INDEX_VAR_MASK;
|
||||
|
||||
/*
|
||||
* Here we make sure that the size will fit in the field reserved for
|
||||
* it in t_info.
|
||||
* Here we make sure that the size will fit in the field reserved for it
|
||||
* in t_info.
|
||||
*/
|
||||
if ((size & INDEX_SIZE_MASK) != size)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
||||
errmsg("index row requires %lu bytes, maximum size is %lu",
|
||||
(unsigned long) size,
|
||||
(unsigned long) INDEX_SIZE_MASK)));
|
||||
errmsg("index row requires %lu bytes, maximum size is %lu",
|
||||
(unsigned long) size,
|
||||
(unsigned long) INDEX_SIZE_MASK)));
|
||||
|
||||
infomask |= size;
|
||||
|
||||
@@ -322,10 +322,9 @@ nocache_index_getattr(IndexTuple tup,
|
||||
}
|
||||
|
||||
/*
|
||||
* If slow is false, and we got here, we know that we have a tuple
|
||||
* with no nulls or var-widths before the target attribute. If
|
||||
* possible, we also want to initialize the remainder of the attribute
|
||||
* cached offset values.
|
||||
* If slow is false, and we got here, we know that we have a tuple with no
|
||||
* nulls or var-widths before the target attribute. If possible, we also
|
||||
* want to initialize the remainder of the attribute cached offset values.
|
||||
*/
|
||||
if (!slow)
|
||||
{
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/printtup.c,v 1.91 2005/06/22 17:45:45 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/printtup.c,v 1.92 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -78,9 +78,9 @@ printtup_create_DR(CommandDest dest, Portal portal)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* In protocol 2.0 the Bind message does not exist, so there is no
|
||||
* way for the columns to have different print formats; it's
|
||||
* sufficient to look at the first one.
|
||||
* In protocol 2.0 the Bind message does not exist, so there is no way
|
||||
* for the columns to have different print formats; it's sufficient to
|
||||
* look at the first one.
|
||||
*/
|
||||
if (portal->formats && portal->formats[0] != 0)
|
||||
self->pub.receiveSlot = printtup_internal_20;
|
||||
@@ -113,8 +113,7 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
|
||||
if (PG_PROTOCOL_MAJOR(FrontendProtocol) < 3)
|
||||
{
|
||||
/*
|
||||
* Send portal name to frontend (obsolete cruft, gone in proto
|
||||
* 3.0)
|
||||
* Send portal name to frontend (obsolete cruft, gone in proto 3.0)
|
||||
*
|
||||
* If portal name not specified, use "blank" portal.
|
||||
*/
|
||||
@@ -127,8 +126,8 @@ printtup_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a retrieve, and we are supposed to emit row
|
||||
* descriptions, then we send back the tuple descriptor of the tuples.
|
||||
* If this is a retrieve, and we are supposed to emit row descriptions,
|
||||
* then we send back the tuple descriptor of the tuples.
|
||||
*/
|
||||
if (operation == CMD_SELECT && myState->sendDescrip)
|
||||
SendRowDescriptionMessage(typeinfo,
|
||||
@@ -280,7 +279,7 @@ printtup_prepare_info(DR_printtup *myState, TupleDesc typeinfo, int numAttrs)
|
||||
static void
|
||||
printtup(TupleTableSlot *slot, DestReceiver *self)
|
||||
{
|
||||
TupleDesc typeinfo = slot->tts_tupleDescriptor;
|
||||
TupleDesc typeinfo = slot->tts_tupleDescriptor;
|
||||
DR_printtup *myState = (DR_printtup *) self;
|
||||
StringInfoData buf;
|
||||
int natts = typeinfo->natts;
|
||||
@@ -363,7 +362,7 @@ printtup(TupleTableSlot *slot, DestReceiver *self)
|
||||
static void
|
||||
printtup_20(TupleTableSlot *slot, DestReceiver *self)
|
||||
{
|
||||
TupleDesc typeinfo = slot->tts_tupleDescriptor;
|
||||
TupleDesc typeinfo = slot->tts_tupleDescriptor;
|
||||
DR_printtup *myState = (DR_printtup *) self;
|
||||
StringInfoData buf;
|
||||
int natts = typeinfo->natts;
|
||||
@@ -566,7 +565,7 @@ debugtup(TupleTableSlot *slot, DestReceiver *self)
|
||||
static void
|
||||
printtup_internal_20(TupleTableSlot *slot, DestReceiver *self)
|
||||
{
|
||||
TupleDesc typeinfo = slot->tts_tupleDescriptor;
|
||||
TupleDesc typeinfo = slot->tts_tupleDescriptor;
|
||||
DR_printtup *myState = (DR_printtup *) self;
|
||||
StringInfoData buf;
|
||||
int natts = typeinfo->natts;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/tupdesc.c,v 1.111 2005/04/14 22:34:48 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/common/tupdesc.c,v 1.112 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* some of the executor utility code such as "ExecTypeFromTL" should be
|
||||
@@ -49,10 +49,10 @@ CreateTemplateTupleDesc(int natts, bool hasoid)
|
||||
* Allocate enough memory for the tuple descriptor, including the
|
||||
* attribute rows, and set up the attribute row pointers.
|
||||
*
|
||||
* Note: we assume that sizeof(struct tupleDesc) is a multiple of
|
||||
* the struct pointer alignment requirement, and hence we don't need
|
||||
* to insert alignment padding between the struct and the array of
|
||||
* attribute row pointers.
|
||||
* Note: we assume that sizeof(struct tupleDesc) is a multiple of the struct
|
||||
* pointer alignment requirement, and hence we don't need to insert
|
||||
* alignment padding between the struct and the array of attribute row
|
||||
* pointers.
|
||||
*/
|
||||
attroffset = sizeof(struct tupleDesc) + natts * sizeof(Form_pg_attribute);
|
||||
attroffset = MAXALIGN(attroffset);
|
||||
@@ -273,16 +273,16 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
|
||||
Form_pg_attribute attr2 = tupdesc2->attrs[i];
|
||||
|
||||
/*
|
||||
* We do not need to check every single field here: we can
|
||||
* disregard attrelid and attnum (which were used to place the row
|
||||
* in the attrs array in the first place). It might look like we
|
||||
* could dispense with checking attlen/attbyval/attalign, since these
|
||||
* are derived from atttypid; but in the case of dropped columns
|
||||
* we must check them (since atttypid will be zero for all dropped
|
||||
* columns) and in general it seems safer to check them always.
|
||||
* We do not need to check every single field here: we can disregard
|
||||
* attrelid and attnum (which were used to place the row in the attrs
|
||||
* array in the first place). It might look like we could dispense
|
||||
* with checking attlen/attbyval/attalign, since these are derived
|
||||
* from atttypid; but in the case of dropped columns we must check
|
||||
* them (since atttypid will be zero for all dropped columns) and in
|
||||
* general it seems safer to check them always.
|
||||
*
|
||||
* attcacheoff must NOT be checked since it's possibly not set
|
||||
* in both copies.
|
||||
* attcacheoff must NOT be checked since it's possibly not set in both
|
||||
* copies.
|
||||
*/
|
||||
if (strcmp(NameStr(attr1->attname), NameStr(attr2->attname)) != 0)
|
||||
return false;
|
||||
@@ -332,9 +332,9 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
|
||||
AttrDefault *defval2 = constr2->defval;
|
||||
|
||||
/*
|
||||
* We can't assume that the items are always read from the
|
||||
* system catalogs in the same order; so use the adnum field
|
||||
* to identify the matching item to compare.
|
||||
* We can't assume that the items are always read from the system
|
||||
* catalogs in the same order; so use the adnum field to identify
|
||||
* the matching item to compare.
|
||||
*/
|
||||
for (j = 0; j < n; defval2++, j++)
|
||||
{
|
||||
@@ -355,9 +355,9 @@ equalTupleDescs(TupleDesc tupdesc1, TupleDesc tupdesc2)
|
||||
ConstrCheck *check2 = constr2->check;
|
||||
|
||||
/*
|
||||
* Similarly, don't assume that the checks are always read in
|
||||
* the same order; match them up by name and contents. (The
|
||||
* name *should* be unique, but...)
|
||||
* Similarly, don't assume that the checks are always read in the
|
||||
* same order; match them up by name and contents. (The name
|
||||
* *should* be unique, but...)
|
||||
*/
|
||||
for (j = 0; j < n; check2++, j++)
|
||||
{
|
||||
@@ -407,8 +407,8 @@ TupleDescInitEntry(TupleDesc desc,
|
||||
|
||||
/*
|
||||
* Note: attributeName can be NULL, because the planner doesn't always
|
||||
* fill in valid resname values in targetlists, particularly for
|
||||
* resjunk attributes.
|
||||
* fill in valid resname values in targetlists, particularly for resjunk
|
||||
* attributes.
|
||||
*/
|
||||
if (attributeName != NULL)
|
||||
namestrcpy(&(att->attname), attributeName);
|
||||
@@ -482,8 +482,8 @@ BuildDescForRelation(List *schema)
|
||||
ColumnDef *entry = lfirst(l);
|
||||
|
||||
/*
|
||||
* for each entry in the list, get the name and type information
|
||||
* from the list and have TupleDescInitEntry fill in the attribute
|
||||
* for each entry in the list, get the name and type information from
|
||||
* the list and have TupleDescInitEntry fill in the attribute
|
||||
* information we need.
|
||||
*/
|
||||
attnum++;
|
||||
@@ -508,8 +508,8 @@ BuildDescForRelation(List *schema)
|
||||
desc->attrs[attnum - 1]->attnotnull = entry->is_not_null;
|
||||
|
||||
/*
|
||||
* Note we copy only pre-cooked default expressions. Digestion of
|
||||
* raw ones is someone else's problem.
|
||||
* Note we copy only pre-cooked default expressions. Digestion of raw
|
||||
* ones is someone else's problem.
|
||||
*/
|
||||
if (entry->cooked_default != NULL)
|
||||
{
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/gist/gistproc.c,v 1.2 2005/09/22 20:44:36 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/gist/gistproc.c,v 1.3 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -26,7 +26,7 @@ typedef struct
|
||||
{
|
||||
BOX *key;
|
||||
int pos;
|
||||
} KBsort;
|
||||
} KBsort;
|
||||
|
||||
static int compare_KB(const void *a, const void *b);
|
||||
static bool gist_box_leaf_consistent(BOX *key, BOX *query,
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.80 2005/06/06 17:01:21 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.81 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* This file contains only the public interface routines.
|
||||
@@ -55,8 +55,8 @@ hashbuild(PG_FUNCTION_ARGS)
|
||||
HashBuildState buildstate;
|
||||
|
||||
/*
|
||||
* We expect to be called exactly once for any index relation. If
|
||||
* that's not the case, big trouble's what we have.
|
||||
* We expect to be called exactly once for any index relation. If that's
|
||||
* not the case, big trouble's what we have.
|
||||
*/
|
||||
if (RelationGetNumberOfBlocks(index) != 0)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
@@ -70,7 +70,7 @@ hashbuild(PG_FUNCTION_ARGS)
|
||||
|
||||
/* do the heap scan */
|
||||
reltuples = IndexBuildHeapScan(heap, index, indexInfo,
|
||||
hashbuildCallback, (void *) &buildstate);
|
||||
hashbuildCallback, (void *) &buildstate);
|
||||
|
||||
/* since we just counted the # of tuples, may as well update stats */
|
||||
IndexCloseAndUpdateStats(heap, reltuples, index, buildstate.indtuples);
|
||||
@@ -141,12 +141,12 @@ hashinsert(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* If the single index key is null, we don't insert it into the index.
|
||||
* Hash tables support scans on '='. Relational algebra says that A =
|
||||
* B returns null if either A or B is null. This means that no
|
||||
* qualification used in an index scan could ever return true on a
|
||||
* null attribute. It also means that indices can't be used by ISNULL
|
||||
* or NOTNULL scans, but that's an artifact of the strategy map
|
||||
* architecture chosen in 1986, not of the way nulls are handled here.
|
||||
* Hash tables support scans on '='. Relational algebra says that A = B
|
||||
* returns null if either A or B is null. This means that no
|
||||
* qualification used in an index scan could ever return true on a null
|
||||
* attribute. It also means that indices can't be used by ISNULL or
|
||||
* NOTNULL scans, but that's an artifact of the strategy map architecture
|
||||
* chosen in 1986, not of the way nulls are handled here.
|
||||
*/
|
||||
if (IndexTupleHasNulls(itup))
|
||||
{
|
||||
@@ -180,16 +180,16 @@ hashgettuple(PG_FUNCTION_ARGS)
|
||||
bool res;
|
||||
|
||||
/*
|
||||
* We hold pin but not lock on current buffer while outside the hash
|
||||
* AM. Reacquire the read lock here.
|
||||
* We hold pin but not lock on current buffer while outside the hash AM.
|
||||
* Reacquire the read lock here.
|
||||
*/
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
|
||||
|
||||
/*
|
||||
* If we've already initialized this scan, we can just advance it in
|
||||
* the appropriate direction. If we haven't done so yet, we call a
|
||||
* routine to get the first item in the scan.
|
||||
* If we've already initialized this scan, we can just advance it in the
|
||||
* appropriate direction. If we haven't done so yet, we call a routine to
|
||||
* get the first item in the scan.
|
||||
*/
|
||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
||||
{
|
||||
@@ -199,17 +199,16 @@ hashgettuple(PG_FUNCTION_ARGS)
|
||||
if (scan->kill_prior_tuple)
|
||||
{
|
||||
/*
|
||||
* Yes, so mark it by setting the LP_DELETE bit in the item
|
||||
* flags.
|
||||
* Yes, so mark it by setting the LP_DELETE bit in the item flags.
|
||||
*/
|
||||
offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
|
||||
page = BufferGetPage(so->hashso_curbuf);
|
||||
PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, it's treated the
|
||||
* same as a commit-hint-bit status update for heap tuples: we
|
||||
* mark the buffer dirty but don't make a WAL log entry.
|
||||
* Since this can be redone later if needed, it's treated the same
|
||||
* as a commit-hint-bit status update for heap tuples: we mark the
|
||||
* buffer dirty but don't make a WAL log entry.
|
||||
*/
|
||||
SetBufferCommitInfoNeedsSave(so->hashso_curbuf);
|
||||
}
|
||||
@@ -256,7 +255,7 @@ Datum
|
||||
hashgetmulti(PG_FUNCTION_ARGS)
|
||||
{
|
||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
int32 max_tids = PG_GETARG_INT32(2);
|
||||
int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3);
|
||||
HashScanOpaque so = (HashScanOpaque) scan->opaque;
|
||||
@@ -265,8 +264,8 @@ hashgetmulti(PG_FUNCTION_ARGS)
|
||||
int32 ntids = 0;
|
||||
|
||||
/*
|
||||
* We hold pin but not lock on current buffer while outside the hash
|
||||
* AM. Reacquire the read lock here.
|
||||
* We hold pin but not lock on current buffer while outside the hash AM.
|
||||
* Reacquire the read lock here.
|
||||
*/
|
||||
if (BufferIsValid(so->hashso_curbuf))
|
||||
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
|
||||
@@ -280,6 +279,7 @@ hashgetmulti(PG_FUNCTION_ARGS)
|
||||
res = _hash_next(scan, ForwardScanDirection);
|
||||
else
|
||||
res = _hash_first(scan, ForwardScanDirection);
|
||||
|
||||
/*
|
||||
* Skip killed tuples if asked to.
|
||||
*/
|
||||
@@ -505,12 +505,12 @@ hashbulkdelete(PG_FUNCTION_ARGS)
|
||||
num_index_tuples = 0;
|
||||
|
||||
/*
|
||||
* Read the metapage to fetch original bucket and tuple counts. Also,
|
||||
* we keep a copy of the last-seen metapage so that we can use its
|
||||
* hashm_spares[] values to compute bucket page addresses. This is a
|
||||
* bit hokey but perfectly safe, since the interesting entries in the
|
||||
* spares array cannot change under us; and it beats rereading the
|
||||
* metapage for each bucket.
|
||||
* Read the metapage to fetch original bucket and tuple counts. Also, we
|
||||
* keep a copy of the last-seen metapage so that we can use its
|
||||
* hashm_spares[] values to compute bucket page addresses. This is a bit
|
||||
* hokey but perfectly safe, since the interesting entries in the spares
|
||||
* array cannot change under us; and it beats rereading the metapage for
|
||||
* each bucket.
|
||||
*/
|
||||
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
|
||||
metap = (HashMetaPage) BufferGetPage(metabuf);
|
||||
@@ -569,7 +569,7 @@ loop_top:
|
||||
ItemPointer htup;
|
||||
|
||||
hitem = (HashItem) PageGetItem(page,
|
||||
PageGetItemId(page, offno));
|
||||
PageGetItemId(page, offno));
|
||||
htup = &(hitem->hash_itup.t_tid);
|
||||
if (callback(htup, callback_state))
|
||||
{
|
||||
@@ -641,8 +641,7 @@ loop_top:
|
||||
{
|
||||
/*
|
||||
* Otherwise, our count is untrustworthy since we may have
|
||||
* double-scanned tuples in split buckets. Proceed by
|
||||
* dead-reckoning.
|
||||
* double-scanned tuples in split buckets. Proceed by dead-reckoning.
|
||||
*/
|
||||
if (metap->hashm_ntuples > tuples_removed)
|
||||
metap->hashm_ntuples -= tuples_removed;
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.44 2005/05/25 21:40:40 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.45 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* These functions are stored in pg_amproc. For each operator class
|
||||
@@ -46,11 +46,11 @@ hashint8(PG_FUNCTION_ARGS)
|
||||
{
|
||||
/*
|
||||
* The idea here is to produce a hash value compatible with the values
|
||||
* produced by hashint4 and hashint2 for logically equivalent inputs;
|
||||
* this is necessary if we ever hope to support cross-type hash joins
|
||||
* across these input types. Since all three types are signed, we can
|
||||
* xor the high half of the int8 value if the sign is positive, or the
|
||||
* complement of the high half when the sign is negative.
|
||||
* produced by hashint4 and hashint2 for logically equivalent inputs; this
|
||||
* is necessary if we ever hope to support cross-type hash joins across
|
||||
* these input types. Since all three types are signed, we can xor the
|
||||
* high half of the int8 value if the sign is positive, or the complement
|
||||
* of the high half when the sign is negative.
|
||||
*/
|
||||
#ifndef INT64_IS_BUSTED
|
||||
int64 val = PG_GETARG_INT64(0);
|
||||
@@ -78,9 +78,9 @@ hashfloat4(PG_FUNCTION_ARGS)
|
||||
float4 key = PG_GETARG_FLOAT4(0);
|
||||
|
||||
/*
|
||||
* On IEEE-float machines, minus zero and zero have different bit
|
||||
* patterns but should compare as equal. We must ensure that they
|
||||
* have the same hash value, which is most easily done this way:
|
||||
* On IEEE-float machines, minus zero and zero have different bit patterns
|
||||
* but should compare as equal. We must ensure that they have the same
|
||||
* hash value, which is most easily done this way:
|
||||
*/
|
||||
if (key == (float4) 0)
|
||||
PG_RETURN_UINT32(0);
|
||||
@@ -94,9 +94,9 @@ hashfloat8(PG_FUNCTION_ARGS)
|
||||
float8 key = PG_GETARG_FLOAT8(0);
|
||||
|
||||
/*
|
||||
* On IEEE-float machines, minus zero and zero have different bit
|
||||
* patterns but should compare as equal. We must ensure that they
|
||||
* have the same hash value, which is most easily done this way:
|
||||
* On IEEE-float machines, minus zero and zero have different bit patterns
|
||||
* but should compare as equal. We must ensure that they have the same
|
||||
* hash value, which is most easily done this way:
|
||||
*/
|
||||
if (key == (float8) 0)
|
||||
PG_RETURN_UINT32(0);
|
||||
@@ -126,8 +126,7 @@ hashname(PG_FUNCTION_ARGS)
|
||||
char *key = NameStr(*PG_GETARG_NAME(0));
|
||||
int keylen = strlen(key);
|
||||
|
||||
Assert(keylen < NAMEDATALEN); /* else it's not truncated
|
||||
* correctly */
|
||||
Assert(keylen < NAMEDATALEN); /* else it's not truncated correctly */
|
||||
|
||||
return hash_any((unsigned char *) key, keylen);
|
||||
}
|
||||
@@ -139,8 +138,8 @@ hashtext(PG_FUNCTION_ARGS)
|
||||
Datum result;
|
||||
|
||||
/*
|
||||
* Note: this is currently identical in behavior to hashvarlena, but
|
||||
* it seems likely that we may need to do something different in non-C
|
||||
* Note: this is currently identical in behavior to hashvarlena, but it
|
||||
* seems likely that we may need to do something different in non-C
|
||||
* locales. (See also hashbpchar, if so.)
|
||||
*/
|
||||
result = hash_any((unsigned char *) VARDATA(key),
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.37 2005/08/10 21:36:45 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashinsert.c,v 1.38 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -50,8 +50,8 @@ _hash_doinsert(Relation rel, HashItem hitem)
|
||||
bool isnull;
|
||||
|
||||
/*
|
||||
* Compute the hash key for the item. We do this first so as not to
|
||||
* need to hold any locks while running the hash function.
|
||||
* Compute the hash key for the item. We do this first so as not to need
|
||||
* to hold any locks while running the hash function.
|
||||
*/
|
||||
itup = &(hitem->hash_itup);
|
||||
if (rel->rd_rel->relnatts != 1)
|
||||
@@ -64,12 +64,12 @@ _hash_doinsert(Relation rel, HashItem hitem)
|
||||
itemsz = IndexTupleDSize(hitem->hash_itup)
|
||||
+ (sizeof(HashItemData) - sizeof(IndexTupleData));
|
||||
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but
|
||||
* we need to be consistent */
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
|
||||
* need to be consistent */
|
||||
|
||||
/*
|
||||
* Acquire shared split lock so we can compute the target bucket
|
||||
* safely (see README).
|
||||
* Acquire shared split lock so we can compute the target bucket safely
|
||||
* (see README).
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_SHARE);
|
||||
|
||||
@@ -79,9 +79,9 @@ _hash_doinsert(Relation rel, HashItem hitem)
|
||||
_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a hash page at all. (Eventually,
|
||||
* we ought to try to apply TOAST methods if not.) Note that at this
|
||||
* point, itemsz doesn't include the ItemId.
|
||||
* Check whether the item can fit on a hash page at all. (Eventually, we
|
||||
* ought to try to apply TOAST methods if not.) Note that at this point,
|
||||
* itemsz doesn't include the ItemId.
|
||||
*/
|
||||
if (itemsz > HashMaxItemSize((Page) metap))
|
||||
ereport(ERROR,
|
||||
@@ -89,7 +89,7 @@ _hash_doinsert(Relation rel, HashItem hitem)
|
||||
errmsg("index row size %lu exceeds hash maximum %lu",
|
||||
(unsigned long) itemsz,
|
||||
(unsigned long) HashMaxItemSize((Page) metap)),
|
||||
errhint("Values larger than a buffer page cannot be indexed.")));
|
||||
errhint("Values larger than a buffer page cannot be indexed.")));
|
||||
|
||||
/*
|
||||
* Compute the target bucket number, and convert to block number.
|
||||
@@ -105,8 +105,7 @@ _hash_doinsert(Relation rel, HashItem hitem)
|
||||
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
|
||||
|
||||
/*
|
||||
* Acquire share lock on target bucket; then we can release split
|
||||
* lock.
|
||||
* Acquire share lock on target bucket; then we can release split lock.
|
||||
*/
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
|
||||
@@ -130,8 +129,8 @@ _hash_doinsert(Relation rel, HashItem hitem)
|
||||
if (BlockNumberIsValid(nextblkno))
|
||||
{
|
||||
/*
|
||||
* ovfl page exists; go get it. if it doesn't have room,
|
||||
* we'll find out next pass through the loop test above.
|
||||
* ovfl page exists; go get it. if it doesn't have room, we'll
|
||||
* find out next pass through the loop test above.
|
||||
*/
|
||||
_hash_relbuf(rel, buf);
|
||||
buf = _hash_getbuf(rel, nextblkno, HASH_WRITE);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.46 2005/05/11 01:26:01 neilc Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashovfl.c,v 1.47 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Overflow pages look like ordinary relation pages.
|
||||
@@ -44,8 +44,8 @@ bitno_to_blkno(HashMetaPage metap, uint32 ovflbitnum)
|
||||
/* loop */ ;
|
||||
|
||||
/*
|
||||
* Convert to absolute page number by adding the number of bucket
|
||||
* pages that exist before this split point.
|
||||
* Convert to absolute page number by adding the number of bucket pages
|
||||
* that exist before this split point.
|
||||
*/
|
||||
return (BlockNumber) ((1 << i) + ovflbitnum);
|
||||
}
|
||||
@@ -252,10 +252,10 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
|
||||
/*
|
||||
* We create the new bitmap page with all pages marked "in use".
|
||||
* Actually two pages in the new bitmap's range will exist
|
||||
* immediately: the bitmap page itself, and the following page
|
||||
* which is the one we return to the caller. Both of these are
|
||||
* correctly marked "in use". Subsequent pages do not exist yet,
|
||||
* but it is convenient to pre-mark them as "in use" too.
|
||||
* immediately: the bitmap page itself, and the following page which
|
||||
* is the one we return to the caller. Both of these are correctly
|
||||
* marked "in use". Subsequent pages do not exist yet, but it is
|
||||
* convenient to pre-mark them as "in use" too.
|
||||
*/
|
||||
_hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit));
|
||||
|
||||
@@ -265,8 +265,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Nothing to do here; since the page was past the last used page,
|
||||
* we know its bitmap bit was preinitialized to "in use".
|
||||
* Nothing to do here; since the page was past the last used page, we
|
||||
* know its bitmap bit was preinitialized to "in use".
|
||||
*/
|
||||
}
|
||||
|
||||
@@ -275,8 +275,7 @@ _hash_getovflpage(Relation rel, Buffer metabuf)
|
||||
|
||||
/*
|
||||
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
|
||||
* changing it if someone moved it while we were searching bitmap
|
||||
* pages.
|
||||
* changing it if someone moved it while we were searching bitmap pages.
|
||||
*/
|
||||
if (metap->hashm_firstfree == orig_firstfree)
|
||||
metap->hashm_firstfree = bit + 1;
|
||||
@@ -305,8 +304,7 @@ found:
|
||||
|
||||
/*
|
||||
* Adjust hashm_firstfree to avoid redundant searches. But don't risk
|
||||
* changing it if someone moved it while we were searching bitmap
|
||||
* pages.
|
||||
* changing it if someone moved it while we were searching bitmap pages.
|
||||
*/
|
||||
if (metap->hashm_firstfree == orig_firstfree)
|
||||
{
|
||||
@@ -394,10 +392,10 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf)
|
||||
_hash_wrtbuf(rel, ovflbuf);
|
||||
|
||||
/*
|
||||
* Fix up the bucket chain. this is a doubly-linked list, so we must
|
||||
* fix up the bucket chain members behind and ahead of the overflow
|
||||
* page being deleted. No concurrency issues since we hold exclusive
|
||||
* lock on the entire bucket.
|
||||
* Fix up the bucket chain. this is a doubly-linked list, so we must fix
|
||||
* up the bucket chain members behind and ahead of the overflow page being
|
||||
* deleted. No concurrency issues since we hold exclusive lock on the
|
||||
* entire bucket.
|
||||
*/
|
||||
if (BlockNumberIsValid(prevblkno))
|
||||
{
|
||||
@@ -488,12 +486,11 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
|
||||
|
||||
/*
|
||||
* It is okay to write-lock the new bitmap page while holding metapage
|
||||
* write lock, because no one else could be contending for the new
|
||||
* page.
|
||||
* write lock, because no one else could be contending for the new page.
|
||||
*
|
||||
* There is some loss of concurrency in possibly doing I/O for the new
|
||||
* page while holding the metapage lock, but this path is taken so
|
||||
* seldom that it's not worth worrying about.
|
||||
* There is some loss of concurrency in possibly doing I/O for the new page
|
||||
* while holding the metapage lock, but this path is taken so seldom that
|
||||
* it's not worth worrying about.
|
||||
*/
|
||||
buf = _hash_getbuf(rel, blkno, HASH_WRITE);
|
||||
pg = BufferGetPage(buf);
|
||||
@@ -586,8 +583,8 @@ _hash_squeezebucket(Relation rel,
|
||||
}
|
||||
|
||||
/*
|
||||
* find the last page in the bucket chain by starting at the base
|
||||
* bucket page and working forward.
|
||||
* find the last page in the bucket chain by starting at the base bucket
|
||||
* page and working forward.
|
||||
*/
|
||||
ropaque = wopaque;
|
||||
do
|
||||
@@ -655,22 +652,21 @@ _hash_squeezebucket(Relation rel,
|
||||
|
||||
/*
|
||||
* delete the tuple from the "read" page. PageIndexTupleDelete
|
||||
* repacks the ItemId array, so 'roffnum' will be "advanced"
|
||||
* to the "next" ItemId.
|
||||
* repacks the ItemId array, so 'roffnum' will be "advanced" to
|
||||
* the "next" ItemId.
|
||||
*/
|
||||
PageIndexTupleDelete(rpage, roffnum);
|
||||
}
|
||||
|
||||
/*
|
||||
* if the "read" page is now empty because of the deletion (or
|
||||
* because it was empty when we got to it), free it.
|
||||
* if the "read" page is now empty because of the deletion (or because
|
||||
* it was empty when we got to it), free it.
|
||||
*
|
||||
* Tricky point here: if our read and write pages are adjacent in the
|
||||
* bucket chain, our write lock on wbuf will conflict with
|
||||
* _hash_freeovflpage's attempt to update the sibling links of the
|
||||
* removed page. However, in that case we are done anyway, so we
|
||||
* can simply drop the write lock before calling
|
||||
* _hash_freeovflpage.
|
||||
* removed page. However, in that case we are done anyway, so we can
|
||||
* simply drop the write lock before calling _hash_freeovflpage.
|
||||
*/
|
||||
if (PageIsEmpty(rpage))
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.51 2005/06/09 21:01:25 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.52 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Postgres hash pages look like ordinary relation pages. The opaque
|
||||
@@ -240,13 +240,13 @@ _hash_metapinit(Relation rel)
|
||||
RelationGetRelationName(rel));
|
||||
|
||||
/*
|
||||
* Determine the target fill factor (tuples per bucket) for this
|
||||
* index. The idea is to make the fill factor correspond to pages
|
||||
* about 3/4ths full. We can compute it exactly if the index datatype
|
||||
* is fixed-width, but for var-width there's some guessing involved.
|
||||
* Determine the target fill factor (tuples per bucket) for this index.
|
||||
* The idea is to make the fill factor correspond to pages about 3/4ths
|
||||
* full. We can compute it exactly if the index datatype is fixed-width,
|
||||
* but for var-width there's some guessing involved.
|
||||
*/
|
||||
data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
|
||||
RelationGetDescr(rel)->attrs[0]->atttypmod);
|
||||
RelationGetDescr(rel)->attrs[0]->atttypmod);
|
||||
item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) +
|
||||
sizeof(ItemIdData); /* include the line pointer */
|
||||
ffactor = (BLCKSZ * 3 / 4) / item_width;
|
||||
@@ -289,9 +289,8 @@ _hash_metapinit(Relation rel)
|
||||
metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
|
||||
|
||||
/*
|
||||
* We initialize the index with two buckets, 0 and 1, occupying
|
||||
* physical blocks 1 and 2. The first freespace bitmap page is in
|
||||
* block 3.
|
||||
* We initialize the index with two buckets, 0 and 1, occupying physical
|
||||
* blocks 1 and 2. The first freespace bitmap page is in block 3.
|
||||
*/
|
||||
metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
|
||||
metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
|
||||
@@ -321,8 +320,8 @@ _hash_metapinit(Relation rel)
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize first bitmap page. Can't do this until we create the
|
||||
* first two buckets, else smgr will complain.
|
||||
* Initialize first bitmap page. Can't do this until we create the first
|
||||
* two buckets, else smgr will complain.
|
||||
*/
|
||||
_hash_initbitmap(rel, metap, 3);
|
||||
|
||||
@@ -367,15 +366,14 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
* Obtain the page-zero lock to assert the right to begin a split (see
|
||||
* README).
|
||||
*
|
||||
* Note: deadlock should be impossible here. Our own backend could only
|
||||
* be holding bucket sharelocks due to stopped indexscans; those will
|
||||
* not block other holders of the page-zero lock, who are only
|
||||
* interested in acquiring bucket sharelocks themselves. Exclusive
|
||||
* bucket locks are only taken here and in hashbulkdelete, and neither
|
||||
* of these operations needs any additional locks to complete. (If,
|
||||
* due to some flaw in this reasoning, we manage to deadlock anyway,
|
||||
* it's okay to error out; the index will be left in a consistent
|
||||
* state.)
|
||||
* Note: deadlock should be impossible here. Our own backend could only be
|
||||
* holding bucket sharelocks due to stopped indexscans; those will not
|
||||
* block other holders of the page-zero lock, who are only interested in
|
||||
* acquiring bucket sharelocks themselves. Exclusive bucket locks are
|
||||
* only taken here and in hashbulkdelete, and neither of these operations
|
||||
* needs any additional locks to complete. (If, due to some flaw in this
|
||||
* reasoning, we manage to deadlock anyway, it's okay to error out; the
|
||||
* index will be left in a consistent state.)
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_EXCLUSIVE);
|
||||
|
||||
@@ -386,8 +384,8 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
|
||||
|
||||
/*
|
||||
* Check to see if split is still needed; someone else might have
|
||||
* already done one while we waited for the lock.
|
||||
* Check to see if split is still needed; someone else might have already
|
||||
* done one while we waited for the lock.
|
||||
*
|
||||
* Make sure this stays in sync with _hash_doinsert()
|
||||
*/
|
||||
@@ -402,11 +400,11 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
* The lock protects us against other backends, but not against our own
|
||||
* backend. Must check for active scans separately.
|
||||
*
|
||||
* Ideally we would lock the new bucket too before proceeding, but if we
|
||||
* are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
|
||||
* isn't correct yet. For simplicity we update the metapage first and
|
||||
* then lock. This should be okay because no one else should be
|
||||
* trying to lock the new bucket yet...
|
||||
* Ideally we would lock the new bucket too before proceeding, but if we are
|
||||
* about to cross a splitpoint then the BUCKET_TO_BLKNO mapping isn't
|
||||
* correct yet. For simplicity we update the metapage first and then
|
||||
* lock. This should be okay because no one else should be trying to lock
|
||||
* the new bucket yet...
|
||||
*/
|
||||
new_bucket = metap->hashm_maxbucket + 1;
|
||||
old_bucket = (new_bucket & metap->hashm_lowmask);
|
||||
@@ -420,14 +418,13 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
* Okay to proceed with split. Update the metapage bucket mapping
|
||||
* info.
|
||||
* Okay to proceed with split. Update the metapage bucket mapping info.
|
||||
*
|
||||
* Since we are scribbling on the metapage data right in the shared
|
||||
* buffer, any failure in this next little bit leaves us with a big
|
||||
* problem: the metapage is effectively corrupt but could get written
|
||||
* back to disk. We don't really expect any failure, but just to be
|
||||
* sure, establish a critical section.
|
||||
* Since we are scribbling on the metapage data right in the shared buffer,
|
||||
* any failure in this next little bit leaves us with a big problem: the
|
||||
* metapage is effectively corrupt but could get written back to disk. We
|
||||
* don't really expect any failure, but just to be sure, establish a
|
||||
* critical section.
|
||||
*/
|
||||
START_CRIT_SECTION();
|
||||
|
||||
@@ -443,8 +440,8 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
/*
|
||||
* If the split point is increasing (hashm_maxbucket's log base 2
|
||||
* increases), we need to adjust the hashm_spares[] array and
|
||||
* hashm_ovflpoint so that future overflow pages will be created
|
||||
* beyond this new batch of bucket pages.
|
||||
* hashm_ovflpoint so that future overflow pages will be created beyond
|
||||
* this new batch of bucket pages.
|
||||
*
|
||||
* XXX should initialize new bucket pages to prevent out-of-order page
|
||||
* creation? Don't wanna do it right here though.
|
||||
@@ -471,10 +468,9 @@ _hash_expandtable(Relation rel, Buffer metabuf)
|
||||
/*
|
||||
* Copy bucket mapping info now; this saves re-accessing the meta page
|
||||
* inside _hash_splitbucket's inner loop. Note that once we drop the
|
||||
* split lock, other splits could begin, so these values might be out
|
||||
* of date before _hash_splitbucket finishes. That's okay, since all
|
||||
* it needs is to tell which of these two buckets to map hashkeys
|
||||
* into.
|
||||
* split lock, other splits could begin, so these values might be out of
|
||||
* date before _hash_splitbucket finishes. That's okay, since all it
|
||||
* needs is to tell which of these two buckets to map hashkeys into.
|
||||
*/
|
||||
maxbucket = metap->hashm_maxbucket;
|
||||
highmask = metap->hashm_highmask;
|
||||
@@ -554,9 +550,9 @@ _hash_splitbucket(Relation rel,
|
||||
TupleDesc itupdesc = RelationGetDescr(rel);
|
||||
|
||||
/*
|
||||
* It should be okay to simultaneously write-lock pages from each
|
||||
* bucket, since no one else can be trying to acquire buffer lock on
|
||||
* pages of either bucket.
|
||||
* It should be okay to simultaneously write-lock pages from each bucket,
|
||||
* since no one else can be trying to acquire buffer lock on pages of
|
||||
* either bucket.
|
||||
*/
|
||||
oblkno = start_oblkno;
|
||||
nblkno = start_nblkno;
|
||||
@@ -578,17 +574,17 @@ _hash_splitbucket(Relation rel,
|
||||
nopaque->hasho_filler = HASHO_FILL;
|
||||
|
||||
/*
|
||||
* Partition the tuples in the old bucket between the old bucket and
|
||||
* the new bucket, advancing along the old bucket's overflow bucket
|
||||
* chain and adding overflow pages to the new bucket as needed.
|
||||
* Partition the tuples in the old bucket between the old bucket and the
|
||||
* new bucket, advancing along the old bucket's overflow bucket chain and
|
||||
* adding overflow pages to the new bucket as needed.
|
||||
*/
|
||||
ooffnum = FirstOffsetNumber;
|
||||
omaxoffnum = PageGetMaxOffsetNumber(opage);
|
||||
for (;;)
|
||||
{
|
||||
/*
|
||||
* at each iteration through this loop, each of these variables
|
||||
* should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
|
||||
* at each iteration through this loop, each of these variables should
|
||||
* be up-to-date: obuf opage oopaque ooffnum omaxoffnum
|
||||
*/
|
||||
|
||||
/* check if we're at the end of the page */
|
||||
@@ -600,8 +596,8 @@ _hash_splitbucket(Relation rel,
|
||||
break;
|
||||
|
||||
/*
|
||||
* we ran out of tuples on this particular page, but we have
|
||||
* more overflow pages; advance to next page.
|
||||
* we ran out of tuples on this particular page, but we have more
|
||||
* overflow pages; advance to next page.
|
||||
*/
|
||||
_hash_wrtbuf(rel, obuf);
|
||||
|
||||
@@ -618,8 +614,7 @@ _hash_splitbucket(Relation rel,
|
||||
* Re-hash the tuple to determine which bucket it now belongs in.
|
||||
*
|
||||
* It is annoying to call the hash function while holding locks, but
|
||||
* releasing and relocking the page for each tuple is unappealing
|
||||
* too.
|
||||
* releasing and relocking the page for each tuple is unappealing too.
|
||||
*/
|
||||
hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
|
||||
itup = &(hitem->hash_itup);
|
||||
@@ -632,9 +627,9 @@ _hash_splitbucket(Relation rel,
|
||||
if (bucket == nbucket)
|
||||
{
|
||||
/*
|
||||
* insert the tuple into the new bucket. if it doesn't fit on
|
||||
* the current page in the new bucket, we must allocate a new
|
||||
* overflow page and place the tuple on that page instead.
|
||||
* insert the tuple into the new bucket. if it doesn't fit on the
|
||||
* current page in the new bucket, we must allocate a new overflow
|
||||
* page and place the tuple on that page instead.
|
||||
*/
|
||||
itemsz = IndexTupleDSize(hitem->hash_itup)
|
||||
+ (sizeof(HashItemData) - sizeof(IndexTupleData));
|
||||
@@ -659,13 +654,13 @@ _hash_splitbucket(Relation rel,
|
||||
RelationGetRelationName(rel));
|
||||
|
||||
/*
|
||||
* now delete the tuple from the old bucket. after this
|
||||
* section of code, 'ooffnum' will actually point to the
|
||||
* ItemId to which we would point if we had advanced it before
|
||||
* the deletion (PageIndexTupleDelete repacks the ItemId
|
||||
* array). this also means that 'omaxoffnum' is exactly one
|
||||
* less than it used to be, so we really can just decrement it
|
||||
* instead of calling PageGetMaxOffsetNumber.
|
||||
* now delete the tuple from the old bucket. after this section
|
||||
* of code, 'ooffnum' will actually point to the ItemId to which
|
||||
* we would point if we had advanced it before the deletion
|
||||
* (PageIndexTupleDelete repacks the ItemId array). this also
|
||||
* means that 'omaxoffnum' is exactly one less than it used to be,
|
||||
* so we really can just decrement it instead of calling
|
||||
* PageGetMaxOffsetNumber.
|
||||
*/
|
||||
PageIndexTupleDelete(opage, ooffnum);
|
||||
omaxoffnum = OffsetNumberPrev(omaxoffnum);
|
||||
@@ -673,9 +668,9 @@ _hash_splitbucket(Relation rel,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* the tuple stays on this page. we didn't move anything, so
|
||||
* we didn't delete anything and therefore we don't have to
|
||||
* change 'omaxoffnum'.
|
||||
* the tuple stays on this page. we didn't move anything, so we
|
||||
* didn't delete anything and therefore we don't have to change
|
||||
* 'omaxoffnum'.
|
||||
*/
|
||||
Assert(bucket == obucket);
|
||||
ooffnum = OffsetNumberNext(ooffnum);
|
||||
@@ -683,11 +678,10 @@ _hash_splitbucket(Relation rel,
|
||||
}
|
||||
|
||||
/*
|
||||
* We're at the end of the old bucket chain, so we're done
|
||||
* partitioning the tuples. Before quitting, call _hash_squeezebucket
|
||||
* to ensure the tuples remaining in the old bucket (including the
|
||||
* overflow pages) are packed as tightly as possible. The new bucket
|
||||
* is already tight.
|
||||
* We're at the end of the old bucket chain, so we're done partitioning
|
||||
* the tuples. Before quitting, call _hash_squeezebucket to ensure the
|
||||
* tuples remaining in the old bucket (including the overflow pages) are
|
||||
* packed as tightly as possible. The new bucket is already tight.
|
||||
*/
|
||||
_hash_wrtbuf(rel, obuf);
|
||||
_hash_wrtbuf(rel, nbuf);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashscan.c,v 1.38 2004/12/31 21:59:13 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashscan.c,v 1.39 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -44,9 +44,9 @@ ReleaseResources_hash(void)
|
||||
HashScanList next;
|
||||
|
||||
/*
|
||||
* Note: this should be a no-op during normal query shutdown. However,
|
||||
* in an abort situation ExecutorEnd is not called and so there may be
|
||||
* open index scans to clean up.
|
||||
* Note: this should be a no-op during normal query shutdown. However, in
|
||||
* an abort situation ExecutorEnd is not called and so there may be open
|
||||
* index scans to clean up.
|
||||
*/
|
||||
prev = NULL;
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.39 2005/10/06 02:29:08 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/hash/hashsearch.c,v 1.40 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -137,33 +137,32 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ItemPointerSetInvalid(current);
|
||||
|
||||
/*
|
||||
* We do not support hash scans with no index qualification, because
|
||||
* we would have to read the whole index rather than just one bucket.
|
||||
* That creates a whole raft of problems, since we haven't got a
|
||||
* practical way to lock all the buckets against splits or
|
||||
* compactions.
|
||||
* We do not support hash scans with no index qualification, because we
|
||||
* would have to read the whole index rather than just one bucket. That
|
||||
* creates a whole raft of problems, since we haven't got a practical way
|
||||
* to lock all the buckets against splits or compactions.
|
||||
*/
|
||||
if (scan->numberOfKeys < 1)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||
errmsg("hash indexes do not support whole-index scans")));
|
||||
errmsg("hash indexes do not support whole-index scans")));
|
||||
|
||||
/*
|
||||
* If the constant in the index qual is NULL, assume it cannot match
|
||||
* any items in the index.
|
||||
* If the constant in the index qual is NULL, assume it cannot match any
|
||||
* items in the index.
|
||||
*/
|
||||
if (scan->keyData[0].sk_flags & SK_ISNULL)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Okay to compute the hash key. We want to do this before acquiring
|
||||
* any locks, in case a user-defined hash function happens to be slow.
|
||||
* Okay to compute the hash key. We want to do this before acquiring any
|
||||
* locks, in case a user-defined hash function happens to be slow.
|
||||
*/
|
||||
hashkey = _hash_datum2hashkey(rel, scan->keyData[0].sk_argument);
|
||||
|
||||
/*
|
||||
* Acquire shared split lock so we can compute the target bucket
|
||||
* safely (see README).
|
||||
* Acquire shared split lock so we can compute the target bucket safely
|
||||
* (see README).
|
||||
*/
|
||||
_hash_getlock(rel, 0, HASH_SHARE);
|
||||
|
||||
@@ -186,8 +185,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
|
||||
_hash_relbuf(rel, metabuf);
|
||||
|
||||
/*
|
||||
* Acquire share lock on target bucket; then we can release split
|
||||
* lock.
|
||||
* Acquire share lock on target bucket; then we can release split lock.
|
||||
*/
|
||||
_hash_getlock(rel, blkno, HASH_SHARE);
|
||||
|
||||
@@ -263,9 +261,9 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
bucket = opaque->hasho_bucket;
|
||||
|
||||
/*
|
||||
* If _hash_step is called from _hash_first, current will not be
|
||||
* valid, so we can't dereference it. However, in that case, we
|
||||
* presumably want to start at the beginning/end of the page...
|
||||
* If _hash_step is called from _hash_first, current will not be valid, so
|
||||
* we can't dereference it. However, in that case, we presumably want to
|
||||
* start at the beginning/end of the page...
|
||||
*/
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
if (ItemPointerIsValid(current))
|
||||
@@ -276,8 +274,8 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
/*
|
||||
* 'offnum' now points to the last tuple we have seen (if any).
|
||||
*
|
||||
* continue to step through tuples until: 1) we get to the end of the
|
||||
* bucket chain or 2) we find a valid tuple.
|
||||
* continue to step through tuples until: 1) we get to the end of the bucket
|
||||
* chain or 2) we find a valid tuple.
|
||||
*/
|
||||
do
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.199 2005/10/06 02:29:10 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.200 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*
|
||||
* INTERFACE ROUTINES
|
||||
@@ -54,7 +54,7 @@
|
||||
|
||||
|
||||
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
|
||||
ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
|
||||
ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
@@ -272,8 +272,8 @@ heapgettup(Relation relation,
|
||||
/* 'dir' is now non-zero */
|
||||
|
||||
/*
|
||||
* calculate line pointer and number of remaining items to check on
|
||||
* this page.
|
||||
* calculate line pointer and number of remaining items to check on this
|
||||
* page.
|
||||
*/
|
||||
lpp = PageGetItemId(dp, lineoff);
|
||||
if (dir < 0)
|
||||
@@ -282,8 +282,8 @@ heapgettup(Relation relation,
|
||||
linesleft = lines - lineoff;
|
||||
|
||||
/*
|
||||
* advance the scan until we find a qualifying tuple or run out of
|
||||
* stuff to scan
|
||||
* advance the scan until we find a qualifying tuple or run out of stuff
|
||||
* to scan
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -321,15 +321,14 @@ heapgettup(Relation relation,
|
||||
}
|
||||
else
|
||||
{
|
||||
++lpp; /* move forward in this page's ItemId
|
||||
* array */
|
||||
++lpp; /* move forward in this page's ItemId array */
|
||||
++lineoff;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* if we get here, it means we've exhausted the items on this page
|
||||
* and it's time to move to the next.
|
||||
* if we get here, it means we've exhausted the items on this page and
|
||||
* it's time to move to the next.
|
||||
*/
|
||||
LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
@@ -506,15 +505,15 @@ relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
|
||||
|
||||
/*
|
||||
* Check for shared-cache-inval messages before trying to open the
|
||||
* relation. This is needed to cover the case where the name
|
||||
* identifies a rel that has been dropped and recreated since the
|
||||
* start of our transaction: if we don't flush the old syscache entry
|
||||
* then we'll latch onto that entry and suffer an error when we do
|
||||
* LockRelation. Note that relation_open does not need to do this,
|
||||
* since a relation's OID never changes.
|
||||
* relation. This is needed to cover the case where the name identifies a
|
||||
* rel that has been dropped and recreated since the start of our
|
||||
* transaction: if we don't flush the old syscache entry then we'll latch
|
||||
* onto that entry and suffer an error when we do LockRelation. Note that
|
||||
* relation_open does not need to do this, since a relation's OID never
|
||||
* changes.
|
||||
*
|
||||
* We skip this if asked for NoLock, on the assumption that the caller
|
||||
* has already ensured some appropriate lock is held.
|
||||
* We skip this if asked for NoLock, on the assumption that the caller has
|
||||
* already ensured some appropriate lock is held.
|
||||
*/
|
||||
if (lockmode != NoLock)
|
||||
AcceptInvalidationMessages();
|
||||
@@ -633,9 +632,9 @@ heap_beginscan(Relation relation, Snapshot snapshot,
|
||||
/*
|
||||
* increment relation ref count while scanning relation
|
||||
*
|
||||
* This is just to make really sure the relcache entry won't go away
|
||||
* while the scan has a pointer to it. Caller should be holding the
|
||||
* rel open anyway, so this is redundant in all normal scenarios...
|
||||
* This is just to make really sure the relcache entry won't go away while
|
||||
* the scan has a pointer to it. Caller should be holding the rel open
|
||||
* anyway, so this is redundant in all normal scenarios...
|
||||
*/
|
||||
RelationIncrementReferenceCount(relation);
|
||||
|
||||
@@ -649,8 +648,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
|
||||
scan->rs_nkeys = nkeys;
|
||||
|
||||
/*
|
||||
* we do this here instead of in initscan() because heap_rescan also
|
||||
* calls initscan() and we don't want to allocate memory again
|
||||
* we do this here instead of in initscan() because heap_rescan also calls
|
||||
* initscan() and we don't want to allocate memory again
|
||||
*/
|
||||
if (nkeys > 0)
|
||||
scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
|
||||
@@ -763,8 +762,8 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction)
|
||||
}
|
||||
|
||||
/*
|
||||
* if we get here it means we have a new current scan tuple, so point
|
||||
* to the proper return buffer and return the tuple.
|
||||
* if we get here it means we have a new current scan tuple, so point to
|
||||
* the proper return buffer and return the tuple.
|
||||
*/
|
||||
|
||||
HEAPDEBUG_3; /* heap_getnext returning tuple */
|
||||
@@ -859,8 +858,8 @@ heap_release_fetch(Relation relation,
|
||||
dp = (PageHeader) BufferGetPage(buffer);
|
||||
|
||||
/*
|
||||
* We'd better check for out-of-range offnum in case of VACUUM since
|
||||
* the TID was obtained.
|
||||
* We'd better check for out-of-range offnum in case of VACUUM since the
|
||||
* TID was obtained.
|
||||
*/
|
||||
offnum = ItemPointerGetOffsetNumber(tid);
|
||||
if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
|
||||
@@ -952,7 +951,7 @@ heap_release_fetch(Relation relation,
|
||||
* possibly uncommitted version.
|
||||
*
|
||||
* *tid is both an input and an output parameter: it is updated to
|
||||
* show the latest version of the row. Note that it will not be changed
|
||||
* show the latest version of the row. Note that it will not be changed
|
||||
* if no version of the row passes the snapshot test.
|
||||
*/
|
||||
void
|
||||
@@ -960,7 +959,7 @@ heap_get_latest_tid(Relation relation,
|
||||
Snapshot snapshot,
|
||||
ItemPointer tid)
|
||||
{
|
||||
BlockNumber blk;
|
||||
BlockNumber blk;
|
||||
ItemPointerData ctid;
|
||||
TransactionId priorXmax;
|
||||
|
||||
@@ -969,10 +968,10 @@ heap_get_latest_tid(Relation relation,
|
||||
return;
|
||||
|
||||
/*
|
||||
* Since this can be called with user-supplied TID, don't trust the
|
||||
* input too much. (RelationGetNumberOfBlocks is an expensive check,
|
||||
* so we don't check t_ctid links again this way. Note that it would
|
||||
* not do to call it just once and save the result, either.)
|
||||
* Since this can be called with user-supplied TID, don't trust the input
|
||||
* too much. (RelationGetNumberOfBlocks is an expensive check, so we
|
||||
* don't check t_ctid links again this way. Note that it would not do to
|
||||
* call it just once and save the result, either.)
|
||||
*/
|
||||
blk = ItemPointerGetBlockNumber(tid);
|
||||
if (blk >= RelationGetNumberOfBlocks(relation))
|
||||
@@ -980,9 +979,9 @@ heap_get_latest_tid(Relation relation,
|
||||
blk, RelationGetRelationName(relation));
|
||||
|
||||
/*
|
||||
* Loop to chase down t_ctid links. At top of loop, ctid is the
|
||||
* tuple we need to examine, and *tid is the TID we will return if
|
||||
* ctid turns out to be bogus.
|
||||
* Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
|
||||
* need to examine, and *tid is the TID we will return if ctid turns out
|
||||
* to be bogus.
|
||||
*
|
||||
* Note that we will loop until we reach the end of the t_ctid chain.
|
||||
* Depending on the snapshot passed, there might be at most one visible
|
||||
@@ -1008,8 +1007,8 @@ heap_get_latest_tid(Relation relation,
|
||||
|
||||
/*
|
||||
* Check for bogus item number. This is not treated as an error
|
||||
* condition because it can happen while following a t_ctid link.
|
||||
* We just assume that the prior tid is OK and return it unchanged.
|
||||
* condition because it can happen while following a t_ctid link. We
|
||||
* just assume that the prior tid is OK and return it unchanged.
|
||||
*/
|
||||
offnum = ItemPointerGetOffsetNumber(&ctid);
|
||||
if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
|
||||
@@ -1037,7 +1036,7 @@ heap_get_latest_tid(Relation relation,
|
||||
* tuple. Check for XMIN match.
|
||||
*/
|
||||
if (TransactionIdIsValid(priorXmax) &&
|
||||
!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
|
||||
!TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data)))
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
@@ -1068,7 +1067,7 @@ heap_get_latest_tid(Relation relation,
|
||||
priorXmax = HeapTupleHeaderGetXmax(tp.t_data);
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
} /* end of loop */
|
||||
} /* end of loop */
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1102,13 +1101,12 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If the object id of this tuple has already been assigned, trust
|
||||
* the caller. There are a couple of ways this can happen. At
|
||||
* initial db creation, the backend program sets oids for tuples.
|
||||
* When we define an index, we set the oid. Finally, in the
|
||||
* future, we may allow users to set their own object ids in order
|
||||
* to support a persistent object store (objects need to contain
|
||||
* pointers to one another).
|
||||
* If the object id of this tuple has already been assigned, trust the
|
||||
* caller. There are a couple of ways this can happen. At initial db
|
||||
* creation, the backend program sets oids for tuples. When we define
|
||||
* an index, we set the oid. Finally, in the future, we may allow
|
||||
* users to set their own object ids in order to support a persistent
|
||||
* object store (objects need to contain pointers to one another).
|
||||
*/
|
||||
if (!OidIsValid(HeapTupleGetOid(tup)))
|
||||
HeapTupleSetOid(tup, GetNewOid(relation));
|
||||
@@ -1129,8 +1127,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
|
||||
/*
|
||||
* If the new tuple is too big for storage or contains already toasted
|
||||
* out-of-line attributes from some other relation, invoke the
|
||||
* toaster.
|
||||
* out-of-line attributes from some other relation, invoke the toaster.
|
||||
*/
|
||||
if (HeapTupleHasExternal(tup) ||
|
||||
(MAXALIGN(tup->t_len) > TOAST_TUPLE_THRESHOLD))
|
||||
@@ -1172,9 +1169,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
xlhdr.t_hoff = tup->t_data->t_hoff;
|
||||
|
||||
/*
|
||||
* note we mark rdata[1] as belonging to buffer; if XLogInsert
|
||||
* decides to write the whole page to the xlog, we don't need to
|
||||
* store xl_heap_header in the xlog.
|
||||
* note we mark rdata[1] as belonging to buffer; if XLogInsert decides
|
||||
* to write the whole page to the xlog, we don't need to store
|
||||
* xl_heap_header in the xlog.
|
||||
*/
|
||||
rdata[1].data = (char *) &xlhdr;
|
||||
rdata[1].len = SizeOfHeapHeader;
|
||||
@@ -1190,9 +1187,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
rdata[2].next = NULL;
|
||||
|
||||
/*
|
||||
* If this is the single and first tuple on page, we can reinit
|
||||
* the page instead of restoring the whole thing. Set flag, and
|
||||
* hide buffer references from XLogInsert.
|
||||
* If this is the single and first tuple on page, we can reinit the
|
||||
* page instead of restoring the whole thing. Set flag, and hide
|
||||
* buffer references from XLogInsert.
|
||||
*/
|
||||
if (ItemPointerGetOffsetNumber(&(tup->t_self)) == FirstOffsetNumber &&
|
||||
PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
|
||||
@@ -1213,10 +1210,10 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
|
||||
WriteBuffer(buffer);
|
||||
|
||||
/*
|
||||
* If tuple is cachable, mark it for invalidation from the caches in
|
||||
* case we abort. Note it is OK to do this after WriteBuffer releases
|
||||
* the buffer, because the "tup" data structure is all in local
|
||||
* memory, not in the shared buffer.
|
||||
* If tuple is cachable, mark it for invalidation from the caches in case
|
||||
* we abort. Note it is OK to do this after WriteBuffer releases the
|
||||
* buffer, because the "tup" data structure is all in local memory, not in
|
||||
* the shared buffer.
|
||||
*/
|
||||
CacheInvalidateHeapTuple(relation, tup);
|
||||
|
||||
@@ -1268,7 +1265,7 @@ heap_delete(Relation relation, ItemPointer tid,
|
||||
ItemPointer ctid, TransactionId *update_xmax,
|
||||
CommandId cid, Snapshot crosscheck, bool wait)
|
||||
{
|
||||
HTSU_Result result;
|
||||
HTSU_Result result;
|
||||
TransactionId xid = GetCurrentTransactionId();
|
||||
ItemId lp;
|
||||
HeapTupleData tp;
|
||||
@@ -1301,7 +1298,7 @@ l1:
|
||||
else if (result == HeapTupleBeingUpdated && wait)
|
||||
{
|
||||
TransactionId xwait;
|
||||
uint16 infomask;
|
||||
uint16 infomask;
|
||||
|
||||
/* must copy state data before unlocking buffer */
|
||||
xwait = HeapTupleHeaderGetXmax(tp.t_data);
|
||||
@@ -1310,13 +1307,13 @@ l1:
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/*
|
||||
* Acquire tuple lock to establish our priority for the tuple
|
||||
* (see heap_lock_tuple). LockTuple will release us when we are
|
||||
* Acquire tuple lock to establish our priority for the tuple (see
|
||||
* heap_lock_tuple). LockTuple will release us when we are
|
||||
* next-in-line for the tuple.
|
||||
*
|
||||
* If we are forced to "start over" below, we keep the tuple lock;
|
||||
* this arranges that we stay at the head of the line while
|
||||
* rechecking tuple state.
|
||||
* If we are forced to "start over" below, we keep the tuple lock; this
|
||||
* arranges that we stay at the head of the line while rechecking
|
||||
* tuple state.
|
||||
*/
|
||||
if (!have_tuple_lock)
|
||||
{
|
||||
@@ -1347,12 +1344,12 @@ l1:
|
||||
goto l1;
|
||||
|
||||
/*
|
||||
* You might think the multixact is necessarily done here, but
|
||||
* not so: it could have surviving members, namely our own xact
|
||||
* or other subxacts of this backend. It is legal for us to
|
||||
* delete the tuple in either case, however (the latter case is
|
||||
* essentially a situation of upgrading our former shared lock
|
||||
* to exclusive). We don't bother changing the on-disk hint bits
|
||||
* You might think the multixact is necessarily done here, but not
|
||||
* so: it could have surviving members, namely our own xact or
|
||||
* other subxacts of this backend. It is legal for us to delete
|
||||
* the tuple in either case, however (the latter case is
|
||||
* essentially a situation of upgrading our former shared lock to
|
||||
* exclusive). We don't bother changing the on-disk hint bits
|
||||
* since we are about to overwrite the xmax altogether.
|
||||
*/
|
||||
}
|
||||
@@ -1385,8 +1382,8 @@ l1:
|
||||
}
|
||||
|
||||
/*
|
||||
* We may overwrite if previous xmax aborted, or if it committed
|
||||
* but only locked the tuple without updating it.
|
||||
* We may overwrite if previous xmax aborted, or if it committed but
|
||||
* only locked the tuple without updating it.
|
||||
*/
|
||||
if (tp.t_data->t_infomask & (HEAP_XMAX_INVALID |
|
||||
HEAP_IS_LOCKED))
|
||||
@@ -1467,18 +1464,18 @@ l1:
|
||||
|
||||
/*
|
||||
* If the tuple has toasted out-of-line attributes, we need to delete
|
||||
* those items too. We have to do this before WriteBuffer because we
|
||||
* need to look at the contents of the tuple, but it's OK to release
|
||||
* the context lock on the buffer first.
|
||||
* those items too. We have to do this before WriteBuffer because we need
|
||||
* to look at the contents of the tuple, but it's OK to release the
|
||||
* context lock on the buffer first.
|
||||
*/
|
||||
if (HeapTupleHasExternal(&tp))
|
||||
heap_tuple_toast_attrs(relation, NULL, &tp);
|
||||
|
||||
/*
|
||||
* Mark tuple for invalidation from system caches at next command
|
||||
* boundary. We have to do this before WriteBuffer because we need to
|
||||
* look at the contents of the tuple, so we need to hold our refcount
|
||||
* on the buffer.
|
||||
* boundary. We have to do this before WriteBuffer because we need to look
|
||||
* at the contents of the tuple, so we need to hold our refcount on the
|
||||
* buffer.
|
||||
*/
|
||||
CacheInvalidateHeapTuple(relation, &tp);
|
||||
|
||||
@@ -1506,7 +1503,7 @@ l1:
|
||||
void
|
||||
simple_heap_delete(Relation relation, ItemPointer tid)
|
||||
{
|
||||
HTSU_Result result;
|
||||
HTSU_Result result;
|
||||
ItemPointerData update_ctid;
|
||||
TransactionId update_xmax;
|
||||
|
||||
@@ -1569,7 +1566,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
|
||||
ItemPointer ctid, TransactionId *update_xmax,
|
||||
CommandId cid, Snapshot crosscheck, bool wait)
|
||||
{
|
||||
HTSU_Result result;
|
||||
HTSU_Result result;
|
||||
TransactionId xid = GetCurrentTransactionId();
|
||||
ItemId lp;
|
||||
HeapTupleData oldtup;
|
||||
@@ -1598,8 +1595,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup,
|
||||
/*
|
||||
* Note: beyond this point, use oldtup not otid to refer to old tuple.
|
||||
* otid may very well point at newtup->t_self, which we will overwrite
|
||||
* with the new tuple's location, so there's great risk of confusion
|
||||
* if we use otid anymore.
|
||||
* with the new tuple's location, so there's great risk of confusion if we
|
||||
* use otid anymore.
|
||||
*/
|
||||
|
||||
l2:
|
||||
@@ -1614,7 +1611,7 @@ l2:
|
||||
else if (result == HeapTupleBeingUpdated && wait)
|
||||
{
|
||||
TransactionId xwait;
|
||||
uint16 infomask;
|
||||
uint16 infomask;
|
||||
|
||||
/* must copy state data before unlocking buffer */
|
||||
xwait = HeapTupleHeaderGetXmax(oldtup.t_data);
|
||||
@@ -1623,13 +1620,13 @@ l2:
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
|
||||
/*
|
||||
* Acquire tuple lock to establish our priority for the tuple
|
||||
* (see heap_lock_tuple). LockTuple will release us when we are
|
||||
* Acquire tuple lock to establish our priority for the tuple (see
|
||||
* heap_lock_tuple). LockTuple will release us when we are
|
||||
* next-in-line for the tuple.
|
||||
*
|
||||
* If we are forced to "start over" below, we keep the tuple lock;
|
||||
* this arranges that we stay at the head of the line while
|
||||
* rechecking tuple state.
|
||||
* If we are forced to "start over" below, we keep the tuple lock; this
|
||||
* arranges that we stay at the head of the line while rechecking
|
||||
* tuple state.
|
||||
*/
|
||||
if (!have_tuple_lock)
|
||||
{
|
||||
@@ -1660,12 +1657,12 @@ l2:
|
||||
goto l2;
|
||||
|
||||
/*
|
||||
* You might think the multixact is necessarily done here, but
|
||||
* not so: it could have surviving members, namely our own xact
|
||||
* or other subxacts of this backend. It is legal for us to
|
||||
* update the tuple in either case, however (the latter case is
|
||||
* essentially a situation of upgrading our former shared lock
|
||||
* to exclusive). We don't bother changing the on-disk hint bits
|
||||
* You might think the multixact is necessarily done here, but not
|
||||
* so: it could have surviving members, namely our own xact or
|
||||
* other subxacts of this backend. It is legal for us to update
|
||||
* the tuple in either case, however (the latter case is
|
||||
* essentially a situation of upgrading our former shared lock to
|
||||
* exclusive). We don't bother changing the on-disk hint bits
|
||||
* since we are about to overwrite the xmax altogether.
|
||||
*/
|
||||
}
|
||||
@@ -1698,8 +1695,8 @@ l2:
|
||||
}
|
||||
|
||||
/*
|
||||
* We may overwrite if previous xmax aborted, or if it committed
|
||||
* but only locked the tuple without updating it.
|
||||
* We may overwrite if previous xmax aborted, or if it committed but
|
||||
* only locked the tuple without updating it.
|
||||
*/
|
||||
if (oldtup.t_data->t_infomask & (HEAP_XMAX_INVALID |
|
||||
HEAP_IS_LOCKED))
|
||||
@@ -1753,15 +1750,15 @@ l2:
|
||||
HeapTupleHeaderSetCmax(newtup->t_data, 0); /* for cleanliness */
|
||||
|
||||
/*
|
||||
* If the toaster needs to be activated, OR if the new tuple will not
|
||||
* fit on the same page as the old, then we need to release the
|
||||
* context lock (but not the pin!) on the old tuple's buffer while we
|
||||
* are off doing TOAST and/or table-file-extension work. We must mark
|
||||
* the old tuple to show that it's already being updated, else other
|
||||
* processes may try to update it themselves.
|
||||
* If the toaster needs to be activated, OR if the new tuple will not fit
|
||||
* on the same page as the old, then we need to release the context lock
|
||||
* (but not the pin!) on the old tuple's buffer while we are off doing
|
||||
* TOAST and/or table-file-extension work. We must mark the old tuple to
|
||||
* show that it's already being updated, else other processes may try to
|
||||
* update it themselves.
|
||||
*
|
||||
* We need to invoke the toaster if there are already any out-of-line
|
||||
* toasted values present, or if the new tuple is over-threshold.
|
||||
* We need to invoke the toaster if there are already any out-of-line toasted
|
||||
* values present, or if the new tuple is over-threshold.
|
||||
*/
|
||||
need_toast = (HeapTupleHasExternal(&oldtup) ||
|
||||
HeapTupleHasExternal(newtup) ||
|
||||
@@ -1790,22 +1787,21 @@ l2:
|
||||
}
|
||||
|
||||
/*
|
||||
* Now, do we need a new page for the tuple, or not? This is a
|
||||
* bit tricky since someone else could have added tuples to the
|
||||
* page while we weren't looking. We have to recheck the
|
||||
* available space after reacquiring the buffer lock. But don't
|
||||
* bother to do that if the former amount of free space is still
|
||||
* not enough; it's unlikely there's more free now than before.
|
||||
* Now, do we need a new page for the tuple, or not? This is a bit
|
||||
* tricky since someone else could have added tuples to the page while
|
||||
* we weren't looking. We have to recheck the available space after
|
||||
* reacquiring the buffer lock. But don't bother to do that if the
|
||||
* former amount of free space is still not enough; it's unlikely
|
||||
* there's more free now than before.
|
||||
*
|
||||
* What's more, if we need to get a new page, we will need to acquire
|
||||
* buffer locks on both old and new pages. To avoid deadlock
|
||||
* against some other backend trying to get the same two locks in
|
||||
* the other order, we must be consistent about the order we get
|
||||
* the locks in. We use the rule "lock the lower-numbered page of
|
||||
* the relation first". To implement this, we must do
|
||||
* RelationGetBufferForTuple while not holding the lock on the old
|
||||
* page, and we must rely on it to get the locks on both pages in
|
||||
* the correct order.
|
||||
* buffer locks on both old and new pages. To avoid deadlock against
|
||||
* some other backend trying to get the same two locks in the other
|
||||
* order, we must be consistent about the order we get the locks in.
|
||||
* We use the rule "lock the lower-numbered page of the relation
|
||||
* first". To implement this, we must do RelationGetBufferForTuple
|
||||
* while not holding the lock on the old page, and we must rely on it
|
||||
* to get the locks on both pages in the correct order.
|
||||
*/
|
||||
if (newtupsize > pagefree)
|
||||
{
|
||||
@@ -1823,8 +1819,8 @@ l2:
|
||||
{
|
||||
/*
|
||||
* Rats, it doesn't fit anymore. We must now unlock and
|
||||
* relock to avoid deadlock. Fortunately, this path
|
||||
* should seldom be taken.
|
||||
* relock to avoid deadlock. Fortunately, this path should
|
||||
* seldom be taken.
|
||||
*/
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
newbuf = RelationGetBufferForTuple(relation, newtup->t_len,
|
||||
@@ -1845,9 +1841,9 @@ l2:
|
||||
}
|
||||
|
||||
/*
|
||||
* At this point newbuf and buffer are both pinned and locked, and
|
||||
* newbuf has enough space for the new tuple. If they are the same
|
||||
* buffer, only one pin is held.
|
||||
* At this point newbuf and buffer are both pinned and locked, and newbuf
|
||||
* has enough space for the new tuple. If they are the same buffer, only
|
||||
* one pin is held.
|
||||
*/
|
||||
|
||||
/* NO EREPORT(ERROR) from here till changes are logged */
|
||||
@@ -1897,8 +1893,8 @@ l2:
|
||||
|
||||
/*
|
||||
* Mark old tuple for invalidation from system caches at next command
|
||||
* boundary. We have to do this before WriteBuffer because we need to
|
||||
* look at the contents of the tuple, so we need to hold our refcount.
|
||||
* boundary. We have to do this before WriteBuffer because we need to look
|
||||
* at the contents of the tuple, so we need to hold our refcount.
|
||||
*/
|
||||
CacheInvalidateHeapTuple(relation, &oldtup);
|
||||
|
||||
@@ -1907,10 +1903,10 @@ l2:
|
||||
WriteBuffer(buffer);
|
||||
|
||||
/*
|
||||
* If new tuple is cachable, mark it for invalidation from the caches
|
||||
* in case we abort. Note it is OK to do this after WriteBuffer
|
||||
* releases the buffer, because the "newtup" data structure is all in
|
||||
* local memory, not in the shared buffer.
|
||||
* If new tuple is cachable, mark it for invalidation from the caches in
|
||||
* case we abort. Note it is OK to do this after WriteBuffer releases the
|
||||
* buffer, because the "newtup" data structure is all in local memory, not
|
||||
* in the shared buffer.
|
||||
*/
|
||||
CacheInvalidateHeapTuple(relation, newtup);
|
||||
|
||||
@@ -1936,7 +1932,7 @@ l2:
|
||||
void
|
||||
simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
|
||||
{
|
||||
HTSU_Result result;
|
||||
HTSU_Result result;
|
||||
ItemPointerData update_ctid;
|
||||
TransactionId update_xmax;
|
||||
|
||||
@@ -2012,7 +2008,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
|
||||
* waiter gets the tuple, potentially leading to indefinite starvation of
|
||||
* some waiters. The possibility of share-locking makes the problem much
|
||||
* worse --- a steady stream of share-lockers can easily block an exclusive
|
||||
* locker forever. To provide more reliable semantics about who gets a
|
||||
* locker forever. To provide more reliable semantics about who gets a
|
||||
* tuple-level lock first, we use the standard lock manager. The protocol
|
||||
* for waiting for a tuple-level lock is really
|
||||
* LockTuple()
|
||||
@@ -2020,7 +2016,7 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup)
|
||||
* mark tuple as locked by me
|
||||
* UnlockTuple()
|
||||
* When there are multiple waiters, arbitration of who is to get the lock next
|
||||
* is provided by LockTuple(). However, at most one tuple-level lock will
|
||||
* is provided by LockTuple(). However, at most one tuple-level lock will
|
||||
* be held or awaited per backend at any time, so we don't risk overflow
|
||||
* of the lock table. Note that incoming share-lockers are required to
|
||||
* do LockTuple as well, if there is any conflict, to ensure that they don't
|
||||
@@ -2032,11 +2028,11 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer,
|
||||
ItemPointer ctid, TransactionId *update_xmax,
|
||||
CommandId cid, LockTupleMode mode, bool nowait)
|
||||
{
|
||||
HTSU_Result result;
|
||||
HTSU_Result result;
|
||||
ItemPointer tid = &(tuple->t_self);
|
||||
ItemId lp;
|
||||
PageHeader dp;
|
||||
TransactionId xid;
|
||||
TransactionId xid;
|
||||
uint16 new_infomask;
|
||||
LOCKMODE tuple_lock_type;
|
||||
bool have_tuple_lock = false;
|
||||
@@ -2067,7 +2063,7 @@ l3:
|
||||
else if (result == HeapTupleBeingUpdated)
|
||||
{
|
||||
TransactionId xwait;
|
||||
uint16 infomask;
|
||||
uint16 infomask;
|
||||
|
||||
/* must copy state data before unlocking buffer */
|
||||
xwait = HeapTupleHeaderGetXmax(tuple->t_data);
|
||||
@@ -2077,12 +2073,12 @@ l3:
|
||||
|
||||
/*
|
||||
* Acquire tuple lock to establish our priority for the tuple.
|
||||
* LockTuple will release us when we are next-in-line for the
|
||||
* tuple. We must do this even if we are share-locking.
|
||||
* LockTuple will release us when we are next-in-line for the tuple.
|
||||
* We must do this even if we are share-locking.
|
||||
*
|
||||
* If we are forced to "start over" below, we keep the tuple lock;
|
||||
* this arranges that we stay at the head of the line while
|
||||
* rechecking tuple state.
|
||||
* If we are forced to "start over" below, we keep the tuple lock; this
|
||||
* arranges that we stay at the head of the line while rechecking
|
||||
* tuple state.
|
||||
*/
|
||||
if (!have_tuple_lock)
|
||||
{
|
||||
@@ -2091,8 +2087,8 @@ l3:
|
||||
if (!ConditionalLockTuple(relation, tid, tuple_lock_type))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
|
||||
errmsg("could not obtain lock on row in relation \"%s\"",
|
||||
RelationGetRelationName(relation))));
|
||||
errmsg("could not obtain lock on row in relation \"%s\"",
|
||||
RelationGetRelationName(relation))));
|
||||
}
|
||||
else
|
||||
LockTuple(relation, tid, tuple_lock_type);
|
||||
@@ -2108,8 +2104,8 @@ l3:
|
||||
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Make sure it's still a shared lock, else start over. (It's
|
||||
* OK if the ownership of the shared lock has changed, though.)
|
||||
* Make sure it's still a shared lock, else start over. (It's OK
|
||||
* if the ownership of the shared lock has changed, though.)
|
||||
*/
|
||||
if (!(tuple->t_data->t_infomask & HEAP_XMAX_SHARED_LOCK))
|
||||
goto l3;
|
||||
@@ -2122,8 +2118,8 @@ l3:
|
||||
if (!ConditionalMultiXactIdWait((MultiXactId) xwait))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
|
||||
errmsg("could not obtain lock on row in relation \"%s\"",
|
||||
RelationGetRelationName(relation))));
|
||||
errmsg("could not obtain lock on row in relation \"%s\"",
|
||||
RelationGetRelationName(relation))));
|
||||
}
|
||||
else
|
||||
MultiXactIdWait((MultiXactId) xwait);
|
||||
@@ -2131,9 +2127,9 @@ l3:
|
||||
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* If xwait had just locked the tuple then some other xact
|
||||
* could update this tuple before we get to this point.
|
||||
* Check for xmax change, and start over if so.
|
||||
* If xwait had just locked the tuple then some other xact could
|
||||
* update this tuple before we get to this point. Check for xmax
|
||||
* change, and start over if so.
|
||||
*/
|
||||
if (!(tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
|
||||
!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
|
||||
@@ -2141,12 +2137,12 @@ l3:
|
||||
goto l3;
|
||||
|
||||
/*
|
||||
* You might think the multixact is necessarily done here, but
|
||||
* not so: it could have surviving members, namely our own xact
|
||||
* or other subxacts of this backend. It is legal for us to
|
||||
* lock the tuple in either case, however. We don't bother
|
||||
* changing the on-disk hint bits since we are about to
|
||||
* overwrite the xmax altogether.
|
||||
* You might think the multixact is necessarily done here, but not
|
||||
* so: it could have surviving members, namely our own xact or
|
||||
* other subxacts of this backend. It is legal for us to lock the
|
||||
* tuple in either case, however. We don't bother changing the
|
||||
* on-disk hint bits since we are about to overwrite the xmax
|
||||
* altogether.
|
||||
*/
|
||||
}
|
||||
else
|
||||
@@ -2157,8 +2153,8 @@ l3:
|
||||
if (!ConditionalXactLockTableWait(xwait))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
|
||||
errmsg("could not obtain lock on row in relation \"%s\"",
|
||||
RelationGetRelationName(relation))));
|
||||
errmsg("could not obtain lock on row in relation \"%s\"",
|
||||
RelationGetRelationName(relation))));
|
||||
}
|
||||
else
|
||||
XactLockTableWait(xwait);
|
||||
@@ -2166,9 +2162,9 @@ l3:
|
||||
LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* xwait is done, but if xwait had just locked the tuple then
|
||||
* some other xact could update this tuple before we get to
|
||||
* this point. Check for xmax change, and start over if so.
|
||||
* xwait is done, but if xwait had just locked the tuple then some
|
||||
* other xact could update this tuple before we get to this point.
|
||||
* Check for xmax change, and start over if so.
|
||||
*/
|
||||
if ((tuple->t_data->t_infomask & HEAP_XMAX_IS_MULTI) ||
|
||||
!TransactionIdEquals(HeapTupleHeaderGetXmax(tuple->t_data),
|
||||
@@ -2188,10 +2184,10 @@ l3:
|
||||
}
|
||||
|
||||
/*
|
||||
* We may lock if previous xmax aborted, or if it committed
|
||||
* but only locked the tuple without updating it. The case where
|
||||
* we didn't wait because we are joining an existing shared lock
|
||||
* is correctly handled, too.
|
||||
* We may lock if previous xmax aborted, or if it committed but only
|
||||
* locked the tuple without updating it. The case where we didn't
|
||||
* wait because we are joining an existing shared lock is correctly
|
||||
* handled, too.
|
||||
*/
|
||||
if (tuple->t_data->t_infomask & (HEAP_XMAX_INVALID |
|
||||
HEAP_IS_LOCKED))
|
||||
@@ -2213,9 +2209,9 @@ l3:
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute the new xmax and infomask to store into the tuple. Note we
|
||||
* do not modify the tuple just yet, because that would leave it in the
|
||||
* wrong state if multixact.c elogs.
|
||||
* Compute the new xmax and infomask to store into the tuple. Note we do
|
||||
* not modify the tuple just yet, because that would leave it in the wrong
|
||||
* state if multixact.c elogs.
|
||||
*/
|
||||
xid = GetCurrentTransactionId();
|
||||
|
||||
@@ -2229,17 +2225,16 @@ l3:
|
||||
|
||||
if (mode == LockTupleShared)
|
||||
{
|
||||
TransactionId xmax = HeapTupleHeaderGetXmax(tuple->t_data);
|
||||
TransactionId xmax = HeapTupleHeaderGetXmax(tuple->t_data);
|
||||
uint16 old_infomask = tuple->t_data->t_infomask;
|
||||
|
||||
/*
|
||||
* If this is the first acquisition of a shared lock in the current
|
||||
* transaction, set my per-backend OldestMemberMXactId setting.
|
||||
* We can be certain that the transaction will never become a
|
||||
* member of any older MultiXactIds than that. (We have to do this
|
||||
* even if we end up just using our own TransactionId below, since
|
||||
* some other backend could incorporate our XID into a MultiXact
|
||||
* immediately afterwards.)
|
||||
* transaction, set my per-backend OldestMemberMXactId setting. We can
|
||||
* be certain that the transaction will never become a member of any
|
||||
* older MultiXactIds than that. (We have to do this even if we end
|
||||
* up just using our own TransactionId below, since some other backend
|
||||
* could incorporate our XID into a MultiXact immediately afterwards.)
|
||||
*/
|
||||
MultiXactIdSetOldestMember();
|
||||
|
||||
@@ -2249,14 +2244,14 @@ l3:
|
||||
* Check to see if we need a MultiXactId because there are multiple
|
||||
* lockers.
|
||||
*
|
||||
* HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID
|
||||
* bit if the xmax was a MultiXactId but it was not running anymore.
|
||||
* There is a race condition, which is that the MultiXactId may have
|
||||
* finished since then, but that uncommon case is handled within
|
||||
* HeapTupleSatisfiesUpdate will have set the HEAP_XMAX_INVALID bit if
|
||||
* the xmax was a MultiXactId but it was not running anymore. There is
|
||||
* a race condition, which is that the MultiXactId may have finished
|
||||
* since then, but that uncommon case is handled within
|
||||
* MultiXactIdExpand.
|
||||
*
|
||||
* There is a similar race condition possible when the old xmax was
|
||||
* a regular TransactionId. We test TransactionIdIsInProgress again
|
||||
* There is a similar race condition possible when the old xmax was a
|
||||
* regular TransactionId. We test TransactionIdIsInProgress again
|
||||
* just to narrow the window, but it's still possible to end up
|
||||
* creating an unnecessary MultiXactId. Fortunately this is harmless.
|
||||
*/
|
||||
@@ -2277,10 +2272,10 @@ l3:
|
||||
{
|
||||
/*
|
||||
* If the old locker is ourselves, we'll just mark the
|
||||
* tuple again with our own TransactionId. However we
|
||||
* have to consider the possibility that we had
|
||||
* exclusive rather than shared lock before --- if so,
|
||||
* be careful to preserve the exclusivity of the lock.
|
||||
* tuple again with our own TransactionId. However we
|
||||
* have to consider the possibility that we had exclusive
|
||||
* rather than shared lock before --- if so, be careful to
|
||||
* preserve the exclusivity of the lock.
|
||||
*/
|
||||
if (!(old_infomask & HEAP_XMAX_SHARED_LOCK))
|
||||
{
|
||||
@@ -2303,9 +2298,9 @@ l3:
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Can get here iff HeapTupleSatisfiesUpdate saw the old
|
||||
* xmax as running, but it finished before
|
||||
* TransactionIdIsInProgress() got to run. Treat it like
|
||||
* Can get here iff HeapTupleSatisfiesUpdate saw the old xmax
|
||||
* as running, but it finished before
|
||||
* TransactionIdIsInProgress() got to run. Treat it like
|
||||
* there's no locker in the tuple.
|
||||
*/
|
||||
}
|
||||
@@ -2329,8 +2324,8 @@ l3:
|
||||
/*
|
||||
* Store transaction information of xact locking the tuple.
|
||||
*
|
||||
* Note: our CID is meaningless if storing a MultiXactId, but no harm
|
||||
* in storing it anyway.
|
||||
* Note: our CID is meaningless if storing a MultiXactId, but no harm in
|
||||
* storing it anyway.
|
||||
*/
|
||||
tuple->t_data->t_infomask = new_infomask;
|
||||
HeapTupleHeaderSetXmax(tuple->t_data, xid);
|
||||
@@ -2339,8 +2334,8 @@ l3:
|
||||
tuple->t_data->t_ctid = *tid;
|
||||
|
||||
/*
|
||||
* XLOG stuff. You might think that we don't need an XLOG record because
|
||||
* there is no state change worth restoring after a crash. You would be
|
||||
* XLOG stuff. You might think that we don't need an XLOG record because
|
||||
* there is no state change worth restoring after a crash. You would be
|
||||
* wrong however: we have just written either a TransactionId or a
|
||||
* MultiXactId that may never have been seen on disk before, and we need
|
||||
* to make sure that there are XLOG entries covering those ID numbers.
|
||||
@@ -2473,8 +2468,8 @@ log_heap_clean(Relation reln, Buffer buffer, OffsetNumber *unused, int uncnt)
|
||||
|
||||
/*
|
||||
* The unused-offsets array is not actually in the buffer, but pretend
|
||||
* that it is. When XLogInsert stores the whole buffer, the offsets
|
||||
* array need not be stored too.
|
||||
* that it is. When XLogInsert stores the whole buffer, the offsets array
|
||||
* need not be stored too.
|
||||
*/
|
||||
if (uncnt > 0)
|
||||
{
|
||||
@@ -2500,11 +2495,10 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
|
||||
Buffer newbuf, HeapTuple newtup, bool move)
|
||||
{
|
||||
/*
|
||||
* Note: xlhdr is declared to have adequate size and correct alignment
|
||||
* for an xl_heap_header. However the two tids, if present at all,
|
||||
* will be packed in with no wasted space after the xl_heap_header;
|
||||
* they aren't necessarily aligned as implied by this struct
|
||||
* declaration.
|
||||
* Note: xlhdr is declared to have adequate size and correct alignment for
|
||||
* an xl_heap_header. However the two tids, if present at all, will be
|
||||
* packed in with no wasted space after the xl_heap_header; they aren't
|
||||
* necessarily aligned as implied by this struct declaration.
|
||||
*/
|
||||
struct
|
||||
{
|
||||
@@ -2555,8 +2549,8 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
|
||||
}
|
||||
|
||||
/*
|
||||
* As with insert records, we need not store the rdata[2] segment if
|
||||
* we decide to store the whole buffer instead.
|
||||
* As with insert records, we need not store the rdata[2] segment if we
|
||||
* decide to store the whole buffer instead.
|
||||
*/
|
||||
rdata[2].data = (char *) &xlhdr;
|
||||
rdata[2].len = hsize;
|
||||
@@ -2655,8 +2649,8 @@ heap_xlog_newpage(XLogRecPtr lsn, XLogRecord *record)
|
||||
Page page;
|
||||
|
||||
/*
|
||||
* Note: the NEWPAGE log record is used for both heaps and indexes, so
|
||||
* do not do anything that assumes we are touching a heap.
|
||||
* Note: the NEWPAGE log record is used for both heaps and indexes, so do
|
||||
* not do anything that assumes we are touching a heap.
|
||||
*/
|
||||
|
||||
if (record->xl_info & XLR_BKP_BLOCK_1)
|
||||
@@ -2699,7 +2693,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
|
||||
return;
|
||||
|
||||
buffer = XLogReadBuffer(false, reln,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
if (!BufferIsValid(buffer))
|
||||
elog(PANIC, "heap_delete_redo: no block");
|
||||
|
||||
@@ -2707,7 +2701,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
|
||||
if (PageIsNew((PageHeader) page))
|
||||
elog(PANIC, "heap_delete_redo: uninitialized page");
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
@@ -2749,7 +2743,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
|
||||
struct
|
||||
{
|
||||
HeapTupleHeaderData hdr;
|
||||
char data[MaxTupleSize];
|
||||
char data[MaxTupleSize];
|
||||
} tbuf;
|
||||
HeapTupleHeader htup;
|
||||
xl_heap_header xlhdr;
|
||||
@@ -2764,7 +2758,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
|
||||
return;
|
||||
|
||||
buffer = XLogReadBuffer(true, reln,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
if (!BufferIsValid(buffer))
|
||||
return;
|
||||
|
||||
@@ -2776,7 +2770,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
|
||||
if (record->xl_info & XLOG_HEAP_INIT_PAGE)
|
||||
PageInit(page, BufferGetPageSize(buffer), 0);
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
@@ -2835,7 +2829,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
|
||||
struct
|
||||
{
|
||||
HeapTupleHeaderData hdr;
|
||||
char data[MaxTupleSize];
|
||||
char data[MaxTupleSize];
|
||||
} tbuf;
|
||||
xl_heap_header xlhdr;
|
||||
int hsize;
|
||||
@@ -2850,7 +2844,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
|
||||
/* Deal with old tuple version */
|
||||
|
||||
buffer = XLogReadBuffer(false, reln,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
if (!BufferIsValid(buffer))
|
||||
elog(PANIC, "heap_update_redo: no block");
|
||||
|
||||
@@ -2858,7 +2852,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move)
|
||||
if (PageIsNew((PageHeader) page))
|
||||
elog(PANIC, "heap_update_redo: uninitialized old page");
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
@@ -2928,7 +2922,7 @@ newsame:;
|
||||
if (record->xl_info & XLOG_HEAP_INIT_PAGE)
|
||||
PageInit(page, BufferGetPageSize(buffer), 0);
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
@@ -2961,7 +2955,7 @@ newsame:;
|
||||
|
||||
if (move)
|
||||
{
|
||||
TransactionId xid[2]; /* xmax, xmin */
|
||||
TransactionId xid[2]; /* xmax, xmin */
|
||||
|
||||
memcpy((char *) xid,
|
||||
(char *) xlrec + SizeOfHeapUpdate + SizeOfHeapHeader,
|
||||
@@ -3008,7 +3002,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
|
||||
return;
|
||||
|
||||
buffer = XLogReadBuffer(false, reln,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
if (!BufferIsValid(buffer))
|
||||
elog(PANIC, "heap_lock_redo: no block");
|
||||
|
||||
@@ -3016,7 +3010,7 @@ heap_xlog_lock(XLogRecPtr lsn, XLogRecord *record)
|
||||
if (PageIsNew((PageHeader) page))
|
||||
elog(PANIC, "heap_lock_redo: uninitialized page");
|
||||
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
if (XLByteLE(lsn, PageGetLSN(page))) /* changes are applied */
|
||||
{
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
ReleaseBuffer(buffer);
|
||||
@@ -3081,7 +3075,7 @@ static void
|
||||
out_target(char *buf, xl_heaptid *target)
|
||||
{
|
||||
sprintf(buf + strlen(buf), "rel %u/%u/%u; tid %u/%u",
|
||||
target->node.spcNode, target->node.dbNode, target->node.relNode,
|
||||
target->node.spcNode, target->node.dbNode, target->node.relNode,
|
||||
ItemPointerGetBlockNumber(&(target->tid)),
|
||||
ItemPointerGetOffsetNumber(&(target->tid)));
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.57 2005/06/20 18:37:01 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/hio.c,v 1.58 2005/10/15 02:49:08 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -80,7 +80,7 @@ RelationPutHeapTuple(Relation relation,
|
||||
* enough there). In that case, the page will be pinned and locked only once.
|
||||
*
|
||||
* If use_fsm is true (the normal case), we use FSM to help us find free
|
||||
* space. If use_fsm is false, we always append a new empty page to the
|
||||
* space. If use_fsm is false, we always append a new empty page to the
|
||||
* end of the relation if the tuple won't fit on the current target page.
|
||||
* This can save some cycles when we know the relation is new and doesn't
|
||||
* contain useful amounts of free space.
|
||||
@@ -122,22 +122,20 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
if (otherBuffer != InvalidBuffer)
|
||||
otherBlock = BufferGetBlockNumber(otherBuffer);
|
||||
else
|
||||
otherBlock = InvalidBlockNumber; /* just to keep compiler
|
||||
* quiet */
|
||||
otherBlock = InvalidBlockNumber; /* just to keep compiler quiet */
|
||||
|
||||
/*
|
||||
* We first try to put the tuple on the same page we last inserted a
|
||||
* tuple on, as cached in the relcache entry. If that doesn't work,
|
||||
* we ask the shared Free Space Map to locate a suitable page. Since
|
||||
* the FSM's info might be out of date, we have to be prepared to loop
|
||||
* around and retry multiple times. (To insure this isn't an infinite
|
||||
* loop, we must update the FSM with the correct amount of free space
|
||||
* on each page that proves not to be suitable.) If the FSM has no
|
||||
* record of a page with enough free space, we give up and extend the
|
||||
* relation.
|
||||
* We first try to put the tuple on the same page we last inserted a tuple
|
||||
* on, as cached in the relcache entry. If that doesn't work, we ask the
|
||||
* shared Free Space Map to locate a suitable page. Since the FSM's info
|
||||
* might be out of date, we have to be prepared to loop around and retry
|
||||
* multiple times. (To insure this isn't an infinite loop, we must update
|
||||
* the FSM with the correct amount of free space on each page that proves
|
||||
* not to be suitable.) If the FSM has no record of a page with enough
|
||||
* free space, we give up and extend the relation.
|
||||
*
|
||||
* When use_fsm is false, we either put the tuple onto the existing
|
||||
* target page or extend the relation.
|
||||
* When use_fsm is false, we either put the tuple onto the existing target
|
||||
* page or extend the relation.
|
||||
*/
|
||||
|
||||
targetBlock = relation->rd_targblock;
|
||||
@@ -151,9 +149,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
targetBlock = GetPageWithFreeSpace(&relation->rd_node, len);
|
||||
|
||||
/*
|
||||
* If the FSM knows nothing of the rel, try the last page before
|
||||
* we give up and extend. This avoids one-tuple-per-page syndrome
|
||||
* during bootstrapping or in a recently-started system.
|
||||
* If the FSM knows nothing of the rel, try the last page before we
|
||||
* give up and extend. This avoids one-tuple-per-page syndrome during
|
||||
* bootstrapping or in a recently-started system.
|
||||
*/
|
||||
if (targetBlock == InvalidBlockNumber)
|
||||
{
|
||||
@@ -168,8 +166,8 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
{
|
||||
/*
|
||||
* Read and exclusive-lock the target block, as well as the other
|
||||
* block if one was given, taking suitable care with lock ordering
|
||||
* and the possibility they are the same block.
|
||||
* block if one was given, taking suitable care with lock ordering and
|
||||
* the possibility they are the same block.
|
||||
*/
|
||||
if (otherBuffer == InvalidBuffer)
|
||||
{
|
||||
@@ -199,8 +197,8 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we can check to see if there's enough free space here. If
|
||||
* so, we're done.
|
||||
* Now we can check to see if there's enough free space here. If so,
|
||||
* we're done.
|
||||
*/
|
||||
pageHeader = (Page) BufferGetPage(buffer);
|
||||
pageFreeSpace = PageGetFreeSpace(pageHeader);
|
||||
@@ -213,9 +211,9 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
|
||||
/*
|
||||
* Not enough space, so we must give up our page locks and pin (if
|
||||
* any) and prepare to look elsewhere. We don't care which order
|
||||
* we unlock the two buffers in, so this can be slightly simpler
|
||||
* than the code above.
|
||||
* any) and prepare to look elsewhere. We don't care which order we
|
||||
* unlock the two buffers in, so this can be slightly simpler than the
|
||||
* code above.
|
||||
*/
|
||||
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
||||
if (otherBuffer == InvalidBuffer)
|
||||
@@ -231,8 +229,8 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
break;
|
||||
|
||||
/*
|
||||
* Update FSM as to condition of this page, and ask for another
|
||||
* page to try.
|
||||
* Update FSM as to condition of this page, and ask for another page
|
||||
* to try.
|
||||
*/
|
||||
targetBlock = RecordAndGetPageWithFreeSpace(&relation->rd_node,
|
||||
targetBlock,
|
||||
@@ -243,10 +241,10 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
/*
|
||||
* Have to extend the relation.
|
||||
*
|
||||
* We have to use a lock to ensure no one else is extending the rel at
|
||||
* the same time, else we will both try to initialize the same new
|
||||
* page. We can skip locking for new or temp relations, however,
|
||||
* since no one else could be accessing them.
|
||||
* We have to use a lock to ensure no one else is extending the rel at the
|
||||
* same time, else we will both try to initialize the same new page. We
|
||||
* can skip locking for new or temp relations, however, since no one else
|
||||
* could be accessing them.
|
||||
*/
|
||||
needLock = !RELATION_IS_LOCAL(relation);
|
||||
|
||||
@@ -254,17 +252,16 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
LockRelationForExtension(relation, ExclusiveLock);
|
||||
|
||||
/*
|
||||
* XXX This does an lseek - rather expensive - but at the moment it is
|
||||
* the only way to accurately determine how many blocks are in a
|
||||
* relation. Is it worth keeping an accurate file length in shared
|
||||
* memory someplace, rather than relying on the kernel to do it for
|
||||
* us?
|
||||
* XXX This does an lseek - rather expensive - but at the moment it is the
|
||||
* only way to accurately determine how many blocks are in a relation. Is
|
||||
* it worth keeping an accurate file length in shared memory someplace,
|
||||
* rather than relying on the kernel to do it for us?
|
||||
*/
|
||||
buffer = ReadBuffer(relation, P_NEW);
|
||||
|
||||
/*
|
||||
* We can be certain that locking the otherBuffer first is OK, since
|
||||
* it must have a lower page number.
|
||||
* We can be certain that locking the otherBuffer first is OK, since it
|
||||
* must have a lower page number.
|
||||
*/
|
||||
if (otherBuffer != InvalidBuffer)
|
||||
LockBuffer(otherBuffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
@@ -275,10 +272,10 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Release the file-extension lock; it's now OK for someone else to
|
||||
* extend the relation some more. Note that we cannot release this
|
||||
* lock before we have buffer lock on the new page, or we risk a
|
||||
* race condition against vacuumlazy.c --- see comments therein.
|
||||
* Release the file-extension lock; it's now OK for someone else to extend
|
||||
* the relation some more. Note that we cannot release this lock before
|
||||
* we have buffer lock on the new page, or we risk a race condition
|
||||
* against vacuumlazy.c --- see comments therein.
|
||||
*/
|
||||
if (needLock)
|
||||
UnlockRelationForExtension(relation, ExclusiveLock);
|
||||
@@ -299,11 +296,11 @@ RelationGetBufferForTuple(Relation relation, Size len,
|
||||
/*
|
||||
* Remember the new page as our target for future insertions.
|
||||
*
|
||||
* XXX should we enter the new page into the free space map immediately,
|
||||
* or just keep it for this backend's exclusive use in the short run
|
||||
* (until VACUUM sees it)? Seems to depend on whether you expect the
|
||||
* current backend to make more insertions or not, which is probably a
|
||||
* good bet most of the time. So for now, don't add it to FSM yet.
|
||||
* XXX should we enter the new page into the free space map immediately, or
|
||||
* just keep it for this backend's exclusive use in the short run (until
|
||||
* VACUUM sees it)? Seems to depend on whether you expect the current
|
||||
* backend to make more insertions or not, which is probably a good bet
|
||||
* most of the time. So for now, don't add it to FSM yet.
|
||||
*/
|
||||
relation->rd_targblock = BufferGetBlockNumber(buffer);
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.52 2005/08/12 01:35:54 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.53 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*
|
||||
* INTERFACE ROUTINES
|
||||
@@ -90,8 +90,7 @@ heap_tuple_fetch_attr(varattrib *attr)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* This is a plain value inside of the main tuple - why am I
|
||||
* called?
|
||||
* This is a plain value inside of the main tuple - why am I called?
|
||||
*/
|
||||
result = attr;
|
||||
}
|
||||
@@ -154,8 +153,7 @@ heap_tuple_untoast_attr(varattrib *attr)
|
||||
else
|
||||
|
||||
/*
|
||||
* This is a plain value inside of the main tuple - why am I
|
||||
* called?
|
||||
* This is a plain value inside of the main tuple - why am I called?
|
||||
*/
|
||||
return attr;
|
||||
|
||||
@@ -255,8 +253,8 @@ toast_raw_datum_size(Datum value)
|
||||
else if (VARATT_IS_EXTERNAL(attr))
|
||||
{
|
||||
/*
|
||||
* an uncompressed external attribute has rawsize including the
|
||||
* header (not too consistent!)
|
||||
* an uncompressed external attribute has rawsize including the header
|
||||
* (not too consistent!)
|
||||
*/
|
||||
result = attr->va_content.va_external.va_rawsize;
|
||||
}
|
||||
@@ -274,26 +272,26 @@ toast_raw_datum_size(Datum value)
|
||||
* Return the physical storage size (possibly compressed) of a varlena datum
|
||||
* ----------
|
||||
*/
|
||||
Size
|
||||
Size
|
||||
toast_datum_size(Datum value)
|
||||
{
|
||||
varattrib *attr = (varattrib *) DatumGetPointer(value);
|
||||
varattrib *attr = (varattrib *) DatumGetPointer(value);
|
||||
Size result;
|
||||
|
||||
if (VARATT_IS_EXTERNAL(attr))
|
||||
{
|
||||
/*
|
||||
* Attribute is stored externally - return the extsize whether
|
||||
* compressed or not. We do not count the size of the toast
|
||||
* pointer ... should we?
|
||||
* compressed or not. We do not count the size of the toast pointer
|
||||
* ... should we?
|
||||
*/
|
||||
result = attr->va_content.va_external.va_extsize;
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Attribute is stored inline either compressed or not, just
|
||||
* calculate the size of the datum in either case.
|
||||
* Attribute is stored inline either compressed or not, just calculate
|
||||
* the size of the datum in either case.
|
||||
*/
|
||||
result = VARSIZE(attr);
|
||||
}
|
||||
@@ -321,12 +319,12 @@ toast_delete(Relation rel, HeapTuple oldtup)
|
||||
* Get the tuple descriptor and break down the tuple into fields.
|
||||
*
|
||||
* NOTE: it's debatable whether to use heap_deformtuple() here or just
|
||||
* heap_getattr() only the varlena columns. The latter could win if
|
||||
* there are few varlena columns and many non-varlena ones. However,
|
||||
* heap_deformtuple costs only O(N) while the heap_getattr way would
|
||||
* cost O(N^2) if there are many varlena columns, so it seems better
|
||||
* to err on the side of linear cost. (We won't even be here unless
|
||||
* there's at least one varlena column, by the way.)
|
||||
* heap_getattr() only the varlena columns. The latter could win if there
|
||||
* are few varlena columns and many non-varlena ones. However,
|
||||
* heap_deformtuple costs only O(N) while the heap_getattr way would cost
|
||||
* O(N^2) if there are many varlena columns, so it seems better to err on
|
||||
* the side of linear cost. (We won't even be here unless there's at
|
||||
* least one varlena column, by the way.)
|
||||
*/
|
||||
tupleDesc = rel->rd_att;
|
||||
att = tupleDesc->attrs;
|
||||
@@ -336,8 +334,8 @@ toast_delete(Relation rel, HeapTuple oldtup)
|
||||
heap_deform_tuple(oldtup, tupleDesc, toast_values, toast_isnull);
|
||||
|
||||
/*
|
||||
* Check for external stored attributes and delete them from the
|
||||
* secondary relation.
|
||||
* Check for external stored attributes and delete them from the secondary
|
||||
* relation.
|
||||
*/
|
||||
for (i = 0; i < numAttrs; i++)
|
||||
{
|
||||
@@ -447,9 +445,9 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* This attribute isn't changed by this update so we
|
||||
* reuse the original reference to the old value in
|
||||
* the new tuple.
|
||||
* This attribute isn't changed by this update so we reuse
|
||||
* the original reference to the old value in the new
|
||||
* tuple.
|
||||
*/
|
||||
toast_action[i] = 'p';
|
||||
toast_sizes[i] = VARATT_SIZE(toast_values[i]);
|
||||
@@ -582,16 +580,15 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* incompressible data, ignore on subsequent compression
|
||||
* passes
|
||||
* incompressible data, ignore on subsequent compression passes
|
||||
*/
|
||||
toast_action[i] = 'x';
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Second we look for attributes of attstorage 'x' or 'e' that are
|
||||
* still inline.
|
||||
* Second we look for attributes of attstorage 'x' or 'e' that are still
|
||||
* inline.
|
||||
*/
|
||||
while (MAXALIGN(heap_compute_data_size(tupleDesc,
|
||||
toast_values, toast_isnull)) >
|
||||
@@ -696,8 +693,7 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* incompressible data, ignore on subsequent compression
|
||||
* passes
|
||||
* incompressible data, ignore on subsequent compression passes
|
||||
*/
|
||||
toast_action[i] = 'x';
|
||||
}
|
||||
@@ -755,8 +751,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup)
|
||||
}
|
||||
|
||||
/*
|
||||
* In the case we toasted any values, we need to build a new heap
|
||||
* tuple with the changed values.
|
||||
* In the case we toasted any values, we need to build a new heap tuple
|
||||
* with the changed values.
|
||||
*/
|
||||
if (need_change)
|
||||
{
|
||||
@@ -798,8 +794,8 @@ toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup)
|
||||
has_nulls ? newtup->t_data->t_bits : NULL);
|
||||
|
||||
/*
|
||||
* In the case we modified a previously modified tuple again, free
|
||||
* the memory from the previous run
|
||||
* In the case we modified a previously modified tuple again, free the
|
||||
* memory from the previous run
|
||||
*/
|
||||
if ((char *) olddata != ((char *) newtup + HEAPTUPLESIZE))
|
||||
pfree(olddata);
|
||||
@@ -906,8 +902,8 @@ toast_flatten_tuple_attribute(Datum value,
|
||||
return value;
|
||||
|
||||
/*
|
||||
* Calculate the new size of the tuple. Header size should not
|
||||
* change, but data size might.
|
||||
* Calculate the new size of the tuple. Header size should not change,
|
||||
* but data size might.
|
||||
*/
|
||||
new_len = offsetof(HeapTupleHeaderData, t_bits);
|
||||
if (has_nulls)
|
||||
@@ -1007,9 +1003,9 @@ toast_save_datum(Relation rel, Datum value)
|
||||
int32 data_todo;
|
||||
|
||||
/*
|
||||
* Open the toast relation and its index. We can use the index to
|
||||
* check uniqueness of the OID we assign to the toasted item, even
|
||||
* though it has additional columns besides OID.
|
||||
* Open the toast relation and its index. We can use the index to check
|
||||
* uniqueness of the OID we assign to the toasted item, even though it has
|
||||
* additional columns besides OID.
|
||||
*/
|
||||
toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock);
|
||||
toasttupDesc = toastrel->rd_att;
|
||||
@@ -1082,11 +1078,11 @@ toast_save_datum(Relation rel, Datum value)
|
||||
|
||||
/*
|
||||
* Create the index entry. We cheat a little here by not using
|
||||
* FormIndexDatum: this relies on the knowledge that the index
|
||||
* columns are the same as the initial columns of the table.
|
||||
* FormIndexDatum: this relies on the knowledge that the index columns
|
||||
* are the same as the initial columns of the table.
|
||||
*
|
||||
* Note also that there had better not be any user-created index on
|
||||
* the TOAST table, since we don't bother to update anything else.
|
||||
* Note also that there had better not be any user-created index on the
|
||||
* TOAST table, since we don't bother to update anything else.
|
||||
*/
|
||||
index_insert(toastidx, t_values, t_isnull,
|
||||
&(toasttup->t_self),
|
||||
@@ -1148,7 +1144,7 @@ toast_delete_datum(Relation rel, Datum value)
|
||||
ScanKeyInit(&toastkey,
|
||||
(AttrNumber) 1,
|
||||
BTEqualStrategyNumber, F_OIDEQ,
|
||||
ObjectIdGetDatum(attr->va_content.va_external.va_valueid));
|
||||
ObjectIdGetDatum(attr->va_content.va_external.va_valueid));
|
||||
|
||||
/*
|
||||
* Find the chunks by index
|
||||
@@ -1219,14 +1215,14 @@ toast_fetch_datum(varattrib *attr)
|
||||
ScanKeyInit(&toastkey,
|
||||
(AttrNumber) 1,
|
||||
BTEqualStrategyNumber, F_OIDEQ,
|
||||
ObjectIdGetDatum(attr->va_content.va_external.va_valueid));
|
||||
ObjectIdGetDatum(attr->va_content.va_external.va_valueid));
|
||||
|
||||
/*
|
||||
* Read the chunks by index
|
||||
*
|
||||
* Note that because the index is actually on (valueid, chunkidx) we will
|
||||
* see the chunks in chunkidx order, even though we didn't explicitly
|
||||
* ask for it.
|
||||
* Note that because the index is actually on (valueid, chunkidx) we will see
|
||||
* the chunks in chunkidx order, even though we didn't explicitly ask for
|
||||
* it.
|
||||
*/
|
||||
nextidx = 0;
|
||||
|
||||
@@ -1367,13 +1363,13 @@ toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length)
|
||||
toastidx = index_open(toastrel->rd_rel->reltoastidxid);
|
||||
|
||||
/*
|
||||
* Setup a scan key to fetch from the index. This is either two keys
|
||||
* or three depending on the number of chunks.
|
||||
* Setup a scan key to fetch from the index. This is either two keys or
|
||||
* three depending on the number of chunks.
|
||||
*/
|
||||
ScanKeyInit(&toastkey[0],
|
||||
(AttrNumber) 1,
|
||||
BTEqualStrategyNumber, F_OIDEQ,
|
||||
ObjectIdGetDatum(attr->va_content.va_external.va_valueid));
|
||||
ObjectIdGetDatum(attr->va_content.va_external.va_valueid));
|
||||
|
||||
/*
|
||||
* Use equality condition for one chunk, a range condition otherwise:
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.48 2005/05/27 23:31:20 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.49 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* many of the old access method routines have been turned into
|
||||
@@ -78,15 +78,15 @@ RelationGetIndexScan(Relation indexRelation,
|
||||
scan->numberOfKeys = nkeys;
|
||||
|
||||
/*
|
||||
* We allocate the key space here, but the AM is responsible for
|
||||
* actually filling it from the passed key array.
|
||||
* We allocate the key space here, but the AM is responsible for actually
|
||||
* filling it from the passed key array.
|
||||
*/
|
||||
if (nkeys > 0)
|
||||
scan->keyData = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
|
||||
else
|
||||
scan->keyData = NULL;
|
||||
|
||||
scan->is_multiscan = false; /* caller may change this */
|
||||
scan->is_multiscan = false; /* caller may change this */
|
||||
scan->kill_prior_tuple = false;
|
||||
scan->ignore_killed_tuples = true; /* default setting */
|
||||
scan->keys_are_unique = false; /* may be set by index AM */
|
||||
@@ -203,8 +203,8 @@ systable_beginscan(Relation heapRelation,
|
||||
/*
|
||||
* Change attribute numbers to be index column numbers.
|
||||
*
|
||||
* This code could be generalized to search for the index key numbers
|
||||
* to substitute, but for now there's no need.
|
||||
* This code could be generalized to search for the index key numbers to
|
||||
* substitute, but for now there's no need.
|
||||
*/
|
||||
for (i = 0; i < nkeys; i++)
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.85 2005/10/06 02:29:11 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.86 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
* INTERFACE ROUTINES
|
||||
* index_open - open an index relation by relation OID
|
||||
@@ -111,7 +111,7 @@ do { \
|
||||
} while(0)
|
||||
|
||||
static IndexScanDesc index_beginscan_internal(Relation indexRelation,
|
||||
int nkeys, ScanKey key);
|
||||
int nkeys, ScanKey key);
|
||||
|
||||
|
||||
/* ----------------------------------------------------------------
|
||||
@@ -122,14 +122,14 @@ static IndexScanDesc index_beginscan_internal(Relation indexRelation,
|
||||
/* ----------------
|
||||
* index_open - open an index relation by relation OID
|
||||
*
|
||||
* Note: we acquire no lock on the index. A lock is not needed when
|
||||
* Note: we acquire no lock on the index. A lock is not needed when
|
||||
* simply examining the index reldesc; the index's schema information
|
||||
* is considered to be protected by the lock that the caller had better
|
||||
* be holding on the parent relation. Some type of lock should be
|
||||
* be holding on the parent relation. Some type of lock should be
|
||||
* obtained on the index before physically accessing it, however.
|
||||
* This is handled automatically for most uses by index_beginscan
|
||||
* and index_endscan for scan cases, or by ExecOpenIndices and
|
||||
* ExecCloseIndices for update cases. Other callers will need to
|
||||
* ExecCloseIndices for update cases. Other callers will need to
|
||||
* obtain their own locks.
|
||||
*
|
||||
* This is a convenience routine adapted for indexscan use.
|
||||
@@ -241,8 +241,8 @@ index_beginscan(Relation heapRelation,
|
||||
scan = index_beginscan_internal(indexRelation, nkeys, key);
|
||||
|
||||
/*
|
||||
* Save additional parameters into the scandesc. Everything else was
|
||||
* set up by RelationGetIndexScan.
|
||||
* Save additional parameters into the scandesc. Everything else was set
|
||||
* up by RelationGetIndexScan.
|
||||
*/
|
||||
scan->is_multiscan = false;
|
||||
scan->heapRelation = heapRelation;
|
||||
@@ -267,8 +267,8 @@ index_beginscan_multi(Relation indexRelation,
|
||||
scan = index_beginscan_internal(indexRelation, nkeys, key);
|
||||
|
||||
/*
|
||||
* Save additional parameters into the scandesc. Everything else was
|
||||
* set up by RelationGetIndexScan.
|
||||
* Save additional parameters into the scandesc. Everything else was set
|
||||
* up by RelationGetIndexScan.
|
||||
*/
|
||||
scan->is_multiscan = true;
|
||||
scan->xs_snapshot = snapshot;
|
||||
@@ -294,14 +294,14 @@ index_beginscan_internal(Relation indexRelation,
|
||||
* Acquire AccessShareLock for the duration of the scan
|
||||
*
|
||||
* Note: we could get an SI inval message here and consequently have to
|
||||
* rebuild the relcache entry. The refcount increment above ensures
|
||||
* that we will rebuild it and not just flush it...
|
||||
* rebuild the relcache entry. The refcount increment above ensures that
|
||||
* we will rebuild it and not just flush it...
|
||||
*/
|
||||
LockRelation(indexRelation, AccessShareLock);
|
||||
|
||||
/*
|
||||
* LockRelation can clean rd_aminfo structure, so fill procedure
|
||||
* after LockRelation
|
||||
* LockRelation can clean rd_aminfo structure, so fill procedure after
|
||||
* LockRelation
|
||||
*/
|
||||
|
||||
GET_REL_PROCEDURE(ambeginscan);
|
||||
@@ -425,8 +425,8 @@ index_restrpos(IndexScanDesc scan)
|
||||
|
||||
/*
|
||||
* We do not reset got_tuple; so if the scan is actually being
|
||||
* short-circuited by index_getnext, the effective position
|
||||
* restoration is done by restoring unique_tuple_pos.
|
||||
* short-circuited by index_getnext, the effective position restoration is
|
||||
* done by restoring unique_tuple_pos.
|
||||
*/
|
||||
scan->unique_tuple_pos = scan->unique_tuple_mark;
|
||||
|
||||
@@ -454,19 +454,19 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||
|
||||
/*
|
||||
* If we already got a tuple and it must be unique, there's no need to
|
||||
* make the index AM look through any additional tuples. (This can
|
||||
* save a useful amount of work in scenarios where there are many dead
|
||||
* tuples due to heavy update activity.)
|
||||
* make the index AM look through any additional tuples. (This can save a
|
||||
* useful amount of work in scenarios where there are many dead tuples due
|
||||
* to heavy update activity.)
|
||||
*
|
||||
* To do this we must keep track of the logical scan position
|
||||
* (before/on/after tuple). Also, we have to be sure to release scan
|
||||
* resources before returning NULL; if we fail to do so then a
|
||||
* multi-index scan can easily run the system out of free buffers. We
|
||||
* can release index-level resources fairly cheaply by calling
|
||||
* index_rescan. This means there are two persistent states as far as
|
||||
* the index AM is concerned: on-tuple and rescanned. If we are
|
||||
* actually asked to re-fetch the single tuple, we have to go through
|
||||
* a fresh indexscan startup, which penalizes that (infrequent) case.
|
||||
* resources before returning NULL; if we fail to do so then a multi-index
|
||||
* scan can easily run the system out of free buffers. We can release
|
||||
* index-level resources fairly cheaply by calling index_rescan. This
|
||||
* means there are two persistent states as far as the index AM is
|
||||
* concerned: on-tuple and rescanned. If we are actually asked to
|
||||
* re-fetch the single tuple, we have to go through a fresh indexscan
|
||||
* startup, which penalizes that (infrequent) case.
|
||||
*/
|
||||
if (scan->keys_are_unique && scan->got_tuple)
|
||||
{
|
||||
@@ -485,19 +485,18 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||
if (new_tuple_pos == 0)
|
||||
{
|
||||
/*
|
||||
* We are moving onto the unique tuple from having been off
|
||||
* it. We just fall through and let the index AM do the work.
|
||||
* Note we should get the right answer regardless of scan
|
||||
* direction.
|
||||
* We are moving onto the unique tuple from having been off it. We
|
||||
* just fall through and let the index AM do the work. Note we
|
||||
* should get the right answer regardless of scan direction.
|
||||
*/
|
||||
scan->unique_tuple_pos = 0; /* need to update position */
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* Moving off the tuple; must do amrescan to release
|
||||
* index-level pins before we return NULL. Since index_rescan
|
||||
* will reset my state, must save and restore...
|
||||
* Moving off the tuple; must do amrescan to release index-level
|
||||
* pins before we return NULL. Since index_rescan will reset my
|
||||
* state, must save and restore...
|
||||
*/
|
||||
int unique_tuple_mark = scan->unique_tuple_mark;
|
||||
|
||||
@@ -520,8 +519,7 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||
bool found;
|
||||
|
||||
/*
|
||||
* The AM's gettuple proc finds the next tuple matching the scan
|
||||
* keys.
|
||||
* The AM's gettuple proc finds the next tuple matching the scan keys.
|
||||
*/
|
||||
found = DatumGetBool(FunctionCall2(procedure,
|
||||
PointerGetDatum(scan),
|
||||
@@ -556,9 +554,9 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If we can't see it, maybe no one else can either. Check to see
|
||||
* if the tuple is dead to all transactions. If so, signal the
|
||||
* index AM to not return it on future indexscans.
|
||||
* If we can't see it, maybe no one else can either. Check to see if
|
||||
* the tuple is dead to all transactions. If so, signal the index AM
|
||||
* to not return it on future indexscans.
|
||||
*
|
||||
* We told heap_release_fetch to keep a pin on the buffer, so we can
|
||||
* re-access the tuple here. But we must re-lock the buffer first.
|
||||
@@ -576,8 +574,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
|
||||
scan->got_tuple = true;
|
||||
|
||||
/*
|
||||
* If we just fetched a known-unique tuple, then subsequent calls will
|
||||
* go through the short-circuit code above. unique_tuple_pos has been
|
||||
* If we just fetched a known-unique tuple, then subsequent calls will go
|
||||
* through the short-circuit code above. unique_tuple_pos has been
|
||||
* initialized to 0, which is the correct state ("on row").
|
||||
*/
|
||||
|
||||
@@ -805,11 +803,10 @@ index_getprocinfo(Relation irel,
|
||||
procId = loc[procindex];
|
||||
|
||||
/*
|
||||
* Complain if function was not found during
|
||||
* IndexSupportInitialize. This should not happen unless the
|
||||
* system tables contain bogus entries for the index opclass. (If
|
||||
* an AM wants to allow a support function to be optional, it can
|
||||
* use index_getprocid.)
|
||||
* Complain if function was not found during IndexSupportInitialize.
|
||||
* This should not happen unless the system tables contain bogus
|
||||
* entries for the index opclass. (If an AM wants to allow a support
|
||||
* function to be optional, it can use index_getprocid.)
|
||||
*/
|
||||
if (!RegProcedureIsValid(procId))
|
||||
elog(ERROR, "missing support function %d for attribute %d of index \"%s\"",
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.126 2005/10/12 17:18:03 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.127 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -93,30 +93,29 @@ top:
|
||||
|
||||
/*
|
||||
* If the page was split between the time that we surrendered our read
|
||||
* lock and acquired our write lock, then this page may no longer be
|
||||
* the right place for the key we want to insert. In this case, we
|
||||
* need to move right in the tree. See Lehman and Yao for an
|
||||
* excruciatingly precise description.
|
||||
* lock and acquired our write lock, then this page may no longer be the
|
||||
* right place for the key we want to insert. In this case, we need to
|
||||
* move right in the tree. See Lehman and Yao for an excruciatingly
|
||||
* precise description.
|
||||
*/
|
||||
buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
|
||||
|
||||
/*
|
||||
* If we're not allowing duplicates, make sure the key isn't already
|
||||
* in the index.
|
||||
* If we're not allowing duplicates, make sure the key isn't already in
|
||||
* the index.
|
||||
*
|
||||
* NOTE: obviously, _bt_check_unique can only detect keys that are
|
||||
* already in the index; so it cannot defend against concurrent
|
||||
* insertions of the same key. We protect against that by means of
|
||||
* holding a write lock on the target page. Any other would-be
|
||||
* inserter of the same key must acquire a write lock on the same
|
||||
* target page, so only one would-be inserter can be making the check
|
||||
* at one time. Furthermore, once we are past the check we hold write
|
||||
* locks continuously until we have performed our insertion, so no
|
||||
* later inserter can fail to see our insertion. (This requires some
|
||||
* care in _bt_insertonpg.)
|
||||
* NOTE: obviously, _bt_check_unique can only detect keys that are already in
|
||||
* the index; so it cannot defend against concurrent insertions of the
|
||||
* same key. We protect against that by means of holding a write lock on
|
||||
* the target page. Any other would-be inserter of the same key must
|
||||
* acquire a write lock on the same target page, so only one would-be
|
||||
* inserter can be making the check at one time. Furthermore, once we are
|
||||
* past the check we hold write locks continuously until we have performed
|
||||
* our insertion, so no later inserter can fail to see our insertion.
|
||||
* (This requires some care in _bt_insertonpg.)
|
||||
*
|
||||
* If we must wait for another xact, we release the lock while waiting,
|
||||
* and then must start over completely.
|
||||
* If we must wait for another xact, we release the lock while waiting, and
|
||||
* then must start over completely.
|
||||
*/
|
||||
if (index_is_unique)
|
||||
{
|
||||
@@ -167,8 +166,8 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* Find first item >= proposed new item. Note we could also get a
|
||||
* pointer to end-of-page here.
|
||||
* Find first item >= proposed new item. Note we could also get a pointer
|
||||
* to end-of-page here.
|
||||
*/
|
||||
offset = _bt_binsrch(rel, buf, natts, itup_scankey, false);
|
||||
|
||||
@@ -194,24 +193,24 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
||||
/*
|
||||
* We can skip items that are marked killed.
|
||||
*
|
||||
* Formerly, we applied _bt_isequal() before checking the kill
|
||||
* flag, so as to fall out of the item loop as soon as
|
||||
* possible. However, in the presence of heavy update activity
|
||||
* an index may contain many killed items with the same key;
|
||||
* running _bt_isequal() on each killed item gets expensive.
|
||||
* Furthermore it is likely that the non-killed version of
|
||||
* each key appears first, so that we didn't actually get to
|
||||
* exit any sooner anyway. So now we just advance over killed
|
||||
* items as quickly as we can. We only apply _bt_isequal()
|
||||
* when we get to a non-killed item or the end of the page.
|
||||
* Formerly, we applied _bt_isequal() before checking the kill flag,
|
||||
* so as to fall out of the item loop as soon as possible.
|
||||
* However, in the presence of heavy update activity an index may
|
||||
* contain many killed items with the same key; running
|
||||
* _bt_isequal() on each killed item gets expensive. Furthermore
|
||||
* it is likely that the non-killed version of each key appears
|
||||
* first, so that we didn't actually get to exit any sooner
|
||||
* anyway. So now we just advance over killed items as quickly as
|
||||
* we can. We only apply _bt_isequal() when we get to a non-killed
|
||||
* item or the end of the page.
|
||||
*/
|
||||
if (!ItemIdDeleted(curitemid))
|
||||
{
|
||||
/*
|
||||
* _bt_compare returns 0 for (1,NULL) and (1,NULL) -
|
||||
* this's how we handling NULLs - and so we must not use
|
||||
* _bt_compare in real comparison, but only for
|
||||
* ordering/finding items on pages. - vadim 03/24/97
|
||||
* _bt_compare returns 0 for (1,NULL) and (1,NULL) - this's
|
||||
* how we handling NULLs - and so we must not use _bt_compare
|
||||
* in real comparison, but only for ordering/finding items on
|
||||
* pages. - vadim 03/24/97
|
||||
*/
|
||||
if (!_bt_isequal(itupdesc, page, offset, natts, itup_scankey))
|
||||
break; /* we're past all the equal tuples */
|
||||
@@ -246,15 +245,15 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel,
|
||||
*/
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_UNIQUE_VIOLATION),
|
||||
errmsg("duplicate key violates unique constraint \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
errmsg("duplicate key violates unique constraint \"%s\"",
|
||||
RelationGetRelationName(rel))));
|
||||
}
|
||||
else if (htup.t_data != NULL)
|
||||
{
|
||||
/*
|
||||
* Hmm, if we can't see the tuple, maybe it can be
|
||||
* marked killed. This logic should match
|
||||
* index_getnext and btgettuple.
|
||||
* Hmm, if we can't see the tuple, maybe it can be marked
|
||||
* killed. This logic should match index_getnext and
|
||||
* btgettuple.
|
||||
*/
|
||||
LockBuffer(hbuffer, BUFFER_LOCK_SHARE);
|
||||
if (HeapTupleSatisfiesVacuum(htup.t_data, RecentGlobalXmin,
|
||||
@@ -377,15 +376,15 @@ _bt_insertonpg(Relation rel,
|
||||
itemsz = IndexTupleDSize(btitem->bti_itup)
|
||||
+ (sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but
|
||||
* we need to be consistent */
|
||||
itemsz = MAXALIGN(itemsz); /* be safe, PageAddItem will do this but we
|
||||
* need to be consistent */
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a btree page at all. (Eventually,
|
||||
* we ought to try to apply TOAST methods if not.) We actually need to
|
||||
* be able to fit three items on every page, so restrict any one item
|
||||
* to 1/3 the per-page available space. Note that at this point,
|
||||
* itemsz doesn't include the ItemId.
|
||||
* Check whether the item can fit on a btree page at all. (Eventually, we
|
||||
* ought to try to apply TOAST methods if not.) We actually need to be
|
||||
* able to fit three items on every page, so restrict any one item to 1/3
|
||||
* the per-page available space. Note that at this point, itemsz doesn't
|
||||
* include the ItemId.
|
||||
*/
|
||||
if (itemsz > BTMaxItemSize(page))
|
||||
ereport(ERROR,
|
||||
@@ -393,9 +392,9 @@ _bt_insertonpg(Relation rel,
|
||||
errmsg("index row size %lu exceeds btree maximum, %lu",
|
||||
(unsigned long) itemsz,
|
||||
(unsigned long) BTMaxItemSize(page)),
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
|
||||
/*
|
||||
* Determine exactly where new item will go.
|
||||
@@ -432,11 +431,11 @@ _bt_insertonpg(Relation rel,
|
||||
/*
|
||||
* step right to next non-dead page
|
||||
*
|
||||
* must write-lock that page before releasing write lock on
|
||||
* current page; else someone else's _bt_check_unique scan
|
||||
* could fail to see our insertion. write locks on
|
||||
* intermediate dead pages won't do because we don't know when
|
||||
* they will get de-linked from the tree.
|
||||
* must write-lock that page before releasing write lock on current
|
||||
* page; else someone else's _bt_check_unique scan could fail to
|
||||
* see our insertion. write locks on intermediate dead pages
|
||||
* won't do because we don't know when they will get de-linked
|
||||
* from the tree.
|
||||
*/
|
||||
Buffer rbuf = InvalidBuffer;
|
||||
|
||||
@@ -459,9 +458,9 @@ _bt_insertonpg(Relation rel,
|
||||
}
|
||||
|
||||
/*
|
||||
* Now we are on the right page, so find the insert position. If
|
||||
* we moved right at all, we know we should insert at the start of
|
||||
* the page, else must find the position by searching.
|
||||
* Now we are on the right page, so find the insert position. If we
|
||||
* moved right at all, we know we should insert at the start of the
|
||||
* page, else must find the position by searching.
|
||||
*/
|
||||
if (movedright)
|
||||
newitemoff = P_FIRSTDATAKEY(lpageop);
|
||||
@@ -472,9 +471,9 @@ _bt_insertonpg(Relation rel,
|
||||
/*
|
||||
* Do we need to split the page to fit the item on it?
|
||||
*
|
||||
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
|
||||
* so this comparison is correct even though we appear to be
|
||||
* accounting only for the item and not for its line pointer.
|
||||
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result, so
|
||||
* this comparison is correct even though we appear to be accounting only
|
||||
* for the item and not for its line pointer.
|
||||
*/
|
||||
if (PageGetFreeSpace(page) < itemsz)
|
||||
{
|
||||
@@ -522,12 +521,11 @@ _bt_insertonpg(Relation rel,
|
||||
itup_blkno = BufferGetBlockNumber(buf);
|
||||
|
||||
/*
|
||||
* If we are doing this insert because we split a page that was
|
||||
* the only one on its tree level, but was not the root, it may
|
||||
* have been the "fast root". We need to ensure that the fast
|
||||
* root link points at or above the current page. We can safely
|
||||
* acquire a lock on the metapage here --- see comments for
|
||||
* _bt_newroot().
|
||||
* If we are doing this insert because we split a page that was the
|
||||
* only one on its tree level, but was not the root, it may have been
|
||||
* the "fast root". We need to ensure that the fast root link points
|
||||
* at or above the current page. We can safely acquire a lock on the
|
||||
* metapage here --- see comments for _bt_newroot().
|
||||
*/
|
||||
if (split_only_page)
|
||||
{
|
||||
@@ -692,11 +690,11 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
|
||||
|
||||
/*
|
||||
* If the page we're splitting is not the rightmost page at its level
|
||||
* in the tree, then the first entry on the page is the high key for
|
||||
* the page. We need to copy that to the right half. Otherwise
|
||||
* (meaning the rightmost page case), all the items on the right half
|
||||
* will be user data.
|
||||
* If the page we're splitting is not the rightmost page at its level in
|
||||
* the tree, then the first entry on the page is the high key for the
|
||||
* page. We need to copy that to the right half. Otherwise (meaning the
|
||||
* rightmost page case), all the items on the right half will be user
|
||||
* data.
|
||||
*/
|
||||
rightoff = P_HIKEY;
|
||||
|
||||
@@ -712,9 +710,9 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* The "high key" for the new left page will be the first key that's
|
||||
* going to go into the new right page. This might be either the
|
||||
* existing data item at position firstright, or the incoming tuple.
|
||||
* The "high key" for the new left page will be the first key that's going
|
||||
* to go into the new right page. This might be either the existing data
|
||||
* item at position firstright, or the incoming tuple.
|
||||
*/
|
||||
leftoff = P_HIKEY;
|
||||
if (!newitemonleft && newitemoff == firstright)
|
||||
@@ -806,8 +804,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
/*
|
||||
* We have to grab the right sibling (if any) and fix the prev pointer
|
||||
* there. We are guaranteed that this is deadlock-free since no other
|
||||
* writer will be holding a lock on that page and trying to move left,
|
||||
* and all readers release locks on a page before trying to fetch its
|
||||
* writer will be holding a lock on that page and trying to move left, and
|
||||
* all readers release locks on a page before trying to fetch its
|
||||
* neighbors.
|
||||
*/
|
||||
|
||||
@@ -821,8 +819,8 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* Right sibling is locked, new siblings are prepared, but original
|
||||
* page is not updated yet. Log changes before continuing.
|
||||
* Right sibling is locked, new siblings are prepared, but original page
|
||||
* is not updated yet. Log changes before continuing.
|
||||
*
|
||||
* NO EREPORT(ERROR) till right sibling is updated.
|
||||
*/
|
||||
@@ -850,10 +848,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
xlrec.level = lopaque->btpo.level;
|
||||
|
||||
/*
|
||||
* Direct access to page is not good but faster - we should
|
||||
* implement some new func in page API. Note we only store the
|
||||
* tuples themselves, knowing that the item pointers are in the
|
||||
* same order and can be reconstructed by scanning the tuples.
|
||||
* Direct access to page is not good but faster - we should implement
|
||||
* some new func in page API. Note we only store the tuples
|
||||
* themselves, knowing that the item pointers are in the same order
|
||||
* and can be reconstructed by scanning the tuples.
|
||||
*/
|
||||
xlrec.leftlen = ((PageHeader) leftpage)->pd_special -
|
||||
((PageHeader) leftpage)->pd_upper;
|
||||
@@ -903,13 +901,13 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
|
||||
}
|
||||
|
||||
/*
|
||||
* By here, the original data page has been split into two new halves,
|
||||
* and these are correct. The algorithm requires that the left page
|
||||
* never move during a split, so we copy the new left page back on top
|
||||
* of the original. Note that this is not a waste of time, since we
|
||||
* also require (in the page management code) that the center of a
|
||||
* page always be clean, and the most efficient way to guarantee this
|
||||
* is just to compact the data by reinserting it into a new left page.
|
||||
* By here, the original data page has been split into two new halves, and
|
||||
* these are correct. The algorithm requires that the left page never
|
||||
* move during a split, so we copy the new left page back on top of the
|
||||
* original. Note that this is not a waste of time, since we also require
|
||||
* (in the page management code) that the center of a page always be
|
||||
* clean, and the most efficient way to guarantee this is just to compact
|
||||
* the data by reinserting it into a new left page.
|
||||
*/
|
||||
|
||||
PageRestoreTempPage(leftpage, origpage);
|
||||
@@ -984,13 +982,13 @@ _bt_findsplitloc(Relation rel,
|
||||
MAXALIGN(sizeof(BTPageOpaqueData));
|
||||
|
||||
/*
|
||||
* Finding the best possible split would require checking all the
|
||||
* possible split points, because of the high-key and left-key special
|
||||
* cases. That's probably more work than it's worth; instead, stop as
|
||||
* soon as we find a "good-enough" split, where good-enough is defined
|
||||
* as an imbalance in free space of no more than pagesize/16
|
||||
* (arbitrary...) This should let us stop near the middle on most
|
||||
* pages, instead of plowing to the end.
|
||||
* Finding the best possible split would require checking all the possible
|
||||
* split points, because of the high-key and left-key special cases.
|
||||
* That's probably more work than it's worth; instead, stop as soon as we
|
||||
* find a "good-enough" split, where good-enough is defined as an
|
||||
* imbalance in free space of no more than pagesize/16 (arbitrary...) This
|
||||
* should let us stop near the middle on most pages, instead of plowing to
|
||||
* the end.
|
||||
*/
|
||||
goodenough = leftspace / 16;
|
||||
|
||||
@@ -1006,8 +1004,8 @@ _bt_findsplitloc(Relation rel,
|
||||
dataitemtotal = rightspace - (int) PageGetFreeSpace(page);
|
||||
|
||||
/*
|
||||
* Scan through the data items and calculate space usage for a split
|
||||
* at each possible position.
|
||||
* Scan through the data items and calculate space usage for a split at
|
||||
* each possible position.
|
||||
*/
|
||||
dataitemstoleft = 0;
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
@@ -1024,9 +1022,9 @@ _bt_findsplitloc(Relation rel,
|
||||
itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
|
||||
|
||||
/*
|
||||
* We have to allow for the current item becoming the high key of
|
||||
* the left page; therefore it counts against left space as well
|
||||
* as right space.
|
||||
* We have to allow for the current item becoming the high key of the
|
||||
* left page; therefore it counts against left space as well as right
|
||||
* space.
|
||||
*/
|
||||
leftfree = leftspace - dataitemstoleft - (int) itemsz;
|
||||
rightfree = rightspace - (dataitemtotal - dataitemstoleft);
|
||||
@@ -1058,8 +1056,8 @@ _bt_findsplitloc(Relation rel,
|
||||
}
|
||||
|
||||
/*
|
||||
* I believe it is not possible to fail to find a feasible split, but
|
||||
* just in case ...
|
||||
* I believe it is not possible to fail to find a feasible split, but just
|
||||
* in case ...
|
||||
*/
|
||||
if (!state.have_split)
|
||||
elog(ERROR, "could not find a feasible split point for \"%s\"",
|
||||
@@ -1105,8 +1103,7 @@ _bt_checksplitloc(FindSplitData *state, OffsetNumber firstright,
|
||||
{
|
||||
/*
|
||||
* On a rightmost page, try to equalize right free space with
|
||||
* twice the left free space. See comments for
|
||||
* _bt_findsplitloc.
|
||||
* twice the left free space. See comments for _bt_findsplitloc.
|
||||
*/
|
||||
delta = (2 * leftfree) - rightfree;
|
||||
}
|
||||
@@ -1153,19 +1150,18 @@ _bt_insert_parent(Relation rel,
|
||||
bool is_only)
|
||||
{
|
||||
/*
|
||||
* Here we have to do something Lehman and Yao don't talk about: deal
|
||||
* with a root split and construction of a new root. If our stack is
|
||||
* empty then we have just split a node on what had been the root
|
||||
* level when we descended the tree. If it was still the root then we
|
||||
* perform a new-root construction. If it *wasn't* the root anymore,
|
||||
* search to find the next higher level that someone constructed
|
||||
* meanwhile, and find the right place to insert as for the normal
|
||||
* case.
|
||||
* Here we have to do something Lehman and Yao don't talk about: deal with
|
||||
* a root split and construction of a new root. If our stack is empty
|
||||
* then we have just split a node on what had been the root level when we
|
||||
* descended the tree. If it was still the root then we perform a
|
||||
* new-root construction. If it *wasn't* the root anymore, search to find
|
||||
* the next higher level that someone constructed meanwhile, and find the
|
||||
* right place to insert as for the normal case.
|
||||
*
|
||||
* If we have to search for the parent level, we do so by re-descending
|
||||
* from the root. This is not super-efficient, but it's rare enough
|
||||
* not to matter. (This path is also taken when called from WAL
|
||||
* recovery --- we have no stack in that case.)
|
||||
* If we have to search for the parent level, we do so by re-descending from
|
||||
* the root. This is not super-efficient, but it's rare enough not to
|
||||
* matter. (This path is also taken when called from WAL recovery --- we
|
||||
* have no stack in that case.)
|
||||
*/
|
||||
if (is_root)
|
||||
{
|
||||
@@ -1219,9 +1215,9 @@ _bt_insert_parent(Relation rel,
|
||||
/*
|
||||
* Find the parent buffer and get the parent page.
|
||||
*
|
||||
* Oops - if we were moved right then we need to change stack item!
|
||||
* We want to find parent pointing to where we are, right ? -
|
||||
* vadim 05/27/97
|
||||
* Oops - if we were moved right then we need to change stack item! We
|
||||
* want to find parent pointing to where we are, right ? - vadim
|
||||
* 05/27/97
|
||||
*/
|
||||
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
|
||||
bknum, P_HIKEY);
|
||||
@@ -1291,9 +1287,9 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* start = InvalidOffsetNumber means "search the whole page".
|
||||
* We need this test anyway due to possibility that page has a
|
||||
* high key now when it didn't before.
|
||||
* start = InvalidOffsetNumber means "search the whole page". We
|
||||
* need this test anyway due to possibility that page has a high
|
||||
* key now when it didn't before.
|
||||
*/
|
||||
if (start < minoff)
|
||||
start = minoff;
|
||||
@@ -1307,8 +1303,8 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
|
||||
|
||||
/*
|
||||
* These loops will check every item on the page --- but in an
|
||||
* order that's attuned to the probability of where it
|
||||
* actually is. Scan to the right first, then to the left.
|
||||
* order that's attuned to the probability of where it actually
|
||||
* is. Scan to the right first, then to the left.
|
||||
*/
|
||||
for (offnum = start;
|
||||
offnum <= maxoff;
|
||||
@@ -1424,9 +1420,9 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
metad->btm_fastlevel = rootopaque->btpo.level;
|
||||
|
||||
/*
|
||||
* Create downlink item for left page (old root). Since this will be
|
||||
* the first item in a non-leaf page, it implicitly has minus-infinity
|
||||
* key value, so we need not store any actual key in it.
|
||||
* Create downlink item for left page (old root). Since this will be the
|
||||
* first item in a non-leaf page, it implicitly has minus-infinity key
|
||||
* value, so we need not store any actual key in it.
|
||||
*/
|
||||
itemsz = sizeof(BTItemData);
|
||||
new_item = (BTItem) palloc(itemsz);
|
||||
@@ -1434,17 +1430,17 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
ItemPointerSet(&(new_item->bti_itup.t_tid), lbkno, P_HIKEY);
|
||||
|
||||
/*
|
||||
* Insert the left page pointer into the new root page. The root page
|
||||
* is the rightmost page on its level so there is no "high key" in it;
|
||||
* the two items will go into positions P_HIKEY and P_FIRSTKEY.
|
||||
* Insert the left page pointer into the new root page. The root page is
|
||||
* the rightmost page on its level so there is no "high key" in it; the
|
||||
* two items will go into positions P_HIKEY and P_FIRSTKEY.
|
||||
*/
|
||||
if (PageAddItem(rootpage, (Item) new_item, itemsz, P_HIKEY, LP_USED) == InvalidOffsetNumber)
|
||||
elog(PANIC, "failed to add leftkey to new root page");
|
||||
pfree(new_item);
|
||||
|
||||
/*
|
||||
* Create downlink item for right page. The key for it is obtained
|
||||
* from the "high key" position in the left page.
|
||||
* Create downlink item for right page. The key for it is obtained from
|
||||
* the "high key" position in the left page.
|
||||
*/
|
||||
itemid = PageGetItemId(lpage, P_HIKEY);
|
||||
itemsz = ItemIdGetLength(itemid);
|
||||
@@ -1476,8 +1472,8 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
|
||||
rdata[0].next = &(rdata[1]);
|
||||
|
||||
/*
|
||||
* Direct access to page is not good but faster - we should
|
||||
* implement some new func in page API.
|
||||
* Direct access to page is not good but faster - we should implement
|
||||
* some new func in page API.
|
||||
*/
|
||||
rdata[1].data = (char *) rootpage + ((PageHeader) rootpage)->pd_upper;
|
||||
rdata[1].len = ((PageHeader) rootpage)->pd_special -
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.87 2005/08/12 14:34:14 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.88 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Postgres btree pages look like ordinary relation pages. The opaque
|
||||
@@ -115,8 +115,8 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
|
||||
metaopaque->btpo_flags = BTP_META;
|
||||
|
||||
/*
|
||||
* Set pd_lower just past the end of the metadata. This is not
|
||||
* essential but it makes the page look compressible to xlog.c.
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* but it makes the page look compressible to xlog.c.
|
||||
*/
|
||||
((PageHeader) page)->pd_lower =
|
||||
((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
|
||||
@@ -198,26 +198,26 @@ _bt_getroot(Relation rel, int access)
|
||||
LockBuffer(metabuf, BT_WRITE);
|
||||
|
||||
/*
|
||||
* Race condition: if someone else initialized the metadata
|
||||
* between the time we released the read lock and acquired the
|
||||
* write lock, we must avoid doing it again.
|
||||
* Race condition: if someone else initialized the metadata between
|
||||
* the time we released the read lock and acquired the write lock, we
|
||||
* must avoid doing it again.
|
||||
*/
|
||||
if (metad->btm_root != P_NONE)
|
||||
{
|
||||
/*
|
||||
* Metadata initialized by someone else. In order to
|
||||
* guarantee no deadlocks, we have to release the metadata
|
||||
* page and start all over again. (Is that really true? But
|
||||
* it's hardly worth trying to optimize this case.)
|
||||
* Metadata initialized by someone else. In order to guarantee no
|
||||
* deadlocks, we have to release the metadata page and start all
|
||||
* over again. (Is that really true? But it's hardly worth trying
|
||||
* to optimize this case.)
|
||||
*/
|
||||
_bt_relbuf(rel, metabuf);
|
||||
return _bt_getroot(rel, access);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get, initialize, write, and leave a lock of the appropriate
|
||||
* type on the new root page. Since this is the first page in the
|
||||
* tree, it's a leaf as well as the root.
|
||||
* Get, initialize, write, and leave a lock of the appropriate type on
|
||||
* the new root page. Since this is the first page in the tree, it's
|
||||
* a leaf as well as the root.
|
||||
*/
|
||||
rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
|
||||
rootblkno = BufferGetBlockNumber(rootbuf);
|
||||
@@ -266,9 +266,9 @@ _bt_getroot(Relation rel, int access)
|
||||
_bt_wrtnorelbuf(rel, rootbuf);
|
||||
|
||||
/*
|
||||
* swap root write lock for read lock. There is no danger of
|
||||
* anyone else accessing the new root page while it's unlocked,
|
||||
* since no one else knows where it is yet.
|
||||
* swap root write lock for read lock. There is no danger of anyone
|
||||
* else accessing the new root page while it's unlocked, since no one
|
||||
* else knows where it is yet.
|
||||
*/
|
||||
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
|
||||
LockBuffer(rootbuf, BT_READ);
|
||||
@@ -312,8 +312,8 @@ _bt_getroot(Relation rel, int access)
|
||||
}
|
||||
|
||||
/*
|
||||
* By here, we have a pin and read lock on the root page, and no lock
|
||||
* set on the metadata page. Return the root page's buffer.
|
||||
* By here, we have a pin and read lock on the root page, and no lock set
|
||||
* on the metadata page. Return the root page's buffer.
|
||||
*/
|
||||
return rootbuf;
|
||||
}
|
||||
@@ -435,27 +435,26 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
/*
|
||||
* First see if the FSM knows of any free pages.
|
||||
*
|
||||
* We can't trust the FSM's report unreservedly; we have to check
|
||||
* that the page is still free. (For example, an already-free
|
||||
* page could have been re-used between the time the last VACUUM
|
||||
* scanned it and the time the VACUUM made its FSM updates.)
|
||||
* We can't trust the FSM's report unreservedly; we have to check that
|
||||
* the page is still free. (For example, an already-free page could
|
||||
* have been re-used between the time the last VACUUM scanned it and
|
||||
* the time the VACUUM made its FSM updates.)
|
||||
*
|
||||
* In fact, it's worse than that: we can't even assume that it's safe
|
||||
* to take a lock on the reported page. If somebody else has a
|
||||
* lock on it, or even worse our own caller does, we could
|
||||
* deadlock. (The own-caller scenario is actually not improbable.
|
||||
* Consider an index on a serial or timestamp column. Nearly all
|
||||
* splits will be at the rightmost page, so it's entirely likely
|
||||
* that _bt_split will call us while holding a lock on the page
|
||||
* most recently acquired from FSM. A VACUUM running concurrently
|
||||
* with the previous split could well have placed that page back
|
||||
* in FSM.)
|
||||
* In fact, it's worse than that: we can't even assume that it's safe to
|
||||
* take a lock on the reported page. If somebody else has a lock on
|
||||
* it, or even worse our own caller does, we could deadlock. (The
|
||||
* own-caller scenario is actually not improbable. Consider an index
|
||||
* on a serial or timestamp column. Nearly all splits will be at the
|
||||
* rightmost page, so it's entirely likely that _bt_split will call us
|
||||
* while holding a lock on the page most recently acquired from FSM.
|
||||
* A VACUUM running concurrently with the previous split could well
|
||||
* have placed that page back in FSM.)
|
||||
*
|
||||
* To get around that, we ask for only a conditional lock on the
|
||||
* reported page. If we fail, then someone else is using the
|
||||
* page, and we may reasonably assume it's not free. (If we
|
||||
* happen to be wrong, the worst consequence is the page will be
|
||||
* lost to use till the next VACUUM, which is no big problem.)
|
||||
* To get around that, we ask for only a conditional lock on the reported
|
||||
* page. If we fail, then someone else is using the page, and we may
|
||||
* reasonably assume it's not free. (If we happen to be wrong, the
|
||||
* worst consequence is the page will be lost to use till the next
|
||||
* VACUUM, which is no big problem.)
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -486,10 +485,10 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
/*
|
||||
* Extend the relation by one page.
|
||||
*
|
||||
* We have to use a lock to ensure no one else is extending the rel
|
||||
* at the same time, else we will both try to initialize the same
|
||||
* new page. We can skip locking for new or temp relations,
|
||||
* however, since no one else could be accessing them.
|
||||
* We have to use a lock to ensure no one else is extending the rel at
|
||||
* the same time, else we will both try to initialize the same new
|
||||
* page. We can skip locking for new or temp relations, however,
|
||||
* since no one else could be accessing them.
|
||||
*/
|
||||
needLock = !RELATION_IS_LOCAL(rel);
|
||||
|
||||
@@ -504,8 +503,8 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
|
||||
/*
|
||||
* Release the file-extension lock; it's now OK for someone else to
|
||||
* extend the relation some more. Note that we cannot release this
|
||||
* lock before we have buffer lock on the new page, or we risk a
|
||||
* race condition against btvacuumcleanup --- see comments therein.
|
||||
* lock before we have buffer lock on the new page, or we risk a race
|
||||
* condition against btvacuumcleanup --- see comments therein.
|
||||
*/
|
||||
if (needLock)
|
||||
UnlockRelationForExtension(rel, ExclusiveLock);
|
||||
@@ -614,10 +613,10 @@ _bt_page_recyclable(Page page)
|
||||
BTPageOpaque opaque;
|
||||
|
||||
/*
|
||||
* It's possible to find an all-zeroes page in an index --- for
|
||||
* example, a backend might successfully extend the relation one page
|
||||
* and then crash before it is able to make a WAL entry for adding the
|
||||
* page. If we find a zeroed page then reclaim it.
|
||||
* It's possible to find an all-zeroes page in an index --- for example, a
|
||||
* backend might successfully extend the relation one page and then crash
|
||||
* before it is able to make a WAL entry for adding the page. If we find a
|
||||
* zeroed page then reclaim it.
|
||||
*/
|
||||
if (PageIsNew(page))
|
||||
return true;
|
||||
@@ -672,9 +671,9 @@ _bt_delitems(Relation rel, Buffer buf,
|
||||
rdata[0].next = &(rdata[1]);
|
||||
|
||||
/*
|
||||
* The target-offsets array is not in the buffer, but pretend that
|
||||
* it is. When XLogInsert stores the whole buffer, the offsets
|
||||
* array need not be stored too.
|
||||
* The target-offsets array is not in the buffer, but pretend that it
|
||||
* is. When XLogInsert stores the whole buffer, the offsets array
|
||||
* need not be stored too.
|
||||
*/
|
||||
if (nitems > 0)
|
||||
{
|
||||
@@ -747,8 +746,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
BTPageOpaque opaque;
|
||||
|
||||
/*
|
||||
* We can never delete rightmost pages nor root pages. While at it,
|
||||
* check that page is not already deleted and is empty.
|
||||
* We can never delete rightmost pages nor root pages. While at it, check
|
||||
* that page is not already deleted and is empty.
|
||||
*/
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -760,8 +759,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save info about page, including a copy of its high key (it must
|
||||
* have one, being non-rightmost).
|
||||
* Save info about page, including a copy of its high key (it must have
|
||||
* one, being non-rightmost).
|
||||
*/
|
||||
target = BufferGetBlockNumber(buf);
|
||||
targetlevel = opaque->btpo.level;
|
||||
@@ -770,11 +769,11 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
targetkey = CopyBTItem((BTItem) PageGetItem(page, itemid));
|
||||
|
||||
/*
|
||||
* We need to get an approximate pointer to the page's parent page.
|
||||
* Use the standard search mechanism to search for the page's high
|
||||
* key; this will give us a link to either the current parent or
|
||||
* someplace to its left (if there are multiple equal high keys). To
|
||||
* avoid deadlocks, we'd better drop the target page lock first.
|
||||
* We need to get an approximate pointer to the page's parent page. Use
|
||||
* the standard search mechanism to search for the page's high key; this
|
||||
* will give us a link to either the current parent or someplace to its
|
||||
* left (if there are multiple equal high keys). To avoid deadlocks, we'd
|
||||
* better drop the target page lock first.
|
||||
*/
|
||||
_bt_relbuf(rel, buf);
|
||||
/* we need a scan key to do our search, so build one */
|
||||
@@ -786,9 +785,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
_bt_relbuf(rel, lbuf);
|
||||
|
||||
/*
|
||||
* If we are trying to delete an interior page, _bt_search did more
|
||||
* than we needed. Locate the stack item pointing to our parent
|
||||
* level.
|
||||
* If we are trying to delete an interior page, _bt_search did more than
|
||||
* we needed. Locate the stack item pointing to our parent level.
|
||||
*/
|
||||
ilevel = 0;
|
||||
for (;;)
|
||||
@@ -803,16 +801,15 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
|
||||
/*
|
||||
* We have to lock the pages we need to modify in the standard order:
|
||||
* moving right, then up. Else we will deadlock against other
|
||||
* writers.
|
||||
* moving right, then up. Else we will deadlock against other writers.
|
||||
*
|
||||
* So, we need to find and write-lock the current left sibling of the
|
||||
* target page. The sibling that was current a moment ago could have
|
||||
* split, so we may have to move right. This search could fail if
|
||||
* either the sibling or the target page was deleted by someone else
|
||||
* meanwhile; if so, give up. (Right now, that should never happen,
|
||||
* since page deletion is only done in VACUUM and there shouldn't be
|
||||
* multiple VACUUMs concurrently on the same table.)
|
||||
* So, we need to find and write-lock the current left sibling of the target
|
||||
* page. The sibling that was current a moment ago could have split, so
|
||||
* we may have to move right. This search could fail if either the
|
||||
* sibling or the target page was deleted by someone else meanwhile; if
|
||||
* so, give up. (Right now, that should never happen, since page deletion
|
||||
* is only done in VACUUM and there shouldn't be multiple VACUUMs
|
||||
* concurrently on the same table.)
|
||||
*/
|
||||
if (leftsib != P_NONE)
|
||||
{
|
||||
@@ -839,19 +836,18 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
lbuf = InvalidBuffer;
|
||||
|
||||
/*
|
||||
* Next write-lock the target page itself. It should be okay to take
|
||||
* just a write lock not a superexclusive lock, since no scans would
|
||||
* stop on an empty page.
|
||||
* Next write-lock the target page itself. It should be okay to take just
|
||||
* a write lock not a superexclusive lock, since no scans would stop on an
|
||||
* empty page.
|
||||
*/
|
||||
buf = _bt_getbuf(rel, target, BT_WRITE);
|
||||
page = BufferGetPage(buf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* Check page is still empty etc, else abandon deletion. The empty
|
||||
* check is necessary since someone else might have inserted into it
|
||||
* while we didn't have it locked; the others are just for paranoia's
|
||||
* sake.
|
||||
* Check page is still empty etc, else abandon deletion. The empty check
|
||||
* is necessary since someone else might have inserted into it while we
|
||||
* didn't have it locked; the others are just for paranoia's sake.
|
||||
*/
|
||||
if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
|
||||
P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
|
||||
@@ -872,9 +868,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);
|
||||
|
||||
/*
|
||||
* Next find and write-lock the current parent of the target page.
|
||||
* This is essentially the same as the corresponding step of
|
||||
* splitting.
|
||||
* Next find and write-lock the current parent of the target page. This is
|
||||
* essentially the same as the corresponding step of splitting.
|
||||
*/
|
||||
ItemPointerSet(&(stack->bts_btitem.bti_itup.t_tid),
|
||||
target, P_HIKEY);
|
||||
@@ -887,8 +882,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
|
||||
/*
|
||||
* If the target is the rightmost child of its parent, then we can't
|
||||
* delete, unless it's also the only child --- in which case the
|
||||
* parent changes to half-dead status.
|
||||
* delete, unless it's also the only child --- in which case the parent
|
||||
* changes to half-dead status.
|
||||
*/
|
||||
page = BufferGetPage(pbuf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -917,11 +912,10 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are deleting the next-to-last page on the target's level,
|
||||
* then the rightsib is a candidate to become the new fast root. (In
|
||||
* theory, it might be possible to push the fast root even further
|
||||
* down, but the odds of doing so are slim, and the locking
|
||||
* considerations daunting.)
|
||||
* If we are deleting the next-to-last page on the target's level, then
|
||||
* the rightsib is a candidate to become the new fast root. (In theory, it
|
||||
* might be possible to push the fast root even further down, but the odds
|
||||
* of doing so are slim, and the locking considerations daunting.)
|
||||
*
|
||||
* We can safely acquire a lock on the metapage here --- see comments for
|
||||
* _bt_newroot().
|
||||
@@ -939,9 +933,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
metad = BTPageGetMeta(metapg);
|
||||
|
||||
/*
|
||||
* The expected case here is btm_fastlevel == targetlevel+1;
|
||||
* if the fastlevel is <= targetlevel, something is wrong, and
|
||||
* we choose to overwrite it to fix it.
|
||||
* The expected case here is btm_fastlevel == targetlevel+1; if
|
||||
* the fastlevel is <= targetlevel, something is wrong, and we
|
||||
* choose to overwrite it to fix it.
|
||||
*/
|
||||
if (metad->btm_fastlevel > targetlevel + 1)
|
||||
{
|
||||
@@ -961,9 +955,9 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
|
||||
/*
|
||||
* Update parent. The normal case is a tad tricky because we want to
|
||||
* delete the target's downlink and the *following* key. Easiest way
|
||||
* is to copy the right sibling's downlink over the target downlink,
|
||||
* and then delete the following item.
|
||||
* delete the target's downlink and the *following* key. Easiest way is
|
||||
* to copy the right sibling's downlink over the target downlink, and then
|
||||
* delete the following item.
|
||||
*/
|
||||
page = BufferGetPage(pbuf);
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
@@ -992,8 +986,8 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
}
|
||||
|
||||
/*
|
||||
* Update siblings' side-links. Note the target page's side-links
|
||||
* will continue to point to the siblings.
|
||||
* Update siblings' side-links. Note the target page's side-links will
|
||||
* continue to point to the siblings.
|
||||
*/
|
||||
if (BufferIsValid(lbuf))
|
||||
{
|
||||
@@ -1123,10 +1117,10 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full)
|
||||
_bt_wrtbuf(rel, lbuf);
|
||||
|
||||
/*
|
||||
* If parent became half dead, recurse to try to delete it. Otherwise,
|
||||
* if right sibling is empty and is now the last child of the parent,
|
||||
* recurse to try to delete it. (These cases cannot apply at the same
|
||||
* time, though the second case might itself recurse to the first.)
|
||||
* If parent became half dead, recurse to try to delete it. Otherwise, if
|
||||
* right sibling is empty and is now the last child of the parent, recurse
|
||||
* to try to delete it. (These cases cannot apply at the same time,
|
||||
* though the second case might itself recurse to the first.)
|
||||
*/
|
||||
if (parent_half_dead)
|
||||
{
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.131 2005/09/02 19:02:19 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.132 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -39,9 +39,9 @@ typedef struct
|
||||
BTSpool *spool;
|
||||
|
||||
/*
|
||||
* spool2 is needed only when the index is an unique index. Dead
|
||||
* tuples are put into spool2 instead of spool in order to avoid
|
||||
* uniqueness check.
|
||||
* spool2 is needed only when the index is an unique index. Dead tuples
|
||||
* are put into spool2 instead of spool in order to avoid uniqueness
|
||||
* check.
|
||||
*/
|
||||
BTSpool *spool2;
|
||||
double indtuples;
|
||||
@@ -72,10 +72,10 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
BTBuildState buildstate;
|
||||
|
||||
/*
|
||||
* bootstrap processing does something strange, so don't use
|
||||
* sort/build for initial catalog indices. at some point i need to
|
||||
* look harder at this. (there is some kind of incremental processing
|
||||
* going on there.) -- pma 08/29/95
|
||||
* bootstrap processing does something strange, so don't use sort/build
|
||||
* for initial catalog indices. at some point i need to look harder at
|
||||
* this. (there is some kind of incremental processing going on there.)
|
||||
* -- pma 08/29/95
|
||||
*/
|
||||
buildstate.usefast = (FastBuild && IsNormalProcessingMode());
|
||||
buildstate.isUnique = indexInfo->ii_Unique;
|
||||
@@ -91,8 +91,8 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
#endif /* BTREE_BUILD_STATS */
|
||||
|
||||
/*
|
||||
* We expect to be called exactly once for any index relation. If
|
||||
* that's not the case, big trouble's what we have.
|
||||
* We expect to be called exactly once for any index relation. If that's
|
||||
* not the case, big trouble's what we have.
|
||||
*/
|
||||
if (RelationGetNumberOfBlocks(index) != 0)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
@@ -103,8 +103,8 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
buildstate.spool = _bt_spoolinit(index, indexInfo->ii_Unique, false);
|
||||
|
||||
/*
|
||||
* If building a unique index, put dead tuples in a second spool
|
||||
* to keep them out of the uniqueness check.
|
||||
* If building a unique index, put dead tuples in a second spool to
|
||||
* keep them out of the uniqueness check.
|
||||
*/
|
||||
if (indexInfo->ii_Unique)
|
||||
buildstate.spool2 = _bt_spoolinit(index, false, true);
|
||||
@@ -129,8 +129,8 @@ btbuild(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* if we are doing bottom-up btree build, finish the build by (1)
|
||||
* completing the sort of the spool file, (2) inserting the sorted
|
||||
* tuples into btree pages and (3) building the upper levels.
|
||||
* completing the sort of the spool file, (2) inserting the sorted tuples
|
||||
* into btree pages and (3) building the upper levels.
|
||||
*/
|
||||
if (buildstate.usefast)
|
||||
{
|
||||
@@ -176,9 +176,8 @@ btbuildCallback(Relation index,
|
||||
btitem = _bt_formitem(itup);
|
||||
|
||||
/*
|
||||
* if we are doing bottom-up btree build, we insert the index into a
|
||||
* spool file for subsequent processing. otherwise, we insert into
|
||||
* the btree.
|
||||
* if we are doing bottom-up btree build, we insert the index into a spool
|
||||
* file for subsequent processing. otherwise, we insert into the btree.
|
||||
*/
|
||||
if (buildstate->usefast)
|
||||
{
|
||||
@@ -248,16 +247,16 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||
bool res;
|
||||
|
||||
/*
|
||||
* If we've already initialized this scan, we can just advance it in
|
||||
* the appropriate direction. If we haven't done so yet, we call a
|
||||
* routine to get the first item in the scan.
|
||||
* If we've already initialized this scan, we can just advance it in the
|
||||
* appropriate direction. If we haven't done so yet, we call a routine to
|
||||
* get the first item in the scan.
|
||||
*/
|
||||
if (ItemPointerIsValid(&(scan->currentItemData)))
|
||||
{
|
||||
/*
|
||||
* Restore scan position using heap TID returned by previous call
|
||||
* to btgettuple(). _bt_restscan() re-grabs the read lock on the
|
||||
* buffer, too.
|
||||
* Restore scan position using heap TID returned by previous call to
|
||||
* btgettuple(). _bt_restscan() re-grabs the read lock on the buffer,
|
||||
* too.
|
||||
*/
|
||||
_bt_restscan(scan);
|
||||
|
||||
@@ -267,17 +266,16 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||
if (scan->kill_prior_tuple)
|
||||
{
|
||||
/*
|
||||
* Yes, so mark it by setting the LP_DELETE bit in the item
|
||||
* flags.
|
||||
* Yes, so mark it by setting the LP_DELETE bit in the item flags.
|
||||
*/
|
||||
offnum = ItemPointerGetOffsetNumber(&(scan->currentItemData));
|
||||
page = BufferGetPage(so->btso_curbuf);
|
||||
PageGetItemId(page, offnum)->lp_flags |= LP_DELETE;
|
||||
|
||||
/*
|
||||
* Since this can be redone later if needed, it's treated the
|
||||
* same as a commit-hint-bit status update for heap tuples: we
|
||||
* mark the buffer dirty but don't make a WAL log entry.
|
||||
* Since this can be redone later if needed, it's treated the same
|
||||
* as a commit-hint-bit status update for heap tuples: we mark the
|
||||
* buffer dirty but don't make a WAL log entry.
|
||||
*/
|
||||
SetBufferCommitInfoNeedsSave(so->btso_curbuf);
|
||||
}
|
||||
@@ -306,11 +304,11 @@ btgettuple(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read
|
||||
* lock on the buffer so that we aren't blocking other backends.
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read lock on
|
||||
* the buffer so that we aren't blocking other backends.
|
||||
*
|
||||
* NOTE: we do keep the pin on the buffer! This is essential to ensure
|
||||
* that someone else doesn't delete the index entry we are stopped on.
|
||||
* NOTE: we do keep the pin on the buffer! This is essential to ensure that
|
||||
* someone else doesn't delete the index entry we are stopped on.
|
||||
*/
|
||||
if (res)
|
||||
{
|
||||
@@ -333,7 +331,7 @@ Datum
|
||||
btgetmulti(PG_FUNCTION_ARGS)
|
||||
{
|
||||
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
int32 max_tids = PG_GETARG_INT32(2);
|
||||
int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3);
|
||||
BTScanOpaque so = (BTScanOpaque) scan->opaque;
|
||||
@@ -355,6 +353,7 @@ btgetmulti(PG_FUNCTION_ARGS)
|
||||
res = _bt_next(scan, ForwardScanDirection);
|
||||
else
|
||||
res = _bt_first(scan, ForwardScanDirection);
|
||||
|
||||
/*
|
||||
* Skip killed tuples if asked to.
|
||||
*/
|
||||
@@ -381,8 +380,8 @@ btgetmulti(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read
|
||||
* lock on the buffer so that we aren't blocking other backends.
|
||||
* Save heap TID to use it in _bt_restscan. Then release the read lock on
|
||||
* the buffer so that we aren't blocking other backends.
|
||||
*/
|
||||
if (res)
|
||||
{
|
||||
@@ -456,8 +455,8 @@ btrescan(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset the scan keys. Note that keys ordering stuff moved to
|
||||
* _bt_first. - vadim 05/05/97
|
||||
* Reset the scan keys. Note that keys ordering stuff moved to _bt_first.
|
||||
* - vadim 05/05/97
|
||||
*/
|
||||
if (scankey && scan->numberOfKeys > 0)
|
||||
memmove(scan->keyData,
|
||||
@@ -593,21 +592,20 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
num_index_tuples = 0;
|
||||
|
||||
/*
|
||||
* The outer loop iterates over index leaf pages, the inner over items
|
||||
* on a leaf page. We issue just one _bt_delitems() call per page, so
|
||||
* as to minimize WAL traffic.
|
||||
* The outer loop iterates over index leaf pages, the inner over items on
|
||||
* a leaf page. We issue just one _bt_delitems() call per page, so as to
|
||||
* minimize WAL traffic.
|
||||
*
|
||||
* Note that we exclusive-lock every leaf page containing data items, in
|
||||
* sequence left to right. It sounds attractive to only
|
||||
* exclusive-lock those containing items we need to delete, but
|
||||
* unfortunately that is not safe: we could then pass a stopped
|
||||
* indexscan, which could in rare cases lead to deleting the item it
|
||||
* needs to find when it resumes. (See _bt_restscan --- this could
|
||||
* only happen if an indexscan stops on a deletable item and then a
|
||||
* page split moves that item into a page further to its right, which
|
||||
* the indexscan will have no pin on.) We can skip obtaining
|
||||
* exclusive lock on empty pages though, since no indexscan could be
|
||||
* stopped on those.
|
||||
* sequence left to right. It sounds attractive to only exclusive-lock
|
||||
* those containing items we need to delete, but unfortunately that is not
|
||||
* safe: we could then pass a stopped indexscan, which could in rare cases
|
||||
* lead to deleting the item it needs to find when it resumes. (See
|
||||
* _bt_restscan --- this could only happen if an indexscan stops on a
|
||||
* deletable item and then a page split moves that item into a page
|
||||
* further to its right, which the indexscan will have no pin on.) We can
|
||||
* skip obtaining exclusive lock on empty pages though, since no indexscan
|
||||
* could be stopped on those.
|
||||
*/
|
||||
buf = _bt_get_endpoint(rel, 0, false);
|
||||
if (BufferIsValid(buf)) /* check for empty index */
|
||||
@@ -632,15 +630,15 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
if (minoff <= maxoff && !P_ISDELETED(opaque))
|
||||
{
|
||||
/*
|
||||
* Trade in the initial read lock for a super-exclusive
|
||||
* write lock on this page.
|
||||
* Trade in the initial read lock for a super-exclusive write
|
||||
* lock on this page.
|
||||
*/
|
||||
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
||||
LockBufferForCleanup(buf);
|
||||
|
||||
/*
|
||||
* Recompute minoff/maxoff, both of which could have
|
||||
* changed while we weren't holding the lock.
|
||||
* Recompute minoff/maxoff, both of which could have changed
|
||||
* while we weren't holding the lock.
|
||||
*/
|
||||
minoff = P_FIRSTDATAKEY(opaque);
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
@@ -657,7 +655,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
ItemPointer htup;
|
||||
|
||||
btitem = (BTItem) PageGetItem(page,
|
||||
PageGetItemId(page, offnum));
|
||||
PageGetItemId(page, offnum));
|
||||
htup = &(btitem->bti_itup.t_tid);
|
||||
if (callback(htup, callback_state))
|
||||
{
|
||||
@@ -670,8 +668,8 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we need to delete anything, do it and write the buffer;
|
||||
* else just release the buffer.
|
||||
* If we need to delete anything, do it and write the buffer; else
|
||||
* just release the buffer.
|
||||
*/
|
||||
nextpage = opaque->btpo_next;
|
||||
if (ndeletable > 0)
|
||||
@@ -725,19 +723,19 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
Assert(stats != NULL);
|
||||
|
||||
/*
|
||||
* First find out the number of pages in the index. We must acquire
|
||||
* the relation-extension lock while doing this to avoid a race
|
||||
* condition: if someone else is extending the relation, there is
|
||||
* a window where bufmgr/smgr have created a new all-zero page but
|
||||
* it hasn't yet been write-locked by _bt_getbuf(). If we manage to
|
||||
* scan such a page here, we'll improperly assume it can be recycled.
|
||||
* Taking the lock synchronizes things enough to prevent a problem:
|
||||
* either num_pages won't include the new page, or _bt_getbuf already
|
||||
* has write lock on the buffer and it will be fully initialized before
|
||||
* we can examine it. (See also vacuumlazy.c, which has the same issue.)
|
||||
* First find out the number of pages in the index. We must acquire the
|
||||
* relation-extension lock while doing this to avoid a race condition: if
|
||||
* someone else is extending the relation, there is a window where
|
||||
* bufmgr/smgr have created a new all-zero page but it hasn't yet been
|
||||
* write-locked by _bt_getbuf(). If we manage to scan such a page here,
|
||||
* we'll improperly assume it can be recycled. Taking the lock
|
||||
* synchronizes things enough to prevent a problem: either num_pages won't
|
||||
* include the new page, or _bt_getbuf already has write lock on the
|
||||
* buffer and it will be fully initialized before we can examine it. (See
|
||||
* also vacuumlazy.c, which has the same issue.)
|
||||
*
|
||||
* We can skip locking for new or temp relations,
|
||||
* however, since no one else could be accessing them.
|
||||
* We can skip locking for new or temp relations, however, since no one else
|
||||
* could be accessing them.
|
||||
*/
|
||||
needLock = !RELATION_IS_LOCAL(rel);
|
||||
|
||||
@@ -807,12 +805,12 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* During VACUUM FULL it's okay to recycle deleted pages
|
||||
* immediately, since there can be no other transactions
|
||||
* scanning the index. Note that we will only recycle the
|
||||
* current page and not any parent pages that _bt_pagedel
|
||||
* might have recursed to; this seems reasonable in the name
|
||||
* of simplicity. (Trying to do otherwise would mean we'd
|
||||
* have to sort the list of recyclable pages we're building.)
|
||||
* immediately, since there can be no other transactions scanning
|
||||
* the index. Note that we will only recycle the current page and
|
||||
* not any parent pages that _bt_pagedel might have recursed to;
|
||||
* this seems reasonable in the name of simplicity. (Trying to do
|
||||
* otherwise would mean we'd have to sort the list of recyclable
|
||||
* pages we're building.)
|
||||
*/
|
||||
if (ndel && info->vacuum_full)
|
||||
{
|
||||
@@ -827,10 +825,10 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* During VACUUM FULL, we truncate off any recyclable pages at the end
|
||||
* of the index. In a normal vacuum it'd be unsafe to do this except
|
||||
* by acquiring exclusive lock on the index and then rechecking all
|
||||
* the pages; doesn't seem worth it.
|
||||
* During VACUUM FULL, we truncate off any recyclable pages at the end of
|
||||
* the index. In a normal vacuum it'd be unsafe to do this except by
|
||||
* acquiring exclusive lock on the index and then rechecking all the
|
||||
* pages; doesn't seem worth it.
|
||||
*/
|
||||
if (info->vacuum_full && nFreePages > 0)
|
||||
{
|
||||
@@ -857,9 +855,9 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Update the shared Free Space Map with the info we now have about
|
||||
* free pages in the index, discarding any old info the map may have.
|
||||
* We do not need to sort the page numbers; they're in order already.
|
||||
* Update the shared Free Space Map with the info we now have about free
|
||||
* pages in the index, discarding any old info the map may have. We do not
|
||||
* need to sort the page numbers; they're in order already.
|
||||
*/
|
||||
RecordIndexFreeSpace(&rel->rd_node, nFreePages, freePages);
|
||||
|
||||
@@ -915,15 +913,15 @@ _bt_restscan(IndexScanDesc scan)
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* We use this as flag when first index tuple on page is deleted but
|
||||
* we do not move left (this would slowdown vacuum) - so we set
|
||||
* We use this as flag when first index tuple on page is deleted but we do
|
||||
* not move left (this would slowdown vacuum) - so we set
|
||||
* current->ip_posid before first index tuple on the current page
|
||||
* (_bt_step will move it right)... XXX still needed?
|
||||
*/
|
||||
if (!ItemPointerIsValid(target))
|
||||
{
|
||||
ItemPointerSetOffsetNumber(current,
|
||||
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
|
||||
OffsetNumberPrev(P_FIRSTDATAKEY(opaque)));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -948,12 +946,12 @@ _bt_restscan(IndexScanDesc scan)
|
||||
}
|
||||
|
||||
/*
|
||||
* The item we're looking for moved right at least one page, so
|
||||
* move right. We are careful here to pin and read-lock the next
|
||||
* non-dead page before releasing the current one. This ensures
|
||||
* that a concurrent btbulkdelete scan cannot pass our position
|
||||
* --- if it did, it might be able to reach and delete our target
|
||||
* item before we can find it again.
|
||||
* The item we're looking for moved right at least one page, so move
|
||||
* right. We are careful here to pin and read-lock the next non-dead
|
||||
* page before releasing the current one. This ensures that a
|
||||
* concurrent btbulkdelete scan cannot pass our position --- if it
|
||||
* did, it might be able to reach and delete our target item before we
|
||||
* can find it again.
|
||||
*/
|
||||
if (P_RIGHTMOST(opaque))
|
||||
elog(ERROR, "failed to re-find previous key in \"%s\"",
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.94 2005/10/06 02:29:12 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.95 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -69,9 +69,9 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
BTStack new_stack;
|
||||
|
||||
/*
|
||||
* Race -- the page we just grabbed may have split since we read
|
||||
* its pointer in the parent (or metapage). If it has, we may
|
||||
* need to move right to its new sibling. Do that.
|
||||
* Race -- the page we just grabbed may have split since we read its
|
||||
* pointer in the parent (or metapage). If it has, we may need to
|
||||
* move right to its new sibling. Do that.
|
||||
*/
|
||||
*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
|
||||
|
||||
@@ -82,8 +82,8 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
break;
|
||||
|
||||
/*
|
||||
* Find the appropriate item on the internal page, and get the
|
||||
* child page that it points to.
|
||||
* Find the appropriate item on the internal page, and get the child
|
||||
* page that it points to.
|
||||
*/
|
||||
offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey);
|
||||
itemid = PageGetItemId(page, offnum);
|
||||
@@ -94,13 +94,13 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
|
||||
|
||||
/*
|
||||
* We need to save the location of the index entry we chose in the
|
||||
* parent page on a stack. In case we split the tree, we'll use
|
||||
* the stack to work back up to the parent page. We also save the
|
||||
* actual downlink (TID) to uniquely identify the index entry, in
|
||||
* case it moves right while we're working lower in the tree. See
|
||||
* the paper by Lehman and Yao for how this is detected and
|
||||
* handled. (We use the child link to disambiguate duplicate keys
|
||||
* in the index -- Lehman and Yao disallow duplicate keys.)
|
||||
* parent page on a stack. In case we split the tree, we'll use the
|
||||
* stack to work back up to the parent page. We also save the actual
|
||||
* downlink (TID) to uniquely identify the index entry, in case it
|
||||
* moves right while we're working lower in the tree. See the paper
|
||||
* by Lehman and Yao for how this is detected and handled. (We use the
|
||||
* child link to disambiguate duplicate keys in the index -- Lehman
|
||||
* and Yao disallow duplicate keys.)
|
||||
*/
|
||||
new_stack = (BTStack) palloc(sizeof(BTStackData));
|
||||
new_stack->bts_blkno = par_blkno;
|
||||
@@ -156,19 +156,18 @@ _bt_moveright(Relation rel,
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* When nextkey = false (normal case): if the scan key that brought us
|
||||
* to this page is > the high key stored on the page, then the page
|
||||
* has split and we need to move right. (If the scan key is equal to
|
||||
* the high key, we might or might not need to move right; have to
|
||||
* scan the page first anyway.)
|
||||
* When nextkey = false (normal case): if the scan key that brought us to
|
||||
* this page is > the high key stored on the page, then the page has split
|
||||
* and we need to move right. (If the scan key is equal to the high key,
|
||||
* we might or might not need to move right; have to scan the page first
|
||||
* anyway.)
|
||||
*
|
||||
* When nextkey = true: move right if the scan key is >= page's high key.
|
||||
*
|
||||
* The page could even have split more than once, so scan as far as
|
||||
* needed.
|
||||
* The page could even have split more than once, so scan as far as needed.
|
||||
*
|
||||
* We also have to move right if we followed a link that brought us to a
|
||||
* dead page.
|
||||
* We also have to move right if we followed a link that brought us to a dead
|
||||
* page.
|
||||
*/
|
||||
cmpval = nextkey ? 0 : 1;
|
||||
|
||||
@@ -242,24 +241,24 @@ _bt_binsrch(Relation rel,
|
||||
high = PageGetMaxOffsetNumber(page);
|
||||
|
||||
/*
|
||||
* If there are no keys on the page, return the first available slot.
|
||||
* Note this covers two cases: the page is really empty (no keys), or
|
||||
* it contains only a high key. The latter case is possible after
|
||||
* vacuuming. This can never happen on an internal page, however,
|
||||
* since they are never empty (an internal page must have children).
|
||||
* If there are no keys on the page, return the first available slot. Note
|
||||
* this covers two cases: the page is really empty (no keys), or it
|
||||
* contains only a high key. The latter case is possible after vacuuming.
|
||||
* This can never happen on an internal page, however, since they are
|
||||
* never empty (an internal page must have children).
|
||||
*/
|
||||
if (high < low)
|
||||
return low;
|
||||
|
||||
/*
|
||||
* Binary search to find the first key on the page >= scan key, or
|
||||
* first key > scankey when nextkey is true.
|
||||
* Binary search to find the first key on the page >= scan key, or first
|
||||
* key > scankey when nextkey is true.
|
||||
*
|
||||
* For nextkey=false (cmpval=1), the loop invariant is: all slots before
|
||||
* 'low' are < scan key, all slots at or after 'high' are >= scan key.
|
||||
*
|
||||
* For nextkey=true (cmpval=0), the loop invariant is: all slots before
|
||||
* 'low' are <= scan key, all slots at or after 'high' are > scan key.
|
||||
* For nextkey=true (cmpval=0), the loop invariant is: all slots before 'low'
|
||||
* are <= scan key, all slots at or after 'high' are > scan key.
|
||||
*
|
||||
* We can fall out when high == low.
|
||||
*/
|
||||
@@ -285,15 +284,15 @@ _bt_binsrch(Relation rel,
|
||||
* At this point we have high == low, but be careful: they could point
|
||||
* past the last slot on the page.
|
||||
*
|
||||
* On a leaf page, we always return the first key >= scan key (resp. >
|
||||
* scan key), which could be the last slot + 1.
|
||||
* On a leaf page, we always return the first key >= scan key (resp. > scan
|
||||
* key), which could be the last slot + 1.
|
||||
*/
|
||||
if (P_ISLEAF(opaque))
|
||||
return low;
|
||||
|
||||
/*
|
||||
* On a non-leaf page, return the last key < scan key (resp. <= scan
|
||||
* key). There must be one if _bt_compare() is playing by the rules.
|
||||
* On a non-leaf page, return the last key < scan key (resp. <= scan key).
|
||||
* There must be one if _bt_compare() is playing by the rules.
|
||||
*/
|
||||
Assert(low > P_FIRSTDATAKEY(opaque));
|
||||
|
||||
@@ -337,8 +336,8 @@ _bt_compare(Relation rel,
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Force result ">" if target item is first data item on an internal
|
||||
* page --- see NOTE above.
|
||||
* Force result ">" if target item is first data item on an internal page
|
||||
* --- see NOTE above.
|
||||
*/
|
||||
if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque))
|
||||
return 1;
|
||||
@@ -347,15 +346,15 @@ _bt_compare(Relation rel,
|
||||
itup = &(btitem->bti_itup);
|
||||
|
||||
/*
|
||||
* The scan key is set up with the attribute number associated with
|
||||
* each term in the key. It is important that, if the index is
|
||||
* multi-key, the scan contain the first k key attributes, and that
|
||||
* they be in order. If you think about how multi-key ordering works,
|
||||
* you'll understand why this is.
|
||||
* The scan key is set up with the attribute number associated with each
|
||||
* term in the key. It is important that, if the index is multi-key, the
|
||||
* scan contain the first k key attributes, and that they be in order. If
|
||||
* you think about how multi-key ordering works, you'll understand why
|
||||
* this is.
|
||||
*
|
||||
* We don't test for violation of this condition here, however. The
|
||||
* initial setup for the index scan had better have gotten it right
|
||||
* (see _bt_first).
|
||||
* We don't test for violation of this condition here, however. The initial
|
||||
* setup for the index scan had better have gotten it right (see
|
||||
* _bt_first).
|
||||
*/
|
||||
|
||||
for (i = 1; i <= keysz; i++)
|
||||
@@ -381,15 +380,15 @@ _bt_compare(Relation rel,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* The sk_func needs to be passed the index value as left arg
|
||||
* and the sk_argument as right arg (they might be of
|
||||
* different types). Since it is convenient for callers to
|
||||
* think of _bt_compare as comparing the scankey to the index
|
||||
* item, we have to flip the sign of the comparison result.
|
||||
* The sk_func needs to be passed the index value as left arg and
|
||||
* the sk_argument as right arg (they might be of different
|
||||
* types). Since it is convenient for callers to think of
|
||||
* _bt_compare as comparing the scankey to the index item, we have
|
||||
* to flip the sign of the comparison result.
|
||||
*
|
||||
* Note: curious-looking coding is to avoid overflow if
|
||||
* comparison function returns INT_MIN. There is no risk of
|
||||
* overflow for positive results.
|
||||
* Note: curious-looking coding is to avoid overflow if comparison
|
||||
* function returns INT_MIN. There is no risk of overflow for
|
||||
* positive results.
|
||||
*/
|
||||
result = DatumGetInt32(FunctionCall2(&scankey->sk_func,
|
||||
datum,
|
||||
@@ -497,7 +496,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
bool goback;
|
||||
bool continuescan;
|
||||
ScanKey startKeys[INDEX_MAX_KEYS];
|
||||
ScanKeyData scankeys[INDEX_MAX_KEYS];
|
||||
ScanKeyData scankeys[INDEX_MAX_KEYS];
|
||||
int keysCount = 0;
|
||||
int i;
|
||||
StrategyNumber strat_total;
|
||||
@@ -505,8 +504,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
pgstat_count_index_scan(&scan->xs_pgstat_info);
|
||||
|
||||
/*
|
||||
* Examine the scan keys and eliminate any redundant keys; also
|
||||
* discover how many keys must be matched to continue the scan.
|
||||
* Examine the scan keys and eliminate any redundant keys; also discover
|
||||
* how many keys must be matched to continue the scan.
|
||||
*/
|
||||
_bt_preprocess_keys(scan);
|
||||
|
||||
@@ -556,9 +555,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ScanKey cur;
|
||||
|
||||
/*
|
||||
* chosen is the so-far-chosen key for the current attribute, if
|
||||
* any. We don't cast the decision in stone until we reach keys
|
||||
* for the next attribute.
|
||||
* chosen is the so-far-chosen key for the current attribute, if any.
|
||||
* We don't cast the decision in stone until we reach keys for the
|
||||
* next attribute.
|
||||
*/
|
||||
curattr = 1;
|
||||
chosen = NULL;
|
||||
@@ -595,9 +594,9 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
}
|
||||
|
||||
/*
|
||||
* Done if that was the last attribute, or if next key
|
||||
* is not in sequence (implying no boundary key is available
|
||||
* for the next attribute).
|
||||
* Done if that was the last attribute, or if next key is not
|
||||
* in sequence (implying no boundary key is available for the
|
||||
* next attribute).
|
||||
*/
|
||||
if (i >= so->numberOfKeys ||
|
||||
cur->sk_attno != curattr + 1)
|
||||
@@ -632,17 +631,17 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we found no usable boundary keys, we have to start from one end
|
||||
* of the tree. Walk down that edge to the first or last key, and
|
||||
* scan from there.
|
||||
* If we found no usable boundary keys, we have to start from one end of
|
||||
* the tree. Walk down that edge to the first or last key, and scan from
|
||||
* there.
|
||||
*/
|
||||
if (keysCount == 0)
|
||||
return _bt_endpoint(scan, dir);
|
||||
|
||||
/*
|
||||
* We want to start the scan somewhere within the index. Set up a
|
||||
* 3-way-comparison scankey we can use to search for the boundary
|
||||
* point we identified above.
|
||||
* 3-way-comparison scankey we can use to search for the boundary point we
|
||||
* identified above.
|
||||
*/
|
||||
Assert(keysCount <= INDEX_MAX_KEYS);
|
||||
for (i = 0; i < keysCount; i++)
|
||||
@@ -650,16 +649,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ScanKey cur = startKeys[i];
|
||||
|
||||
/*
|
||||
* _bt_preprocess_keys disallows it, but it's place to add some
|
||||
* code later
|
||||
* _bt_preprocess_keys disallows it, but it's place to add some code
|
||||
* later
|
||||
*/
|
||||
if (cur->sk_flags & SK_ISNULL)
|
||||
elog(ERROR, "btree doesn't support is(not)null, yet");
|
||||
|
||||
/*
|
||||
* If scankey operator is of default subtype, we can use the
|
||||
* cached comparison procedure; otherwise gotta look it up in the
|
||||
* catalogs.
|
||||
* If scankey operator is of default subtype, we can use the cached
|
||||
* comparison procedure; otherwise gotta look it up in the catalogs.
|
||||
*/
|
||||
if (cur->sk_subtype == InvalidOid)
|
||||
{
|
||||
@@ -692,13 +690,13 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
}
|
||||
|
||||
/*
|
||||
* Examine the selected initial-positioning strategy to determine
|
||||
* exactly where we need to start the scan, and set flag variables to
|
||||
* control the code below.
|
||||
* Examine the selected initial-positioning strategy to determine exactly
|
||||
* where we need to start the scan, and set flag variables to control the
|
||||
* code below.
|
||||
*
|
||||
* If nextkey = false, _bt_search and _bt_binsrch will locate the first
|
||||
* item >= scan key. If nextkey = true, they will locate the first
|
||||
* item > scan key.
|
||||
* If nextkey = false, _bt_search and _bt_binsrch will locate the first item
|
||||
* >= scan key. If nextkey = true, they will locate the first item > scan
|
||||
* key.
|
||||
*
|
||||
* If goback = true, we will then step back one item, while if goback =
|
||||
* false, we will start the scan on the located item.
|
||||
@@ -710,10 +708,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTLessStrategyNumber:
|
||||
|
||||
/*
|
||||
* Find first item >= scankey, then back up one to arrive at
|
||||
* last item < scankey. (Note: this positioning strategy is
|
||||
* only used for a backward scan, so that is always the
|
||||
* correct starting position.)
|
||||
* Find first item >= scankey, then back up one to arrive at last
|
||||
* item < scankey. (Note: this positioning strategy is only used
|
||||
* for a backward scan, so that is always the correct starting
|
||||
* position.)
|
||||
*/
|
||||
nextkey = false;
|
||||
goback = true;
|
||||
@@ -722,10 +720,10 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTLessEqualStrategyNumber:
|
||||
|
||||
/*
|
||||
* Find first item > scankey, then back up one to arrive at
|
||||
* last item <= scankey. (Note: this positioning strategy is
|
||||
* only used for a backward scan, so that is always the
|
||||
* correct starting position.)
|
||||
* Find first item > scankey, then back up one to arrive at last
|
||||
* item <= scankey. (Note: this positioning strategy is only used
|
||||
* for a backward scan, so that is always the correct starting
|
||||
* position.)
|
||||
*/
|
||||
nextkey = true;
|
||||
goback = true;
|
||||
@@ -734,14 +732,14 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
case BTEqualStrategyNumber:
|
||||
|
||||
/*
|
||||
* If a backward scan was specified, need to start with last
|
||||
* equal item not first one.
|
||||
* If a backward scan was specified, need to start with last equal
|
||||
* item not first one.
|
||||
*/
|
||||
if (ScanDirectionIsBackward(dir))
|
||||
{
|
||||
/*
|
||||
* This is the same as the <= strategy. We will check at
|
||||
* the end whether the found item is actually =.
|
||||
* This is the same as the <= strategy. We will check at the
|
||||
* end whether the found item is actually =.
|
||||
*/
|
||||
nextkey = true;
|
||||
goback = true;
|
||||
@@ -749,8 +747,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
else
|
||||
{
|
||||
/*
|
||||
* This is the same as the >= strategy. We will check at
|
||||
* the end whether the found item is actually =.
|
||||
* This is the same as the >= strategy. We will check at the
|
||||
* end whether the found item is actually =.
|
||||
*/
|
||||
nextkey = false;
|
||||
goback = false;
|
||||
@@ -813,24 +811,24 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
|
||||
ItemPointerSet(current, blkno, offnum);
|
||||
|
||||
/*
|
||||
* If nextkey = false, we are positioned at the first item >= scan
|
||||
* key, or possibly at the end of a page on which all the existing
|
||||
* items are less than the scan key and we know that everything on
|
||||
* later pages is greater than or equal to scan key.
|
||||
* If nextkey = false, we are positioned at the first item >= scan key, or
|
||||
* possibly at the end of a page on which all the existing items are less
|
||||
* than the scan key and we know that everything on later pages is greater
|
||||
* than or equal to scan key.
|
||||
*
|
||||
* If nextkey = true, we are positioned at the first item > scan key, or
|
||||
* possibly at the end of a page on which all the existing items are
|
||||
* less than or equal to the scan key and we know that everything on
|
||||
* later pages is greater than scan key.
|
||||
* possibly at the end of a page on which all the existing items are less
|
||||
* than or equal to the scan key and we know that everything on later
|
||||
* pages is greater than scan key.
|
||||
*
|
||||
* The actually desired starting point is either this item or the prior
|
||||
* one, or in the end-of-page case it's the first item on the next
|
||||
* page or the last item on this page. We apply _bt_step if needed to
|
||||
* get to the right place.
|
||||
* The actually desired starting point is either this item or the prior one,
|
||||
* or in the end-of-page case it's the first item on the next page or the
|
||||
* last item on this page. We apply _bt_step if needed to get to the
|
||||
* right place.
|
||||
*
|
||||
* If _bt_step fails (meaning we fell off the end of the index in one
|
||||
* direction or the other), then there are no matches so we just
|
||||
* return false.
|
||||
* direction or the other), then there are no matches so we just return
|
||||
* false.
|
||||
*/
|
||||
if (goback)
|
||||
{
|
||||
@@ -902,8 +900,8 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
BlockNumber blkno;
|
||||
|
||||
/*
|
||||
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion
|
||||
* due to ability of ip_posid to be equal 0.
|
||||
* Don't use ItemPointerGetOffsetNumber or you risk to get assertion due
|
||||
* to ability of ip_posid to be equal 0.
|
||||
*/
|
||||
offnum = current->ip_posid;
|
||||
|
||||
@@ -954,9 +952,9 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
/*
|
||||
* Walk left to the next page with data. This is much more
|
||||
* complex than the walk-right case because of the possibility
|
||||
* that the page to our left splits while we are in flight to
|
||||
* it, plus the possibility that the page we were on gets
|
||||
* deleted after we leave it. See nbtree/README for details.
|
||||
* that the page to our left splits while we are in flight to it,
|
||||
* plus the possibility that the page we were on gets deleted
|
||||
* after we leave it. See nbtree/README for details.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -973,9 +971,9 @@ _bt_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
|
||||
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
|
||||
|
||||
/*
|
||||
* Okay, we managed to move left to a non-deleted page.
|
||||
* Done if it's not half-dead and not empty. Else loop
|
||||
* back and do it all again.
|
||||
* Okay, we managed to move left to a non-deleted page. Done
|
||||
* if it's not half-dead and not empty. Else loop back and do
|
||||
* it all again.
|
||||
*/
|
||||
if (!P_IGNORE(opaque))
|
||||
{
|
||||
@@ -1043,15 +1041,14 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
|
||||
/*
|
||||
* If this isn't the page we want, walk right till we find what we
|
||||
* want --- but go no more than four hops (an arbitrary limit). If
|
||||
* we don't find the correct page by then, the most likely bet is
|
||||
* that the original page got deleted and isn't in the sibling
|
||||
* chain at all anymore, not that its left sibling got split more
|
||||
* than four times.
|
||||
* want --- but go no more than four hops (an arbitrary limit). If we
|
||||
* don't find the correct page by then, the most likely bet is that
|
||||
* the original page got deleted and isn't in the sibling chain at all
|
||||
* anymore, not that its left sibling got split more than four times.
|
||||
*
|
||||
* Note that it is correct to test P_ISDELETED not P_IGNORE here,
|
||||
* because half-dead pages are still in the sibling chain. Caller
|
||||
* must reject half-dead pages if wanted.
|
||||
* Note that it is correct to test P_ISDELETED not P_IGNORE here, because
|
||||
* half-dead pages are still in the sibling chain. Caller must reject
|
||||
* half-dead pages if wanted.
|
||||
*/
|
||||
tries = 0;
|
||||
for (;;)
|
||||
@@ -1077,9 +1074,9 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
{
|
||||
/*
|
||||
* It was deleted. Move right to first nondeleted page (there
|
||||
* must be one); that is the page that has acquired the
|
||||
* deleted one's keyspace, so stepping left from it will take
|
||||
* us where we want to be.
|
||||
* must be one); that is the page that has acquired the deleted
|
||||
* one's keyspace, so stepping left from it will take us where we
|
||||
* want to be.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
@@ -1095,16 +1092,16 @@ _bt_walk_left(Relation rel, Buffer buf)
|
||||
}
|
||||
|
||||
/*
|
||||
* Now return to top of loop, resetting obknum to point to
|
||||
* this nondeleted page, and try again.
|
||||
* Now return to top of loop, resetting obknum to point to this
|
||||
* nondeleted page, and try again.
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
* It wasn't deleted; the explanation had better be that the
|
||||
* page to the left got split or deleted. Without this check,
|
||||
* we'd go into an infinite loop if there's anything wrong.
|
||||
* It wasn't deleted; the explanation had better be that the page
|
||||
* to the left got split or deleted. Without this check, we'd go
|
||||
* into an infinite loop if there's anything wrong.
|
||||
*/
|
||||
if (opaque->btpo_prev == lblkno)
|
||||
elog(ERROR, "could not find left sibling in \"%s\"",
|
||||
@@ -1137,8 +1134,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
||||
|
||||
/*
|
||||
* If we are looking for a leaf page, okay to descend from fast root;
|
||||
* otherwise better descend from true root. (There is no point in
|
||||
* being smarter about intermediate levels.)
|
||||
* otherwise better descend from true root. (There is no point in being
|
||||
* smarter about intermediate levels.)
|
||||
*/
|
||||
if (level == 0)
|
||||
buf = _bt_getroot(rel, BT_READ);
|
||||
@@ -1159,8 +1156,8 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost)
|
||||
/*
|
||||
* If we landed on a deleted page, step right to find a live page
|
||||
* (there must be one). Also, if we want the rightmost page, step
|
||||
* right if needed to get to it (this could happen if the page
|
||||
* split since we obtained a pointer to it).
|
||||
* right if needed to get to it (this could happen if the page split
|
||||
* since we obtained a pointer to it).
|
||||
*/
|
||||
while (P_IGNORE(opaque) ||
|
||||
(rightmost && !P_RIGHTMOST(opaque)))
|
||||
@@ -1228,9 +1225,9 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
so = (BTScanOpaque) scan->opaque;
|
||||
|
||||
/*
|
||||
* Scan down to the leftmost or rightmost leaf page. This is a
|
||||
* simplified version of _bt_search(). We don't maintain a stack
|
||||
* since we know we won't need it.
|
||||
* Scan down to the leftmost or rightmost leaf page. This is a simplified
|
||||
* version of _bt_search(). We don't maintain a stack since we know we
|
||||
* won't need it.
|
||||
*/
|
||||
buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir));
|
||||
|
||||
@@ -1261,8 +1258,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
Assert(P_RIGHTMOST(opaque));
|
||||
|
||||
start = PageGetMaxOffsetNumber(page);
|
||||
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty
|
||||
* page */
|
||||
if (start < P_FIRSTDATAKEY(opaque)) /* watch out for empty page */
|
||||
start = P_FIRSTDATAKEY(opaque);
|
||||
}
|
||||
else
|
||||
@@ -1276,8 +1272,8 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
so->btso_curbuf = buf;
|
||||
|
||||
/*
|
||||
* Left/rightmost page could be empty due to deletions, if so step
|
||||
* till we find a nonempty page.
|
||||
* Left/rightmost page could be empty due to deletions, if so step till we
|
||||
* find a nonempty page.
|
||||
*/
|
||||
if (start > maxoff)
|
||||
{
|
||||
@@ -1291,8 +1287,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
|
||||
itup = &(btitem->bti_itup);
|
||||
|
||||
/*
|
||||
* Okay, we are on the first or last tuple. Does it pass all the
|
||||
* quals?
|
||||
* Okay, we are on the first or last tuple. Does it pass all the quals?
|
||||
*/
|
||||
if (_bt_checkkeys(scan, itup, dir, &continuescan))
|
||||
{
|
||||
|
||||
@@ -56,7 +56,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.94 2005/08/11 13:22:33 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsort.c,v 1.95 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -99,12 +99,10 @@ typedef struct BTPageState
|
||||
{
|
||||
Page btps_page; /* workspace for page building */
|
||||
BlockNumber btps_blkno; /* block # to write this page at */
|
||||
BTItem btps_minkey; /* copy of minimum key (first item) on
|
||||
* page */
|
||||
BTItem btps_minkey; /* copy of minimum key (first item) on page */
|
||||
OffsetNumber btps_lastoff; /* last item offset loaded */
|
||||
uint32 btps_level; /* tree level (0 = leaf) */
|
||||
Size btps_full; /* "full" if less than this much free
|
||||
* space */
|
||||
Size btps_full; /* "full" if less than this much free space */
|
||||
struct BTPageState *btps_next; /* link to parent level, if any */
|
||||
} BTPageState;
|
||||
|
||||
@@ -157,21 +155,21 @@ _bt_spoolinit(Relation index, bool isunique, bool isdead)
|
||||
btspool->isunique = isunique;
|
||||
|
||||
/*
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem
|
||||
* to speed index creation. This should be OK since a single backend
|
||||
* can't run multiple index creations in parallel. Note that creation
|
||||
* of a unique index actually requires two BTSpool objects. We expect
|
||||
* that the second one (for dead tuples) won't get very full, so we
|
||||
* give it only work_mem.
|
||||
* We size the sort area as maintenance_work_mem rather than work_mem to
|
||||
* speed index creation. This should be OK since a single backend can't
|
||||
* run multiple index creations in parallel. Note that creation of a
|
||||
* unique index actually requires two BTSpool objects. We expect that the
|
||||
* second one (for dead tuples) won't get very full, so we give it only
|
||||
* work_mem.
|
||||
*/
|
||||
btKbytes = isdead ? work_mem : maintenance_work_mem;
|
||||
btspool->sortstate = tuplesort_begin_index(index, isunique,
|
||||
btKbytes, false);
|
||||
|
||||
/*
|
||||
* Currently, tuplesort provides sort functions on IndexTuples. If we
|
||||
* kept anything in a BTItem other than a regular IndexTuple, we'd
|
||||
* need to modify tuplesort to understand BTItems as such.
|
||||
* Currently, tuplesort provides sort functions on IndexTuples. If we kept
|
||||
* anything in a BTItem other than a regular IndexTuple, we'd need to
|
||||
* modify tuplesort to understand BTItems as such.
|
||||
*/
|
||||
Assert(sizeof(BTItemData) == sizeof(IndexTupleData));
|
||||
|
||||
@@ -222,8 +220,8 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
|
||||
wstate.index = btspool->index;
|
||||
|
||||
/*
|
||||
* We need to log index creation in WAL iff WAL archiving is enabled
|
||||
* AND it's not a temp index.
|
||||
* We need to log index creation in WAL iff WAL archiving is enabled AND
|
||||
* it's not a temp index.
|
||||
*/
|
||||
wstate.btws_use_wal = XLogArchivingActive() && !wstate.index->rd_istemp;
|
||||
|
||||
@@ -313,9 +311,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
|
||||
/*
|
||||
* If we have to write pages nonsequentially, fill in the space with
|
||||
* zeroes until we come back and overwrite. This is not logically
|
||||
* necessary on standard Unix filesystems (unwritten space will read
|
||||
* as zeroes anyway), but it should help to avoid fragmentation. The
|
||||
* dummy pages aren't WAL-logged though.
|
||||
* necessary on standard Unix filesystems (unwritten space will read as
|
||||
* zeroes anyway), but it should help to avoid fragmentation. The dummy
|
||||
* pages aren't WAL-logged though.
|
||||
*/
|
||||
while (blkno > wstate->btws_pages_written)
|
||||
{
|
||||
@@ -328,8 +326,8 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
|
||||
|
||||
/*
|
||||
* Now write the page. We say isTemp = true even if it's not a temp
|
||||
* index, because there's no need for smgr to schedule an fsync for
|
||||
* this write; we'll do it ourselves before ending the build.
|
||||
* index, because there's no need for smgr to schedule an fsync for this
|
||||
* write; we'll do it ourselves before ending the build.
|
||||
*/
|
||||
smgrwrite(wstate->index->rd_smgr, blkno, (char *) page, true);
|
||||
|
||||
@@ -483,15 +481,15 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
btisz = MAXALIGN(btisz);
|
||||
|
||||
/*
|
||||
* Check whether the item can fit on a btree page at all. (Eventually,
|
||||
* we ought to try to apply TOAST methods if not.) We actually need to
|
||||
* be able to fit three items on every page, so restrict any one item
|
||||
* to 1/3 the per-page available space. Note that at this point, btisz
|
||||
* doesn't include the ItemId.
|
||||
* Check whether the item can fit on a btree page at all. (Eventually, we
|
||||
* ought to try to apply TOAST methods if not.) We actually need to be
|
||||
* able to fit three items on every page, so restrict any one item to 1/3
|
||||
* the per-page available space. Note that at this point, btisz doesn't
|
||||
* include the ItemId.
|
||||
*
|
||||
* NOTE: similar code appears in _bt_insertonpg() to defend against
|
||||
* oversize items being inserted into an already-existing index. But
|
||||
* during creation of an index, we don't go through there.
|
||||
* NOTE: similar code appears in _bt_insertonpg() to defend against oversize
|
||||
* items being inserted into an already-existing index. But during
|
||||
* creation of an index, we don't go through there.
|
||||
*/
|
||||
if (btisz > BTMaxItemSize(npage))
|
||||
ereport(ERROR,
|
||||
@@ -499,9 +497,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
errmsg("index row size %lu exceeds btree maximum, %lu",
|
||||
(unsigned long) btisz,
|
||||
(unsigned long) BTMaxItemSize(npage)),
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
errhint("Values larger than 1/3 of a buffer page cannot be indexed.\n"
|
||||
"Consider a function index of an MD5 hash of the value, "
|
||||
"or use full text indexing.")));
|
||||
|
||||
if (pgspc < btisz || pgspc < state->btps_full)
|
||||
{
|
||||
@@ -523,11 +521,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
|
||||
/*
|
||||
* We copy the last item on the page into the new page, and then
|
||||
* rearrange the old page so that the 'last item' becomes its high
|
||||
* key rather than a true data item. There had better be at least
|
||||
* two items on the page already, else the page would be empty of
|
||||
* useful data. (Hence, we must allow pages to be packed at least
|
||||
* 2/3rds full; the 70% figure used above is close to minimum.)
|
||||
* rearrange the old page so that the 'last item' becomes its high key
|
||||
* rather than a true data item. There had better be at least two
|
||||
* items on the page already, else the page would be empty of useful
|
||||
* data. (Hence, we must allow pages to be packed at least 2/3rds
|
||||
* full; the 70% figure used above is close to minimum.)
|
||||
*/
|
||||
Assert(last_off > P_FIRSTKEY);
|
||||
ii = PageGetItemId(opage, last_off);
|
||||
@@ -544,8 +542,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
|
||||
/*
|
||||
* Link the old page into its parent, using its minimum key. If we
|
||||
* don't have a parent, we have to create one; this adds a new
|
||||
* btree level.
|
||||
* don't have a parent, we have to create one; this adds a new btree
|
||||
* level.
|
||||
*/
|
||||
if (state->btps_next == NULL)
|
||||
state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);
|
||||
@@ -557,9 +555,9 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
pfree(state->btps_minkey);
|
||||
|
||||
/*
|
||||
* Save a copy of the minimum key for the new page. We have to
|
||||
* copy it off the old page, not the new one, in case we are not
|
||||
* at leaf level.
|
||||
* Save a copy of the minimum key for the new page. We have to copy
|
||||
* it off the old page, not the new one, in case we are not at leaf
|
||||
* level.
|
||||
*/
|
||||
state->btps_minkey = _bt_formitem(&(obti->bti_itup));
|
||||
|
||||
@@ -576,8 +574,8 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
}
|
||||
|
||||
/*
|
||||
* Write out the old page. We never need to touch it again, so we
|
||||
* can free the opage workspace too.
|
||||
* Write out the old page. We never need to touch it again, so we can
|
||||
* free the opage workspace too.
|
||||
*/
|
||||
_bt_blwritepage(wstate, opage, oblkno);
|
||||
|
||||
@@ -588,10 +586,10 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, BTItem bti)
|
||||
}
|
||||
|
||||
/*
|
||||
* If the new item is the first for its page, stash a copy for later.
|
||||
* Note this will only happen for the first item on a level; on later
|
||||
* pages, the first item for a page is copied from the prior page in
|
||||
* the code above.
|
||||
* If the new item is the first for its page, stash a copy for later. Note
|
||||
* this will only happen for the first item on a level; on later pages,
|
||||
* the first item for a page is copied from the prior page in the code
|
||||
* above.
|
||||
*/
|
||||
if (last_off == P_HIKEY)
|
||||
{
|
||||
@@ -636,9 +634,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
* We have to link the last page on this level to somewhere.
|
||||
*
|
||||
* If we're at the top, it's the root, so attach it to the metapage.
|
||||
* Otherwise, add an entry for it to its parent using its minimum
|
||||
* key. This may cause the last page of the parent level to
|
||||
* split, but that's not a problem -- we haven't gotten to it yet.
|
||||
* Otherwise, add an entry for it to its parent using its minimum key.
|
||||
* This may cause the last page of the parent level to split, but
|
||||
* that's not a problem -- we haven't gotten to it yet.
|
||||
*/
|
||||
if (s->btps_next == NULL)
|
||||
{
|
||||
@@ -657,8 +655,8 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the rightmost page, so the ItemId array needs to be
|
||||
* slid back one slot. Then we can dump out the page.
|
||||
* This is the rightmost page, so the ItemId array needs to be slid
|
||||
* back one slot. Then we can dump out the page.
|
||||
*/
|
||||
_bt_slideleft(s->btps_page);
|
||||
_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
|
||||
@@ -667,9 +665,9 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
|
||||
|
||||
/*
|
||||
* As the last step in the process, construct the metapage and make it
|
||||
* point to the new root (unless we had no data at all, in which case
|
||||
* it's set to point to "P_NONE"). This changes the index to the
|
||||
* "valid" state by filling in a valid magic number in the metapage.
|
||||
* point to the new root (unless we had no data at all, in which case it's
|
||||
* set to point to "P_NONE"). This changes the index to the "valid" state
|
||||
* by filling in a valid magic number in the metapage.
|
||||
*/
|
||||
metapage = (Page) palloc(BLCKSZ);
|
||||
_bt_initmetapage(metapage, rootblkno, rootlevel);
|
||||
@@ -748,7 +746,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
|
||||
compare = DatumGetInt32(FunctionCall2(&entry->sk_func,
|
||||
attrDatum1,
|
||||
attrDatum2));
|
||||
attrDatum2));
|
||||
if (compare > 0)
|
||||
{
|
||||
load1 = false;
|
||||
@@ -772,7 +770,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
if (should_free)
|
||||
pfree(bti);
|
||||
bti = (BTItem) tuplesort_getindextuple(btspool->sortstate,
|
||||
true, &should_free);
|
||||
true, &should_free);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -780,7 +778,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
if (should_free2)
|
||||
pfree(bti2);
|
||||
bti2 = (BTItem) tuplesort_getindextuple(btspool2->sortstate,
|
||||
true, &should_free2);
|
||||
true, &should_free2);
|
||||
}
|
||||
}
|
||||
_bt_freeskey(indexScanKey);
|
||||
@@ -789,7 +787,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
{
|
||||
/* merge is unnecessary */
|
||||
while ((bti = (BTItem) tuplesort_getindextuple(btspool->sortstate,
|
||||
true, &should_free)) != NULL)
|
||||
true, &should_free)) != NULL)
|
||||
{
|
||||
/* When we see first tuple, create first index page */
|
||||
if (state == NULL)
|
||||
@@ -805,19 +803,19 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
|
||||
_bt_uppershutdown(wstate, state);
|
||||
|
||||
/*
|
||||
* If the index isn't temp, we must fsync it down to disk before it's
|
||||
* safe to commit the transaction. (For a temp index we don't care
|
||||
* since the index will be uninteresting after a crash anyway.)
|
||||
* If the index isn't temp, we must fsync it down to disk before it's safe
|
||||
* to commit the transaction. (For a temp index we don't care since the
|
||||
* index will be uninteresting after a crash anyway.)
|
||||
*
|
||||
* It's obvious that we must do this when not WAL-logging the build. It's
|
||||
* less obvious that we have to do it even if we did WAL-log the index
|
||||
* pages. The reason is that since we're building outside shared
|
||||
* buffers, a CHECKPOINT occurring during the build has no way to
|
||||
* flush the previously written data to disk (indeed it won't know the
|
||||
* index even exists). A crash later on would replay WAL from the
|
||||
* checkpoint, therefore it wouldn't replay our earlier WAL entries.
|
||||
* If we do not fsync those pages here, they might still not be on
|
||||
* disk when the crash occurs.
|
||||
* pages. The reason is that since we're building outside shared buffers,
|
||||
* a CHECKPOINT occurring during the build has no way to flush the
|
||||
* previously written data to disk (indeed it won't know the index even
|
||||
* exists). A crash later on would replay WAL from the checkpoint,
|
||||
* therefore it wouldn't replay our earlier WAL entries. If we do not
|
||||
* fsync those pages here, they might still not be on disk when the crash
|
||||
* occurs.
|
||||
*/
|
||||
if (!wstate->index->rd_istemp)
|
||||
smgrimmedsync(wstate->index->rd_smgr);
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.63 2005/06/13 23:14:48 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtutils.c,v 1.64 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -48,8 +48,8 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
|
||||
bool null;
|
||||
|
||||
/*
|
||||
* We can use the cached (default) support procs since no
|
||||
* cross-type comparison can be needed.
|
||||
* We can use the cached (default) support procs since no cross-type
|
||||
* comparison can be needed.
|
||||
*/
|
||||
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
||||
arg = index_getattr(itup, i + 1, itupdesc, &null);
|
||||
@@ -93,8 +93,8 @@ _bt_mkscankey_nodata(Relation rel)
|
||||
FmgrInfo *procinfo;
|
||||
|
||||
/*
|
||||
* We can use the cached (default) support procs since no
|
||||
* cross-type comparison can be needed.
|
||||
* We can use the cached (default) support procs since no cross-type
|
||||
* comparison can be needed.
|
||||
*/
|
||||
procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC);
|
||||
ScanKeyEntryInitializeWithInfo(&skey[i],
|
||||
@@ -257,9 +257,9 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
if (numberOfKeys == 1)
|
||||
{
|
||||
/*
|
||||
* We don't use indices for 'A is null' and 'A is not null'
|
||||
* currently and 'A < = > <> NULL' will always fail - so qual is
|
||||
* not OK if comparison value is NULL. - vadim 03/21/97
|
||||
* We don't use indices for 'A is null' and 'A is not null' currently
|
||||
* and 'A < = > <> NULL' will always fail - so qual is not OK if
|
||||
* comparison value is NULL. - vadim 03/21/97
|
||||
*/
|
||||
if (cur->sk_flags & SK_ISNULL)
|
||||
so->qual_ok = false;
|
||||
@@ -286,20 +286,20 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
/*
|
||||
* Initialize for processing of keys for attr 1.
|
||||
*
|
||||
* xform[i] points to the currently best scan key of strategy type i+1,
|
||||
* if any is found with a default operator subtype; it is NULL if we
|
||||
* haven't yet found such a key for this attr. Scan keys of
|
||||
* nondefault subtypes are transferred to the output with no
|
||||
* processing except for noting if they are of "=" type.
|
||||
* xform[i] points to the currently best scan key of strategy type i+1, if
|
||||
* any is found with a default operator subtype; it is NULL if we haven't
|
||||
* yet found such a key for this attr. Scan keys of nondefault subtypes
|
||||
* are transferred to the output with no processing except for noting if
|
||||
* they are of "=" type.
|
||||
*/
|
||||
attno = 1;
|
||||
memset(xform, 0, sizeof(xform));
|
||||
hasOtherTypeEqual = false;
|
||||
|
||||
/*
|
||||
* Loop iterates from 0 to numberOfKeys inclusive; we use the last
|
||||
* pass to handle after-last-key processing. Actual exit from the
|
||||
* loop is at the "break" statement below.
|
||||
* Loop iterates from 0 to numberOfKeys inclusive; we use the last pass to
|
||||
* handle after-last-key processing. Actual exit from the loop is at the
|
||||
* "break" statement below.
|
||||
*/
|
||||
for (i = 0;; cur++, i++)
|
||||
{
|
||||
@@ -319,8 +319,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are at the end of the keys for a particular attr, finish
|
||||
* up processing and emit the cleaned-up keys.
|
||||
* If we are at the end of the keys for a particular attr, finish up
|
||||
* processing and emit the cleaned-up keys.
|
||||
*/
|
||||
if (i == numberOfKeys || cur->sk_attno != attno)
|
||||
{
|
||||
@@ -331,9 +331,9 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
elog(ERROR, "btree index keys must be ordered by attribute");
|
||||
|
||||
/*
|
||||
* If = has been specified, no other key will be used. In case
|
||||
* of key > 2 && key == 1 and so on we have to set qual_ok to
|
||||
* false before discarding the other keys.
|
||||
* If = has been specified, no other key will be used. In case of
|
||||
* key > 2 && key == 1 and so on we have to set qual_ok to false
|
||||
* before discarding the other keys.
|
||||
*/
|
||||
if (xform[BTEqualStrategyNumber - 1])
|
||||
{
|
||||
@@ -411,8 +411,8 @@ _bt_preprocess_keys(IndexScanDesc scan)
|
||||
}
|
||||
|
||||
/*
|
||||
* If all attrs before this one had "=", include these keys
|
||||
* into the required-keys count.
|
||||
* If all attrs before this one had "=", include these keys into
|
||||
* the required-keys count.
|
||||
*/
|
||||
if (priorNumberOfEqualCols == attno - 1)
|
||||
so->numberOfRequiredKeys = new_numberOfKeys;
|
||||
@@ -526,11 +526,11 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
|
||||
if (isNull)
|
||||
{
|
||||
/*
|
||||
* Since NULLs are sorted after non-NULLs, we know we have
|
||||
* reached the upper limit of the range of values for this
|
||||
* index attr. On a forward scan, we can stop if this qual is
|
||||
* one of the "must match" subset. On a backward scan,
|
||||
* however, we should keep going.
|
||||
* Since NULLs are sorted after non-NULLs, we know we have reached
|
||||
* the upper limit of the range of values for this index attr. On
|
||||
* a forward scan, we can stop if this qual is one of the "must
|
||||
* match" subset. On a backward scan, however, we should keep
|
||||
* going.
|
||||
*/
|
||||
if (ikey < so->numberOfRequiredKeys &&
|
||||
ScanDirectionIsForward(dir))
|
||||
@@ -547,24 +547,22 @@ _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
|
||||
if (!DatumGetBool(test))
|
||||
{
|
||||
/*
|
||||
* Tuple fails this qual. If it's a required qual, then we
|
||||
* may be able to conclude no further tuples will pass,
|
||||
* either. We have to look at the scan direction and the qual
|
||||
* type.
|
||||
* Tuple fails this qual. If it's a required qual, then we may be
|
||||
* able to conclude no further tuples will pass, either. We have
|
||||
* to look at the scan direction and the qual type.
|
||||
*
|
||||
* Note: the only case in which we would keep going after failing
|
||||
* a required qual is if there are partially-redundant quals
|
||||
* that _bt_preprocess_keys() was unable to eliminate. For
|
||||
* example, given "x > 4 AND x > 10" where both are cross-type
|
||||
* comparisons and so not removable, we might start the scan
|
||||
* at the x = 4 boundary point. The "x > 10" condition will
|
||||
* fail until we pass x = 10, but we must not stop the scan on
|
||||
* its account.
|
||||
* Note: the only case in which we would keep going after failing a
|
||||
* required qual is if there are partially-redundant quals that
|
||||
* _bt_preprocess_keys() was unable to eliminate. For example,
|
||||
* given "x > 4 AND x > 10" where both are cross-type comparisons
|
||||
* and so not removable, we might start the scan at the x = 4
|
||||
* boundary point. The "x > 10" condition will fail until we pass
|
||||
* x = 10, but we must not stop the scan on its account.
|
||||
*
|
||||
* Note: because we stop the scan as soon as any required
|
||||
* equality qual fails, it is critical that equality quals be
|
||||
* used for the initial positioning in _bt_first() when they
|
||||
* are available. See comments in _bt_first().
|
||||
* Note: because we stop the scan as soon as any required equality
|
||||
* qual fails, it is critical that equality quals be used for the
|
||||
* initial positioning in _bt_first() when they are available. See
|
||||
* comments in _bt_first().
|
||||
*/
|
||||
if (ikey < so->numberOfRequiredKeys)
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.22 2005/06/06 17:01:22 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.23 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -101,7 +101,7 @@ _bt_restore_page(Page page, char *from, int len)
|
||||
(sizeof(BTItemData) - sizeof(IndexTupleData));
|
||||
itemsz = MAXALIGN(itemsz);
|
||||
if (PageAddItem(page, (Item) from, itemsz,
|
||||
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
||||
FirstOffsetNumber, LP_USED) == InvalidOffsetNumber)
|
||||
elog(PANIC, "_bt_restore_page: can't add item to page");
|
||||
from += itemsz;
|
||||
}
|
||||
@@ -136,8 +136,8 @@ _bt_restore_meta(Relation reln, XLogRecPtr lsn,
|
||||
pageop->btpo_flags = BTP_META;
|
||||
|
||||
/*
|
||||
* Set pd_lower just past the end of the metadata. This is not
|
||||
* essential but it makes the page look compressible to xlog.c.
|
||||
* Set pd_lower just past the end of the metadata. This is not essential
|
||||
* but it makes the page look compressible to xlog.c.
|
||||
*/
|
||||
((PageHeader) metapg)->pd_lower =
|
||||
((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
|
||||
@@ -181,7 +181,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
if (!(record->xl_info & XLR_BKP_BLOCK_1))
|
||||
{
|
||||
buffer = XLogReadBuffer(false, reln,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)));
|
||||
if (!BufferIsValid(buffer))
|
||||
elog(PANIC, "btree_insert_redo: block unfound");
|
||||
page = (Page) BufferGetPage(buffer);
|
||||
@@ -217,8 +217,8 @@ btree_xlog_insert(bool isleaf, bool ismeta,
|
||||
if (!isleaf && incomplete_splits != NIL)
|
||||
{
|
||||
forget_matching_split(reln, xlrec->target.node,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
false);
|
||||
}
|
||||
}
|
||||
@@ -325,8 +325,8 @@ btree_xlog_split(bool onleft, bool isroot,
|
||||
if (xlrec->level > 0 && incomplete_splits != NIL)
|
||||
{
|
||||
forget_matching_split(reln, xlrec->target.node,
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetBlockNumber(&(xlrec->target.tid)),
|
||||
ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
|
||||
false);
|
||||
}
|
||||
|
||||
@@ -655,7 +655,7 @@ static void
|
||||
out_target(char *buf, xl_btreetid *target)
|
||||
{
|
||||
sprintf(buf + strlen(buf), "rel %u/%u/%u; tid %u/%u",
|
||||
target->node.spcNode, target->node.dbNode, target->node.relNode,
|
||||
target->node.spcNode, target->node.dbNode, target->node.relNode,
|
||||
ItemPointerGetBlockNumber(&(target->tid)),
|
||||
ItemPointerGetOffsetNumber(&(target->tid)));
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.36 2005/10/06 02:29:14 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtget.c,v 1.37 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -32,12 +32,12 @@ rtgettuple(PG_FUNCTION_ARGS)
|
||||
IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
|
||||
RTreeScanOpaque so = (RTreeScanOpaque) s->opaque;
|
||||
Page page;
|
||||
Page page;
|
||||
OffsetNumber offnum;
|
||||
|
||||
/*
|
||||
* If we've already produced a tuple and the executor has informed
|
||||
* us that it should be marked "killed", do so now.
|
||||
* If we've already produced a tuple and the executor has informed us that
|
||||
* it should be marked "killed", do so now.
|
||||
*/
|
||||
if (s->kill_prior_tuple && ItemPointerIsValid(&(s->currentItemData)))
|
||||
{
|
||||
@@ -48,14 +48,13 @@ rtgettuple(PG_FUNCTION_ARGS)
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the next tuple that matches the search key; if asked to
|
||||
* skip killed tuples, find the first non-killed tuple that
|
||||
* matches. Return as soon as we've run out of matches or we've
|
||||
* found an acceptable match.
|
||||
* Get the next tuple that matches the search key; if asked to skip killed
|
||||
* tuples, find the first non-killed tuple that matches. Return as soon as
|
||||
* we've run out of matches or we've found an acceptable match.
|
||||
*/
|
||||
for (;;)
|
||||
{
|
||||
bool res = rtnext(s, dir);
|
||||
bool res = rtnext(s, dir);
|
||||
|
||||
if (res && s->ignore_killed_tuples)
|
||||
{
|
||||
@@ -73,7 +72,7 @@ Datum
|
||||
rtgetmulti(PG_FUNCTION_ARGS)
|
||||
{
|
||||
IndexScanDesc s = (IndexScanDesc) PG_GETARG_POINTER(0);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
ItemPointer tids = (ItemPointer) PG_GETARG_POINTER(1);
|
||||
int32 max_tids = PG_GETARG_INT32(2);
|
||||
int32 *returned_tids = (int32 *) PG_GETARG_POINTER(3);
|
||||
RTreeScanOpaque so = (RTreeScanOpaque) s->opaque;
|
||||
@@ -86,7 +85,7 @@ rtgetmulti(PG_FUNCTION_ARGS)
|
||||
res = rtnext(s, ForwardScanDirection);
|
||||
if (res && s->ignore_killed_tuples)
|
||||
{
|
||||
Page page;
|
||||
Page page;
|
||||
OffsetNumber offnum;
|
||||
|
||||
offnum = ItemPointerGetOffsetNumber(&(s->currentItemData));
|
||||
@@ -201,12 +200,11 @@ rtnext(IndexScanDesc s, ScanDirection dir)
|
||||
blk = ItemPointerGetBlockNumber(&(it->t_tid));
|
||||
|
||||
/*
|
||||
* Note that we release the pin on the page as we descend
|
||||
* down the tree, even though there's a good chance we'll
|
||||
* eventually need to re-read the buffer later in this
|
||||
* scan. This may or may not be optimal, but it doesn't
|
||||
* seem likely to make a huge performance difference
|
||||
* either way.
|
||||
* Note that we release the pin on the page as we descend down the
|
||||
* tree, even though there's a good chance we'll eventually need
|
||||
* to re-read the buffer later in this scan. This may or may not
|
||||
* be optimal, but it doesn't seem likely to make a huge
|
||||
* performance difference either way.
|
||||
*/
|
||||
so->curbuf = ReleaseAndReadBuffer(so->curbuf, s->indexRelation, blk);
|
||||
p = BufferGetPage(so->curbuf);
|
||||
@@ -233,7 +231,7 @@ findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
|
||||
IndexTuple it;
|
||||
RTreePageOpaque po;
|
||||
RTreeScanOpaque so;
|
||||
Page p;
|
||||
Page p;
|
||||
|
||||
so = (RTreeScanOpaque) s->opaque;
|
||||
p = BufferGetPage(so->curbuf);
|
||||
@@ -242,8 +240,8 @@ findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
|
||||
po = (RTreePageOpaque) PageGetSpecialPointer(p);
|
||||
|
||||
/*
|
||||
* If we modified the index during the scan, we may have a pointer to
|
||||
* a ghost tuple, before the scan. If this is the case, back up one.
|
||||
* If we modified the index during the scan, we may have a pointer to a
|
||||
* ghost tuple, before the scan. If this is the case, back up one.
|
||||
*/
|
||||
|
||||
if (so->s_flags & RTS_CURBEFORE)
|
||||
@@ -277,7 +275,7 @@ findnext(IndexScanDesc s, OffsetNumber n, ScanDirection dir)
|
||||
}
|
||||
|
||||
if (n >= FirstOffsetNumber && n <= maxoff)
|
||||
return n; /* found a match on this page */
|
||||
return n; /* found a match on this page */
|
||||
else
|
||||
return InvalidOffsetNumber; /* no match, go to next page */
|
||||
}
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtproc.c,v 1.42 2004/12/31 21:59:26 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtproc.c,v 1.43 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -146,8 +146,8 @@ rt_poly_size(PG_FUNCTION_ARGS)
|
||||
ydim;
|
||||
|
||||
/*
|
||||
* Can't just use GETARG because of possibility that input is NULL;
|
||||
* since POLYGON is toastable, GETARG will try to inspect its value
|
||||
* Can't just use GETARG because of possibility that input is NULL; since
|
||||
* POLYGON is toastable, GETARG will try to inspect its value
|
||||
*/
|
||||
if (aptr == NULL)
|
||||
{
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.91 2005/08/10 21:36:46 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtree.c,v 1.92 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -121,8 +121,8 @@ rtbuild(PG_FUNCTION_ARGS)
|
||||
initRtstate(&buildstate.rtState, index);
|
||||
|
||||
/*
|
||||
* We expect to be called exactly once for any index relation. If
|
||||
* that's not the case, big trouble's what we have.
|
||||
* We expect to be called exactly once for any index relation. If that's
|
||||
* not the case, big trouble's what we have.
|
||||
*/
|
||||
if (RelationGetNumberOfBlocks(index) != 0)
|
||||
elog(ERROR, "index \"%s\" already contains data",
|
||||
@@ -175,10 +175,10 @@ rtbuildCallback(Relation index,
|
||||
|
||||
/*
|
||||
* Since we already have the index relation locked, we call rtdoinsert
|
||||
* directly. Normal access method calls dispatch through rtinsert,
|
||||
* which locks the relation for write. This is the right thing to do
|
||||
* if you're inserting single tups, but not when you're initializing
|
||||
* the whole index at once.
|
||||
* directly. Normal access method calls dispatch through rtinsert, which
|
||||
* locks the relation for write. This is the right thing to do if you're
|
||||
* inserting single tups, but not when you're initializing the whole index
|
||||
* at once.
|
||||
*/
|
||||
rtdoinsert(index, itup, &buildstate->rtState);
|
||||
|
||||
@@ -226,9 +226,8 @@ rtinsert(PG_FUNCTION_ARGS)
|
||||
initRtstate(&rtState, r);
|
||||
|
||||
/*
|
||||
* Since rtree is not marked "amconcurrent" in pg_am, caller should
|
||||
* have acquired exclusive lock on index relation. We need no locking
|
||||
* here.
|
||||
* Since rtree is not marked "amconcurrent" in pg_am, caller should have
|
||||
* acquired exclusive lock on index relation. We need no locking here.
|
||||
*/
|
||||
rtdoinsert(r, itup, &rtState);
|
||||
|
||||
@@ -331,7 +330,7 @@ rttighten(Relation r,
|
||||
p = BufferGetPage(b);
|
||||
|
||||
oldud = IndexTupleGetDatum(PageGetItem(p,
|
||||
PageGetItemId(p, stk->rts_child)));
|
||||
PageGetItemId(p, stk->rts_child)));
|
||||
|
||||
FunctionCall2(&rtstate->sizeFn, oldud,
|
||||
PointerGetDatum(&old_size));
|
||||
@@ -342,8 +341,8 @@ rttighten(Relation r,
|
||||
PointerGetDatum(&newd_size));
|
||||
|
||||
/*
|
||||
* If newd_size == 0 we have degenerate rectangles, so we don't know
|
||||
* if there was any change, so we have to assume there was.
|
||||
* If newd_size == 0 we have degenerate rectangles, so we don't know if
|
||||
* there was any change, so we have to assume there was.
|
||||
*/
|
||||
if ((newd_size == 0) || (newd_size != old_size))
|
||||
{
|
||||
@@ -370,8 +369,8 @@ rttighten(Relation r,
|
||||
/*
|
||||
* The user may be defining an index on variable-sized data (like
|
||||
* polygons). If so, we need to get a constant-sized datum for
|
||||
* insertion on the internal page. We do this by calling the
|
||||
* union proc, which is required to return a rectangle.
|
||||
* insertion on the internal page. We do this by calling the union
|
||||
* proc, which is required to return a rectangle.
|
||||
*/
|
||||
tdatum = FunctionCall2(&rtstate->unionFn, datum, datum);
|
||||
|
||||
@@ -428,8 +427,8 @@ rtdosplit(Relation r,
|
||||
|
||||
/*
|
||||
* The root of the tree is the first block in the relation. If we're
|
||||
* about to split the root, we need to do some hocus-pocus to enforce
|
||||
* this guarantee.
|
||||
* about to split the root, we need to do some hocus-pocus to enforce this
|
||||
* guarantee.
|
||||
*/
|
||||
|
||||
if (BufferGetBlockNumber(buffer) == P_ROOT)
|
||||
@@ -459,10 +458,9 @@ rtdosplit(Relation r,
|
||||
newitemoff = OffsetNumberNext(maxoff);
|
||||
|
||||
/*
|
||||
* spl_left contains a list of the offset numbers of the tuples that
|
||||
* will go to the left page. For each offset number, get the tuple
|
||||
* item, then add the item to the left page. Similarly for the right
|
||||
* side.
|
||||
* spl_left contains a list of the offset numbers of the tuples that will
|
||||
* go to the left page. For each offset number, get the tuple item, then
|
||||
* add the item to the left page. Similarly for the right side.
|
||||
*/
|
||||
|
||||
/* fill left node */
|
||||
@@ -525,13 +523,13 @@ rtdosplit(Relation r,
|
||||
* introduced in its structure by splitting this page.
|
||||
*
|
||||
* 2) "Tighten" the bounding box of the pointer to the left page in the
|
||||
* parent node in the tree, if any. Since we moved a bunch of stuff
|
||||
* off the left page, we expect it to get smaller. This happens in
|
||||
* the internal insertion routine.
|
||||
* parent node in the tree, if any. Since we moved a bunch of stuff off
|
||||
* the left page, we expect it to get smaller. This happens in the
|
||||
* internal insertion routine.
|
||||
*
|
||||
* 3) Insert a pointer to the right page in the parent. This may cause
|
||||
* the parent to split. If it does, we need to repeat steps one and
|
||||
* two for each split node in the tree.
|
||||
* 3) Insert a pointer to the right page in the parent. This may cause the
|
||||
* parent to split. If it does, we need to repeat steps one and two for
|
||||
* each split node in the tree.
|
||||
*/
|
||||
|
||||
/* adjust active scans */
|
||||
@@ -583,10 +581,10 @@ rtintinsert(Relation r,
|
||||
old = (IndexTuple) PageGetItem(p, PageGetItemId(p, stk->rts_child));
|
||||
|
||||
/*
|
||||
* This is a hack. Right now, we force rtree internal keys to be
|
||||
* constant size. To fix this, need delete the old key and add both
|
||||
* left and right for the two new pages. The insertion of left may
|
||||
* force a split if the new left key is bigger than the old key.
|
||||
* This is a hack. Right now, we force rtree internal keys to be constant
|
||||
* size. To fix this, need delete the old key and add both left and right
|
||||
* for the two new pages. The insertion of left may force a split if the
|
||||
* new left key is bigger than the old key.
|
||||
*/
|
||||
|
||||
if (IndexTupleSize(old) != IndexTupleSize(ltup))
|
||||
@@ -603,8 +601,7 @@ rtintinsert(Relation r,
|
||||
rttighten(r, stk->rts_parent, newdatum,
|
||||
IndexTupleAttSize(ltup), rtstate);
|
||||
rtdosplit(r, b, stk->rts_parent, rtup, rtstate);
|
||||
WriteBuffer(b); /* don't forget to release buffer! -
|
||||
* 01/31/94 */
|
||||
WriteBuffer(b); /* don't forget to release buffer! - 01/31/94 */
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -716,16 +713,15 @@ rtpicksplit(Relation r,
|
||||
int total_num_tuples,
|
||||
num_tuples_without_seeds,
|
||||
max_after_split; /* in Guttman's lingo, (M - m) */
|
||||
float diff; /* diff between cost of putting tuple left
|
||||
* or right */
|
||||
float diff; /* diff between cost of putting tuple left or
|
||||
* right */
|
||||
SPLITCOST *cost_vector;
|
||||
int n;
|
||||
|
||||
/*
|
||||
* First, make sure the new item is not so large that we can't
|
||||
* possibly fit it on a page, even by itself. (It's sufficient to
|
||||
* make this test here, since any oversize tuple must lead to a page
|
||||
* split attempt.)
|
||||
* First, make sure the new item is not so large that we can't possibly
|
||||
* fit it on a page, even by itself. (It's sufficient to make this test
|
||||
* here, since any oversize tuple must lead to a page split attempt.)
|
||||
*/
|
||||
newitemsz = IndexTupleTotalSize(itup);
|
||||
if (newitemsz > RTPageAvailSpace)
|
||||
@@ -734,11 +730,10 @@ rtpicksplit(Relation r,
|
||||
errmsg("index row size %lu exceeds rtree maximum, %lu",
|
||||
(unsigned long) newitemsz,
|
||||
(unsigned long) RTPageAvailSpace),
|
||||
errhint("Values larger than a buffer page cannot be indexed.")));
|
||||
errhint("Values larger than a buffer page cannot be indexed.")));
|
||||
|
||||
maxoff = PageGetMaxOffsetNumber(page);
|
||||
newitemoff = OffsetNumberNext(maxoff); /* phony index for new
|
||||
* item */
|
||||
newitemoff = OffsetNumberNext(maxoff); /* phony index for new item */
|
||||
total_num_tuples = newitemoff;
|
||||
num_tuples_without_seeds = total_num_tuples - 2;
|
||||
max_after_split = total_num_tuples / 2; /* works for m = M/2 */
|
||||
@@ -793,8 +788,7 @@ rtpicksplit(Relation r,
|
||||
pfree(DatumGetPointer(inter_d));
|
||||
|
||||
/*
|
||||
* are these a more promising split that what we've already
|
||||
* seen?
|
||||
* are these a more promising split that what we've already seen?
|
||||
*/
|
||||
if (size_waste > waste || firsttime)
|
||||
{
|
||||
@@ -809,10 +803,10 @@ rtpicksplit(Relation r,
|
||||
if (firsttime)
|
||||
{
|
||||
/*
|
||||
* There is no possible split except to put the new item on its
|
||||
* own page. Since we still have to compute the union rectangles,
|
||||
* we play dumb and run through the split algorithm anyway,
|
||||
* setting seed_1 = first item on page and seed_2 = new item.
|
||||
* There is no possible split except to put the new item on its own
|
||||
* page. Since we still have to compute the union rectangles, we play
|
||||
* dumb and run through the split algorithm anyway, setting seed_1 =
|
||||
* first item on page and seed_2 = new item.
|
||||
*/
|
||||
seed_1 = FirstOffsetNumber;
|
||||
seed_2 = newitemoff;
|
||||
@@ -840,25 +834,23 @@ rtpicksplit(Relation r,
|
||||
/*
|
||||
* Now split up the regions between the two seeds.
|
||||
*
|
||||
* The cost_vector array will contain hints for determining where each
|
||||
* tuple should go. Each record in the array will contain a boolean,
|
||||
* choose_left, that indicates which node the tuple prefers to be on,
|
||||
* and the absolute difference in cost between putting the tuple in
|
||||
* its favored node and in the other node.
|
||||
* The cost_vector array will contain hints for determining where each tuple
|
||||
* should go. Each record in the array will contain a boolean,
|
||||
* choose_left, that indicates which node the tuple prefers to be on, and
|
||||
* the absolute difference in cost between putting the tuple in its
|
||||
* favored node and in the other node.
|
||||
*
|
||||
* Later, we will sort the cost_vector in descending order by cost
|
||||
* difference, and consider the tuples in that order for placement.
|
||||
* That way, the tuples that *really* want to be in one node or the
|
||||
* other get to choose first, and the tuples that don't really care
|
||||
* choose last.
|
||||
* difference, and consider the tuples in that order for placement. That
|
||||
* way, the tuples that *really* want to be in one node or the other get
|
||||
* to choose first, and the tuples that don't really care choose last.
|
||||
*
|
||||
* First, build the cost_vector array. The new index tuple will also be
|
||||
* handled in this loop, and represented in the array, with
|
||||
* i==newitemoff.
|
||||
* handled in this loop, and represented in the array, with i==newitemoff.
|
||||
*
|
||||
* In the case of variable size tuples it is possible that we only have
|
||||
* the two seeds and no other tuples, in which case we don't do any of
|
||||
* this cost_vector stuff.
|
||||
* In the case of variable size tuples it is possible that we only have the
|
||||
* two seeds and no other tuples, in which case we don't do any of this
|
||||
* cost_vector stuff.
|
||||
*/
|
||||
|
||||
/* to keep compiler quiet */
|
||||
@@ -908,13 +900,13 @@ rtpicksplit(Relation r,
|
||||
}
|
||||
|
||||
/*
|
||||
* Now make the final decisions about where each tuple will go, and
|
||||
* build the vectors to return in the SPLITVEC record.
|
||||
* Now make the final decisions about where each tuple will go, and build
|
||||
* the vectors to return in the SPLITVEC record.
|
||||
*
|
||||
* The cost_vector array contains (descriptions of) all the tuples, in
|
||||
* the order that we want to consider them, so we we just iterate
|
||||
* through it and place each tuple in left or right nodes, according
|
||||
* to the criteria described below.
|
||||
* The cost_vector array contains (descriptions of) all the tuples, in the
|
||||
* order that we want to consider them, so we we just iterate through it
|
||||
* and place each tuple in left or right nodes, according to the criteria
|
||||
* described below.
|
||||
*/
|
||||
|
||||
left = v->spl_left;
|
||||
@@ -923,8 +915,8 @@ rtpicksplit(Relation r,
|
||||
v->spl_nright = 0;
|
||||
|
||||
/*
|
||||
* Place the seeds first. left avail space, left union, right avail
|
||||
* space, and right union have already been adjusted for the seeds.
|
||||
* Place the seeds first. left avail space, left union, right avail space,
|
||||
* and right union have already been adjusted for the seeds.
|
||||
*/
|
||||
|
||||
*left++ = seed_1;
|
||||
@@ -966,32 +958,30 @@ rtpicksplit(Relation r,
|
||||
PointerGetDatum(&size_beta));
|
||||
|
||||
/*
|
||||
* We prefer the page that shows smaller enlargement of its union
|
||||
* area (Guttman's algorithm), but we must take care that at least
|
||||
* one page will still have room for the new item after this one
|
||||
* is added.
|
||||
* We prefer the page that shows smaller enlargement of its union area
|
||||
* (Guttman's algorithm), but we must take care that at least one page
|
||||
* will still have room for the new item after this one is added.
|
||||
*
|
||||
* (We know that all the old items together can fit on one page, so
|
||||
* we need not worry about any other problem than failing to fit
|
||||
* the new item.)
|
||||
* (We know that all the old items together can fit on one page, so we
|
||||
* need not worry about any other problem than failing to fit the new
|
||||
* item.)
|
||||
*
|
||||
* Guttman's algorithm actually has two factors to consider (in
|
||||
* order): 1. if one node has so many tuples already assigned to
|
||||
* it that the other needs all the rest in order to satisfy the
|
||||
* condition that neither node has fewer than m tuples, then that
|
||||
* is decisive; 2. otherwise, choose the page that shows the
|
||||
* smaller enlargement of its union area.
|
||||
* Guttman's algorithm actually has two factors to consider (in order):
|
||||
* 1. if one node has so many tuples already assigned to it that the
|
||||
* other needs all the rest in order to satisfy the condition that
|
||||
* neither node has fewer than m tuples, then that is decisive; 2.
|
||||
* otherwise, choose the page that shows the smaller enlargement of
|
||||
* its union area.
|
||||
*
|
||||
* I have chosen m = M/2, where M is the maximum number of tuples on
|
||||
* a page. (Actually, this is only strictly true for fixed size
|
||||
* tuples. For variable size tuples, there still might have to be
|
||||
* only one tuple on a page, if it is really big. But even with
|
||||
* variable size tuples we still try to get m as close as possible
|
||||
* to M/2.)
|
||||
* I have chosen m = M/2, where M is the maximum number of tuples on a
|
||||
* page. (Actually, this is only strictly true for fixed size tuples.
|
||||
* For variable size tuples, there still might have to be only one
|
||||
* tuple on a page, if it is really big. But even with variable size
|
||||
* tuples we still try to get m as close as possible to M/2.)
|
||||
*
|
||||
* The question of which page shows the smaller enlargement of its
|
||||
* union area has already been answered, and the answer stored in
|
||||
* the choose_left field of the SPLITCOST record.
|
||||
* The question of which page shows the smaller enlargement of its union
|
||||
* area has already been answered, and the answer stored in the
|
||||
* choose_left field of the SPLITCOST record.
|
||||
*/
|
||||
left_feasible = (left_avail_space >= item_1_sz &&
|
||||
((left_avail_space - item_1_sz) >= newitemsz ||
|
||||
@@ -1003,9 +993,8 @@ rtpicksplit(Relation r,
|
||||
{
|
||||
/*
|
||||
* Both feasible, use Guttman's algorithm. First check the m
|
||||
* condition described above, and if that doesn't apply,
|
||||
* choose the page with the smaller enlargement of its union
|
||||
* area.
|
||||
* condition described above, and if that doesn't apply, choose
|
||||
* the page with the smaller enlargement of its union area.
|
||||
*/
|
||||
if (v->spl_nleft > max_after_split)
|
||||
choose_left = false;
|
||||
@@ -1153,9 +1142,8 @@ rtbulkdelete(PG_FUNCTION_ARGS)
|
||||
num_index_tuples = 0;
|
||||
|
||||
/*
|
||||
* Since rtree is not marked "amconcurrent" in pg_am, caller should
|
||||
* have acquired exclusive lock on index relation. We need no locking
|
||||
* here.
|
||||
* Since rtree is not marked "amconcurrent" in pg_am, caller should have
|
||||
* acquired exclusive lock on index relation. We need no locking here.
|
||||
*/
|
||||
|
||||
/*
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.59 2005/06/24 00:18:52 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.60 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -123,11 +123,11 @@ rtrescan(PG_FUNCTION_ARGS)
|
||||
|
||||
/*
|
||||
* Scans on internal pages use different operators than they do on
|
||||
* leaf pages. For example, if the user wants all boxes that
|
||||
* exactly match (x1,y1,x2,y2), then on internal pages we need to
|
||||
* find all boxes that contain (x1,y1,x2,y2). rtstrat.c knows
|
||||
* how to pick the opclass member to use for internal pages.
|
||||
* In some cases we need to negate the result of the opclass member.
|
||||
* leaf pages. For example, if the user wants all boxes that exactly
|
||||
* match (x1,y1,x2,y2), then on internal pages we need to find all
|
||||
* boxes that contain (x1,y1,x2,y2). rtstrat.c knows how to pick the
|
||||
* opclass member to use for internal pages. In some cases we need to
|
||||
* negate the result of the opclass member.
|
||||
*/
|
||||
for (i = 0; i < s->numberOfKeys; i++)
|
||||
{
|
||||
@@ -333,9 +333,9 @@ ReleaseResources_rtree(void)
|
||||
RTScanList next;
|
||||
|
||||
/*
|
||||
* Note: this should be a no-op during normal query shutdown. However,
|
||||
* in an abort situation ExecutorEnd is not called and so there may be
|
||||
* open index scans to clean up.
|
||||
* Note: this should be a no-op during normal query shutdown. However, in
|
||||
* an abort situation ExecutorEnd is not called and so there may be open
|
||||
* index scans to clean up.
|
||||
*/
|
||||
prev = NULL;
|
||||
|
||||
@@ -440,8 +440,7 @@ adjustiptr(IndexScanDesc s,
|
||||
else
|
||||
{
|
||||
/*
|
||||
* remember that we're before the current
|
||||
* tuple
|
||||
* remember that we're before the current tuple
|
||||
*/
|
||||
ItemPointerSet(iptr, blkno, FirstOffsetNumber);
|
||||
if (iptr == &(s->currentItemData))
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.32 2005/08/20 23:26:08 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.33 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -222,14 +222,14 @@ StartupCLOG(void)
|
||||
/*
|
||||
* Zero out the remainder of the current clog page. Under normal
|
||||
* circumstances it should be zeroes already, but it seems at least
|
||||
* theoretically possible that XLOG replay will have settled on a
|
||||
* nextXID value that is less than the last XID actually used and
|
||||
* marked by the previous database lifecycle (since subtransaction
|
||||
* commit writes clog but makes no WAL entry). Let's just be safe.
|
||||
* (We need not worry about pages beyond the current one, since those
|
||||
* will be zeroed when first used. For the same reason, there is no
|
||||
* need to do anything when nextXid is exactly at a page boundary; and
|
||||
* it's likely that the "current" page doesn't exist yet in that case.)
|
||||
* theoretically possible that XLOG replay will have settled on a nextXID
|
||||
* value that is less than the last XID actually used and marked by the
|
||||
* previous database lifecycle (since subtransaction commit writes clog
|
||||
* but makes no WAL entry). Let's just be safe. (We need not worry about
|
||||
* pages beyond the current one, since those will be zeroed when first
|
||||
* used. For the same reason, there is no need to do anything when
|
||||
* nextXid is exactly at a page boundary; and it's likely that the
|
||||
* "current" page doesn't exist yet in that case.)
|
||||
*/
|
||||
if (TransactionIdToPgIndex(xid) != 0)
|
||||
{
|
||||
@@ -325,8 +325,8 @@ TruncateCLOG(TransactionId oldestXact)
|
||||
int cutoffPage;
|
||||
|
||||
/*
|
||||
* The cutoff point is the start of the segment containing oldestXact.
|
||||
* We pass the *page* containing oldestXact to SimpleLruTruncate.
|
||||
* The cutoff point is the start of the segment containing oldestXact. We
|
||||
* pass the *page* containing oldestXact to SimpleLruTruncate.
|
||||
*/
|
||||
cutoffPage = TransactionIdToPage(oldestXact);
|
||||
|
||||
|
||||
@@ -4,15 +4,15 @@
|
||||
* PostgreSQL multi-transaction-log manager
|
||||
*
|
||||
* The pg_multixact manager is a pg_clog-like manager that stores an array
|
||||
* of TransactionIds for each MultiXactId. It is a fundamental part of the
|
||||
* shared-row-lock implementation. A share-locked tuple stores a
|
||||
* of TransactionIds for each MultiXactId. It is a fundamental part of the
|
||||
* shared-row-lock implementation. A share-locked tuple stores a
|
||||
* MultiXactId in its Xmax, and a transaction that needs to wait for the
|
||||
* tuple to be unlocked can sleep on the potentially-several TransactionIds
|
||||
* that compose the MultiXactId.
|
||||
*
|
||||
* We use two SLRU areas, one for storing the offsets at which the data
|
||||
* starts for each MultiXactId in the other one. This trick allows us to
|
||||
* store variable length arrays of TransactionIds. (We could alternatively
|
||||
* store variable length arrays of TransactionIds. (We could alternatively
|
||||
* use one area containing counts and TransactionIds, with valid MultiXactId
|
||||
* values pointing at slots containing counts; but that way seems less robust
|
||||
* since it would get completely confused if someone inquired about a bogus
|
||||
@@ -32,7 +32,7 @@
|
||||
*
|
||||
* Like clog.c, and unlike subtrans.c, we have to preserve state across
|
||||
* crashes and ensure that MXID and offset numbering increases monotonically
|
||||
* across a crash. We do this in the same way as it's done for transaction
|
||||
* across a crash. We do this in the same way as it's done for transaction
|
||||
* IDs: the WAL record is guaranteed to contain evidence of every MXID we
|
||||
* could need to worry about, and we just make sure that at the end of
|
||||
* replay, the next-MXID and next-offset counters are at least as large as
|
||||
@@ -42,7 +42,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.8 2005/08/20 23:26:08 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.9 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -59,13 +59,13 @@
|
||||
|
||||
|
||||
/*
|
||||
* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
|
||||
* Defines for MultiXactOffset page sizes. A page is the same BLCKSZ as is
|
||||
* used everywhere else in Postgres.
|
||||
*
|
||||
* Note: because both MultiXactOffsets and TransactionIds are 32 bits and
|
||||
* wrap around at 0xFFFFFFFF, MultiXact page numbering also wraps around at
|
||||
* 0xFFFFFFFF/MULTIXACT_*_PER_PAGE, and segment numbering at
|
||||
* 0xFFFFFFFF/MULTIXACT_*_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need take no
|
||||
* 0xFFFFFFFF/MULTIXACT_*_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need take no
|
||||
* explicit notice of that fact in this module, except when comparing segment
|
||||
* and page numbers in TruncateMultiXact
|
||||
* (see MultiXact{Offset,Member}PagePrecedes).
|
||||
@@ -92,11 +92,11 @@
|
||||
static SlruCtlData MultiXactOffsetCtlData;
|
||||
static SlruCtlData MultiXactMemberCtlData;
|
||||
|
||||
#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
|
||||
#define MultiXactMemberCtl (&MultiXactMemberCtlData)
|
||||
#define MultiXactOffsetCtl (&MultiXactOffsetCtlData)
|
||||
#define MultiXactMemberCtl (&MultiXactMemberCtlData)
|
||||
|
||||
/*
|
||||
* MultiXact state shared across all backends. All this state is protected
|
||||
* MultiXact state shared across all backends. All this state is protected
|
||||
* by MultiXactGenLock. (We also use MultiXactOffsetControlLock and
|
||||
* MultiXactMemberControlLock to guard accesses to the two sets of SLRU
|
||||
* buffers. For concurrency's sake, we avoid holding more than one of these
|
||||
@@ -105,50 +105,48 @@ static SlruCtlData MultiXactMemberCtlData;
|
||||
typedef struct MultiXactStateData
|
||||
{
|
||||
/* next-to-be-assigned MultiXactId */
|
||||
MultiXactId nextMXact;
|
||||
MultiXactId nextMXact;
|
||||
|
||||
/* next-to-be-assigned offset */
|
||||
MultiXactOffset nextOffset;
|
||||
MultiXactOffset nextOffset;
|
||||
|
||||
/* the Offset SLRU area was last truncated at this MultiXactId */
|
||||
MultiXactId lastTruncationPoint;
|
||||
MultiXactId lastTruncationPoint;
|
||||
|
||||
/*
|
||||
* Per-backend data starts here. We have two arrays stored in
|
||||
* the area immediately following the MultiXactStateData struct.
|
||||
* Each is indexed by BackendId. (Note: valid BackendIds run from 1 to
|
||||
* MaxBackends; element zero of each array is never used.)
|
||||
* Per-backend data starts here. We have two arrays stored in the area
|
||||
* immediately following the MultiXactStateData struct. Each is indexed by
|
||||
* BackendId. (Note: valid BackendIds run from 1 to MaxBackends; element
|
||||
* zero of each array is never used.)
|
||||
*
|
||||
* OldestMemberMXactId[k] is the oldest MultiXactId each backend's
|
||||
* current transaction(s) could possibly be a member of, or
|
||||
* InvalidMultiXactId when the backend has no live transaction that
|
||||
* could possibly be a member of a MultiXact. Each backend sets its
|
||||
* entry to the current nextMXact counter just before first acquiring a
|
||||
* shared lock in a given transaction, and clears it at transaction end.
|
||||
* (This works because only during or after acquiring a shared lock
|
||||
* could an XID possibly become a member of a MultiXact, and that
|
||||
* MultiXact would have to be created during or after the lock
|
||||
* acquisition.)
|
||||
* OldestMemberMXactId[k] is the oldest MultiXactId each backend's current
|
||||
* transaction(s) could possibly be a member of, or InvalidMultiXactId
|
||||
* when the backend has no live transaction that could possibly be a
|
||||
* member of a MultiXact. Each backend sets its entry to the current
|
||||
* nextMXact counter just before first acquiring a shared lock in a given
|
||||
* transaction, and clears it at transaction end. (This works because only
|
||||
* during or after acquiring a shared lock could an XID possibly become a
|
||||
* member of a MultiXact, and that MultiXact would have to be created
|
||||
* during or after the lock acquisition.)
|
||||
*
|
||||
* OldestVisibleMXactId[k] is the oldest MultiXactId each backend's
|
||||
* current transaction(s) think is potentially live, or InvalidMultiXactId
|
||||
* when not in a transaction or not in a transaction that's paid any
|
||||
* attention to MultiXacts yet. This is computed when first needed in
|
||||
* a given transaction, and cleared at transaction end. We can compute
|
||||
* it as the minimum of the valid OldestMemberMXactId[] entries at the
|
||||
* time we compute it (using nextMXact if none are valid). Each backend
|
||||
* is required not to attempt to access any SLRU data for MultiXactIds
|
||||
* older than its own OldestVisibleMXactId[] setting; this is necessary
|
||||
* because the checkpointer could truncate away such data at any instant.
|
||||
* OldestVisibleMXactId[k] is the oldest MultiXactId each backend's current
|
||||
* transaction(s) think is potentially live, or InvalidMultiXactId when
|
||||
* not in a transaction or not in a transaction that's paid any attention
|
||||
* to MultiXacts yet. This is computed when first needed in a given
|
||||
* transaction, and cleared at transaction end. We can compute it as the
|
||||
* minimum of the valid OldestMemberMXactId[] entries at the time we
|
||||
* compute it (using nextMXact if none are valid). Each backend is
|
||||
* required not to attempt to access any SLRU data for MultiXactIds older
|
||||
* than its own OldestVisibleMXactId[] setting; this is necessary because
|
||||
* the checkpointer could truncate away such data at any instant.
|
||||
*
|
||||
* The checkpointer can compute the safe truncation point as the oldest
|
||||
* valid value among all the OldestMemberMXactId[] and
|
||||
* OldestVisibleMXactId[] entries, or nextMXact if none are valid.
|
||||
* Clearly, it is not possible for any later-computed OldestVisibleMXactId
|
||||
* value to be older than this, and so there is no risk of truncating
|
||||
* data that is still needed.
|
||||
* The checkpointer can compute the safe truncation point as the oldest valid
|
||||
* value among all the OldestMemberMXactId[] and OldestVisibleMXactId[]
|
||||
* entries, or nextMXact if none are valid. Clearly, it is not possible
|
||||
* for any later-computed OldestVisibleMXactId value to be older than
|
||||
* this, and so there is no risk of truncating data that is still needed.
|
||||
*/
|
||||
MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */
|
||||
MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */
|
||||
} MultiXactStateData;
|
||||
|
||||
/* Pointers to the state data in shared memory */
|
||||
@@ -176,13 +174,13 @@ static MultiXactId *OldestVisibleMXactId;
|
||||
typedef struct mXactCacheEnt
|
||||
{
|
||||
struct mXactCacheEnt *next;
|
||||
MultiXactId multi;
|
||||
int nxids;
|
||||
TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */
|
||||
MultiXactId multi;
|
||||
int nxids;
|
||||
TransactionId xids[1]; /* VARIABLE LENGTH ARRAY */
|
||||
} mXactCacheEnt;
|
||||
|
||||
static mXactCacheEnt *MXactCache = NULL;
|
||||
static MemoryContext MXactContext = NULL;
|
||||
static mXactCacheEnt *MXactCache = NULL;
|
||||
static MemoryContext MXactContext = NULL;
|
||||
|
||||
|
||||
#ifdef MULTIXACT_DEBUG
|
||||
@@ -201,14 +199,15 @@ static MemoryContext MXactContext = NULL;
|
||||
static void MultiXactIdSetOldestVisible(void);
|
||||
static MultiXactId CreateMultiXactId(int nxids, TransactionId *xids);
|
||||
static void RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
|
||||
int nxids, TransactionId *xids);
|
||||
int nxids, TransactionId *xids);
|
||||
static MultiXactId GetNewMultiXactId(int nxids, MultiXactOffset *offset);
|
||||
|
||||
/* MultiXact cache management */
|
||||
static MultiXactId mXactCacheGetBySet(int nxids, TransactionId *xids);
|
||||
static int mXactCacheGetById(MultiXactId multi, TransactionId **xids);
|
||||
static int mXactCacheGetById(MultiXactId multi, TransactionId **xids);
|
||||
static void mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids);
|
||||
static int xidComparator(const void *arg1, const void *arg2);
|
||||
static int xidComparator(const void *arg1, const void *arg2);
|
||||
|
||||
#ifdef MULTIXACT_DEBUG
|
||||
static char *mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids);
|
||||
#endif
|
||||
@@ -220,7 +219,7 @@ static bool MultiXactOffsetPagePrecedes(int page1, int page2);
|
||||
static bool MultiXactMemberPagePrecedes(int page1, int page2);
|
||||
static bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2);
|
||||
static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
|
||||
MultiXactOffset offset2);
|
||||
MultiXactOffset offset2);
|
||||
static void ExtendMultiXactOffset(MultiXactId multi);
|
||||
static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
|
||||
static void TruncateMultiXact(void);
|
||||
@@ -239,8 +238,8 @@ static void WriteMZeroPageXlogRec(int pageno, uint8 info);
|
||||
MultiXactId
|
||||
MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
|
||||
{
|
||||
MultiXactId newMulti;
|
||||
TransactionId xids[2];
|
||||
MultiXactId newMulti;
|
||||
TransactionId xids[2];
|
||||
|
||||
AssertArg(TransactionIdIsValid(xid1));
|
||||
AssertArg(TransactionIdIsValid(xid2));
|
||||
@@ -248,9 +247,9 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
|
||||
Assert(!TransactionIdEquals(xid1, xid2));
|
||||
|
||||
/*
|
||||
* Note: unlike MultiXactIdExpand, we don't bother to check that both
|
||||
* XIDs are still running. In typical usage, xid2 will be our own XID
|
||||
* and the caller just did a check on xid1, so it'd be wasted effort.
|
||||
* Note: unlike MultiXactIdExpand, we don't bother to check that both XIDs
|
||||
* are still running. In typical usage, xid2 will be our own XID and the
|
||||
* caller just did a check on xid1, so it'd be wasted effort.
|
||||
*/
|
||||
|
||||
xids[0] = xid1;
|
||||
@@ -281,12 +280,12 @@ MultiXactIdCreate(TransactionId xid1, TransactionId xid2)
|
||||
MultiXactId
|
||||
MultiXactIdExpand(MultiXactId multi, TransactionId xid)
|
||||
{
|
||||
MultiXactId newMulti;
|
||||
TransactionId *members;
|
||||
TransactionId *newMembers;
|
||||
int nmembers;
|
||||
int i;
|
||||
int j;
|
||||
MultiXactId newMulti;
|
||||
TransactionId *members;
|
||||
TransactionId *newMembers;
|
||||
int nmembers;
|
||||
int i;
|
||||
int j;
|
||||
|
||||
AssertArg(MultiXactIdIsValid(multi));
|
||||
AssertArg(TransactionIdIsValid(xid));
|
||||
@@ -313,8 +312,8 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
|
||||
}
|
||||
|
||||
/*
|
||||
* If the TransactionId is already a member of the MultiXactId,
|
||||
* just return the existing MultiXactId.
|
||||
* If the TransactionId is already a member of the MultiXactId, just
|
||||
* return the existing MultiXactId.
|
||||
*/
|
||||
for (i = 0; i < nmembers; i++)
|
||||
{
|
||||
@@ -329,9 +328,9 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
|
||||
|
||||
/*
|
||||
* Determine which of the members of the MultiXactId are still running,
|
||||
* and use them to create a new one. (Removing dead members is just
|
||||
* an optimization, but a useful one. Note we have the same race
|
||||
* condition here as above: j could be 0 at the end of the loop.)
|
||||
* and use them to create a new one. (Removing dead members is just an
|
||||
* optimization, but a useful one. Note we have the same race condition
|
||||
* here as above: j could be 0 at the end of the loop.)
|
||||
*/
|
||||
newMembers = (TransactionId *)
|
||||
palloc(sizeof(TransactionId) * (nmembers + 1));
|
||||
@@ -355,7 +354,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid)
|
||||
|
||||
/*
|
||||
* MultiXactIdIsRunning
|
||||
* Returns whether a MultiXactId is "running".
|
||||
* Returns whether a MultiXactId is "running".
|
||||
*
|
||||
* We return true if at least one member of the given MultiXactId is still
|
||||
* running. Note that a "false" result is certain not to change,
|
||||
@@ -365,9 +364,9 @@ bool
|
||||
MultiXactIdIsRunning(MultiXactId multi)
|
||||
{
|
||||
TransactionId *members;
|
||||
TransactionId myXid;
|
||||
int nmembers;
|
||||
int i;
|
||||
TransactionId myXid;
|
||||
int nmembers;
|
||||
int i;
|
||||
|
||||
debug_elog3(DEBUG2, "IsRunning %u?", multi);
|
||||
|
||||
@@ -394,7 +393,7 @@ MultiXactIdIsRunning(MultiXactId multi)
|
||||
|
||||
/*
|
||||
* This could be made faster by having another entry point in procarray.c,
|
||||
* walking the PGPROC array only once for all the members. But in most
|
||||
* walking the PGPROC array only once for all the members. But in most
|
||||
* cases nmembers should be small enough that it doesn't much matter.
|
||||
*/
|
||||
for (i = 0; i < nmembers; i++)
|
||||
@@ -436,19 +435,19 @@ MultiXactIdSetOldestMember(void)
|
||||
|
||||
/*
|
||||
* You might think we don't need to acquire a lock here, since
|
||||
* fetching and storing of TransactionIds is probably atomic,
|
||||
* but in fact we do: suppose we pick up nextMXact and then
|
||||
* lose the CPU for a long time. Someone else could advance
|
||||
* nextMXact, and then another someone else could compute an
|
||||
* OldestVisibleMXactId that would be after the value we are
|
||||
* going to store when we get control back. Which would be wrong.
|
||||
* fetching and storing of TransactionIds is probably atomic, but in
|
||||
* fact we do: suppose we pick up nextMXact and then lose the CPU for
|
||||
* a long time. Someone else could advance nextMXact, and then
|
||||
* another someone else could compute an OldestVisibleMXactId that
|
||||
* would be after the value we are going to store when we get control
|
||||
* back. Which would be wrong.
|
||||
*/
|
||||
LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* We have to beware of the possibility that nextMXact is in the
|
||||
* wrapped-around state. We don't fix the counter itself here,
|
||||
* but we must be sure to store a valid value in our array entry.
|
||||
* wrapped-around state. We don't fix the counter itself here, but we
|
||||
* must be sure to store a valid value in our array entry.
|
||||
*/
|
||||
nextMXact = MultiXactState->nextMXact;
|
||||
if (nextMXact < FirstMultiXactId)
|
||||
@@ -475,7 +474,7 @@ MultiXactIdSetOldestMember(void)
|
||||
* The value to set is the oldest of nextMXact and all the valid per-backend
|
||||
* OldestMemberMXactId[] entries. Because of the locking we do, we can be
|
||||
* certain that no subsequent call to MultiXactIdSetOldestMember can set
|
||||
* an OldestMemberMXactId[] entry older than what we compute here. Therefore
|
||||
* an OldestMemberMXactId[] entry older than what we compute here. Therefore
|
||||
* there is no live transaction, now or later, that can be a member of any
|
||||
* MultiXactId older than the OldestVisibleMXactId we compute here.
|
||||
*/
|
||||
@@ -485,14 +484,14 @@ MultiXactIdSetOldestVisible(void)
|
||||
if (!MultiXactIdIsValid(OldestVisibleMXactId[MyBackendId]))
|
||||
{
|
||||
MultiXactId oldestMXact;
|
||||
int i;
|
||||
int i;
|
||||
|
||||
LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* We have to beware of the possibility that nextMXact is in the
|
||||
* wrapped-around state. We don't fix the counter itself here,
|
||||
* but we must be sure to store a valid value in our array entry.
|
||||
* wrapped-around state. We don't fix the counter itself here, but we
|
||||
* must be sure to store a valid value in our array entry.
|
||||
*/
|
||||
oldestMXact = MultiXactState->nextMXact;
|
||||
if (oldestMXact < FirstMultiXactId)
|
||||
@@ -535,17 +534,17 @@ void
|
||||
MultiXactIdWait(MultiXactId multi)
|
||||
{
|
||||
TransactionId *members;
|
||||
int nmembers;
|
||||
int nmembers;
|
||||
|
||||
nmembers = GetMultiXactIdMembers(multi, &members);
|
||||
|
||||
if (nmembers >= 0)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nmembers; i++)
|
||||
{
|
||||
TransactionId member = members[i];
|
||||
TransactionId member = members[i];
|
||||
|
||||
debug_elog4(DEBUG2, "MultiXactIdWait: waiting for %d (%u)",
|
||||
i, member);
|
||||
@@ -564,19 +563,19 @@ MultiXactIdWait(MultiXactId multi)
|
||||
bool
|
||||
ConditionalMultiXactIdWait(MultiXactId multi)
|
||||
{
|
||||
bool result = true;
|
||||
bool result = true;
|
||||
TransactionId *members;
|
||||
int nmembers;
|
||||
int nmembers;
|
||||
|
||||
nmembers = GetMultiXactIdMembers(multi, &members);
|
||||
|
||||
if (nmembers >= 0)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nmembers; i++)
|
||||
{
|
||||
TransactionId member = members[i];
|
||||
TransactionId member = members[i];
|
||||
|
||||
debug_elog4(DEBUG2, "ConditionalMultiXactIdWait: trying %d (%u)",
|
||||
i, member);
|
||||
@@ -596,7 +595,7 @@ ConditionalMultiXactIdWait(MultiXactId multi)
|
||||
|
||||
/*
|
||||
* CreateMultiXactId
|
||||
* Make a new MultiXactId
|
||||
* Make a new MultiXactId
|
||||
*
|
||||
* Make XLOG, SLRU and cache entries for a new MultiXactId, recording the
|
||||
* given TransactionIds as members. Returns the newly created MultiXactId.
|
||||
@@ -606,7 +605,7 @@ ConditionalMultiXactIdWait(MultiXactId multi)
|
||||
static MultiXactId
|
||||
CreateMultiXactId(int nxids, TransactionId *xids)
|
||||
{
|
||||
MultiXactId multi;
|
||||
MultiXactId multi;
|
||||
MultiXactOffset offset;
|
||||
XLogRecData rdata[2];
|
||||
xl_multixact_create xlrec;
|
||||
@@ -641,15 +640,15 @@ CreateMultiXactId(int nxids, TransactionId *xids)
|
||||
/*
|
||||
* Make an XLOG entry describing the new MXID.
|
||||
*
|
||||
* Note: we need not flush this XLOG entry to disk before proceeding.
|
||||
* The only way for the MXID to be referenced from any data page is
|
||||
* for heap_lock_tuple() to have put it there, and heap_lock_tuple()
|
||||
* generates an XLOG record that must follow ours. The normal LSN
|
||||
* interlock between the data page and that XLOG record will ensure
|
||||
* that our XLOG record reaches disk first. If the SLRU members/offsets
|
||||
* data reaches disk sooner than the XLOG record, we do not care because
|
||||
* we'll overwrite it with zeroes unless the XLOG record is there too;
|
||||
* see notes at top of this file.
|
||||
* Note: we need not flush this XLOG entry to disk before proceeding. The
|
||||
* only way for the MXID to be referenced from any data page is for
|
||||
* heap_lock_tuple() to have put it there, and heap_lock_tuple() generates
|
||||
* an XLOG record that must follow ours. The normal LSN interlock between
|
||||
* the data page and that XLOG record will ensure that our XLOG record
|
||||
* reaches disk first. If the SLRU members/offsets data reaches disk
|
||||
* sooner than the XLOG record, we do not care because we'll overwrite it
|
||||
* with zeroes unless the XLOG record is there too; see notes at top of
|
||||
* this file.
|
||||
*/
|
||||
xlrec.mid = multi;
|
||||
xlrec.moff = offset;
|
||||
@@ -702,9 +701,9 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
|
||||
/*
|
||||
* Note: we pass the MultiXactId to SimpleLruReadPage as the "transaction"
|
||||
* to complain about if there's any I/O error. This is kinda bogus, but
|
||||
* since the errors will always give the full pathname, it should be
|
||||
* clear enough that a MultiXactId is really involved. Perhaps someday
|
||||
* we'll take the trouble to generalize the slru.c error reporting code.
|
||||
* since the errors will always give the full pathname, it should be clear
|
||||
* enough that a MultiXactId is really involved. Perhaps someday we'll
|
||||
* take the trouble to generalize the slru.c error reporting code.
|
||||
*/
|
||||
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
|
||||
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
|
||||
@@ -750,7 +749,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
|
||||
* GetNewMultiXactId
|
||||
* Get the next MultiXactId.
|
||||
*
|
||||
* Also, reserve the needed amount of space in the "members" area. The
|
||||
* Also, reserve the needed amount of space in the "members" area. The
|
||||
* starting offset of the reserved space is returned in *offset.
|
||||
*
|
||||
* This may generate XLOG records for expansion of the offsets and/or members
|
||||
@@ -761,7 +760,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
|
||||
static MultiXactId
|
||||
GetNewMultiXactId(int nxids, MultiXactOffset *offset)
|
||||
{
|
||||
MultiXactId result;
|
||||
MultiXactId result;
|
||||
|
||||
debug_elog3(DEBUG2, "GetNew: for %d xids", nxids);
|
||||
|
||||
@@ -785,8 +784,8 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
|
||||
* Advance counter. As in GetNewTransactionId(), this must not happen
|
||||
* until after ExtendMultiXactOffset has succeeded!
|
||||
*
|
||||
* We don't care about MultiXactId wraparound here; it will be handled by
|
||||
* the next iteration. But note that nextMXact may be InvalidMultiXactId
|
||||
* We don't care about MultiXactId wraparound here; it will be handled by the
|
||||
* next iteration. But note that nextMXact may be InvalidMultiXactId
|
||||
* after this routine exits, so anyone else looking at the variable must
|
||||
* be prepared to deal with that.
|
||||
*/
|
||||
@@ -809,7 +808,7 @@ GetNewMultiXactId(int nxids, MultiXactOffset *offset)
|
||||
|
||||
/*
|
||||
* GetMultiXactIdMembers
|
||||
* Returns the set of TransactionIds that make up a MultiXactId
|
||||
* Returns the set of TransactionIds that make up a MultiXactId
|
||||
*
|
||||
* We return -1 if the MultiXactId is too old to possibly have any members
|
||||
* still running; in that case we have not actually looked them up, and
|
||||
@@ -822,13 +821,13 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
|
||||
int prev_pageno;
|
||||
int entryno;
|
||||
int slotno;
|
||||
MultiXactOffset *offptr;
|
||||
MultiXactOffset offset;
|
||||
MultiXactOffset *offptr;
|
||||
MultiXactOffset offset;
|
||||
int length;
|
||||
int i;
|
||||
MultiXactId nextMXact;
|
||||
MultiXactId tmpMXact;
|
||||
MultiXactOffset nextOffset;
|
||||
MultiXactId nextMXact;
|
||||
MultiXactId tmpMXact;
|
||||
MultiXactOffset nextOffset;
|
||||
TransactionId *ptr;
|
||||
|
||||
debug_elog3(DEBUG2, "GetMembers: asked for %u", multi);
|
||||
@@ -850,13 +849,13 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
|
||||
/*
|
||||
* We check known limits on MultiXact before resorting to the SLRU area.
|
||||
*
|
||||
* An ID older than our OldestVisibleMXactId[] entry can't possibly still
|
||||
* be running, and we'd run the risk of trying to read already-truncated
|
||||
* SLRU data if we did try to examine it.
|
||||
* An ID older than our OldestVisibleMXactId[] entry can't possibly still be
|
||||
* running, and we'd run the risk of trying to read already-truncated SLRU
|
||||
* data if we did try to examine it.
|
||||
*
|
||||
* Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is
|
||||
* seen, it implies undetected ID wraparound has occurred. We just
|
||||
* silently assume that such an ID is no longer running.
|
||||
* Conversely, an ID >= nextMXact shouldn't ever be seen here; if it is seen,
|
||||
* it implies undetected ID wraparound has occurred. We just silently
|
||||
* assume that such an ID is no longer running.
|
||||
*
|
||||
* Shared lock is enough here since we aren't modifying any global state.
|
||||
* Also, we can examine our own OldestVisibleMXactId without the lock,
|
||||
@@ -880,9 +879,9 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
|
||||
}
|
||||
|
||||
/*
|
||||
* Before releasing the lock, save the current counter values, because
|
||||
* the target MultiXactId may be just one less than nextMXact. We will
|
||||
* need to use nextOffset as the endpoint if so.
|
||||
* Before releasing the lock, save the current counter values, because the
|
||||
* target MultiXactId may be just one less than nextMXact. We will need
|
||||
* to use nextOffset as the endpoint if so.
|
||||
*/
|
||||
nextMXact = MultiXactState->nextMXact;
|
||||
nextOffset = MultiXactState->nextOffset;
|
||||
@@ -902,11 +901,11 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
|
||||
|
||||
/*
|
||||
* How many members do we need to read? If we are at the end of the
|
||||
* assigned MultiXactIds, use the offset just saved above. Else we
|
||||
* need to check the MultiXactId following ours.
|
||||
* assigned MultiXactIds, use the offset just saved above. Else we need
|
||||
* to check the MultiXactId following ours.
|
||||
*
|
||||
* Use the same increment rule as GetNewMultiXactId(), that is, don't
|
||||
* handle wraparound explicitly until needed.
|
||||
* Use the same increment rule as GetNewMultiXactId(), that is, don't handle
|
||||
* wraparound explicitly until needed.
|
||||
*/
|
||||
tmpMXact = multi + 1;
|
||||
|
||||
@@ -974,9 +973,9 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
|
||||
|
||||
/*
|
||||
* mXactCacheGetBySet
|
||||
* returns a MultiXactId from the cache based on the set of
|
||||
* TransactionIds that compose it, or InvalidMultiXactId if
|
||||
* none matches.
|
||||
* returns a MultiXactId from the cache based on the set of
|
||||
* TransactionIds that compose it, or InvalidMultiXactId if
|
||||
* none matches.
|
||||
*
|
||||
* This is helpful, for example, if two transactions want to lock a huge
|
||||
* table. By using the cache, the second will use the same MultiXactId
|
||||
@@ -988,7 +987,7 @@ GetMultiXactIdMembers(MultiXactId multi, TransactionId **xids)
|
||||
static MultiXactId
|
||||
mXactCacheGetBySet(int nxids, TransactionId *xids)
|
||||
{
|
||||
mXactCacheEnt *entry;
|
||||
mXactCacheEnt *entry;
|
||||
|
||||
debug_elog3(DEBUG2, "CacheGet: looking for %s",
|
||||
mxid_to_string(InvalidMultiXactId, nxids, xids));
|
||||
@@ -1015,8 +1014,8 @@ mXactCacheGetBySet(int nxids, TransactionId *xids)
|
||||
|
||||
/*
|
||||
* mXactCacheGetById
|
||||
* returns the composing TransactionId set from the cache for a
|
||||
* given MultiXactId, if present.
|
||||
* returns the composing TransactionId set from the cache for a
|
||||
* given MultiXactId, if present.
|
||||
*
|
||||
* If successful, *xids is set to the address of a palloc'd copy of the
|
||||
* TransactionId set. Return value is number of members, or -1 on failure.
|
||||
@@ -1024,7 +1023,7 @@ mXactCacheGetBySet(int nxids, TransactionId *xids)
|
||||
static int
|
||||
mXactCacheGetById(MultiXactId multi, TransactionId **xids)
|
||||
{
|
||||
mXactCacheEnt *entry;
|
||||
mXactCacheEnt *entry;
|
||||
|
||||
debug_elog3(DEBUG2, "CacheGet: looking for %u", multi);
|
||||
|
||||
@@ -1032,7 +1031,7 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
|
||||
{
|
||||
if (entry->multi == multi)
|
||||
{
|
||||
TransactionId *ptr;
|
||||
TransactionId *ptr;
|
||||
Size size;
|
||||
|
||||
size = sizeof(TransactionId) * entry->nxids;
|
||||
@@ -1042,7 +1041,7 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
|
||||
memcpy(ptr, entry->xids, size);
|
||||
|
||||
debug_elog3(DEBUG2, "CacheGet: found %s",
|
||||
mxid_to_string(multi, entry->nxids, entry->xids));
|
||||
mxid_to_string(multi, entry->nxids, entry->xids));
|
||||
return entry->nxids;
|
||||
}
|
||||
}
|
||||
@@ -1053,12 +1052,12 @@ mXactCacheGetById(MultiXactId multi, TransactionId **xids)
|
||||
|
||||
/*
|
||||
* mXactCachePut
|
||||
* Add a new MultiXactId and its composing set into the local cache.
|
||||
* Add a new MultiXactId and its composing set into the local cache.
|
||||
*/
|
||||
static void
|
||||
mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
|
||||
{
|
||||
mXactCacheEnt *entry;
|
||||
mXactCacheEnt *entry;
|
||||
|
||||
debug_elog3(DEBUG2, "CachePut: storing %s",
|
||||
mxid_to_string(multi, nxids, xids));
|
||||
@@ -1092,7 +1091,7 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
|
||||
|
||||
/*
|
||||
* xidComparator
|
||||
* qsort comparison function for XIDs
|
||||
* qsort comparison function for XIDs
|
||||
*
|
||||
* We don't need to use wraparound comparison for XIDs, and indeed must
|
||||
* not do so since that does not respect the triangle inequality! Any
|
||||
@@ -1101,8 +1100,8 @@ mXactCachePut(MultiXactId multi, int nxids, TransactionId *xids)
|
||||
static int
|
||||
xidComparator(const void *arg1, const void *arg2)
|
||||
{
|
||||
TransactionId xid1 = * (const TransactionId *) arg1;
|
||||
TransactionId xid2 = * (const TransactionId *) arg2;
|
||||
TransactionId xid1 = *(const TransactionId *) arg1;
|
||||
TransactionId xid2 = *(const TransactionId *) arg2;
|
||||
|
||||
if (xid1 > xid2)
|
||||
return 1;
|
||||
@@ -1115,8 +1114,9 @@ xidComparator(const void *arg1, const void *arg2)
|
||||
static char *
|
||||
mxid_to_string(MultiXactId multi, int nxids, TransactionId *xids)
|
||||
{
|
||||
char *str = palloc(15 * (nxids + 1) + 4);
|
||||
int i;
|
||||
char *str = palloc(15 * (nxids + 1) + 4);
|
||||
int i;
|
||||
|
||||
snprintf(str, 47, "%u %d[%u", multi, nxids, xids[0]);
|
||||
|
||||
for (i = 1; i < nxids; i++)
|
||||
@@ -1137,18 +1137,18 @@ void
|
||||
AtEOXact_MultiXact(void)
|
||||
{
|
||||
/*
|
||||
* Reset our OldestMemberMXactId and OldestVisibleMXactId values,
|
||||
* both of which should only be valid while within a transaction.
|
||||
* Reset our OldestMemberMXactId and OldestVisibleMXactId values, both of
|
||||
* which should only be valid while within a transaction.
|
||||
*
|
||||
* We assume that storing a MultiXactId is atomic and so we need
|
||||
* not take MultiXactGenLock to do this.
|
||||
* We assume that storing a MultiXactId is atomic and so we need not take
|
||||
* MultiXactGenLock to do this.
|
||||
*/
|
||||
OldestMemberMXactId[MyBackendId] = InvalidMultiXactId;
|
||||
OldestVisibleMXactId[MyBackendId] = InvalidMultiXactId;
|
||||
|
||||
/*
|
||||
* Discard the local MultiXactId cache. Since MXactContext was created
|
||||
* as a child of TopTransactionContext, we needn't delete it explicitly.
|
||||
* Discard the local MultiXactId cache. Since MXactContext was created as
|
||||
* a child of TopTransactionContext, we needn't delete it explicitly.
|
||||
*/
|
||||
MXactContext = NULL;
|
||||
MXactCache = NULL;
|
||||
@@ -1156,7 +1156,7 @@ AtEOXact_MultiXact(void)
|
||||
|
||||
/*
|
||||
* Initialization of shared memory for MultiXact. We use two SLRU areas,
|
||||
* thus double memory. Also, reserve space for the shared MultiXactState
|
||||
* thus double memory. Also, reserve space for the shared MultiXactState
|
||||
* struct and the per-backend MultiXactId arrays (two of those, too).
|
||||
*/
|
||||
Size
|
||||
@@ -1178,7 +1178,7 @@ MultiXactShmemSize(void)
|
||||
void
|
||||
MultiXactShmemInit(void)
|
||||
{
|
||||
bool found;
|
||||
bool found;
|
||||
|
||||
debug_elog2(DEBUG2, "Shared Memory Init for MultiXact");
|
||||
|
||||
@@ -1205,8 +1205,8 @@ MultiXactShmemInit(void)
|
||||
Assert(found);
|
||||
|
||||
/*
|
||||
* Set up array pointers. Note that perBackendXactIds[0] is wasted
|
||||
* space since we only use indexes 1..MaxBackends in each array.
|
||||
* Set up array pointers. Note that perBackendXactIds[0] is wasted space
|
||||
* since we only use indexes 1..MaxBackends in each array.
|
||||
*/
|
||||
OldestMemberMXactId = MultiXactState->perBackendXactIds;
|
||||
OldestVisibleMXactId = OldestMemberMXactId + MaxBackends;
|
||||
@@ -1214,7 +1214,7 @@ MultiXactShmemInit(void)
|
||||
|
||||
/*
|
||||
* This func must be called ONCE on system install. It creates the initial
|
||||
* MultiXact segments. (The MultiXacts directories are assumed to have been
|
||||
* MultiXact segments. (The MultiXacts directories are assumed to have been
|
||||
* created by initdb, and MultiXactShmemInit must have been called already.)
|
||||
*/
|
||||
void
|
||||
@@ -1287,7 +1287,7 @@ ZeroMultiXactMemberPage(int pageno, bool writeXlog)
|
||||
* This must be called ONCE during postmaster or standalone-backend startup.
|
||||
*
|
||||
* StartupXLOG has already established nextMXact/nextOffset by calling
|
||||
* MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we
|
||||
* MultiXactSetNextMXact and/or MultiXactAdvanceNextMXact. Note that we
|
||||
* may already have replayed WAL data into the SLRU files.
|
||||
*
|
||||
* We don't need any locks here, really; the SLRU locks are taken
|
||||
@@ -1311,14 +1311,14 @@ StartupMultiXact(void)
|
||||
MultiXactOffsetCtl->shared->latest_page_number = pageno;
|
||||
|
||||
/*
|
||||
* Zero out the remainder of the current offsets page. See notes
|
||||
* in StartupCLOG() for motivation.
|
||||
* Zero out the remainder of the current offsets page. See notes in
|
||||
* StartupCLOG() for motivation.
|
||||
*/
|
||||
entryno = MultiXactIdToOffsetEntry(multi);
|
||||
if (entryno != 0)
|
||||
{
|
||||
int slotno;
|
||||
MultiXactOffset *offptr;
|
||||
MultiXactOffset *offptr;
|
||||
|
||||
slotno = SimpleLruReadPage(MultiXactOffsetCtl, pageno, multi);
|
||||
offptr = (MultiXactOffset *) MultiXactOffsetCtl->shared->page_buffer[slotno];
|
||||
@@ -1341,14 +1341,14 @@ StartupMultiXact(void)
|
||||
MultiXactMemberCtl->shared->latest_page_number = pageno;
|
||||
|
||||
/*
|
||||
* Zero out the remainder of the current members page. See notes
|
||||
* in StartupCLOG() for motivation.
|
||||
* Zero out the remainder of the current members page. See notes in
|
||||
* StartupCLOG() for motivation.
|
||||
*/
|
||||
entryno = MXOffsetToMemberEntry(offset);
|
||||
if (entryno != 0)
|
||||
{
|
||||
int slotno;
|
||||
TransactionId *xidptr;
|
||||
TransactionId *xidptr;
|
||||
|
||||
slotno = SimpleLruReadPage(MultiXactMemberCtl, pageno, offset);
|
||||
xidptr = (TransactionId *) MultiXactMemberCtl->shared->page_buffer[slotno];
|
||||
@@ -1499,14 +1499,14 @@ static void
|
||||
ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
|
||||
{
|
||||
/*
|
||||
* It's possible that the members span more than one page of the
|
||||
* members file, so we loop to ensure we consider each page. The
|
||||
* coding is not optimal if the members span several pages, but
|
||||
* that seems unusual enough to not worry much about.
|
||||
* It's possible that the members span more than one page of the members
|
||||
* file, so we loop to ensure we consider each page. The coding is not
|
||||
* optimal if the members span several pages, but that seems unusual
|
||||
* enough to not worry much about.
|
||||
*/
|
||||
while (nmembers > 0)
|
||||
{
|
||||
int entryno;
|
||||
int entryno;
|
||||
|
||||
/*
|
||||
* Only zero when at first entry of a page.
|
||||
@@ -1514,7 +1514,7 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
|
||||
entryno = MXOffsetToMemberEntry(offset);
|
||||
if (entryno == 0)
|
||||
{
|
||||
int pageno;
|
||||
int pageno;
|
||||
|
||||
pageno = MXOffsetToMemberPage(offset);
|
||||
|
||||
@@ -1536,7 +1536,7 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
|
||||
* Remove all MultiXactOffset and MultiXactMember segments before the oldest
|
||||
* ones still of interest.
|
||||
*
|
||||
* This is called only during checkpoints. We assume no more than one
|
||||
* This is called only during checkpoints. We assume no more than one
|
||||
* backend does this at a time.
|
||||
*
|
||||
* XXX do we have any issues with needing to checkpoint here?
|
||||
@@ -1545,23 +1545,23 @@ static void
|
||||
TruncateMultiXact(void)
|
||||
{
|
||||
MultiXactId nextMXact;
|
||||
MultiXactOffset nextOffset;
|
||||
MultiXactOffset nextOffset;
|
||||
MultiXactId oldestMXact;
|
||||
MultiXactOffset oldestOffset;
|
||||
MultiXactOffset oldestOffset;
|
||||
int cutoffPage;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* First, compute where we can safely truncate. Per notes above,
|
||||
* this is the oldest valid value among all the OldestMemberMXactId[] and
|
||||
* First, compute where we can safely truncate. Per notes above, this is
|
||||
* the oldest valid value among all the OldestMemberMXactId[] and
|
||||
* OldestVisibleMXactId[] entries, or nextMXact if none are valid.
|
||||
*/
|
||||
LWLockAcquire(MultiXactGenLock, LW_SHARED);
|
||||
|
||||
/*
|
||||
* We have to beware of the possibility that nextMXact is in the
|
||||
* wrapped-around state. We don't fix the counter itself here,
|
||||
* but we must be sure to use a valid value in our calculation.
|
||||
* wrapped-around state. We don't fix the counter itself here, but we
|
||||
* must be sure to use a valid value in our calculation.
|
||||
*/
|
||||
nextMXact = MultiXactState->nextMXact;
|
||||
if (nextMXact < FirstMultiXactId)
|
||||
@@ -1597,9 +1597,9 @@ TruncateMultiXact(void)
|
||||
return;
|
||||
|
||||
/*
|
||||
* We need to determine where to truncate MultiXactMember. If we
|
||||
* found a valid oldest MultiXactId, read its starting offset;
|
||||
* otherwise we use the nextOffset value we saved above.
|
||||
* We need to determine where to truncate MultiXactMember. If we found a
|
||||
* valid oldest MultiXactId, read its starting offset; otherwise we use
|
||||
* the nextOffset value we saved above.
|
||||
*/
|
||||
if (oldestMXact == nextMXact)
|
||||
oldestOffset = nextOffset;
|
||||
@@ -1608,7 +1608,7 @@ TruncateMultiXact(void)
|
||||
int pageno;
|
||||
int slotno;
|
||||
int entryno;
|
||||
MultiXactOffset *offptr;
|
||||
MultiXactOffset *offptr;
|
||||
|
||||
LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -1624,8 +1624,8 @@ TruncateMultiXact(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* The cutoff point is the start of the segment containing oldestMXact.
|
||||
* We pass the *page* containing oldestMXact to SimpleLruTruncate.
|
||||
* The cutoff point is the start of the segment containing oldestMXact. We
|
||||
* pass the *page* containing oldestMXact to SimpleLruTruncate.
|
||||
*/
|
||||
cutoffPage = MultiXactIdToOffsetPage(oldestMXact);
|
||||
|
||||
@@ -1677,8 +1677,8 @@ MultiXactOffsetPagePrecedes(int page1, int page2)
|
||||
static bool
|
||||
MultiXactMemberPagePrecedes(int page1, int page2)
|
||||
{
|
||||
MultiXactOffset offset1;
|
||||
MultiXactOffset offset2;
|
||||
MultiXactOffset offset1;
|
||||
MultiXactOffset offset2;
|
||||
|
||||
offset1 = ((MultiXactOffset) page1) * MULTIXACT_MEMBERS_PER_PAGE;
|
||||
offset2 = ((MultiXactOffset) page2) * MULTIXACT_MEMBERS_PER_PAGE;
|
||||
@@ -1695,7 +1695,7 @@ MultiXactMemberPagePrecedes(int page1, int page2)
|
||||
static bool
|
||||
MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
|
||||
{
|
||||
int32 diff = (int32) (multi1 - multi2);
|
||||
int32 diff = (int32) (multi1 - multi2);
|
||||
|
||||
return (diff < 0);
|
||||
}
|
||||
@@ -1706,7 +1706,7 @@ MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
|
||||
static bool
|
||||
MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
|
||||
{
|
||||
int32 diff = (int32) (offset1 - offset2);
|
||||
int32 diff = (int32) (offset1 - offset2);
|
||||
|
||||
return (diff < 0);
|
||||
}
|
||||
@@ -1783,9 +1783,9 @@ multixact_redo(XLogRecPtr lsn, XLogRecord *record)
|
||||
MultiXactAdvanceNextMXact(xlrec->mid + 1, xlrec->moff + xlrec->nxids);
|
||||
|
||||
/*
|
||||
* Make sure nextXid is beyond any XID mentioned in the record.
|
||||
* This should be unnecessary, since any XID found here ought to
|
||||
* have other evidence in the XLOG, but let's be safe.
|
||||
* Make sure nextXid is beyond any XID mentioned in the record. This
|
||||
* should be unnecessary, since any XID found here ought to have other
|
||||
* evidence in the XLOG, but let's be safe.
|
||||
*/
|
||||
max_xid = record->xl_xid;
|
||||
for (i = 0; i < xlrec->nxids; i++)
|
||||
|
||||
@@ -48,7 +48,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.27 2005/08/20 23:26:08 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.28 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -186,8 +186,8 @@ SimpleLruInit(SlruCtl ctl, const char *name,
|
||||
Assert(found);
|
||||
|
||||
/*
|
||||
* Initialize the unshared control struct, including directory path.
|
||||
* We assume caller set PagePrecedes.
|
||||
* Initialize the unshared control struct, including directory path. We
|
||||
* assume caller set PagePrecedes.
|
||||
*/
|
||||
ctl->shared = shared;
|
||||
ctl->do_fsync = true; /* default behavior */
|
||||
@@ -351,11 +351,11 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
|
||||
LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* Check to see if someone else already did the write, or took the
|
||||
* buffer away from us. If so, do nothing. NOTE: we really should
|
||||
* never see WRITE_IN_PROGRESS here, since that state should only
|
||||
* occur while the writer is holding the buffer lock. But accept it
|
||||
* so that we have a recovery path if a writer aborts.
|
||||
* Check to see if someone else already did the write, or took the buffer
|
||||
* away from us. If so, do nothing. NOTE: we really should never see
|
||||
* WRITE_IN_PROGRESS here, since that state should only occur while the
|
||||
* writer is holding the buffer lock. But accept it so that we have a
|
||||
* recovery path if a writer aborts.
|
||||
*/
|
||||
if (shared->page_number[slotno] != pageno ||
|
||||
(shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
|
||||
@@ -368,15 +368,14 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
|
||||
|
||||
/*
|
||||
* Mark the slot write-busy. After this point, a transaction status
|
||||
* update on this page will mark it dirty again. NB: we are assuming
|
||||
* that read/write of the page status field is atomic, since we change
|
||||
* the state while not holding control lock. However, we cannot set
|
||||
* this state any sooner, or we'd possibly fool a previous writer into
|
||||
* thinking he's successfully dumped the page when he hasn't.
|
||||
* (Scenario: other writer starts, page is redirtied, we come along
|
||||
* and set WRITE_IN_PROGRESS again, other writer completes and sets
|
||||
* CLEAN because redirty info has been lost, then we think it's clean
|
||||
* too.)
|
||||
* update on this page will mark it dirty again. NB: we are assuming that
|
||||
* read/write of the page status field is atomic, since we change the
|
||||
* state while not holding control lock. However, we cannot set this
|
||||
* state any sooner, or we'd possibly fool a previous writer into thinking
|
||||
* he's successfully dumped the page when he hasn't. (Scenario: other
|
||||
* writer starts, page is redirtied, we come along and set
|
||||
* WRITE_IN_PROGRESS again, other writer completes and sets CLEAN because
|
||||
* redirty info has been lost, then we think it's clean too.)
|
||||
*/
|
||||
shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
|
||||
|
||||
@@ -436,8 +435,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
|
||||
* In a crash-and-restart situation, it's possible for us to receive
|
||||
* commands to set the commit status of transactions whose bits are in
|
||||
* already-truncated segments of the commit log (see notes in
|
||||
* SlruPhysicalWritePage). Hence, if we are InRecovery, allow the
|
||||
* case where the file doesn't exist, and return zeroes instead.
|
||||
* SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case
|
||||
* where the file doesn't exist, and return zeroes instead.
|
||||
*/
|
||||
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
|
||||
if (fd < 0)
|
||||
@@ -528,17 +527,16 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
|
||||
{
|
||||
/*
|
||||
* If the file doesn't already exist, we should create it. It is
|
||||
* possible for this to need to happen when writing a page that's
|
||||
* not first in its segment; we assume the OS can cope with that.
|
||||
* (Note: it might seem that it'd be okay to create files only
|
||||
* when SimpleLruZeroPage is called for the first page of a
|
||||
* segment. However, if after a crash and restart the REDO logic
|
||||
* elects to replay the log from a checkpoint before the latest
|
||||
* one, then it's possible that we will get commands to set
|
||||
* transaction status of transactions that have already been
|
||||
* truncated from the commit log. Easiest way to deal with that is
|
||||
* to accept references to nonexistent files here and in
|
||||
* SlruPhysicalReadPage.)
|
||||
* possible for this to need to happen when writing a page that's not
|
||||
* first in its segment; we assume the OS can cope with that. (Note:
|
||||
* it might seem that it'd be okay to create files only when
|
||||
* SimpleLruZeroPage is called for the first page of a segment.
|
||||
* However, if after a crash and restart the REDO logic elects to
|
||||
* replay the log from a checkpoint before the latest one, then it's
|
||||
* possible that we will get commands to set transaction status of
|
||||
* transactions that have already been truncated from the commit log.
|
||||
* Easiest way to deal with that is to accept references to
|
||||
* nonexistent files here and in SlruPhysicalReadPage.)
|
||||
*/
|
||||
SlruFileName(ctl, path, segno);
|
||||
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
|
||||
@@ -635,49 +633,49 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
|
||||
case SLRU_OPEN_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not open file \"%s\": %m",
|
||||
path)));
|
||||
break;
|
||||
case SLRU_CREATE_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not create file \"%s\": %m",
|
||||
path)));
|
||||
break;
|
||||
case SLRU_SEEK_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not seek in file \"%s\" to offset %u: %m",
|
||||
path, offset)));
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not seek in file \"%s\" to offset %u: %m",
|
||||
path, offset)));
|
||||
break;
|
||||
case SLRU_READ_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not read from file \"%s\" at offset %u: %m",
|
||||
path, offset)));
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not read from file \"%s\" at offset %u: %m",
|
||||
path, offset)));
|
||||
break;
|
||||
case SLRU_WRITE_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not write to file \"%s\" at offset %u: %m",
|
||||
path, offset)));
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not write to file \"%s\" at offset %u: %m",
|
||||
path, offset)));
|
||||
break;
|
||||
case SLRU_FSYNC_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not fsync file \"%s\": %m",
|
||||
path)));
|
||||
break;
|
||||
case SLRU_CLOSE_FAILED:
|
||||
ereport(ERROR,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errmsg("could not access status of transaction %u", xid),
|
||||
errdetail("could not close file \"%s\": %m",
|
||||
path)));
|
||||
break;
|
||||
@@ -723,8 +721,8 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
|
||||
}
|
||||
|
||||
/*
|
||||
* If we find any EMPTY slot, just select that one. Else locate
|
||||
* the least-recently-used slot that isn't the latest page.
|
||||
* If we find any EMPTY slot, just select that one. Else locate the
|
||||
* least-recently-used slot that isn't the latest page.
|
||||
*/
|
||||
for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++)
|
||||
{
|
||||
@@ -745,10 +743,10 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
|
||||
return bestslot;
|
||||
|
||||
/*
|
||||
* We need to do I/O. Normal case is that we have to write it
|
||||
* out, but it's possible in the worst case to have selected a
|
||||
* read-busy page. In that case we use SimpleLruReadPage to wait
|
||||
* for the read to complete.
|
||||
* We need to do I/O. Normal case is that we have to write it out,
|
||||
* but it's possible in the worst case to have selected a read-busy
|
||||
* page. In that case we use SimpleLruReadPage to wait for the read
|
||||
* to complete.
|
||||
*/
|
||||
if (shared->page_status[bestslot] == SLRU_PAGE_READ_IN_PROGRESS)
|
||||
(void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
|
||||
@@ -757,9 +755,9 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
|
||||
SimpleLruWritePage(ctl, bestslot, NULL);
|
||||
|
||||
/*
|
||||
* Now loop back and try again. This is the easiest way of
|
||||
* dealing with corner cases such as the victim page being
|
||||
* re-dirtied while we wrote it.
|
||||
* Now loop back and try again. This is the easiest way of dealing
|
||||
* with corner cases such as the victim page being re-dirtied while we
|
||||
* wrote it.
|
||||
*/
|
||||
}
|
||||
}
|
||||
@@ -789,9 +787,9 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
|
||||
SimpleLruWritePage(ctl, slotno, &fdata);
|
||||
|
||||
/*
|
||||
* When called during a checkpoint, we cannot assert that the slot
|
||||
* is clean now, since another process might have re-dirtied it
|
||||
* already. That's okay.
|
||||
* When called during a checkpoint, we cannot assert that the slot is
|
||||
* clean now, since another process might have re-dirtied it already.
|
||||
* That's okay.
|
||||
*/
|
||||
Assert(checkpoint ||
|
||||
shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
|
||||
@@ -841,10 +839,10 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
|
||||
cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
|
||||
|
||||
/*
|
||||
* Scan shared memory and remove any pages preceding the cutoff page,
|
||||
* to ensure we won't rewrite them later. (Since this is normally
|
||||
* called in or just after a checkpoint, any dirty pages should have
|
||||
* been flushed already ... we're just being extra careful here.)
|
||||
* Scan shared memory and remove any pages preceding the cutoff page, to
|
||||
* ensure we won't rewrite them later. (Since this is normally called in
|
||||
* or just after a checkpoint, any dirty pages should have been flushed
|
||||
* already ... we're just being extra careful here.)
|
||||
*/
|
||||
LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -852,16 +850,16 @@ restart:;
|
||||
|
||||
/*
|
||||
* While we are holding the lock, make an important safety check: the
|
||||
* planned cutoff point must be <= the current endpoint page.
|
||||
* Otherwise we have already wrapped around, and proceeding with the
|
||||
* truncation would risk removing the current segment.
|
||||
* planned cutoff point must be <= the current endpoint page. Otherwise we
|
||||
* have already wrapped around, and proceeding with the truncation would
|
||||
* risk removing the current segment.
|
||||
*/
|
||||
if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
|
||||
{
|
||||
LWLockRelease(shared->ControlLock);
|
||||
ereport(LOG,
|
||||
(errmsg("could not truncate directory \"%s\": apparent wraparound",
|
||||
ctl->Dir)));
|
||||
(errmsg("could not truncate directory \"%s\": apparent wraparound",
|
||||
ctl->Dir)));
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -882,9 +880,9 @@ restart:;
|
||||
}
|
||||
|
||||
/*
|
||||
* Hmm, we have (or may have) I/O operations acting on the page,
|
||||
* so we've got to wait for them to finish and then start again.
|
||||
* This is the same logic as in SlruSelectLRUPage.
|
||||
* Hmm, we have (or may have) I/O operations acting on the page, so
|
||||
* we've got to wait for them to finish and then start again. This is
|
||||
* the same logic as in SlruSelectLRUPage.
|
||||
*/
|
||||
if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
|
||||
(void) SimpleLruReadPage(ctl, shared->page_number[slotno],
|
||||
|
||||
@@ -22,7 +22,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.10 2005/08/20 23:26:08 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.11 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -234,9 +234,8 @@ StartupSUBTRANS(TransactionId oldestActiveXID)
|
||||
/*
|
||||
* Since we don't expect pg_subtrans to be valid across crashes, we
|
||||
* initialize the currently-active page(s) to zeroes during startup.
|
||||
* Whenever we advance into a new page, ExtendSUBTRANS will likewise
|
||||
* zero the new page without regard to whatever was previously on
|
||||
* disk.
|
||||
* Whenever we advance into a new page, ExtendSUBTRANS will likewise zero
|
||||
* the new page without regard to whatever was previously on disk.
|
||||
*/
|
||||
LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -262,8 +261,8 @@ ShutdownSUBTRANS(void)
|
||||
/*
|
||||
* Flush dirty SUBTRANS pages to disk
|
||||
*
|
||||
* This is not actually necessary from a correctness point of view. We do
|
||||
* it merely as a debugging aid.
|
||||
* This is not actually necessary from a correctness point of view. We do it
|
||||
* merely as a debugging aid.
|
||||
*/
|
||||
SimpleLruFlush(SubTransCtl, false);
|
||||
}
|
||||
@@ -277,9 +276,9 @@ CheckPointSUBTRANS(void)
|
||||
/*
|
||||
* Flush dirty SUBTRANS pages to disk
|
||||
*
|
||||
* This is not actually necessary from a correctness point of view. We do
|
||||
* it merely to improve the odds that writing of dirty pages is done
|
||||
* by the checkpoint process and not by backends.
|
||||
* This is not actually necessary from a correctness point of view. We do it
|
||||
* merely to improve the odds that writing of dirty pages is done by the
|
||||
* checkpoint process and not by backends.
|
||||
*/
|
||||
SimpleLruFlush(SubTransCtl, true);
|
||||
}
|
||||
@@ -329,8 +328,8 @@ TruncateSUBTRANS(TransactionId oldestXact)
|
||||
int cutoffPage;
|
||||
|
||||
/*
|
||||
* The cutoff point is the start of the segment containing oldestXact.
|
||||
* We pass the *page* containing oldestXact to SimpleLruTruncate.
|
||||
* The cutoff point is the start of the segment containing oldestXact. We
|
||||
* pass the *page* containing oldestXact to SimpleLruTruncate.
|
||||
*/
|
||||
cutoffPage = TransactionIdToPage(oldestXact);
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.65 2005/06/17 22:32:42 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.66 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* This file contains the high level access-method interface to the
|
||||
@@ -54,8 +54,8 @@ TransactionLogFetch(TransactionId transactionId)
|
||||
XidStatus xidstatus;
|
||||
|
||||
/*
|
||||
* Before going to the commit log manager, check our single item cache
|
||||
* to see if we didn't just check the transaction status a moment ago.
|
||||
* Before going to the commit log manager, check our single item cache to
|
||||
* see if we didn't just check the transaction status a moment ago.
|
||||
*/
|
||||
if (TransactionIdEquals(transactionId, cachedFetchXid))
|
||||
return cachedFetchXidStatus;
|
||||
@@ -78,8 +78,8 @@ TransactionLogFetch(TransactionId transactionId)
|
||||
xidstatus = TransactionIdGetStatus(transactionId);
|
||||
|
||||
/*
|
||||
* DO NOT cache status for unfinished or sub-committed transactions!
|
||||
* We only cache status that is guaranteed not to change.
|
||||
* DO NOT cache status for unfinished or sub-committed transactions! We
|
||||
* only cache status that is guaranteed not to change.
|
||||
*/
|
||||
if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
|
||||
xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
|
||||
@@ -169,18 +169,18 @@ TransactionIdDidCommit(TransactionId transactionId)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If it's marked subcommitted, we have to check the parent
|
||||
* recursively. However, if it's older than TransactionXmin, we can't
|
||||
* look at pg_subtrans; instead assume that the parent crashed without
|
||||
* cleaning up its children.
|
||||
* If it's marked subcommitted, we have to check the parent recursively.
|
||||
* However, if it's older than TransactionXmin, we can't look at
|
||||
* pg_subtrans; instead assume that the parent crashed without cleaning up
|
||||
* its children.
|
||||
*
|
||||
* Originally we Assert'ed that the result of SubTransGetParent was
|
||||
* not zero. However with the introduction of prepared transactions,
|
||||
* there can be a window just after database startup where we do not
|
||||
* have complete knowledge in pg_subtrans of the transactions after
|
||||
* TransactionXmin. StartupSUBTRANS() has ensured that any missing
|
||||
* information will be zeroed. Since this case should not happen under
|
||||
* normal conditions, it seems reasonable to emit a WARNING for it.
|
||||
* Originally we Assert'ed that the result of SubTransGetParent was not zero.
|
||||
* However with the introduction of prepared transactions, there can be a
|
||||
* window just after database startup where we do not have complete
|
||||
* knowledge in pg_subtrans of the transactions after TransactionXmin.
|
||||
* StartupSUBTRANS() has ensured that any missing information will be
|
||||
* zeroed. Since this case should not happen under normal conditions, it
|
||||
* seems reasonable to emit a WARNING for it.
|
||||
*/
|
||||
if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
|
||||
{
|
||||
@@ -225,10 +225,10 @@ TransactionIdDidAbort(TransactionId transactionId)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If it's marked subcommitted, we have to check the parent
|
||||
* recursively. However, if it's older than TransactionXmin, we can't
|
||||
* look at pg_subtrans; instead assume that the parent crashed without
|
||||
* cleaning up its children.
|
||||
* If it's marked subcommitted, we have to check the parent recursively.
|
||||
* However, if it's older than TransactionXmin, we can't look at
|
||||
* pg_subtrans; instead assume that the parent crashed without cleaning up
|
||||
* its children.
|
||||
*/
|
||||
if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
|
||||
{
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.14 2005/10/13 22:55:55 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.15 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
* NOTES
|
||||
* Each global transaction is associated with a global transaction
|
||||
@@ -64,7 +64,7 @@
|
||||
#define TWOPHASE_DIR "pg_twophase"
|
||||
|
||||
/* GUC variable, can't be changed after startup */
|
||||
int max_prepared_xacts = 5;
|
||||
int max_prepared_xacts = 5;
|
||||
|
||||
/*
|
||||
* This struct describes one global transaction that is in prepared state
|
||||
@@ -97,7 +97,7 @@ int max_prepared_xacts = 5;
|
||||
* entry will remain in prepXacts until recycled. We can detect recyclable
|
||||
* entries by checking for valid = false and locking_xid no longer active.
|
||||
*
|
||||
* typedef struct GlobalTransactionData *GlobalTransaction appears in
|
||||
* typedef struct GlobalTransactionData *GlobalTransaction appears in
|
||||
* twophase.h
|
||||
*/
|
||||
#define GIDSIZE 200
|
||||
@@ -105,12 +105,12 @@ int max_prepared_xacts = 5;
|
||||
typedef struct GlobalTransactionData
|
||||
{
|
||||
PGPROC proc; /* dummy proc */
|
||||
TimestampTz prepared_at; /* time of preparation */
|
||||
TimestampTz prepared_at; /* time of preparation */
|
||||
XLogRecPtr prepare_lsn; /* XLOG offset of prepare record */
|
||||
Oid owner; /* ID of user that executed the xact */
|
||||
TransactionId locking_xid; /* top-level XID of backend working on xact */
|
||||
bool valid; /* TRUE if fully prepared */
|
||||
char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
|
||||
char gid[GIDSIZE]; /* The GID assigned to the prepared xact */
|
||||
} GlobalTransactionData;
|
||||
|
||||
/*
|
||||
@@ -123,30 +123,30 @@ typedef struct TwoPhaseStateData
|
||||
SHMEM_OFFSET freeGXacts;
|
||||
|
||||
/* Number of valid prepXacts entries. */
|
||||
int numPrepXacts;
|
||||
int numPrepXacts;
|
||||
|
||||
/*
|
||||
* There are max_prepared_xacts items in this array, but C wants a
|
||||
* fixed-size array.
|
||||
*/
|
||||
GlobalTransaction prepXacts[1]; /* VARIABLE LENGTH ARRAY */
|
||||
GlobalTransaction prepXacts[1]; /* VARIABLE LENGTH ARRAY */
|
||||
} TwoPhaseStateData; /* VARIABLE LENGTH STRUCT */
|
||||
|
||||
static TwoPhaseStateData *TwoPhaseState;
|
||||
|
||||
|
||||
static void RecordTransactionCommitPrepared(TransactionId xid,
|
||||
int nchildren,
|
||||
TransactionId *children,
|
||||
int nrels,
|
||||
RelFileNode *rels);
|
||||
int nchildren,
|
||||
TransactionId *children,
|
||||
int nrels,
|
||||
RelFileNode *rels);
|
||||
static void RecordTransactionAbortPrepared(TransactionId xid,
|
||||
int nchildren,
|
||||
TransactionId *children,
|
||||
int nrels,
|
||||
RelFileNode *rels);
|
||||
int nchildren,
|
||||
TransactionId *children,
|
||||
int nrels,
|
||||
RelFileNode *rels);
|
||||
static void ProcessRecords(char *bufptr, TransactionId xid,
|
||||
const TwoPhaseCallback callbacks[]);
|
||||
const TwoPhaseCallback callbacks[]);
|
||||
|
||||
|
||||
/*
|
||||
@@ -171,7 +171,7 @@ TwoPhaseShmemSize(void)
|
||||
void
|
||||
TwoPhaseShmemInit(void)
|
||||
{
|
||||
bool found;
|
||||
bool found;
|
||||
|
||||
TwoPhaseState = ShmemInitStruct("Prepared Transaction Table",
|
||||
TwoPhaseShmemSize(),
|
||||
@@ -190,7 +190,7 @@ TwoPhaseShmemInit(void)
|
||||
*/
|
||||
gxacts = (GlobalTransaction)
|
||||
((char *) TwoPhaseState +
|
||||
MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
|
||||
MAXALIGN(offsetof(TwoPhaseStateData, prepXacts) +
|
||||
sizeof(GlobalTransaction) * max_prepared_xacts));
|
||||
for (i = 0; i < max_prepared_xacts; i++)
|
||||
{
|
||||
@@ -205,7 +205,7 @@ TwoPhaseShmemInit(void)
|
||||
|
||||
/*
|
||||
* MarkAsPreparing
|
||||
* Reserve the GID for the given transaction.
|
||||
* Reserve the GID for the given transaction.
|
||||
*
|
||||
* Internally, this creates a gxact struct and puts it into the active array.
|
||||
* NOTE: this is also used when reloading a gxact after a crash; so avoid
|
||||
@@ -215,8 +215,8 @@ GlobalTransaction
|
||||
MarkAsPreparing(TransactionId xid, const char *gid,
|
||||
TimestampTz prepared_at, Oid owner, Oid databaseid)
|
||||
{
|
||||
GlobalTransaction gxact;
|
||||
int i;
|
||||
GlobalTransaction gxact;
|
||||
int i;
|
||||
|
||||
if (strlen(gid) >= GIDSIZE)
|
||||
ereport(ERROR,
|
||||
@@ -227,10 +227,9 @@ MarkAsPreparing(TransactionId xid, const char *gid,
|
||||
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
||||
|
||||
/*
|
||||
* First, find and recycle any gxacts that failed during prepare.
|
||||
* We do this partly to ensure we don't mistakenly say their GIDs
|
||||
* are still reserved, and partly so we don't fail on out-of-slots
|
||||
* unnecessarily.
|
||||
* First, find and recycle any gxacts that failed during prepare. We do
|
||||
* this partly to ensure we don't mistakenly say their GIDs are still
|
||||
* reserved, and partly so we don't fail on out-of-slots unnecessarily.
|
||||
*/
|
||||
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
||||
{
|
||||
@@ -360,13 +359,13 @@ MarkAsPrepared(GlobalTransaction gxact)
|
||||
static GlobalTransaction
|
||||
LockGXact(const char *gid, Oid user)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
||||
|
||||
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
||||
{
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
|
||||
/* Ignore not-yet-valid GIDs */
|
||||
if (!gxact->valid)
|
||||
@@ -380,15 +379,15 @@ LockGXact(const char *gid, Oid user)
|
||||
if (TransactionIdIsActive(gxact->locking_xid))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
|
||||
errmsg("prepared transaction with identifier \"%s\" is busy",
|
||||
gid)));
|
||||
errmsg("prepared transaction with identifier \"%s\" is busy",
|
||||
gid)));
|
||||
gxact->locking_xid = InvalidTransactionId;
|
||||
}
|
||||
|
||||
if (user != gxact->owner && !superuser_arg(user))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
|
||||
errmsg("permission denied to finish prepared transaction"),
|
||||
errmsg("permission denied to finish prepared transaction"),
|
||||
errhint("Must be superuser or the user that prepared the transaction.")));
|
||||
|
||||
/* OK for me to lock it */
|
||||
@@ -403,8 +402,8 @@ LockGXact(const char *gid, Oid user)
|
||||
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_UNDEFINED_OBJECT),
|
||||
errmsg("prepared transaction with identifier \"%s\" does not exist",
|
||||
gid)));
|
||||
errmsg("prepared transaction with identifier \"%s\" does not exist",
|
||||
gid)));
|
||||
|
||||
/* NOTREACHED */
|
||||
return NULL;
|
||||
@@ -419,7 +418,7 @@ LockGXact(const char *gid, Oid user)
|
||||
static void
|
||||
RemoveGXact(GlobalTransaction gxact)
|
||||
{
|
||||
int i;
|
||||
int i;
|
||||
|
||||
LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE);
|
||||
|
||||
@@ -449,7 +448,7 @@ RemoveGXact(GlobalTransaction gxact)
|
||||
/*
|
||||
* TransactionIdIsPrepared
|
||||
* True iff transaction associated with the identifier is prepared
|
||||
* for two-phase commit
|
||||
* for two-phase commit
|
||||
*
|
||||
* Note: only gxacts marked "valid" are considered; but notice we do not
|
||||
* check the locking status.
|
||||
@@ -459,14 +458,14 @@ RemoveGXact(GlobalTransaction gxact)
|
||||
static bool
|
||||
TransactionIdIsPrepared(TransactionId xid)
|
||||
{
|
||||
bool result = false;
|
||||
int i;
|
||||
bool result = false;
|
||||
int i;
|
||||
|
||||
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
||||
|
||||
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
||||
{
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
|
||||
if (gxact->valid && gxact->proc.xid == xid)
|
||||
{
|
||||
@@ -496,8 +495,8 @@ static int
|
||||
GetPreparedTransactionList(GlobalTransaction *gxacts)
|
||||
{
|
||||
GlobalTransaction array;
|
||||
int num;
|
||||
int i;
|
||||
int num;
|
||||
int i;
|
||||
|
||||
LWLockAcquire(TwoPhaseStateLock, LW_SHARED);
|
||||
|
||||
@@ -526,13 +525,13 @@ GetPreparedTransactionList(GlobalTransaction *gxacts)
|
||||
typedef struct
|
||||
{
|
||||
GlobalTransaction array;
|
||||
int ngxacts;
|
||||
int currIdx;
|
||||
int ngxacts;
|
||||
int currIdx;
|
||||
} Working_State;
|
||||
|
||||
/*
|
||||
* pg_prepared_xact
|
||||
* Produce a view with one row per prepared transaction.
|
||||
* Produce a view with one row per prepared transaction.
|
||||
*
|
||||
* This function is here so we don't have to export the
|
||||
* GlobalTransactionData struct definition.
|
||||
@@ -552,8 +551,7 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
|
||||
funcctx = SRF_FIRSTCALL_INIT();
|
||||
|
||||
/*
|
||||
* Switch to memory context appropriate for multiple function
|
||||
* calls
|
||||
* Switch to memory context appropriate for multiple function calls
|
||||
*/
|
||||
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
|
||||
|
||||
@@ -574,8 +572,8 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
|
||||
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
|
||||
|
||||
/*
|
||||
* Collect all the 2PC status information that we will format and
|
||||
* send out as a result set.
|
||||
* Collect all the 2PC status information that we will format and send
|
||||
* out as a result set.
|
||||
*/
|
||||
status = (Working_State *) palloc(sizeof(Working_State));
|
||||
funcctx->user_fctx = (void *) status;
|
||||
@@ -644,7 +642,7 @@ TwoPhaseGetDummyProc(TransactionId xid)
|
||||
|
||||
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
||||
{
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
|
||||
if (gxact->proc.xid == xid)
|
||||
{
|
||||
@@ -665,7 +663,7 @@ TwoPhaseGetDummyProc(TransactionId xid)
|
||||
}
|
||||
|
||||
/************************************************************************/
|
||||
/* State file support */
|
||||
/* State file support */
|
||||
/************************************************************************/
|
||||
|
||||
#define TwoPhaseFilePath(path, xid) \
|
||||
@@ -674,14 +672,14 @@ TwoPhaseGetDummyProc(TransactionId xid)
|
||||
/*
|
||||
* 2PC state file format:
|
||||
*
|
||||
* 1. TwoPhaseFileHeader
|
||||
* 2. TransactionId[] (subtransactions)
|
||||
* 1. TwoPhaseFileHeader
|
||||
* 2. TransactionId[] (subtransactions)
|
||||
* 3. RelFileNode[] (files to be deleted at commit)
|
||||
* 4. RelFileNode[] (files to be deleted at abort)
|
||||
* 5. TwoPhaseRecordOnDisk
|
||||
* 6. ...
|
||||
* 7. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
|
||||
* 8. CRC32
|
||||
* 5. TwoPhaseRecordOnDisk
|
||||
* 6. ...
|
||||
* 7. TwoPhaseRecordOnDisk (end sentinel, rmid == TWOPHASE_RM_END_ID)
|
||||
* 8. CRC32
|
||||
*
|
||||
* Each segment except the final CRC32 is MAXALIGN'd.
|
||||
*/
|
||||
@@ -693,16 +691,16 @@ TwoPhaseGetDummyProc(TransactionId xid)
|
||||
|
||||
typedef struct TwoPhaseFileHeader
|
||||
{
|
||||
uint32 magic; /* format identifier */
|
||||
uint32 total_len; /* actual file length */
|
||||
TransactionId xid; /* original transaction XID */
|
||||
Oid database; /* OID of database it was in */
|
||||
TimestampTz prepared_at; /* time of preparation */
|
||||
Oid owner; /* user running the transaction */
|
||||
int32 nsubxacts; /* number of following subxact XIDs */
|
||||
int32 ncommitrels; /* number of delete-on-commit rels */
|
||||
int32 nabortrels; /* number of delete-on-abort rels */
|
||||
char gid[GIDSIZE]; /* GID for transaction */
|
||||
uint32 magic; /* format identifier */
|
||||
uint32 total_len; /* actual file length */
|
||||
TransactionId xid; /* original transaction XID */
|
||||
Oid database; /* OID of database it was in */
|
||||
TimestampTz prepared_at; /* time of preparation */
|
||||
Oid owner; /* user running the transaction */
|
||||
int32 nsubxacts; /* number of following subxact XIDs */
|
||||
int32 ncommitrels; /* number of delete-on-commit rels */
|
||||
int32 nabortrels; /* number of delete-on-abort rels */
|
||||
char gid[GIDSIZE]; /* GID for transaction */
|
||||
} TwoPhaseFileHeader;
|
||||
|
||||
/*
|
||||
@@ -713,9 +711,9 @@ typedef struct TwoPhaseFileHeader
|
||||
*/
|
||||
typedef struct TwoPhaseRecordOnDisk
|
||||
{
|
||||
uint32 len; /* length of rmgr data */
|
||||
TwoPhaseRmgrId rmid; /* resource manager for this record */
|
||||
uint16 info; /* flag bits for use by rmgr */
|
||||
uint32 len; /* length of rmgr data */
|
||||
TwoPhaseRmgrId rmid; /* resource manager for this record */
|
||||
uint16 info; /* flag bits for use by rmgr */
|
||||
} TwoPhaseRecordOnDisk;
|
||||
|
||||
/*
|
||||
@@ -728,9 +726,9 @@ static struct xllist
|
||||
{
|
||||
XLogRecData *head; /* first data block in the chain */
|
||||
XLogRecData *tail; /* last block in chain */
|
||||
uint32 bytes_free; /* free bytes left in tail block */
|
||||
uint32 total_len; /* total data bytes in chain */
|
||||
} records;
|
||||
uint32 bytes_free; /* free bytes left in tail block */
|
||||
uint32 total_len; /* total data bytes in chain */
|
||||
} records;
|
||||
|
||||
|
||||
/*
|
||||
@@ -744,7 +742,7 @@ static struct xllist
|
||||
static void
|
||||
save_state_data(const void *data, uint32 len)
|
||||
{
|
||||
uint32 padlen = MAXALIGN(len);
|
||||
uint32 padlen = MAXALIGN(len);
|
||||
|
||||
if (padlen > records.bytes_free)
|
||||
{
|
||||
@@ -772,7 +770,7 @@ save_state_data(const void *data, uint32 len)
|
||||
void
|
||||
StartPrepare(GlobalTransaction gxact)
|
||||
{
|
||||
TransactionId xid = gxact->proc.xid;
|
||||
TransactionId xid = gxact->proc.xid;
|
||||
TwoPhaseFileHeader hdr;
|
||||
TransactionId *children;
|
||||
RelFileNode *commitrels;
|
||||
@@ -833,13 +831,13 @@ StartPrepare(GlobalTransaction gxact)
|
||||
void
|
||||
EndPrepare(GlobalTransaction gxact)
|
||||
{
|
||||
TransactionId xid = gxact->proc.xid;
|
||||
TransactionId xid = gxact->proc.xid;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
char path[MAXPGPATH];
|
||||
XLogRecData *record;
|
||||
pg_crc32 statefile_crc;
|
||||
pg_crc32 bogus_crc;
|
||||
int fd;
|
||||
char path[MAXPGPATH];
|
||||
XLogRecData *record;
|
||||
pg_crc32 statefile_crc;
|
||||
pg_crc32 bogus_crc;
|
||||
int fd;
|
||||
|
||||
/* Add the end sentinel to the list of 2PC records */
|
||||
RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
|
||||
@@ -853,10 +851,10 @@ EndPrepare(GlobalTransaction gxact)
|
||||
/*
|
||||
* Create the 2PC state file.
|
||||
*
|
||||
* Note: because we use BasicOpenFile(), we are responsible for ensuring
|
||||
* the FD gets closed in any error exit path. Once we get into the
|
||||
* critical section, though, it doesn't matter since any failure causes
|
||||
* PANIC anyway.
|
||||
* Note: because we use BasicOpenFile(), we are responsible for ensuring the
|
||||
* FD gets closed in any error exit path. Once we get into the critical
|
||||
* section, though, it doesn't matter since any failure causes PANIC
|
||||
* anyway.
|
||||
*/
|
||||
TwoPhaseFilePath(path, xid);
|
||||
|
||||
@@ -887,11 +885,10 @@ EndPrepare(GlobalTransaction gxact)
|
||||
FIN_CRC32(statefile_crc);
|
||||
|
||||
/*
|
||||
* Write a deliberately bogus CRC to the state file; this is just
|
||||
* paranoia to catch the case where four more bytes will run us out of
|
||||
* disk space.
|
||||
* Write a deliberately bogus CRC to the state file; this is just paranoia
|
||||
* to catch the case where four more bytes will run us out of disk space.
|
||||
*/
|
||||
bogus_crc = ~ statefile_crc;
|
||||
bogus_crc = ~statefile_crc;
|
||||
|
||||
if ((write(fd, &bogus_crc, sizeof(pg_crc32))) != sizeof(pg_crc32))
|
||||
{
|
||||
@@ -914,11 +911,11 @@ EndPrepare(GlobalTransaction gxact)
|
||||
* The state file isn't valid yet, because we haven't written the correct
|
||||
* CRC yet. Before we do that, insert entry in WAL and flush it to disk.
|
||||
*
|
||||
* Between the time we have written the WAL entry and the time we write
|
||||
* out the correct state file CRC, we have an inconsistency: the xact is
|
||||
* prepared according to WAL but not according to our on-disk state.
|
||||
* We use a critical section to force a PANIC if we are unable to complete
|
||||
* the write --- then, WAL replay should repair the inconsistency. The
|
||||
* Between the time we have written the WAL entry and the time we write out
|
||||
* the correct state file CRC, we have an inconsistency: the xact is
|
||||
* prepared according to WAL but not according to our on-disk state. We
|
||||
* use a critical section to force a PANIC if we are unable to complete
|
||||
* the write --- then, WAL replay should repair the inconsistency. The
|
||||
* odds of a PANIC actually occurring should be very tiny given that we
|
||||
* were able to write the bogus CRC above.
|
||||
*
|
||||
@@ -956,16 +953,16 @@ EndPrepare(GlobalTransaction gxact)
|
||||
errmsg("could not close twophase state file: %m")));
|
||||
|
||||
/*
|
||||
* Mark the prepared transaction as valid. As soon as xact.c marks
|
||||
* MyProc as not running our XID (which it will do immediately after
|
||||
* this function returns), others can commit/rollback the xact.
|
||||
* Mark the prepared transaction as valid. As soon as xact.c marks MyProc
|
||||
* as not running our XID (which it will do immediately after this
|
||||
* function returns), others can commit/rollback the xact.
|
||||
*
|
||||
* NB: a side effect of this is to make a dummy ProcArray entry for the
|
||||
* prepared XID. This must happen before we clear the XID from MyProc,
|
||||
* else there is a window where the XID is not running according to
|
||||
* TransactionIdInProgress, and onlookers would be entitled to assume
|
||||
* the xact crashed. Instead we have a window where the same XID
|
||||
* appears twice in ProcArray, which is OK.
|
||||
* TransactionIdInProgress, and onlookers would be entitled to assume the
|
||||
* xact crashed. Instead we have a window where the same XID appears
|
||||
* twice in ProcArray, which is OK.
|
||||
*/
|
||||
MarkAsPrepared(gxact);
|
||||
|
||||
@@ -1011,9 +1008,10 @@ ReadTwoPhaseFile(TransactionId xid)
|
||||
char *buf;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
int fd;
|
||||
struct stat stat;
|
||||
struct stat stat;
|
||||
uint32 crc_offset;
|
||||
pg_crc32 calc_crc, file_crc;
|
||||
pg_crc32 calc_crc,
|
||||
file_crc;
|
||||
|
||||
TwoPhaseFilePath(path, xid);
|
||||
|
||||
@@ -1028,9 +1026,8 @@ ReadTwoPhaseFile(TransactionId xid)
|
||||
}
|
||||
|
||||
/*
|
||||
* Check file length. We can determine a lower bound pretty easily.
|
||||
* We set an upper bound mainly to avoid palloc() failure on a corrupt
|
||||
* file.
|
||||
* Check file length. We can determine a lower bound pretty easily. We
|
||||
* set an upper bound mainly to avoid palloc() failure on a corrupt file.
|
||||
*/
|
||||
if (fstat(fd, &stat))
|
||||
{
|
||||
@@ -1107,17 +1104,17 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
|
||||
{
|
||||
GlobalTransaction gxact;
|
||||
TransactionId xid;
|
||||
char *buf;
|
||||
char *bufptr;
|
||||
char *buf;
|
||||
char *bufptr;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
TransactionId *children;
|
||||
RelFileNode *commitrels;
|
||||
RelFileNode *abortrels;
|
||||
int i;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* Validate the GID, and lock the GXACT to ensure that two backends
|
||||
* do not try to commit the same GID at once.
|
||||
* Validate the GID, and lock the GXACT to ensure that two backends do not
|
||||
* try to commit the same GID at once.
|
||||
*/
|
||||
gxact = LockGXact(gid, GetUserId());
|
||||
xid = gxact->proc.xid;
|
||||
@@ -1148,10 +1145,10 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
|
||||
/*
|
||||
* The order of operations here is critical: make the XLOG entry for
|
||||
* commit or abort, then mark the transaction committed or aborted in
|
||||
* pg_clog, then remove its PGPROC from the global ProcArray (which
|
||||
* means TransactionIdIsInProgress will stop saying the prepared xact
|
||||
* is in progress), then run the post-commit or post-abort callbacks.
|
||||
* The callbacks will release the locks the transaction held.
|
||||
* pg_clog, then remove its PGPROC from the global ProcArray (which means
|
||||
* TransactionIdIsInProgress will stop saying the prepared xact is in
|
||||
* progress), then run the post-commit or post-abort callbacks. The
|
||||
* callbacks will release the locks the transaction held.
|
||||
*/
|
||||
if (isCommit)
|
||||
RecordTransactionCommitPrepared(xid,
|
||||
@@ -1165,18 +1162,18 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
|
||||
ProcArrayRemove(&gxact->proc);
|
||||
|
||||
/*
|
||||
* In case we fail while running the callbacks, mark the gxact invalid
|
||||
* so no one else will try to commit/rollback, and so it can be recycled
|
||||
* properly later. It is still locked by our XID so it won't go away yet.
|
||||
* In case we fail while running the callbacks, mark the gxact invalid so
|
||||
* no one else will try to commit/rollback, and so it can be recycled
|
||||
* properly later. It is still locked by our XID so it won't go away yet.
|
||||
*
|
||||
* (We assume it's safe to do this without taking TwoPhaseStateLock.)
|
||||
*/
|
||||
gxact->valid = false;
|
||||
|
||||
/*
|
||||
* We have to remove any files that were supposed to be dropped.
|
||||
* For consistency with the regular xact.c code paths, must do this
|
||||
* before releasing locks, so do it before running the callbacks.
|
||||
* We have to remove any files that were supposed to be dropped. For
|
||||
* consistency with the regular xact.c code paths, must do this before
|
||||
* releasing locks, so do it before running the callbacks.
|
||||
*
|
||||
* NB: this code knows that we couldn't be dropping any temp rels ...
|
||||
*/
|
||||
@@ -1228,8 +1225,8 @@ ProcessRecords(char *bufptr, TransactionId xid,
|
||||
bufptr += MAXALIGN(sizeof(TwoPhaseRecordOnDisk));
|
||||
|
||||
if (callbacks[record->rmid] != NULL)
|
||||
callbacks[record->rmid](xid, record->info,
|
||||
(void *) bufptr, record->len);
|
||||
callbacks[record->rmid] (xid, record->info,
|
||||
(void *) bufptr, record->len);
|
||||
|
||||
bufptr += MAXALIGN(record->len);
|
||||
}
|
||||
@@ -1244,15 +1241,15 @@ ProcessRecords(char *bufptr, TransactionId xid,
|
||||
void
|
||||
RemoveTwoPhaseFile(TransactionId xid, bool giveWarning)
|
||||
{
|
||||
char path[MAXPGPATH];
|
||||
char path[MAXPGPATH];
|
||||
|
||||
TwoPhaseFilePath(path, xid);
|
||||
if (unlink(path))
|
||||
if (errno != ENOENT || giveWarning)
|
||||
ereport(WARNING,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not remove two-phase state file \"%s\": %m",
|
||||
path)));
|
||||
errmsg("could not remove two-phase state file \"%s\": %m",
|
||||
path)));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1300,8 +1297,8 @@ RecreateTwoPhaseFile(TransactionId xid, void *content, int len)
|
||||
}
|
||||
|
||||
/*
|
||||
* We must fsync the file because the end-of-replay checkpoint will
|
||||
* not do so, there being no GXACT in shared memory yet to tell it to.
|
||||
* We must fsync the file because the end-of-replay checkpoint will not do
|
||||
* so, there being no GXACT in shared memory yet to tell it to.
|
||||
*/
|
||||
if (pg_fsync(fd) != 0)
|
||||
{
|
||||
@@ -1343,15 +1340,15 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
||||
int i;
|
||||
|
||||
/*
|
||||
* We don't want to hold the TwoPhaseStateLock while doing I/O,
|
||||
* so we grab it just long enough to make a list of the XIDs that
|
||||
* require fsyncing, and then do the I/O afterwards.
|
||||
* We don't want to hold the TwoPhaseStateLock while doing I/O, so we grab
|
||||
* it just long enough to make a list of the XIDs that require fsyncing,
|
||||
* and then do the I/O afterwards.
|
||||
*
|
||||
* This approach creates a race condition: someone else could delete
|
||||
* a GXACT between the time we release TwoPhaseStateLock and the time
|
||||
* we try to open its state file. We handle this by special-casing
|
||||
* ENOENT failures: if we see that, we verify that the GXACT is no
|
||||
* longer valid, and if so ignore the failure.
|
||||
* This approach creates a race condition: someone else could delete a GXACT
|
||||
* between the time we release TwoPhaseStateLock and the time we try to
|
||||
* open its state file. We handle this by special-casing ENOENT failures:
|
||||
* if we see that, we verify that the GXACT is no longer valid, and if so
|
||||
* ignore the failure.
|
||||
*/
|
||||
if (max_prepared_xacts <= 0)
|
||||
return; /* nothing to do */
|
||||
@@ -1362,9 +1359,9 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
||||
|
||||
for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
|
||||
{
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
|
||||
|
||||
if (gxact->valid &&
|
||||
if (gxact->valid &&
|
||||
XLByteLE(gxact->prepare_lsn, redo_horizon))
|
||||
xids[nxids++] = gxact->proc.xid;
|
||||
}
|
||||
@@ -1374,7 +1371,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
||||
for (i = 0; i < nxids; i++)
|
||||
{
|
||||
TransactionId xid = xids[i];
|
||||
int fd;
|
||||
int fd;
|
||||
|
||||
TwoPhaseFilePath(path, xid);
|
||||
|
||||
@@ -1424,7 +1421,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
|
||||
*
|
||||
* We throw away any prepared xacts with main XID beyond nextXid --- if any
|
||||
* are present, it suggests that the DBA has done a PITR recovery to an
|
||||
* earlier point in time without cleaning out pg_twophase. We dare not
|
||||
* earlier point in time without cleaning out pg_twophase. We dare not
|
||||
* try to recover such prepared xacts since they likely depend on database
|
||||
* state that doesn't exist now.
|
||||
*
|
||||
@@ -1442,7 +1439,7 @@ PrescanPreparedTransactions(void)
|
||||
{
|
||||
TransactionId origNextXid = ShmemVariableCache->nextXid;
|
||||
TransactionId result = origNextXid;
|
||||
DIR *cldir;
|
||||
DIR *cldir;
|
||||
struct dirent *clde;
|
||||
|
||||
cldir = AllocateDir(TWOPHASE_DIR);
|
||||
@@ -1452,10 +1449,10 @@ PrescanPreparedTransactions(void)
|
||||
strspn(clde->d_name, "0123456789ABCDEF") == 8)
|
||||
{
|
||||
TransactionId xid;
|
||||
char *buf;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
char *buf;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
TransactionId *subxids;
|
||||
int i;
|
||||
int i;
|
||||
|
||||
xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
|
||||
|
||||
@@ -1541,8 +1538,8 @@ PrescanPreparedTransactions(void)
|
||||
void
|
||||
RecoverPreparedTransactions(void)
|
||||
{
|
||||
char dir[MAXPGPATH];
|
||||
DIR *cldir;
|
||||
char dir[MAXPGPATH];
|
||||
DIR *cldir;
|
||||
struct dirent *clde;
|
||||
|
||||
snprintf(dir, MAXPGPATH, "%s", TWOPHASE_DIR);
|
||||
@@ -1554,12 +1551,12 @@ RecoverPreparedTransactions(void)
|
||||
strspn(clde->d_name, "0123456789ABCDEF") == 8)
|
||||
{
|
||||
TransactionId xid;
|
||||
char *buf;
|
||||
char *bufptr;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
char *buf;
|
||||
char *bufptr;
|
||||
TwoPhaseFileHeader *hdr;
|
||||
TransactionId *subxids;
|
||||
GlobalTransaction gxact;
|
||||
int i;
|
||||
GlobalTransaction gxact;
|
||||
int i;
|
||||
|
||||
xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
|
||||
|
||||
@@ -1598,8 +1595,8 @@ RecoverPreparedTransactions(void)
|
||||
|
||||
/*
|
||||
* Reconstruct subtrans state for the transaction --- needed
|
||||
* because pg_subtrans is not preserved over a restart. Note
|
||||
* that we are linking all the subtransactions directly to the
|
||||
* because pg_subtrans is not preserved over a restart. Note that
|
||||
* we are linking all the subtransactions directly to the
|
||||
* top-level XID; there may originally have been a more complex
|
||||
* hierarchy, but there's no need to restore that exactly.
|
||||
*/
|
||||
@@ -1609,12 +1606,12 @@ RecoverPreparedTransactions(void)
|
||||
/*
|
||||
* Recreate its GXACT and dummy PGPROC
|
||||
*
|
||||
* Note: since we don't have the PREPARE record's WAL location
|
||||
* at hand, we leave prepare_lsn zeroes. This means the GXACT
|
||||
* will be fsync'd on every future checkpoint. We assume this
|
||||
* Note: since we don't have the PREPARE record's WAL location at
|
||||
* hand, we leave prepare_lsn zeroes. This means the GXACT will
|
||||
* be fsync'd on every future checkpoint. We assume this
|
||||
* situation is infrequent enough that the performance cost is
|
||||
* negligible (especially since we know the state file has
|
||||
* already been fsynced).
|
||||
* negligible (especially since we know the state file has already
|
||||
* been fsynced).
|
||||
*/
|
||||
gxact = MarkAsPreparing(xid, hdr->gid,
|
||||
hdr->prepared_at,
|
||||
@@ -1773,12 +1770,11 @@ RecordTransactionAbortPrepared(TransactionId xid,
|
||||
XLogFlush(recptr);
|
||||
|
||||
/*
|
||||
* Mark the transaction aborted in clog. This is not absolutely
|
||||
* necessary but we may as well do it while we are here.
|
||||
* Mark the transaction aborted in clog. This is not absolutely necessary
|
||||
* but we may as well do it while we are here.
|
||||
*/
|
||||
TransactionIdAbort(xid);
|
||||
TransactionIdAbortTree(nchildren, children);
|
||||
|
||||
END_CRIT_SECTION();
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.1 2005/06/17 22:32:42 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/twophase_rmgr.c,v 1.2 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -21,29 +21,29 @@
|
||||
#include "utils/inval.h"
|
||||
|
||||
|
||||
const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
|
||||
const TwoPhaseCallback twophase_recover_callbacks[TWOPHASE_RM_MAX_ID + 1] =
|
||||
{
|
||||
NULL, /* END ID */
|
||||
lock_twophase_recover, /* Lock */
|
||||
NULL, /* Inval */
|
||||
NULL, /* flat file update */
|
||||
NULL /* notify/listen */
|
||||
NULL, /* END ID */
|
||||
lock_twophase_recover, /* Lock */
|
||||
NULL, /* Inval */
|
||||
NULL, /* flat file update */
|
||||
NULL /* notify/listen */
|
||||
};
|
||||
|
||||
const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
|
||||
const TwoPhaseCallback twophase_postcommit_callbacks[TWOPHASE_RM_MAX_ID + 1] =
|
||||
{
|
||||
NULL, /* END ID */
|
||||
lock_twophase_postcommit, /* Lock */
|
||||
inval_twophase_postcommit, /* Inval */
|
||||
flatfile_twophase_postcommit, /* flat file update */
|
||||
notify_twophase_postcommit /* notify/listen */
|
||||
NULL, /* END ID */
|
||||
lock_twophase_postcommit, /* Lock */
|
||||
inval_twophase_postcommit, /* Inval */
|
||||
flatfile_twophase_postcommit, /* flat file update */
|
||||
notify_twophase_postcommit /* notify/listen */
|
||||
};
|
||||
|
||||
const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
|
||||
const TwoPhaseCallback twophase_postabort_callbacks[TWOPHASE_RM_MAX_ID + 1] =
|
||||
{
|
||||
NULL, /* END ID */
|
||||
lock_twophase_postabort, /* Lock */
|
||||
NULL, /* Inval */
|
||||
NULL, /* flat file update */
|
||||
NULL /* notify/listen */
|
||||
NULL, /* END ID */
|
||||
lock_twophase_postabort, /* Lock */
|
||||
NULL, /* Inval */
|
||||
NULL, /* flat file update */
|
||||
NULL /* notify/listen */
|
||||
};
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
* Copyright (c) 2000-2005, PostgreSQL Global Development Group
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.66 2005/08/22 16:59:47 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.67 2005/10/15 02:49:09 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -49,21 +49,21 @@ GetNewTransactionId(bool isSubXact)
|
||||
xid = ShmemVariableCache->nextXid;
|
||||
|
||||
/*
|
||||
* Check to see if it's safe to assign another XID. This protects
|
||||
* against catastrophic data loss due to XID wraparound. The basic
|
||||
* rules are: warn if we're past xidWarnLimit, and refuse to execute
|
||||
* transactions if we're past xidStopLimit, unless we are running in
|
||||
* a standalone backend (which gives an escape hatch to the DBA who
|
||||
* ignored all those warnings).
|
||||
* Check to see if it's safe to assign another XID. This protects against
|
||||
* catastrophic data loss due to XID wraparound. The basic rules are:
|
||||
* warn if we're past xidWarnLimit, and refuse to execute transactions if
|
||||
* we're past xidStopLimit, unless we are running in a standalone backend
|
||||
* (which gives an escape hatch to the DBA who ignored all those
|
||||
* warnings).
|
||||
*
|
||||
* Test is coded to fall out as fast as possible during normal operation,
|
||||
* ie, when the warn limit is set and we haven't violated it.
|
||||
* Test is coded to fall out as fast as possible during normal operation, ie,
|
||||
* when the warn limit is set and we haven't violated it.
|
||||
*/
|
||||
if (TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidWarnLimit) &&
|
||||
TransactionIdIsValid(ShmemVariableCache->xidWarnLimit))
|
||||
{
|
||||
if (IsUnderPostmaster &&
|
||||
TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidStopLimit))
|
||||
TransactionIdFollowsOrEquals(xid, ShmemVariableCache->xidStopLimit))
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
|
||||
errmsg("database is not accepting queries to avoid wraparound data loss in database \"%s\"",
|
||||
@@ -72,20 +72,19 @@ GetNewTransactionId(bool isSubXact)
|
||||
NameStr(ShmemVariableCache->limit_datname))));
|
||||
else
|
||||
ereport(WARNING,
|
||||
(errmsg("database \"%s\" must be vacuumed within %u transactions",
|
||||
NameStr(ShmemVariableCache->limit_datname),
|
||||
ShmemVariableCache->xidWrapLimit - xid),
|
||||
errhint("To avoid a database shutdown, execute a full-database VACUUM in \"%s\".",
|
||||
NameStr(ShmemVariableCache->limit_datname))));
|
||||
(errmsg("database \"%s\" must be vacuumed within %u transactions",
|
||||
NameStr(ShmemVariableCache->limit_datname),
|
||||
ShmemVariableCache->xidWrapLimit - xid),
|
||||
errhint("To avoid a database shutdown, execute a full-database VACUUM in \"%s\".",
|
||||
NameStr(ShmemVariableCache->limit_datname))));
|
||||
}
|
||||
|
||||
/*
|
||||
* If we are allocating the first XID of a new page of the commit log,
|
||||
* zero out that commit-log page before returning. We must do this
|
||||
* while holding XidGenLock, else another xact could acquire and
|
||||
* commit a later XID before we zero the page. Fortunately, a page of
|
||||
* the commit log holds 32K or more transactions, so we don't have to
|
||||
* do this very often.
|
||||
* zero out that commit-log page before returning. We must do this while
|
||||
* holding XidGenLock, else another xact could acquire and commit a later
|
||||
* XID before we zero the page. Fortunately, a page of the commit log
|
||||
* holds 32K or more transactions, so we don't have to do this very often.
|
||||
*
|
||||
* Extend pg_subtrans too.
|
||||
*/
|
||||
@@ -93,45 +92,43 @@ GetNewTransactionId(bool isSubXact)
|
||||
ExtendSUBTRANS(xid);
|
||||
|
||||
/*
|
||||
* Now advance the nextXid counter. This must not happen until after
|
||||
* we have successfully completed ExtendCLOG() --- if that routine
|
||||
* fails, we want the next incoming transaction to try it again. We
|
||||
* cannot assign more XIDs until there is CLOG space for them.
|
||||
* Now advance the nextXid counter. This must not happen until after we
|
||||
* have successfully completed ExtendCLOG() --- if that routine fails, we
|
||||
* want the next incoming transaction to try it again. We cannot assign
|
||||
* more XIDs until there is CLOG space for them.
|
||||
*/
|
||||
TransactionIdAdvance(ShmemVariableCache->nextXid);
|
||||
|
||||
/*
|
||||
* We must store the new XID into the shared PGPROC array before
|
||||
* releasing XidGenLock. This ensures that when GetSnapshotData calls
|
||||
* We must store the new XID into the shared PGPROC array before releasing
|
||||
* XidGenLock. This ensures that when GetSnapshotData calls
|
||||
* ReadNewTransactionId, all active XIDs before the returned value of
|
||||
* nextXid are already present in PGPROC. Else we have a race
|
||||
* condition.
|
||||
* nextXid are already present in PGPROC. Else we have a race condition.
|
||||
*
|
||||
* XXX by storing xid into MyProc without acquiring ProcArrayLock, we are
|
||||
* relying on fetch/store of an xid to be atomic, else other backends
|
||||
* might see a partially-set xid here. But holding both locks at once
|
||||
* would be a nasty concurrency hit (and in fact could cause a
|
||||
* deadlock against GetSnapshotData). So for now, assume atomicity.
|
||||
* Note that readers of PGPROC xid field should be careful to fetch
|
||||
* the value only once, rather than assume they can read it multiple
|
||||
* times and get the same answer each time.
|
||||
* would be a nasty concurrency hit (and in fact could cause a deadlock
|
||||
* against GetSnapshotData). So for now, assume atomicity. Note that
|
||||
* readers of PGPROC xid field should be careful to fetch the value only
|
||||
* once, rather than assume they can read it multiple times and get the
|
||||
* same answer each time.
|
||||
*
|
||||
* The same comments apply to the subxact xid count and overflow fields.
|
||||
*
|
||||
* A solution to the atomic-store problem would be to give each PGPROC
|
||||
* its own spinlock used only for fetching/storing that PGPROC's xid
|
||||
* and related fields.
|
||||
* A solution to the atomic-store problem would be to give each PGPROC its
|
||||
* own spinlock used only for fetching/storing that PGPROC's xid and
|
||||
* related fields.
|
||||
*
|
||||
* If there's no room to fit a subtransaction XID into PGPROC, set the
|
||||
* cache-overflowed flag instead. This forces readers to look in
|
||||
* pg_subtrans to map subtransaction XIDs up to top-level XIDs. There
|
||||
* is a race-condition window, in that the new XID will not appear as
|
||||
* running until its parent link has been placed into pg_subtrans.
|
||||
* However, that will happen before anyone could possibly have a
|
||||
* reason to inquire about the status of the XID, so it seems OK.
|
||||
* (Snapshots taken during this window *will* include the parent XID,
|
||||
* so they will deliver the correct answer later on when someone does
|
||||
* have a reason to inquire.)
|
||||
* pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a
|
||||
* race-condition window, in that the new XID will not appear as running
|
||||
* until its parent link has been placed into pg_subtrans. However, that
|
||||
* will happen before anyone could possibly have a reason to inquire about
|
||||
* the status of the XID, so it seems OK. (Snapshots taken during this
|
||||
* window *will* include the parent XID, so they will deliver the correct
|
||||
* answer later on when someone does have a reason to inquire.)
|
||||
*/
|
||||
if (MyProc != NULL)
|
||||
{
|
||||
@@ -197,27 +194,26 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
|
||||
xidWrapLimit += FirstNormalTransactionId;
|
||||
|
||||
/*
|
||||
* We'll refuse to continue assigning XIDs in interactive mode once
|
||||
* we get within 1M transactions of data loss. This leaves lots
|
||||
* of room for the DBA to fool around fixing things in a standalone
|
||||
* backend, while not being significant compared to total XID space.
|
||||
* (Note that since vacuuming requires one transaction per table
|
||||
* cleaned, we had better be sure there's lots of XIDs left...)
|
||||
* We'll refuse to continue assigning XIDs in interactive mode once we get
|
||||
* within 1M transactions of data loss. This leaves lots of room for the
|
||||
* DBA to fool around fixing things in a standalone backend, while not
|
||||
* being significant compared to total XID space. (Note that since
|
||||
* vacuuming requires one transaction per table cleaned, we had better be
|
||||
* sure there's lots of XIDs left...)
|
||||
*/
|
||||
xidStopLimit = xidWrapLimit - 1000000;
|
||||
if (xidStopLimit < FirstNormalTransactionId)
|
||||
xidStopLimit -= FirstNormalTransactionId;
|
||||
|
||||
/*
|
||||
* We'll start complaining loudly when we get within 10M transactions
|
||||
* of the stop point. This is kind of arbitrary, but if you let your
|
||||
* gas gauge get down to 1% of full, would you be looking for the
|
||||
* next gas station? We need to be fairly liberal about this number
|
||||
* because there are lots of scenarios where most transactions are
|
||||
* done by automatic clients that won't pay attention to warnings.
|
||||
* (No, we're not gonna make this configurable. If you know enough to
|
||||
* configure it, you know enough to not get in this kind of trouble in
|
||||
* the first place.)
|
||||
* We'll start complaining loudly when we get within 10M transactions of
|
||||
* the stop point. This is kind of arbitrary, but if you let your gas
|
||||
* gauge get down to 1% of full, would you be looking for the next gas
|
||||
* station? We need to be fairly liberal about this number because there
|
||||
* are lots of scenarios where most transactions are done by automatic
|
||||
* clients that won't pay attention to warnings. (No, we're not gonna make
|
||||
* this configurable. If you know enough to configure it, you know enough
|
||||
* to not get in this kind of trouble in the first place.)
|
||||
*/
|
||||
xidWarnLimit = xidStopLimit - 10000000;
|
||||
if (xidWarnLimit < FirstNormalTransactionId)
|
||||
@@ -234,16 +230,16 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid,
|
||||
|
||||
/* Log the info */
|
||||
ereport(LOG,
|
||||
(errmsg("transaction ID wrap limit is %u, limited by database \"%s\"",
|
||||
xidWrapLimit, NameStr(*oldest_datname))));
|
||||
(errmsg("transaction ID wrap limit is %u, limited by database \"%s\"",
|
||||
xidWrapLimit, NameStr(*oldest_datname))));
|
||||
/* Give an immediate warning if past the wrap warn point */
|
||||
if (TransactionIdFollowsOrEquals(curXid, xidWarnLimit))
|
||||
ereport(WARNING,
|
||||
(errmsg("database \"%s\" must be vacuumed within %u transactions",
|
||||
NameStr(*oldest_datname),
|
||||
xidWrapLimit - curXid),
|
||||
errhint("To avoid a database shutdown, execute a full-database VACUUM in \"%s\".",
|
||||
NameStr(*oldest_datname))));
|
||||
(errmsg("database \"%s\" must be vacuumed within %u transactions",
|
||||
NameStr(*oldest_datname),
|
||||
xidWrapLimit - curXid),
|
||||
errhint("To avoid a database shutdown, execute a full-database VACUUM in \"%s\".",
|
||||
NameStr(*oldest_datname))));
|
||||
}
|
||||
|
||||
|
||||
@@ -272,11 +268,11 @@ GetNewObjectId(void)
|
||||
* right after a wrap occurs, so as to avoid a possibly large number of
|
||||
* iterations in GetNewOid.) Note we are relying on unsigned comparison.
|
||||
*
|
||||
* During initdb, we start the OID generator at FirstBootstrapObjectId,
|
||||
* so we only enforce wrapping to that point when in bootstrap or
|
||||
* standalone mode. The first time through this routine after normal
|
||||
* postmaster start, the counter will be forced up to FirstNormalObjectId.
|
||||
* This mechanism leaves the OIDs between FirstBootstrapObjectId and
|
||||
* During initdb, we start the OID generator at FirstBootstrapObjectId, so we
|
||||
* only enforce wrapping to that point when in bootstrap or standalone
|
||||
* mode. The first time through this routine after normal postmaster
|
||||
* start, the counter will be forced up to FirstNormalObjectId. This
|
||||
* mechanism leaves the OIDs between FirstBootstrapObjectId and
|
||||
* FirstNormalObjectId available for automatic assignment during initdb,
|
||||
* while ensuring they will never conflict with user-assigned OIDs.
|
||||
*/
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -11,7 +11,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.38 2005/06/06 17:01:23 tgl Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/xlogutils.c,v 1.39 2005/10/15 02:49:11 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -121,7 +121,7 @@ _xl_remove_hash_entry(XLogRelDesc *rdesc)
|
||||
rdesc->moreRecently->lessRecently = rdesc->lessRecently;
|
||||
|
||||
hentry = (XLogRelCacheEntry *) hash_search(_xlrelcache,
|
||||
(void *) &(rdesc->reldata.rd_node), HASH_REMOVE, NULL);
|
||||
(void *) &(rdesc->reldata.rd_node), HASH_REMOVE, NULL);
|
||||
if (hentry == NULL)
|
||||
elog(PANIC, "_xl_remove_hash_entry: file was not found in cache");
|
||||
|
||||
@@ -211,11 +211,11 @@ XLogOpenRelation(RelFileNode rnode)
|
||||
res->reldata.rd_node = rnode;
|
||||
|
||||
/*
|
||||
* We set up the lockRelId in case anything tries to lock the
|
||||
* dummy relation. Note that this is fairly bogus since relNode
|
||||
* may be different from the relation's OID. It shouldn't really
|
||||
* matter though, since we are presumably running by ourselves and
|
||||
* can't have any lock conflicts ...
|
||||
* We set up the lockRelId in case anything tries to lock the dummy
|
||||
* relation. Note that this is fairly bogus since relNode may be
|
||||
* different from the relation's OID. It shouldn't really matter
|
||||
* though, since we are presumably running by ourselves and can't have
|
||||
* any lock conflicts ...
|
||||
*/
|
||||
res->reldata.rd_lockInfo.lockRelId.dbId = rnode.dbNode;
|
||||
res->reldata.rd_lockInfo.lockRelId.relId = rnode.relNode;
|
||||
@@ -233,13 +233,13 @@ XLogOpenRelation(RelFileNode rnode)
|
||||
RelationOpenSmgr(&(res->reldata));
|
||||
|
||||
/*
|
||||
* Create the target file if it doesn't already exist. This lets
|
||||
* us cope if the replay sequence contains writes to a relation
|
||||
* that is later deleted. (The original coding of this routine
|
||||
* would instead return NULL, causing the writes to be suppressed.
|
||||
* But that seems like it risks losing valuable data if the
|
||||
* filesystem loses an inode during a crash. Better to write the
|
||||
* data until we are actually told to delete the file.)
|
||||
* Create the target file if it doesn't already exist. This lets us
|
||||
* cope if the replay sequence contains writes to a relation that is
|
||||
* later deleted. (The original coding of this routine would instead
|
||||
* return NULL, causing the writes to be suppressed. But that seems
|
||||
* like it risks losing valuable data if the filesystem loses an inode
|
||||
* during a crash. Better to write the data until we are actually
|
||||
* told to delete the file.)
|
||||
*/
|
||||
smgrcreate(res->reldata.rd_smgr, res->reldata.rd_istemp, true);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user