1
0
mirror of https://github.com/postgres/postgres.git synced 2025-11-12 05:01:15 +03:00
Files
postgres/src/backend/access/common/indextuple.c
Robert Haas bbe0a81db6 Allow configurable LZ4 TOAST compression.
There is now a per-column COMPRESSION option which can be set to pglz
(the default, and the only option in up until now) or lz4. Or, if you
like, you can set the new default_toast_compression GUC to lz4, and
then that will be the default for new table columns for which no value
is specified. We don't have lz4 support in the PostgreSQL code, so
to use lz4 compression, PostgreSQL must be built --with-lz4.

In general, TOAST compression means compression of individual column
values, not the whole tuple, and those values can either be compressed
inline within the tuple or compressed and then stored externally in
the TOAST table, so those properties also apply to this feature.

Prior to this commit, a TOAST pointer has two unused bits as part of
the va_extsize field, and a compessed datum has two unused bits as
part of the va_rawsize field. These bits are unused because the length
of a varlena is limited to 1GB; we now use them to indicate the
compression type that was used. This means we only have bit space for
2 more built-in compresison types, but we could work around that
problem, if necessary, by introducing a new vartag_external value for
any further types we end up wanting to add. Hopefully, it won't be
too important to offer a wide selection of algorithms here, since
each one we add not only takes more coding but also adds a build
dependency for every packager. Nevertheless, it seems worth doing
at least this much, because LZ4 gets better compression than PGLZ
with less CPU usage.

It's possible for LZ4-compressed datums to leak into composite type
values stored on disk, just as it is for PGLZ. It's also possible for
LZ4-compressed attributes to be copied into a different table via SQL
commands such as CREATE TABLE AS or INSERT .. SELECT.  It would be
expensive to force such values to be decompressed, so PostgreSQL has
never done so. For the same reasons, we also don't force recompression
of already-compressed values even if the target table prefers a
different compression method than was used for the source data.  These
architectural decisions are perhaps arguable but revisiting them is
well beyond the scope of what seemed possible to do as part of this
project.  However, it's relatively cheap to recompress as part of
VACUUM FULL or CLUSTER, so this commit adjusts those commands to do
so, if the configured compression method of the table happens not to
match what was used for some column value stored therein.

Dilip Kumar. The original patches on which this work was based were
written by Ildus Kurbangaliev, and those were patches were based on
even earlier work by Nikita Glukhov, but the design has since changed
very substantially, since allow a potentially large number of
compression methods that could be added and dropped on a running
system proved too problematic given some of the architectural issues
mentioned above; the choice of which specific compression method to
add first is now different; and a lot of the code has been heavily
refactored.  More recently, Justin Przyby helped quite a bit with
testing and reviewing and this version also includes some code
contributions from him. Other design input and review from Tomas
Vondra, Álvaro Herrera, Andres Freund, Oleg Bartunov, Alexander
Korotkov, and me.

Discussion: http://postgr.es/m/20170907194236.4cefce96%40wp.localdomain
Discussion: http://postgr.es/m/CAFiTN-uUpX3ck%3DK0mLEk-G_kUQY%3DSNOTeqdaNRR9FMdQrHKebw%40mail.gmail.com
2021-03-19 15:10:38 -04:00

573 lines
15 KiB
C

/*-------------------------------------------------------------------------
*
* indextuple.c
* This file contains index tuple accessor and mutator routines,
* as well as various tuple utilities.
*
* Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/common/indextuple.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/detoast.h"
#include "access/heaptoast.h"
#include "access/htup_details.h"
#include "access/itup.h"
#include "access/toast_internals.h"
/*
* This enables de-toasting of index entries. Needed until VACUUM is
* smart enough to rebuild indexes from scratch.
*/
#define TOAST_INDEX_HACK
/* ----------------------------------------------------------------
* index_ tuple interface routines
* ----------------------------------------------------------------
*/
/* ----------------
* index_form_tuple
*
* This shouldn't leak any memory; otherwise, callers such as
* tuplesort_putindextuplevalues() will be very unhappy.
*
* This shouldn't perform external table access provided caller
* does not pass values that are stored EXTERNAL.
* ----------------
*/
IndexTuple
index_form_tuple(TupleDesc tupleDescriptor,
Datum *values,
bool *isnull)
{
char *tp; /* tuple pointer */
IndexTuple tuple; /* return tuple */
Size size,
data_size,
hoff;
int i;
unsigned short infomask = 0;
bool hasnull = false;
uint16 tupmask = 0;
int numberOfAttributes = tupleDescriptor->natts;
#ifdef TOAST_INDEX_HACK
Datum untoasted_values[INDEX_MAX_KEYS];
bool untoasted_free[INDEX_MAX_KEYS];
#endif
if (numberOfAttributes > INDEX_MAX_KEYS)
ereport(ERROR,
(errcode(ERRCODE_TOO_MANY_COLUMNS),
errmsg("number of index columns (%d) exceeds limit (%d)",
numberOfAttributes, INDEX_MAX_KEYS)));
#ifdef TOAST_INDEX_HACK
for (i = 0; i < numberOfAttributes; i++)
{
Form_pg_attribute att = TupleDescAttr(tupleDescriptor, i);
untoasted_values[i] = values[i];
untoasted_free[i] = false;
/* Do nothing if value is NULL or not of varlena type */
if (isnull[i] || att->attlen != -1)
continue;
/*
* If value is stored EXTERNAL, must fetch it so we are not depending
* on outside storage. This should be improved someday.
*/
if (VARATT_IS_EXTERNAL(DatumGetPointer(values[i])))
{
untoasted_values[i] =
PointerGetDatum(detoast_external_attr((struct varlena *)
DatumGetPointer(values[i])));
untoasted_free[i] = true;
}
/*
* If value is above size target, and is of a compressible datatype,
* try to compress it in-line.
*/
if (!VARATT_IS_EXTENDED(DatumGetPointer(untoasted_values[i])) &&
VARSIZE(DatumGetPointer(untoasted_values[i])) > TOAST_INDEX_TARGET &&
(att->attstorage == TYPSTORAGE_EXTENDED ||
att->attstorage == TYPSTORAGE_MAIN))
{
Datum cvalue = toast_compress_datum(untoasted_values[i],
att->attcompression);
if (DatumGetPointer(cvalue) != NULL)
{
/* successful compression */
if (untoasted_free[i])
pfree(DatumGetPointer(untoasted_values[i]));
untoasted_values[i] = cvalue;
untoasted_free[i] = true;
}
}
}
#endif
for (i = 0; i < numberOfAttributes; i++)
{
if (isnull[i])
{
hasnull = true;
break;
}
}
if (hasnull)
infomask |= INDEX_NULL_MASK;
hoff = IndexInfoFindDataOffset(infomask);
#ifdef TOAST_INDEX_HACK
data_size = heap_compute_data_size(tupleDescriptor,
untoasted_values, isnull);
#else
data_size = heap_compute_data_size(tupleDescriptor,
values, isnull);
#endif
size = hoff + data_size;
size = MAXALIGN(size); /* be conservative */
tp = (char *) palloc0(size);
tuple = (IndexTuple) tp;
heap_fill_tuple(tupleDescriptor,
#ifdef TOAST_INDEX_HACK
untoasted_values,
#else
values,
#endif
isnull,
(char *) tp + hoff,
data_size,
&tupmask,
(hasnull ? (bits8 *) tp + sizeof(IndexTupleData) : NULL));
#ifdef TOAST_INDEX_HACK
for (i = 0; i < numberOfAttributes; i++)
{
if (untoasted_free[i])
pfree(DatumGetPointer(untoasted_values[i]));
}
#endif
/*
* We do this because heap_fill_tuple wants to initialize a "tupmask"
* which is used for HeapTuples, but we want an indextuple infomask. The
* only relevant info is the "has variable attributes" field. We have
* already set the hasnull bit above.
*/
if (tupmask & HEAP_HASVARWIDTH)
infomask |= INDEX_VAR_MASK;
/* Also assert we got rid of external attributes */
#ifdef TOAST_INDEX_HACK
Assert((tupmask & HEAP_HASEXTERNAL) == 0);
#endif
/*
* Here we make sure that the size will fit in the field reserved for it
* in t_info.
*/
if ((size & INDEX_SIZE_MASK) != size)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("index row requires %zu bytes, maximum size is %zu",
size, (Size) INDEX_SIZE_MASK)));
infomask |= size;
/*
* initialize metadata
*/
tuple->t_info = infomask;
return tuple;
}
/* ----------------
* nocache_index_getattr
*
* This gets called from index_getattr() macro, and only in cases
* where we can't use cacheoffset and the value is not null.
*
* This caches attribute offsets in the attribute descriptor.
*
* An alternative way to speed things up would be to cache offsets
* with the tuple, but that seems more difficult unless you take
* the storage hit of actually putting those offsets into the
* tuple you send to disk. Yuck.
*
* This scheme will be slightly slower than that, but should
* perform well for queries which hit large #'s of tuples. After
* you cache the offsets once, examining all the other tuples using
* the same attribute descriptor will go much quicker. -cim 5/4/91
* ----------------
*/
Datum
nocache_index_getattr(IndexTuple tup,
int attnum,
TupleDesc tupleDesc)
{
char *tp; /* ptr to data part of tuple */
bits8 *bp = NULL; /* ptr to null bitmap in tuple */
bool slow = false; /* do we have to walk attrs? */
int data_off; /* tuple data offset */
int off; /* current offset within data */
/* ----------------
* Three cases:
*
* 1: No nulls and no variable-width attributes.
* 2: Has a null or a var-width AFTER att.
* 3: Has nulls or var-widths BEFORE att.
* ----------------
*/
data_off = IndexInfoFindDataOffset(tup->t_info);
attnum--;
if (IndexTupleHasNulls(tup))
{
/*
* there's a null somewhere in the tuple
*
* check to see if desired att is null
*/
/* XXX "knows" t_bits are just after fixed tuple header! */
bp = (bits8 *) ((char *) tup + sizeof(IndexTupleData));
/*
* Now check to see if any preceding bits are null...
*/
{
int byte = attnum >> 3;
int finalbit = attnum & 0x07;
/* check for nulls "before" final bit of last byte */
if ((~bp[byte]) & ((1 << finalbit) - 1))
slow = true;
else
{
/* check for nulls in any "earlier" bytes */
int i;
for (i = 0; i < byte; i++)
{
if (bp[i] != 0xFF)
{
slow = true;
break;
}
}
}
}
}
tp = (char *) tup + data_off;
if (!slow)
{
Form_pg_attribute att;
/*
* If we get here, there are no nulls up to and including the target
* attribute. If we have a cached offset, we can use it.
*/
att = TupleDescAttr(tupleDesc, attnum);
if (att->attcacheoff >= 0)
return fetchatt(att, tp + att->attcacheoff);
/*
* Otherwise, check for non-fixed-length attrs up to and including
* target. If there aren't any, it's safe to cheaply initialize the
* cached offsets for these attrs.
*/
if (IndexTupleHasVarwidths(tup))
{
int j;
for (j = 0; j <= attnum; j++)
{
if (TupleDescAttr(tupleDesc, j)->attlen <= 0)
{
slow = true;
break;
}
}
}
}
if (!slow)
{
int natts = tupleDesc->natts;
int j = 1;
/*
* If we get here, we have a tuple with no nulls or var-widths up to
* and including the target attribute, so we can use the cached offset
* ... only we don't have it yet, or we'd not have got here. Since
* it's cheap to compute offsets for fixed-width columns, we take the
* opportunity to initialize the cached offsets for *all* the leading
* fixed-width columns, in hope of avoiding future visits to this
* routine.
*/
TupleDescAttr(tupleDesc, 0)->attcacheoff = 0;
/* we might have set some offsets in the slow path previously */
while (j < natts && TupleDescAttr(tupleDesc, j)->attcacheoff > 0)
j++;
off = TupleDescAttr(tupleDesc, j - 1)->attcacheoff +
TupleDescAttr(tupleDesc, j - 1)->attlen;
for (; j < natts; j++)
{
Form_pg_attribute att = TupleDescAttr(tupleDesc, j);
if (att->attlen <= 0)
break;
off = att_align_nominal(off, att->attalign);
att->attcacheoff = off;
off += att->attlen;
}
Assert(j > attnum);
off = TupleDescAttr(tupleDesc, attnum)->attcacheoff;
}
else
{
bool usecache = true;
int i;
/*
* Now we know that we have to walk the tuple CAREFULLY. But we still
* might be able to cache some offsets for next time.
*
* Note - This loop is a little tricky. For each non-null attribute,
* we have to first account for alignment padding before the attr,
* then advance over the attr based on its length. Nulls have no
* storage and no alignment padding either. We can use/set
* attcacheoff until we reach either a null or a var-width attribute.
*/
off = 0;
for (i = 0;; i++) /* loop exit is at "break" */
{
Form_pg_attribute att = TupleDescAttr(tupleDesc, i);
if (IndexTupleHasNulls(tup) && att_isnull(i, bp))
{
usecache = false;
continue; /* this cannot be the target att */
}
/* If we know the next offset, we can skip the rest */
if (usecache && att->attcacheoff >= 0)
off = att->attcacheoff;
else if (att->attlen == -1)
{
/*
* We can only cache the offset for a varlena attribute if the
* offset is already suitably aligned, so that there would be
* no pad bytes in any case: then the offset will be valid for
* either an aligned or unaligned value.
*/
if (usecache &&
off == att_align_nominal(off, att->attalign))
att->attcacheoff = off;
else
{
off = att_align_pointer(off, att->attalign, -1,
tp + off);
usecache = false;
}
}
else
{
/* not varlena, so safe to use att_align_nominal */
off = att_align_nominal(off, att->attalign);
if (usecache)
att->attcacheoff = off;
}
if (i == attnum)
break;
off = att_addlength_pointer(off, att->attlen, tp + off);
if (usecache && att->attlen <= 0)
usecache = false;
}
}
return fetchatt(TupleDescAttr(tupleDesc, attnum), tp + off);
}
/*
* Convert an index tuple into Datum/isnull arrays.
*
* The caller must allocate sufficient storage for the output arrays.
* (INDEX_MAX_KEYS entries should be enough.)
*
* This is nearly the same as heap_deform_tuple(), but for IndexTuples.
* One difference is that the tuple should never have any missing columns.
*/
void
index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor,
Datum *values, bool *isnull)
{
int hasnulls = IndexTupleHasNulls(tup);
int natts = tupleDescriptor->natts; /* number of atts to extract */
int attnum;
char *tp; /* ptr to tuple data */
int off; /* offset in tuple data */
bits8 *bp; /* ptr to null bitmap in tuple */
bool slow = false; /* can we use/set attcacheoff? */
/* Assert to protect callers who allocate fixed-size arrays */
Assert(natts <= INDEX_MAX_KEYS);
/* XXX "knows" t_bits are just after fixed tuple header! */
bp = (bits8 *) ((char *) tup + sizeof(IndexTupleData));
tp = (char *) tup + IndexInfoFindDataOffset(tup->t_info);
off = 0;
for (attnum = 0; attnum < natts; attnum++)
{
Form_pg_attribute thisatt = TupleDescAttr(tupleDescriptor, attnum);
if (hasnulls && att_isnull(attnum, bp))
{
values[attnum] = (Datum) 0;
isnull[attnum] = true;
slow = true; /* can't use attcacheoff anymore */
continue;
}
isnull[attnum] = false;
if (!slow && thisatt->attcacheoff >= 0)
off = thisatt->attcacheoff;
else if (thisatt->attlen == -1)
{
/*
* We can only cache the offset for a varlena attribute if the
* offset is already suitably aligned, so that there would be no
* pad bytes in any case: then the offset will be valid for either
* an aligned or unaligned value.
*/
if (!slow &&
off == att_align_nominal(off, thisatt->attalign))
thisatt->attcacheoff = off;
else
{
off = att_align_pointer(off, thisatt->attalign, -1,
tp + off);
slow = true;
}
}
else
{
/* not varlena, so safe to use att_align_nominal */
off = att_align_nominal(off, thisatt->attalign);
if (!slow)
thisatt->attcacheoff = off;
}
values[attnum] = fetchatt(thisatt, tp + off);
off = att_addlength_pointer(off, thisatt->attlen, tp + off);
if (thisatt->attlen <= 0)
slow = true; /* can't use attcacheoff anymore */
}
}
/*
* Create a palloc'd copy of an index tuple.
*/
IndexTuple
CopyIndexTuple(IndexTuple source)
{
IndexTuple result;
Size size;
size = IndexTupleSize(source);
result = (IndexTuple) palloc(size);
memcpy(result, source, size);
return result;
}
/*
* Create a palloc'd copy of an index tuple, leaving only the first
* leavenatts attributes remaining.
*
* Truncation is guaranteed to result in an index tuple that is no
* larger than the original. It is safe to use the IndexTuple with
* the original tuple descriptor, but caller must avoid actually
* accessing truncated attributes from returned tuple! In practice
* this means that index_getattr() must be called with special care,
* and that the truncated tuple should only ever be accessed by code
* under caller's direct control.
*
* It's safe to call this function with a buffer lock held, since it
* never performs external table access. If it ever became possible
* for index tuples to contain EXTERNAL TOAST values, then this would
* have to be revisited.
*/
IndexTuple
index_truncate_tuple(TupleDesc sourceDescriptor, IndexTuple source,
int leavenatts)
{
TupleDesc truncdesc;
Datum values[INDEX_MAX_KEYS];
bool isnull[INDEX_MAX_KEYS];
IndexTuple truncated;
Assert(leavenatts <= sourceDescriptor->natts);
/* Easy case: no truncation actually required */
if (leavenatts == sourceDescriptor->natts)
return CopyIndexTuple(source);
/* Create temporary descriptor to scribble on */
truncdesc = palloc(TupleDescSize(sourceDescriptor));
TupleDescCopy(truncdesc, sourceDescriptor);
truncdesc->natts = leavenatts;
/* Deform, form copy of tuple with fewer attributes */
index_deform_tuple(source, truncdesc, values, isnull);
truncated = index_form_tuple(truncdesc, values, isnull);
truncated->t_tid = source->t_tid;
Assert(IndexTupleSize(truncated) <= IndexTupleSize(source));
/*
* Cannot leak memory here, TupleDescCopy() doesn't allocate any inner
* structure, so, plain pfree() should clean all allocated memory
*/
pfree(truncdesc);
return truncated;
}