mirror of
https://github.com/postgres/postgres.git
synced 2025-10-25 13:17:41 +03:00
Compress GIN posting lists, for smaller index size.
GIN posting lists are now encoded using varbyte-encoding, which allows them to fit in much smaller space than the straight ItemPointer array format used before. The new encoding is used for both the lists stored in-line in entry tree items, and in posting tree leaf pages. To maintain backwards-compatibility and keep pg_upgrade working, the code can still read old-style pages and tuples. Posting tree leaf pages in the new format are flagged with GIN_COMPRESSED flag, to distinguish old and new format pages. Likewise, entry tree tuples in the new format have a GIN_ITUP_COMPRESSED flag set in a bit that was previously unused. This patch bumps GIN_CURRENT_VERSION from 1 to 2. New indexes created with version 9.4 will therefore have version number 2 in the metapage, while old pg_upgraded indexes will have version 1. The code treats them the same, but it might be come handy in the future, if we want to drop support for the uncompressed format. Alexander Korotkov and me. Reviewed by Tomas Vondra and Amit Langote.
This commit is contained in:
@@ -32,11 +32,8 @@
|
||||
typedef struct GinPageOpaqueData
|
||||
{
|
||||
BlockNumber rightlink; /* next page if any */
|
||||
OffsetNumber maxoff; /* number entries on GIN_DATA page: number of
|
||||
* heap ItemPointers on GIN_DATA|GIN_LEAF page
|
||||
* or number of PostingItems on GIN_DATA &
|
||||
* ~GIN_LEAF page. On GIN_LIST page, number of
|
||||
* heap tuples. */
|
||||
OffsetNumber maxoff; /* number of PostingItems on GIN_DATA & ~GIN_LEAF page.
|
||||
* On GIN_LIST page, number of heap tuples. */
|
||||
uint16 flags; /* see bit definitions below */
|
||||
} GinPageOpaqueData;
|
||||
|
||||
@@ -49,6 +46,7 @@ typedef GinPageOpaqueData *GinPageOpaque;
|
||||
#define GIN_LIST (1 << 4)
|
||||
#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */
|
||||
#define GIN_INCOMPLETE_SPLIT (1 << 6) /* page was split, but parent not updated */
|
||||
#define GIN_COMPRESSED (1 << 7)
|
||||
|
||||
/* Page numbers of fixed-location pages */
|
||||
#define GIN_METAPAGE_BLKNO (0)
|
||||
@@ -88,7 +86,12 @@ typedef struct GinMetaPageData
|
||||
* GIN version number (ideally this should have been at the front, but too
|
||||
* late now. Don't move it!)
|
||||
*
|
||||
* Currently 1 (for indexes initialized in 9.1 or later)
|
||||
* Currently 2 (for indexes initialized in 9.4 or later)
|
||||
*
|
||||
* Version 1 (indexes initialized in version 9.1, 9.2 or 9.3), is
|
||||
* compatible, but may contain uncompressed posting tree (leaf) pages and
|
||||
* posting lists. They will be converted to compressed format when
|
||||
* modified.
|
||||
*
|
||||
* Version 0 (indexes initialized in 9.0 or before) is compatible but may
|
||||
* be missing null entries, including both null keys and placeholders.
|
||||
@@ -97,7 +100,7 @@ typedef struct GinMetaPageData
|
||||
int32 ginVersion;
|
||||
} GinMetaPageData;
|
||||
|
||||
#define GIN_CURRENT_VERSION 1
|
||||
#define GIN_CURRENT_VERSION 2
|
||||
|
||||
#define GinPageGetMeta(p) \
|
||||
((GinMetaPageData *) PageGetContents(p))
|
||||
@@ -116,6 +119,8 @@ typedef struct GinMetaPageData
|
||||
#define GinPageSetList(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST )
|
||||
#define GinPageHasFullRow(page) ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW )
|
||||
#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW )
|
||||
#define GinPageIsCompressed(page) ( GinPageGetOpaque(page)->flags & GIN_COMPRESSED )
|
||||
#define GinPageSetCompressed(page) ( GinPageGetOpaque(page)->flags |= GIN_COMPRESSED )
|
||||
|
||||
#define GinPageIsDeleted(page) ( GinPageGetOpaque(page)->flags & GIN_DELETED)
|
||||
#define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED)
|
||||
@@ -213,13 +218,16 @@ typedef signed char GinNullCategory;
|
||||
#define GinSetPostingTree(itup, blkno) ( GinSetNPosting((itup),GIN_TREE_POSTING), ItemPointerSetBlockNumber(&(itup)->t_tid, blkno) )
|
||||
#define GinGetPostingTree(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid)
|
||||
|
||||
#define GinGetPostingOffset(itup) GinItemPointerGetBlockNumber(&(itup)->t_tid)
|
||||
#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,n)
|
||||
#define GinGetPosting(itup) ((ItemPointer) ((char*)(itup) + GinGetPostingOffset(itup)))
|
||||
#define GIN_ITUP_COMPRESSED (1 << 31)
|
||||
#define GinGetPostingOffset(itup) (GinItemPointerGetBlockNumber(&(itup)->t_tid) & (~GIN_ITUP_COMPRESSED))
|
||||
#define GinSetPostingOffset(itup,n) ItemPointerSetBlockNumber(&(itup)->t_tid,(n)|GIN_ITUP_COMPRESSED)
|
||||
#define GinGetPosting(itup) ((Pointer) ((char*)(itup) + GinGetPostingOffset(itup)))
|
||||
#define GinItupIsCompressed(itup) (GinItemPointerGetBlockNumber(&(itup)->t_tid) & GIN_ITUP_COMPRESSED)
|
||||
|
||||
#define GinMaxItemSize \
|
||||
MAXALIGN_DOWN(((BLCKSZ - SizeOfPageHeaderData - \
|
||||
MAXALIGN(sizeof(GinPageOpaqueData))) / 3 - sizeof(ItemIdData)))
|
||||
Min(INDEX_SIZE_MASK, \
|
||||
MAXALIGN_DOWN(((BLCKSZ - SizeOfPageHeaderData - \
|
||||
MAXALIGN(sizeof(GinPageOpaqueData))) / 6 - sizeof(ItemIdData))))
|
||||
|
||||
/*
|
||||
* Access macros for non-leaf entry tuples
|
||||
@@ -230,30 +238,59 @@ typedef signed char GinNullCategory;
|
||||
|
||||
/*
|
||||
* Data (posting tree) pages
|
||||
*
|
||||
* Posting tree pages don't store regular tuples. Non-leaf pages contain
|
||||
* PostingItems, which are pairs of ItemPointers and child block numbers.
|
||||
* Leaf pages contain GinPostingLists and an uncompressed array of item
|
||||
* pointers.
|
||||
*
|
||||
* In a leaf page, the compressed posting lists are stored after the regular
|
||||
* page header, one after each other. Although we don't store regular tuples,
|
||||
* pd_lower is used to indicate the end of the posting lists. After that, free
|
||||
* space follows. This layout is compatible with the "standard" heap and
|
||||
* index page layout described in bufpage.h, so that we can e.g set buffer_std
|
||||
* when writing WAL records.
|
||||
*
|
||||
* In the special space is the GinPageOpaque struct.
|
||||
*/
|
||||
#define GinDataLeafPageGetPostingList(page) \
|
||||
(GinPostingList *) ((PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData))))
|
||||
#define GinDataLeafPageGetPostingListSize(page) \
|
||||
(((PageHeader) page)->pd_lower - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(ItemPointerData)))
|
||||
#define GinDataLeafPageSetPostingListSize(page, size) \
|
||||
{ \
|
||||
Assert(size <= GinDataLeafMaxContentSize); \
|
||||
((PageHeader) page)->pd_lower = (size) + MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(ItemPointerData)); \
|
||||
}
|
||||
|
||||
#define GinDataLeafPageIsEmpty(page) \
|
||||
(GinPageIsCompressed(page) ? (GinDataLeafPageGetPostingListSize(page) == 0) : (GinPageGetOpaque(page)->maxoff < FirstOffsetNumber))
|
||||
|
||||
#define GinDataLeafPageGetFreeSpace(page) PageGetExactFreeSpace(page)
|
||||
|
||||
#define GinDataPageGetRightBound(page) ((ItemPointer) PageGetContents(page))
|
||||
/*
|
||||
* Pointer to the data portion of a posting tree page. For internal pages,
|
||||
* that's the beginning of the array of PostingItems. For compressed leaf
|
||||
* pages, the first compressed posting list. For uncompressed (pre-9.4) leaf
|
||||
* pages, it's the beginning of the ItemPointer array.
|
||||
*/
|
||||
#define GinDataPageGetData(page) \
|
||||
(PageGetContents(page) + MAXALIGN(sizeof(ItemPointerData)))
|
||||
/* non-leaf pages contain PostingItems */
|
||||
#define GinDataPageGetPostingItem(page, i) \
|
||||
((PostingItem *) (GinDataPageGetData(page) + ((i)-1) * sizeof(PostingItem)))
|
||||
/* leaf pages contain ItemPointers */
|
||||
#define GinDataPageGetItemPointer(page, i) \
|
||||
((ItemPointer) (GinDataPageGetData(page) + ((i)-1) * sizeof(ItemPointerData)))
|
||||
#define GinSizeOfDataPageItem(page) \
|
||||
(GinPageIsLeaf(page) ? sizeof(ItemPointerData) : sizeof(PostingItem))
|
||||
|
||||
#define GinDataPageGetFreeSpace(page) \
|
||||
#define GinNonLeafDataPageGetFreeSpace(page) \
|
||||
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
|
||||
- MAXALIGN(sizeof(ItemPointerData)) \
|
||||
- GinPageGetOpaque(page)->maxoff * GinSizeOfDataPageItem(page) \
|
||||
- GinPageGetOpaque(page)->maxoff * sizeof(PostingItem) \
|
||||
- MAXALIGN(sizeof(GinPageOpaqueData)))
|
||||
|
||||
#define GinMaxLeafDataItems \
|
||||
((BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
|
||||
MAXALIGN(sizeof(ItemPointerData)) - \
|
||||
MAXALIGN(sizeof(GinPageOpaqueData))) \
|
||||
/ sizeof(ItemPointerData))
|
||||
#define GinDataLeafMaxContentSize \
|
||||
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \
|
||||
- MAXALIGN(sizeof(ItemPointerData)) \
|
||||
- MAXALIGN(sizeof(GinPageOpaqueData)))
|
||||
|
||||
/*
|
||||
* List pages
|
||||
@@ -318,6 +355,23 @@ typedef struct GinState
|
||||
Oid supportCollation[INDEX_MAX_KEYS];
|
||||
} GinState;
|
||||
|
||||
|
||||
/*
|
||||
* A compressed posting list.
|
||||
*
|
||||
* Note: This requires 2-byte alignment.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
ItemPointerData first; /* first item in this posting list (unpacked) */
|
||||
uint16 nbytes; /* number of bytes that follow */
|
||||
unsigned char bytes[1]; /* varbyte encoded items (variable length) */
|
||||
} GinPostingList;
|
||||
|
||||
#define SizeOfGinPostingList(plist) (offsetof(GinPostingList, bytes) + SHORTALIGN((plist)->nbytes) )
|
||||
#define GinNextPostingListSegment(cur) ((GinPostingList *) (((char *) (cur)) + SizeOfGinPostingList((cur))))
|
||||
|
||||
|
||||
/* XLog stuff */
|
||||
|
||||
#define XLOG_GIN_CREATE_INDEX 0x00
|
||||
@@ -328,18 +382,21 @@ typedef struct ginxlogCreatePostingTree
|
||||
{
|
||||
RelFileNode node;
|
||||
BlockNumber blkno;
|
||||
uint32 nitem;
|
||||
/* follows list of heap's ItemPointer */
|
||||
uint32 size;
|
||||
/* A compressed posting list follows */
|
||||
} ginxlogCreatePostingTree;
|
||||
|
||||
#define XLOG_GIN_INSERT 0x20
|
||||
|
||||
typedef struct ginxlogInsert
|
||||
/*
|
||||
* The format of the insertion record varies depending on the page type.
|
||||
* ginxlogInsert is the common part between all variants.
|
||||
*/
|
||||
typedef struct
|
||||
{
|
||||
RelFileNode node;
|
||||
BlockNumber blkno;
|
||||
uint16 flags; /* GIN_SPLIT_ISLEAF and/or GIN_SPLIT_ISDATA */
|
||||
OffsetNumber offset;
|
||||
|
||||
/*
|
||||
* FOLLOWS:
|
||||
@@ -358,17 +415,25 @@ typedef struct ginxlogInsert
|
||||
|
||||
typedef struct
|
||||
{
|
||||
OffsetNumber offset;
|
||||
bool isDelete;
|
||||
IndexTupleData tuple; /* variable length */
|
||||
} ginxlogInsertEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
OffsetNumber nitem;
|
||||
ItemPointerData items[1]; /* variable length */
|
||||
} ginxlogInsertDataLeaf;
|
||||
uint16 length;
|
||||
uint16 unmodifiedsize;
|
||||
|
||||
/* In an insert to an internal data page, the payload is a PostingItem */
|
||||
/* compressed segments, variable length */
|
||||
char newdata[1];
|
||||
} ginxlogRecompressDataLeaf;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
OffsetNumber offset;
|
||||
PostingItem newitem;
|
||||
} ginxlogInsertDataInternal;
|
||||
|
||||
|
||||
#define XLOG_GIN_SPLIT 0x30
|
||||
@@ -401,25 +466,58 @@ typedef struct
|
||||
/* FOLLOWS: IndexTuples */
|
||||
} ginxlogSplitEntry;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
uint16 lsize;
|
||||
uint16 rsize;
|
||||
ItemPointerData lrightbound; /* new right bound of left page */
|
||||
ItemPointerData rrightbound; /* new right bound of right page */
|
||||
|
||||
/* FOLLOWS: new compressed posting lists of left and right page */
|
||||
char newdata[1];
|
||||
} ginxlogSplitDataLeaf;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
OffsetNumber separator;
|
||||
OffsetNumber nitem;
|
||||
ItemPointerData rightbound;
|
||||
|
||||
/* FOLLOWS: array of ItemPointers (for leaf) or PostingItems (non-leaf) */
|
||||
} ginxlogSplitData;
|
||||
/* FOLLOWS: array of PostingItems */
|
||||
} ginxlogSplitDataInternal;
|
||||
|
||||
/*
|
||||
* Vacuum simply WAL-logs the whole page, when anything is modified. This
|
||||
* functionally identical heap_newpage records, but is kept separate for
|
||||
* debugging purposes. (When inspecting the WAL stream, it's easier to see
|
||||
* what's going on when GIN vacuum records are marked as such, not as heap
|
||||
* records.) This is currently only used for entry tree leaf pages.
|
||||
*/
|
||||
#define XLOG_GIN_VACUUM_PAGE 0x40
|
||||
|
||||
typedef struct ginxlogVacuumPage
|
||||
{
|
||||
RelFileNode node;
|
||||
BlockNumber blkno;
|
||||
OffsetNumber nitem;
|
||||
/* follows content of page */
|
||||
uint16 hole_offset; /* number of bytes before "hole" */
|
||||
uint16 hole_length; /* number of bytes in "hole" */
|
||||
/* entire page contents (minus the hole) follow at end of record */
|
||||
} ginxlogVacuumPage;
|
||||
|
||||
/*
|
||||
* Vacuuming posting tree leaf page is WAL-logged like recompression caused
|
||||
* by insertion.
|
||||
*/
|
||||
#define XLOG_GIN_VACUUM_DATA_LEAF_PAGE 0x90
|
||||
|
||||
typedef struct ginxlogVacuumDataLeafPage
|
||||
{
|
||||
RelFileNode node;
|
||||
BlockNumber blkno;
|
||||
|
||||
ginxlogRecompressDataLeaf data;
|
||||
} ginxlogVacuumDataLeafPage;
|
||||
|
||||
#define XLOG_GIN_DELETE_PAGE 0x50
|
||||
|
||||
typedef struct ginxlogDeletePage
|
||||
@@ -506,6 +604,7 @@ typedef struct GinBtreeStack
|
||||
BlockNumber blkno;
|
||||
Buffer buffer;
|
||||
OffsetNumber off;
|
||||
ItemPointerData iptr;
|
||||
/* predictNumber contains predicted number of pages on current level */
|
||||
uint32 predictNumber;
|
||||
struct GinBtreeStack *parent;
|
||||
@@ -513,6 +612,14 @@ typedef struct GinBtreeStack
|
||||
|
||||
typedef struct GinBtreeData *GinBtree;
|
||||
|
||||
/* Return codes for GinBtreeData.placeToPage method */
|
||||
typedef enum
|
||||
{
|
||||
UNMODIFIED,
|
||||
INSERTED,
|
||||
SPLIT
|
||||
} GinPlaceToPageRC;
|
||||
|
||||
typedef struct GinBtreeData
|
||||
{
|
||||
/* search methods */
|
||||
@@ -523,8 +630,7 @@ typedef struct GinBtreeData
|
||||
|
||||
/* insert methods */
|
||||
OffsetNumber (*findChildPtr) (GinBtree, Page, BlockNumber, OffsetNumber);
|
||||
bool (*placeToPage) (GinBtree, Buffer, OffsetNumber, void *, BlockNumber, XLogRecData **);
|
||||
Page (*splitPage) (GinBtree, Buffer, Buffer, OffsetNumber, void *, BlockNumber, XLogRecData **);
|
||||
GinPlaceToPageRC (*placeToPage) (GinBtree, Buffer, GinBtreeStack *, void *, BlockNumber, XLogRecData **, Page *, Page *);
|
||||
void *(*prepareDownlink) (GinBtree, Buffer);
|
||||
void (*fillRoot) (GinBtree, Page, BlockNumber, Page, BlockNumber, Page);
|
||||
|
||||
@@ -577,14 +683,17 @@ extern void ginInsertValue(GinBtree btree, GinBtreeStack *stack,
|
||||
/* ginentrypage.c */
|
||||
extern IndexTuple GinFormTuple(GinState *ginstate,
|
||||
OffsetNumber attnum, Datum key, GinNullCategory category,
|
||||
ItemPointerData *ipd, uint32 nipd, bool errorTooBig);
|
||||
extern void GinShortenTuple(IndexTuple itup, uint32 nipd);
|
||||
Pointer data, Size dataSize, int nipd, bool errorTooBig);
|
||||
extern void ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
|
||||
Datum key, GinNullCategory category,
|
||||
GinState *ginstate);
|
||||
extern void ginEntryFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
|
||||
extern ItemPointer ginReadTuple(GinState *ginstate, OffsetNumber attnum,
|
||||
IndexTuple itup, int *nitems);
|
||||
|
||||
/* gindatapage.c */
|
||||
extern ItemPointer GinDataLeafPageGetItems(Page page, int *nitems);
|
||||
extern int GinDataLeafPageGetItemsToTbm(Page page, TIDBitmap *tbm);
|
||||
extern BlockNumber createPostingTree(Relation index,
|
||||
ItemPointerData *items, uint32 nitems,
|
||||
GinStatsData *buildStats);
|
||||
@@ -598,6 +707,15 @@ extern GinBtreeStack *ginScanBeginPostingTree(Relation index, BlockNumber rootBl
|
||||
extern void ginDataFillRoot(GinBtree btree, Page root, BlockNumber lblkno, Page lpage, BlockNumber rblkno, Page rpage);
|
||||
extern void ginPrepareDataScan(GinBtree btree, Relation index, BlockNumber rootBlkno);
|
||||
|
||||
/*
|
||||
* This is declared in ginvacuum.c, but is passed between ginVacuumItemPointers
|
||||
* and ginVacuumPostingTreeLeaf and as an opaque struct, so we need a forward
|
||||
* declaration for it.
|
||||
*/
|
||||
typedef struct GinVacuumState GinVacuumState;
|
||||
|
||||
extern void ginVacuumPostingTreeLeaf(Relation rel, Buffer buf, GinVacuumState *gvs);
|
||||
|
||||
/* ginscan.c */
|
||||
|
||||
/*
|
||||
@@ -679,7 +797,7 @@ typedef struct GinScanEntryData
|
||||
|
||||
/* used for Posting list and one page in Posting tree */
|
||||
ItemPointerData *list;
|
||||
uint32 nlist;
|
||||
int nlist;
|
||||
OffsetNumber offset;
|
||||
|
||||
bool isFinished;
|
||||
@@ -717,6 +835,8 @@ extern Datum gingetbitmap(PG_FUNCTION_ARGS);
|
||||
/* ginvacuum.c */
|
||||
extern Datum ginbulkdelete(PG_FUNCTION_ARGS);
|
||||
extern Datum ginvacuumcleanup(PG_FUNCTION_ARGS);
|
||||
extern ItemPointer ginVacuumItemPointers(GinVacuumState *gvs,
|
||||
ItemPointerData *items, int nitem, int *nremaining);
|
||||
|
||||
/* ginbulk.c */
|
||||
typedef struct GinEntryAccumulator
|
||||
@@ -770,11 +890,17 @@ extern void ginInsertCleanup(GinState *ginstate,
|
||||
bool vac_delay, IndexBulkDeleteResult *stats);
|
||||
|
||||
/* ginpostinglist.c */
|
||||
extern uint32 ginMergeItemPointers(ItemPointerData *dst,
|
||||
|
||||
extern GinPostingList *ginCompressPostingList(const ItemPointer ptrs, int nptrs,
|
||||
int maxsize, int *nwritten);
|
||||
extern int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int totalsize, TIDBitmap *tbm);
|
||||
|
||||
extern ItemPointer ginPostingListDecodeAllSegments(GinPostingList *ptr, int len, int *ndecoded);
|
||||
extern ItemPointer ginPostingListDecode(GinPostingList *ptr, int *ndecoded);
|
||||
extern int ginMergeItemPointers(ItemPointerData *dst,
|
||||
ItemPointerData *a, uint32 na,
|
||||
ItemPointerData *b, uint32 nb);
|
||||
|
||||
|
||||
/*
|
||||
* Merging the results of several gin scans compares item pointers a lot,
|
||||
* so we want this to be inlined. But if the compiler doesn't support that,
|
||||
|
||||
Reference in New Issue
Block a user