1
0
mirror of https://github.com/postgres/postgres.git synced 2025-10-25 13:17:41 +03:00

BRIN: Block Range Indexes

BRIN is a new index access method intended to accelerate scans of very
large tables, without the maintenance overhead of btrees or other
traditional indexes.  They work by maintaining "summary" data about
block ranges.  Bitmap index scans work by reading each summary tuple and
comparing them with the query quals; all pages in the range are returned
in a lossy TID bitmap if the quals are consistent with the values in the
summary tuple, otherwise not.  Normal index scans are not supported
because these indexes do not store TIDs.

As new tuples are added into the index, the summary information is
updated (if the block range in which the tuple is added is already
summarized) or not; in the latter case, a subsequent pass of VACUUM or
the brin_summarize_new_values() function will create the summary
information.

For data types with natural 1-D sort orders, the summary info consists
of the maximum and the minimum values of each indexed column within each
page range.  This type of operator class we call "Minmax", and we
supply a bunch of them for most data types with B-tree opclasses.
Since the BRIN code is generalized, other approaches are possible for
things such as arrays, geometric types, ranges, etc; even for things
such as enum types we could do something different than minmax with
better results.  In this commit I only include minmax.

Catalog version bumped due to new builtin catalog entries.

There's more that could be done here, but this is a good step forwards.

Loosely based on ideas from Simon Riggs; code mostly by Álvaro Herrera,
with contribution by Heikki Linnakangas.

Patch reviewed by: Amit Kapila, Heikki Linnakangas, Robert Haas.
Testing help from Jeff Janes, Erik Rijkers, Emanuel Calvo.

PS:
  The research leading to these results has received funding from the
  European Union's Seventh Framework Programme (FP7/2007-2013) under
  grant agreement n° 318633.
This commit is contained in:
Alvaro Herrera
2014-11-07 16:38:14 -03:00
parent 1961b1c131
commit 7516f52594
57 changed files with 6807 additions and 24 deletions

52
src/include/access/brin.h Normal file
View File

@@ -0,0 +1,52 @@
/*
* AM-callable functions for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin.h
*/
#ifndef BRIN_H
#define BRIN_H
#include "fmgr.h"
#include "nodes/execnodes.h"
#include "utils/relcache.h"
/*
* prototypes for functions in brin.c (external entry points for BRIN)
*/
extern Datum brinbuild(PG_FUNCTION_ARGS);
extern Datum brinbuildempty(PG_FUNCTION_ARGS);
extern Datum brininsert(PG_FUNCTION_ARGS);
extern Datum brinbeginscan(PG_FUNCTION_ARGS);
extern Datum bringettuple(PG_FUNCTION_ARGS);
extern Datum bringetbitmap(PG_FUNCTION_ARGS);
extern Datum brinrescan(PG_FUNCTION_ARGS);
extern Datum brinendscan(PG_FUNCTION_ARGS);
extern Datum brinmarkpos(PG_FUNCTION_ARGS);
extern Datum brinrestrpos(PG_FUNCTION_ARGS);
extern Datum brinbulkdelete(PG_FUNCTION_ARGS);
extern Datum brinvacuumcleanup(PG_FUNCTION_ARGS);
extern Datum brincanreturn(PG_FUNCTION_ARGS);
extern Datum brincostestimate(PG_FUNCTION_ARGS);
extern Datum brinoptions(PG_FUNCTION_ARGS);
/*
* Storage type for BRIN's reloptions
*/
typedef struct BrinOptions
{
int32 vl_len_; /* varlena header (do not touch directly!) */
BlockNumber pagesPerRange;
} BrinOptions;
#define BRIN_DEFAULT_PAGES_PER_RANGE 128
#define BrinGetPagesPerRange(relation) \
((relation)->rd_options ? \
((BrinOptions *) (relation)->rd_options)->pagesPerRange : \
BRIN_DEFAULT_PAGES_PER_RANGE)
#endif /* BRIN_H */

View File

@@ -0,0 +1,88 @@
/*
* brin_internal.h
* internal declarations for BRIN indexes
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_internal.h
*/
#ifndef BRIN_INTERNAL_H
#define BRIN_INTERNAL_H
#include "fmgr.h"
#include "storage/buf.h"
#include "storage/bufpage.h"
#include "storage/off.h"
#include "utils/relcache.h"
/*
* A BrinDesc is a struct designed to enable decoding a BRIN tuple from the
* on-disk format to an in-memory tuple and vice-versa.
*/
/* struct returned by "OpcInfo" amproc */
typedef struct BrinOpcInfo
{
/* Number of columns stored in an index column of this opclass */
uint16 oi_nstored;
/* Opaque pointer for the opclass' private use */
void *oi_opaque;
/* Type IDs of the stored columns */
Oid oi_typids[FLEXIBLE_ARRAY_MEMBER];
} BrinOpcInfo;
/* the size of a BrinOpcInfo for the given number of columns */
#define SizeofBrinOpcInfo(ncols) \
(offsetof(BrinOpcInfo, oi_typids) + sizeof(Oid) * ncols)
typedef struct BrinDesc
{
/* Containing memory context */
MemoryContext bd_context;
/* the index relation itself */
Relation bd_index;
/* tuple descriptor of the index relation */
TupleDesc bd_tupdesc;
/* cached copy for on-disk tuples; generated at first use */
TupleDesc bd_disktdesc;
/* total number of Datum entries that are stored on-disk for all columns */
int bd_totalstored;
/* per-column info; bd_tupdesc->natts entries long */
BrinOpcInfo *bd_info[FLEXIBLE_ARRAY_MEMBER];
} BrinDesc;
/*
* Globally-known function support numbers for BRIN indexes. Individual
* opclasses define their own function support numbers, which must not collide
* with the definitions here.
*/
#define BRIN_PROCNUM_OPCINFO 1
#define BRIN_PROCNUM_ADDVALUE 2
#define BRIN_PROCNUM_CONSISTENT 3
#define BRIN_PROCNUM_UNION 4
/* procedure numbers up to 10 are reserved for BRIN future expansion */
#define BRIN_DEBUG
/* we allow debug if using GCC; otherwise don't bother */
#if defined(BRIN_DEBUG) && defined(__GNUC__)
#define BRIN_elog(level, ...) elog(level, __VA_ARGS__)
#else
#define BRIN_elog(a) void(0)
#endif
/* brin.c */
extern BrinDesc *brin_build_desc(Relation rel);
extern void brin_free_desc(BrinDesc *bdesc);
#endif /* BRIN_INTERNAL_H */

View File

@@ -0,0 +1,70 @@
/*
* brin_page.h
* Prototypes and definitions for BRIN page layouts
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_page.h
*
* NOTES
*
* These structs should really be private to specific BRIN files, but it's
* useful to have them here so that they can be used by pageinspect and similar
* tools.
*/
#ifndef BRIN_PAGE_H
#define BRIN_PAGE_H
#include "storage/block.h"
#include "storage/itemptr.h"
/* special space on all BRIN pages stores a "type" identifier */
#define BRIN_PAGETYPE_META 0xF091
#define BRIN_PAGETYPE_REVMAP 0xF092
#define BRIN_PAGETYPE_REGULAR 0xF093
#define BRIN_PAGE_TYPE(page) \
(((BrinSpecialSpace *) PageGetSpecialPointer(page))->type)
#define BRIN_IS_REVMAP_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REVMAP)
#define BRIN_IS_REGULAR_PAGE(page) (BRIN_PAGE_TYPE(page) == BRIN_PAGETYPE_REGULAR)
/* flags for BrinSpecialSpace */
#define BRIN_EVACUATE_PAGE (1 << 0)
typedef struct BrinSpecialSpace
{
uint16 flags;
uint16 type;
} BrinSpecialSpace;
/* Metapage definitions */
typedef struct BrinMetaPageData
{
uint32 brinMagic;
uint32 brinVersion;
BlockNumber pagesPerRange;
BlockNumber lastRevmapPage;
} BrinMetaPageData;
#define BRIN_CURRENT_VERSION 1
#define BRIN_META_MAGIC 0xA8109CFA
#define BRIN_METAPAGE_BLKNO 0
/* Definitions for revmap pages */
typedef struct RevmapContents
{
ItemPointerData rm_tids[1]; /* really REVMAP_PAGE_MAXITEMS */
} RevmapContents;
#define REVMAP_CONTENT_SIZE \
(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
offsetof(RevmapContents, rm_tids) - \
MAXALIGN(sizeof(BrinSpecialSpace)))
/* max num of items in the array */
#define REVMAP_PAGE_MAXITEMS \
(REVMAP_CONTENT_SIZE / sizeof(ItemPointerData))
#endif /* BRIN_PAGE_H */

View File

@@ -0,0 +1,36 @@
/*
* brin_pageops.h
* Prototypes for operating on BRIN pages.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_pageops.h
*/
#ifndef BRIN_PAGEOPS_H
#define BRIN_PAGEOPS_H
#include "access/brin_revmap.h"
extern bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange,
BrinRevmap *revmap, BlockNumber heapBlk,
Buffer oldbuf, OffsetNumber oldoff,
const BrinTuple *origtup, Size origsz,
const BrinTuple *newtup, Size newsz,
bool samepage);
extern bool brin_can_do_samepage_update(Buffer buffer, Size origsz,
Size newsz);
extern OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk,
BrinTuple *tup, Size itemsz);
extern void brin_page_init(Page page, uint16 type);
extern void brin_metapage_init(Page page, BlockNumber pagesPerRange,
uint16 version);
extern bool brin_start_evacuating_page(Relation idxRel, Buffer buf);
extern void brin_evacuate_page(Relation idxRel, BlockNumber pagesPerRange,
BrinRevmap *revmap, Buffer buf);
#endif /* BRIN_PAGEOPS_H */

View File

@@ -0,0 +1,39 @@
/*
* brin_revmap.h
* Prototypes for BRIN reverse range maps
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_revmap.h
*/
#ifndef BRIN_REVMAP_H
#define BRIN_REVMAP_H
#include "access/brin_tuple.h"
#include "storage/block.h"
#include "storage/buf.h"
#include "storage/itemptr.h"
#include "storage/off.h"
#include "utils/relcache.h"
/* struct definition lives in brin_revmap.c */
typedef struct BrinRevmap BrinRevmap;
extern BrinRevmap *brinRevmapInitialize(Relation idxrel,
BlockNumber *pagesPerRange);
extern void brinRevmapTerminate(BrinRevmap *revmap);
extern void brinRevmapExtend(BrinRevmap *revmap,
BlockNumber heapBlk);
extern Buffer brinLockRevmapPageForUpdate(BrinRevmap *revmap,
BlockNumber heapBlk);
extern void brinSetHeapBlockItemptr(Buffer rmbuf, BlockNumber pagesPerRange,
BlockNumber heapBlk, ItemPointerData tid);
extern BrinTuple *brinGetTupleForHeapBlock(BrinRevmap *revmap,
BlockNumber heapBlk, Buffer *buf, OffsetNumber *off,
Size *size, int mode);
#endif /* BRIN_REVMAP_H */

View File

@@ -0,0 +1,96 @@
/*
* brin_tuple.h
* Declarations for dealing with BRIN-specific tuples.
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
* src/include/access/brin_tuple.h
*/
#ifndef BRIN_TUPLE_H
#define BRIN_TUPLE_H
#include "access/brin_internal.h"
#include "access/tupdesc.h"
/*
* A BRIN index stores one index tuple per page range. Each index tuple
* has one BrinValues struct for each indexed column; in turn, each BrinValues
* has (besides the null flags) an array of Datum whose size is determined by
* the opclass.
*/
typedef struct BrinValues
{
AttrNumber bv_attno; /* index attribute number */
bool bv_hasnulls; /* is there any nulls in the page range? */
bool bv_allnulls; /* are all values nulls in the page range? */
Datum *bv_values; /* current accumulated values */
} BrinValues;
/*
* This struct is used to represent an in-memory index tuple. The values can
* only be meaningfully decoded with an appropriate BrinDesc.
*/
typedef struct BrinMemTuple
{
bool bt_placeholder; /* this is a placeholder tuple */
BlockNumber bt_blkno; /* heap blkno that the tuple is for */
MemoryContext bt_context; /* memcxt holding the dt_column values */
BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER];
} BrinMemTuple;
/*
* An on-disk BRIN tuple. This is possibly followed by a nulls bitmask, with
* room for 2 null bits (two bits for each indexed column); an opclass-defined
* number of Datum values for each column follow.
*/
typedef struct BrinTuple
{
/* heap block number that the tuple is for */
BlockNumber bt_blkno;
/* ---------------
* mt_info is laid out in the following fashion:
*
* 7th (high) bit: has nulls
* 6th bit: is placeholder tuple
* 5th bit: unused
* 4-0 bit: offset of data
* ---------------
*/
uint8 bt_info;
} BrinTuple;
#define SizeOfBrinTuple (offsetof(BrinTuple, bt_info) + sizeof(uint8))
/*
* t_info manipulation macros
*/
#define BRIN_OFFSET_MASK 0x1F
/* bit 0x20 is not used at present */
#define BRIN_PLACEHOLDER_MASK 0x40
#define BRIN_NULLS_MASK 0x80
#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,
BrinMemTuple *tuple, Size *size);
extern BrinTuple *brin_form_placeholder_tuple(BrinDesc *brdesc,
BlockNumber blkno, Size *size);
extern void brin_free_tuple(BrinTuple *tuple);
extern BrinTuple *brin_copy_tuple(BrinTuple *tuple, Size len);
extern bool brin_tuples_equal(const BrinTuple *a, Size alen,
const BrinTuple *b, Size blen);
extern BrinMemTuple *brin_new_memtuple(BrinDesc *brdesc);
extern void brin_memtuple_initialize(BrinMemTuple *dtuple,
BrinDesc *brdesc);
extern BrinMemTuple *brin_deform_tuple(BrinDesc *brdesc,
BrinTuple *tuple);
#endif /* BRIN_TUPLE_H */

View File

@@ -0,0 +1,109 @@
/*-------------------------------------------------------------------------
*
* brin_xlog.h
* POSTGRES BRIN access XLOG definitions.
*
*
* Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/include/access/brin_xlog.h
*
*-------------------------------------------------------------------------
*/
#ifndef BRIN_XLOG_H
#define BRIN_XLOG_H
#include "access/xlogrecord.h"
#include "lib/stringinfo.h"
#include "storage/bufpage.h"
#include "storage/itemptr.h"
#include "storage/relfilenode.h"
#include "utils/relcache.h"
/*
* WAL record definitions for BRIN's WAL operations
*
* XLOG allows to store some information in high 4 bits of log
* record xl_info field.
*/
#define XLOG_BRIN_CREATE_INDEX 0x00
#define XLOG_BRIN_INSERT 0x10
#define XLOG_BRIN_UPDATE 0x20
#define XLOG_BRIN_SAMEPAGE_UPDATE 0x30
#define XLOG_BRIN_REVMAP_EXTEND 0x40
#define XLOG_BRIN_REVMAP_VACUUM 0x50
#define XLOG_BRIN_OPMASK 0x70
/*
* When we insert the first item on a new page, we restore the entire page in
* redo.
*/
#define XLOG_BRIN_INIT_PAGE 0x80
/* This is what we need to know about a BRIN index create */
typedef struct xl_brin_createidx
{
BlockNumber pagesPerRange;
RelFileNode node;
uint16 version;
} xl_brin_createidx;
#define SizeOfBrinCreateIdx (offsetof(xl_brin_createidx, version) + sizeof(uint16))
/*
* This is what we need to know about a BRIN tuple insert
*/
typedef struct xl_brin_insert
{
RelFileNode node;
BlockNumber heapBlk;
/* extra information needed to update the revmap */
BlockNumber revmapBlk;
BlockNumber pagesPerRange;
uint16 tuplen;
ItemPointerData tid;
/* tuple data follows at end of struct */
} xl_brin_insert;
#define SizeOfBrinInsert (offsetof(xl_brin_insert, tid) + sizeof(ItemPointerData))
/*
* A cross-page update is the same as an insert, but also store the old tid.
*/
typedef struct xl_brin_update
{
ItemPointerData oldtid;
xl_brin_insert new;
} xl_brin_update;
#define SizeOfBrinUpdate (offsetof(xl_brin_update, new) + SizeOfBrinInsert)
/* This is what we need to know about a BRIN tuple samepage update */
typedef struct xl_brin_samepage_update
{
RelFileNode node;
ItemPointerData tid;
/* tuple data follows at end of struct */
} xl_brin_samepage_update;
#define SizeOfBrinSamepageUpdate (offsetof(xl_brin_samepage_update, tid) + sizeof(ItemPointerData))
/* This is what we need to know about a revmap extension */
typedef struct xl_brin_revmap_extend
{
RelFileNode node;
BlockNumber targetBlk;
} xl_brin_revmap_extend;
#define SizeOfBrinRevmapExtend (offsetof(xl_brin_revmap_extend, targetBlk) + \
sizeof(BlockNumber))
extern void brin_desc(StringInfo buf, XLogRecord *record);
extern void brin_redo(XLogRecPtr lsn, XLogRecord *record);
extern const char *brin_identify(uint8 info);
#endif /* BRIN_XLOG_H */

View File

@@ -113,6 +113,8 @@ extern HeapScanDesc heap_beginscan_strat(Relation relation, Snapshot snapshot,
bool allow_strat, bool allow_sync);
extern HeapScanDesc heap_beginscan_bm(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key);
extern void heap_setscanlimits(HeapScanDesc scan, BlockNumber startBlk,
BlockNumber endBlk);
extern void heap_rescan(HeapScanDesc scan, ScanKey key);
extern void heap_endscan(HeapScanDesc scan);
extern HeapTuple heap_getnext(HeapScanDesc scan, ScanDirection direction);

View File

@@ -45,8 +45,9 @@ typedef enum relopt_kind
RELOPT_KIND_TABLESPACE = (1 << 7),
RELOPT_KIND_SPGIST = (1 << 8),
RELOPT_KIND_VIEW = (1 << 9),
RELOPT_KIND_BRIN = (1 << 10),
/* if you add a new kind, make sure you update "last_default" too */
RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_VIEW,
RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_BRIN,
/* some compilers treat enums as signed ints, so we can't use 1 << 31 */
RELOPT_KIND_MAX = (1 << 30)
} relopt_kind;

View File

@@ -35,8 +35,10 @@ typedef struct HeapScanDescData
bool rs_temp_snap; /* unregister snapshot at scan end? */
/* state set up at initscan time */
BlockNumber rs_nblocks; /* number of blocks to scan */
BlockNumber rs_nblocks; /* total number of blocks in rel */
BlockNumber rs_startblock; /* block # to start at */
BlockNumber rs_initblock; /* block # to consider initial of rel */
BlockNumber rs_numblocks; /* number of blocks to scan */
BufferAccessStrategy rs_strategy; /* access strategy for reads */
bool rs_syncscan; /* report location to syncscan logic? */

View File

@@ -42,3 +42,4 @@ PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gi
PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup)
PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL)
PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup)
PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL)