mirror of
https://github.com/postgres/postgres.git
synced 2025-08-27 07:42:10 +03:00
Move syncscan.c to src/backend/access/common.
Since the tableam.c code needs to make use of the syncscan.c routines itself, and since other block-oriented AMs might also want to use it one day, it didn't make sense for it to live under src/backend/access/heap. Reviewed-by: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/CA%2BhUKGLCnG%3DNEAByg6bk%2BCT9JZD97Y%3DAxKhh27Su9FeGWOKvDg%40mail.gmail.com
This commit is contained in:
@@ -20,7 +20,6 @@ OBJS = \
|
||||
hio.o \
|
||||
pruneheap.o \
|
||||
rewriteheap.o \
|
||||
syncscan.o \
|
||||
vacuumlazy.o \
|
||||
visibilitymap.o
|
||||
|
||||
|
@@ -41,6 +41,7 @@
|
||||
#include "access/parallel.h"
|
||||
#include "access/relscan.h"
|
||||
#include "access/subtrans.h"
|
||||
#include "access/syncscan.h"
|
||||
#include "access/sysattr.h"
|
||||
#include "access/tableam.h"
|
||||
#include "access/transam.h"
|
||||
|
@@ -24,6 +24,7 @@
|
||||
#include "access/heaptoast.h"
|
||||
#include "access/multixact.h"
|
||||
#include "access/rewriteheap.h"
|
||||
#include "access/syncscan.h"
|
||||
#include "access/tableam.h"
|
||||
#include "access/tsmapi.h"
|
||||
#include "access/xact.h"
|
||||
|
@@ -1,322 +0,0 @@
|
||||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* syncscan.c
|
||||
* heap scan synchronization support
|
||||
*
|
||||
* When multiple backends run a sequential scan on the same table, we try
|
||||
* to keep them synchronized to reduce the overall I/O needed. The goal is
|
||||
* to read each page into shared buffer cache only once, and let all backends
|
||||
* that take part in the shared scan process the page before it falls out of
|
||||
* the cache.
|
||||
*
|
||||
* Since the "leader" in a pack of backends doing a seqscan will have to wait
|
||||
* for I/O, while the "followers" don't, there is a strong self-synchronizing
|
||||
* effect once we can get the backends examining approximately the same part
|
||||
* of the table at the same time. Hence all that is really needed is to get
|
||||
* a new backend beginning a seqscan to begin it close to where other backends
|
||||
* are reading. We can scan the table circularly, from block X up to the
|
||||
* end and then from block 0 to X-1, to ensure we visit all rows while still
|
||||
* participating in the common scan.
|
||||
*
|
||||
* To accomplish that, we keep track of the scan position of each table, and
|
||||
* start new scans close to where the previous scan(s) are. We don't try to
|
||||
* do any extra synchronization to keep the scans together afterwards; some
|
||||
* scans might progress much more slowly than others, for example if the
|
||||
* results need to be transferred to the client over a slow network, and we
|
||||
* don't want such queries to slow down others.
|
||||
*
|
||||
* There can realistically only be a few large sequential scans on different
|
||||
* tables in progress at any time. Therefore we just keep the scan positions
|
||||
* in a small LRU list which we scan every time we need to look up or update a
|
||||
* scan position. The whole mechanism is only applied for tables exceeding
|
||||
* a threshold size (but that is not the concern of this module).
|
||||
*
|
||||
* INTERFACE ROUTINES
|
||||
* ss_get_location - return current scan location of a relation
|
||||
* ss_report_location - update current scan location
|
||||
*
|
||||
*
|
||||
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/backend/access/heap/syncscan.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "postgres.h"
|
||||
|
||||
#include "access/heapam.h"
|
||||
#include "miscadmin.h"
|
||||
#include "storage/lwlock.h"
|
||||
#include "storage/shmem.h"
|
||||
#include "utils/rel.h"
|
||||
|
||||
|
||||
/* GUC variables */
|
||||
#ifdef TRACE_SYNCSCAN
|
||||
bool trace_syncscan = false;
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
* Size of the LRU list.
|
||||
*
|
||||
* Note: the code assumes that SYNC_SCAN_NELEM > 1.
|
||||
*
|
||||
* XXX: What's a good value? It should be large enough to hold the
|
||||
* maximum number of large tables scanned simultaneously. But a larger value
|
||||
* means more traversing of the LRU list when starting a new scan.
|
||||
*/
|
||||
#define SYNC_SCAN_NELEM 20
|
||||
|
||||
/*
|
||||
* Interval between reports of the location of the current scan, in pages.
|
||||
*
|
||||
* Note: This should be smaller than the ring size (see buffer/freelist.c)
|
||||
* we use for bulk reads. Otherwise a scan joining other scans might start
|
||||
* from a page that's no longer in the buffer cache. This is a bit fuzzy;
|
||||
* there's no guarantee that the new scan will read the page before it leaves
|
||||
* the buffer cache anyway, and on the other hand the page is most likely
|
||||
* still in the OS cache.
|
||||
*/
|
||||
#define SYNC_SCAN_REPORT_INTERVAL (128 * 1024 / BLCKSZ)
|
||||
|
||||
|
||||
/*
|
||||
* The scan locations structure is essentially a doubly-linked LRU with head
|
||||
* and tail pointer, but designed to hold a fixed maximum number of elements in
|
||||
* fixed-size shared memory.
|
||||
*/
|
||||
typedef struct ss_scan_location_t
|
||||
{
|
||||
RelFileNode relfilenode; /* identity of a relation */
|
||||
BlockNumber location; /* last-reported location in the relation */
|
||||
} ss_scan_location_t;
|
||||
|
||||
typedef struct ss_lru_item_t
|
||||
{
|
||||
struct ss_lru_item_t *prev;
|
||||
struct ss_lru_item_t *next;
|
||||
ss_scan_location_t location;
|
||||
} ss_lru_item_t;
|
||||
|
||||
typedef struct ss_scan_locations_t
|
||||
{
|
||||
ss_lru_item_t *head;
|
||||
ss_lru_item_t *tail;
|
||||
ss_lru_item_t items[FLEXIBLE_ARRAY_MEMBER]; /* SYNC_SCAN_NELEM items */
|
||||
} ss_scan_locations_t;
|
||||
|
||||
#define SizeOfScanLocations(N) \
|
||||
(offsetof(ss_scan_locations_t, items) + (N) * sizeof(ss_lru_item_t))
|
||||
|
||||
/* Pointer to struct in shared memory */
|
||||
static ss_scan_locations_t *scan_locations;
|
||||
|
||||
/* prototypes for internal functions */
|
||||
static BlockNumber ss_search(RelFileNode relfilenode,
|
||||
BlockNumber location, bool set);
|
||||
|
||||
|
||||
/*
|
||||
* SyncScanShmemSize --- report amount of shared memory space needed
|
||||
*/
|
||||
Size
|
||||
SyncScanShmemSize(void)
|
||||
{
|
||||
return SizeOfScanLocations(SYNC_SCAN_NELEM);
|
||||
}
|
||||
|
||||
/*
|
||||
* SyncScanShmemInit --- initialize this module's shared memory
|
||||
*/
|
||||
void
|
||||
SyncScanShmemInit(void)
|
||||
{
|
||||
int i;
|
||||
bool found;
|
||||
|
||||
scan_locations = (ss_scan_locations_t *)
|
||||
ShmemInitStruct("Sync Scan Locations List",
|
||||
SizeOfScanLocations(SYNC_SCAN_NELEM),
|
||||
&found);
|
||||
|
||||
if (!IsUnderPostmaster)
|
||||
{
|
||||
/* Initialize shared memory area */
|
||||
Assert(!found);
|
||||
|
||||
scan_locations->head = &scan_locations->items[0];
|
||||
scan_locations->tail = &scan_locations->items[SYNC_SCAN_NELEM - 1];
|
||||
|
||||
for (i = 0; i < SYNC_SCAN_NELEM; i++)
|
||||
{
|
||||
ss_lru_item_t *item = &scan_locations->items[i];
|
||||
|
||||
/*
|
||||
* Initialize all slots with invalid values. As scans are started,
|
||||
* these invalid entries will fall off the LRU list and get
|
||||
* replaced with real entries.
|
||||
*/
|
||||
item->location.relfilenode.spcNode = InvalidOid;
|
||||
item->location.relfilenode.dbNode = InvalidOid;
|
||||
item->location.relfilenode.relNode = InvalidOid;
|
||||
item->location.location = InvalidBlockNumber;
|
||||
|
||||
item->prev = (i > 0) ?
|
||||
(&scan_locations->items[i - 1]) : NULL;
|
||||
item->next = (i < SYNC_SCAN_NELEM - 1) ?
|
||||
(&scan_locations->items[i + 1]) : NULL;
|
||||
}
|
||||
}
|
||||
else
|
||||
Assert(found);
|
||||
}
|
||||
|
||||
/*
|
||||
* ss_search --- search the scan_locations structure for an entry with the
|
||||
* given relfilenode.
|
||||
*
|
||||
* If "set" is true, the location is updated to the given location. If no
|
||||
* entry for the given relfilenode is found, it will be created at the head
|
||||
* of the list with the given location, even if "set" is false.
|
||||
*
|
||||
* In any case, the location after possible update is returned.
|
||||
*
|
||||
* Caller is responsible for having acquired suitable lock on the shared
|
||||
* data structure.
|
||||
*/
|
||||
static BlockNumber
|
||||
ss_search(RelFileNode relfilenode, BlockNumber location, bool set)
|
||||
{
|
||||
ss_lru_item_t *item;
|
||||
|
||||
item = scan_locations->head;
|
||||
for (;;)
|
||||
{
|
||||
bool match;
|
||||
|
||||
match = RelFileNodeEquals(item->location.relfilenode, relfilenode);
|
||||
|
||||
if (match || item->next == NULL)
|
||||
{
|
||||
/*
|
||||
* If we reached the end of list and no match was found, take over
|
||||
* the last entry
|
||||
*/
|
||||
if (!match)
|
||||
{
|
||||
item->location.relfilenode = relfilenode;
|
||||
item->location.location = location;
|
||||
}
|
||||
else if (set)
|
||||
item->location.location = location;
|
||||
|
||||
/* Move the entry to the front of the LRU list */
|
||||
if (item != scan_locations->head)
|
||||
{
|
||||
/* unlink */
|
||||
if (item == scan_locations->tail)
|
||||
scan_locations->tail = item->prev;
|
||||
item->prev->next = item->next;
|
||||
if (item->next)
|
||||
item->next->prev = item->prev;
|
||||
|
||||
/* link */
|
||||
item->prev = NULL;
|
||||
item->next = scan_locations->head;
|
||||
scan_locations->head->prev = item;
|
||||
scan_locations->head = item;
|
||||
}
|
||||
|
||||
return item->location.location;
|
||||
}
|
||||
|
||||
item = item->next;
|
||||
}
|
||||
|
||||
/* not reached */
|
||||
}
|
||||
|
||||
/*
|
||||
* ss_get_location --- get the optimal starting location for scan
|
||||
*
|
||||
* Returns the last-reported location of a sequential scan on the
|
||||
* relation, or 0 if no valid location is found.
|
||||
*
|
||||
* We expect the caller has just done RelationGetNumberOfBlocks(), and
|
||||
* so that number is passed in rather than computing it again. The result
|
||||
* is guaranteed less than relnblocks (assuming that's > 0).
|
||||
*/
|
||||
BlockNumber
|
||||
ss_get_location(Relation rel, BlockNumber relnblocks)
|
||||
{
|
||||
BlockNumber startloc;
|
||||
|
||||
LWLockAcquire(SyncScanLock, LW_EXCLUSIVE);
|
||||
startloc = ss_search(rel->rd_node, 0, false);
|
||||
LWLockRelease(SyncScanLock);
|
||||
|
||||
/*
|
||||
* If the location is not a valid block number for this scan, start at 0.
|
||||
*
|
||||
* This can happen if for instance a VACUUM truncated the table since the
|
||||
* location was saved.
|
||||
*/
|
||||
if (startloc >= relnblocks)
|
||||
startloc = 0;
|
||||
|
||||
#ifdef TRACE_SYNCSCAN
|
||||
if (trace_syncscan)
|
||||
elog(LOG,
|
||||
"SYNC_SCAN: start \"%s\" (size %u) at %u",
|
||||
RelationGetRelationName(rel), relnblocks, startloc);
|
||||
#endif
|
||||
|
||||
return startloc;
|
||||
}
|
||||
|
||||
/*
|
||||
* ss_report_location --- update the current scan location
|
||||
*
|
||||
* Writes an entry into the shared Sync Scan state of the form
|
||||
* (relfilenode, blocknumber), overwriting any existing entry for the
|
||||
* same relfilenode.
|
||||
*/
|
||||
void
|
||||
ss_report_location(Relation rel, BlockNumber location)
|
||||
{
|
||||
#ifdef TRACE_SYNCSCAN
|
||||
if (trace_syncscan)
|
||||
{
|
||||
if ((location % 1024) == 0)
|
||||
elog(LOG,
|
||||
"SYNC_SCAN: scanning \"%s\" at %u",
|
||||
RelationGetRelationName(rel), location);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* To reduce lock contention, only report scan progress every N pages. For
|
||||
* the same reason, don't block if the lock isn't immediately available.
|
||||
* Missing a few updates isn't critical, it just means that a new scan
|
||||
* that wants to join the pack will start a little bit behind the head of
|
||||
* the scan. Hopefully the pages are still in OS cache and the scan
|
||||
* catches up quickly.
|
||||
*/
|
||||
if ((location % SYNC_SCAN_REPORT_INTERVAL) == 0)
|
||||
{
|
||||
if (LWLockConditionalAcquire(SyncScanLock, LW_EXCLUSIVE))
|
||||
{
|
||||
(void) ss_search(rel->rd_node, location, true);
|
||||
LWLockRelease(SyncScanLock);
|
||||
}
|
||||
#ifdef TRACE_SYNCSCAN
|
||||
else if (trace_syncscan)
|
||||
elog(LOG,
|
||||
"SYNC_SCAN: missed update for \"%s\" at %u",
|
||||
RelationGetRelationName(rel), location);
|
||||
#endif
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user