mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-29 22:49:41 +03:00 
			
		
		
		
	When testing buffer pool logic, it is useful to be able to evict arbitrary blocks. This function can be used in SQL queries over the pg_buffercache view to set up a wide range of buffer pool states. Of course, buffer mappings might change concurrently so you might evict a block other than the one you had in mind, and another session might bring it back in at any time. That's OK for the intended purpose of setting up developer testing scenarios, and more complicated interlocking schemes to give stronger guararantees about that would likely be less flexible for actual testing work anyway. Superuser-only. Author: Palak Chaturvedi <chaturvedipalak1911@gmail.com> Author: Thomas Munro <thomas.munro@gmail.com> (docs, small tweaks) Reviewed-by: Nitin Jadhav <nitinjadhavpostgres@gmail.com> Reviewed-by: Andres Freund <andres@anarazel.de> Reviewed-by: Cary Huang <cary.huang@highgo.ca> Reviewed-by: Cédric Villemain <cedric.villemain+pgsql@abcsql.com> Reviewed-by: Jim Nasby <jim.nasby@gmail.com> Reviewed-by: Maxim Orlov <orlovmg@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Discussion: https://postgr.es/m/CALfch19pW48ZwWzUoRSpsaV9hqt0UPyaBPC4bOZ4W+c7FF566A@mail.gmail.com
		
			
				
	
	
		
			370 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			370 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*-------------------------------------------------------------------------
 | |
|  *
 | |
|  * pg_buffercache_pages.c
 | |
|  *	  display some contents of the buffer cache
 | |
|  *
 | |
|  *	  contrib/pg_buffercache/pg_buffercache_pages.c
 | |
|  *-------------------------------------------------------------------------
 | |
|  */
 | |
| #include "postgres.h"
 | |
| 
 | |
| #include "access/htup_details.h"
 | |
| #include "catalog/pg_type.h"
 | |
| #include "funcapi.h"
 | |
| #include "storage/buf_internals.h"
 | |
| #include "storage/bufmgr.h"
 | |
| 
 | |
| 
 | |
| #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
 | |
| #define NUM_BUFFERCACHE_PAGES_ELEM	9
 | |
| #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 | |
| #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 | |
| 
 | |
| PG_MODULE_MAGIC;
 | |
| 
 | |
| /*
 | |
|  * Record structure holding the to be exposed cache data.
 | |
|  */
 | |
| typedef struct
 | |
| {
 | |
| 	uint32		bufferid;
 | |
| 	RelFileNumber relfilenumber;
 | |
| 	Oid			reltablespace;
 | |
| 	Oid			reldatabase;
 | |
| 	ForkNumber	forknum;
 | |
| 	BlockNumber blocknum;
 | |
| 	bool		isvalid;
 | |
| 	bool		isdirty;
 | |
| 	uint16		usagecount;
 | |
| 
 | |
| 	/*
 | |
| 	 * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
 | |
| 	 * being pinned by too many backends and each backend will only pin once
 | |
| 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 | |
| 	 */
 | |
| 	int32		pinning_backends;
 | |
| } BufferCachePagesRec;
 | |
| 
 | |
| 
 | |
| /*
 | |
|  * Function context for data persisting over repeated calls.
 | |
|  */
 | |
| typedef struct
 | |
| {
 | |
| 	TupleDesc	tupdesc;
 | |
| 	BufferCachePagesRec *record;
 | |
| } BufferCachePagesContext;
 | |
| 
 | |
| 
 | |
| /*
 | |
|  * Function returning data from the shared buffer cache - buffer number,
 | |
|  * relation node/tablespace/database/blocknum and dirty indicator.
 | |
|  */
 | |
| PG_FUNCTION_INFO_V1(pg_buffercache_pages);
 | |
| PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 | |
| PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 | |
| PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 | |
| 
 | |
| Datum
 | |
| pg_buffercache_pages(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	FuncCallContext *funcctx;
 | |
| 	Datum		result;
 | |
| 	MemoryContext oldcontext;
 | |
| 	BufferCachePagesContext *fctx;	/* User function context. */
 | |
| 	TupleDesc	tupledesc;
 | |
| 	TupleDesc	expected_tupledesc;
 | |
| 	HeapTuple	tuple;
 | |
| 
 | |
| 	if (SRF_IS_FIRSTCALL())
 | |
| 	{
 | |
| 		int			i;
 | |
| 
 | |
| 		funcctx = SRF_FIRSTCALL_INIT();
 | |
| 
 | |
| 		/* Switch context when allocating stuff to be used in later calls */
 | |
| 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 | |
| 
 | |
| 		/* Create a user function context for cross-call persistence */
 | |
| 		fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
 | |
| 
 | |
| 		/*
 | |
| 		 * To smoothly support upgrades from version 1.0 of this extension
 | |
| 		 * transparently handle the (non-)existence of the pinning_backends
 | |
| 		 * column. We unfortunately have to get the result type for that... -
 | |
| 		 * we can't use the result type determined by the function definition
 | |
| 		 * without potentially crashing when somebody uses the old (or even
 | |
| 		 * wrong) function definition though.
 | |
| 		 */
 | |
| 		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
 | |
| 			elog(ERROR, "return type must be a row type");
 | |
| 
 | |
| 		if (expected_tupledesc->natts < NUM_BUFFERCACHE_PAGES_MIN_ELEM ||
 | |
| 			expected_tupledesc->natts > NUM_BUFFERCACHE_PAGES_ELEM)
 | |
| 			elog(ERROR, "incorrect number of output arguments");
 | |
| 
 | |
| 		/* Construct a tuple descriptor for the result rows. */
 | |
| 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
 | |
| 						   INT4OID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
 | |
| 						   OIDOID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
 | |
| 						   OIDOID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
 | |
| 						   OIDOID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relforknumber",
 | |
| 						   INT2OID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "relblocknumber",
 | |
| 						   INT8OID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "isdirty",
 | |
| 						   BOOLOID, -1, 0);
 | |
| 		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
 | |
| 						   INT2OID, -1, 0);
 | |
| 
 | |
| 		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
 | |
| 			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 | |
| 							   INT4OID, -1, 0);
 | |
| 
 | |
| 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 | |
| 
 | |
| 		/* Allocate NBuffers worth of BufferCachePagesRec records. */
 | |
| 		fctx->record = (BufferCachePagesRec *)
 | |
| 			MemoryContextAllocHuge(CurrentMemoryContext,
 | |
| 								   sizeof(BufferCachePagesRec) * NBuffers);
 | |
| 
 | |
| 		/* Set max calls and remember the user function context. */
 | |
| 		funcctx->max_calls = NBuffers;
 | |
| 		funcctx->user_fctx = fctx;
 | |
| 
 | |
| 		/* Return to original context when allocating transient memory */
 | |
| 		MemoryContextSwitchTo(oldcontext);
 | |
| 
 | |
| 		/*
 | |
| 		 * Scan through all the buffers, saving the relevant fields in the
 | |
| 		 * fctx->record structure.
 | |
| 		 *
 | |
| 		 * We don't hold the partition locks, so we don't get a consistent
 | |
| 		 * snapshot across all buffers, but we do grab the buffer header
 | |
| 		 * locks, so the information of each buffer is self-consistent.
 | |
| 		 */
 | |
| 		for (i = 0; i < NBuffers; i++)
 | |
| 		{
 | |
| 			BufferDesc *bufHdr;
 | |
| 			uint32		buf_state;
 | |
| 
 | |
| 			bufHdr = GetBufferDescriptor(i);
 | |
| 			/* Lock each buffer header before inspecting. */
 | |
| 			buf_state = LockBufHdr(bufHdr);
 | |
| 
 | |
| 			fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
 | |
| 			fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag);
 | |
| 			fctx->record[i].reltablespace = bufHdr->tag.spcOid;
 | |
| 			fctx->record[i].reldatabase = bufHdr->tag.dbOid;
 | |
| 			fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag);
 | |
| 			fctx->record[i].blocknum = bufHdr->tag.blockNum;
 | |
| 			fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state);
 | |
| 			fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state);
 | |
| 
 | |
| 			if (buf_state & BM_DIRTY)
 | |
| 				fctx->record[i].isdirty = true;
 | |
| 			else
 | |
| 				fctx->record[i].isdirty = false;
 | |
| 
 | |
| 			/* Note if the buffer is valid, and has storage created */
 | |
| 			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
 | |
| 				fctx->record[i].isvalid = true;
 | |
| 			else
 | |
| 				fctx->record[i].isvalid = false;
 | |
| 
 | |
| 			UnlockBufHdr(bufHdr, buf_state);
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	funcctx = SRF_PERCALL_SETUP();
 | |
| 
 | |
| 	/* Get the saved state */
 | |
| 	fctx = funcctx->user_fctx;
 | |
| 
 | |
| 	if (funcctx->call_cntr < funcctx->max_calls)
 | |
| 	{
 | |
| 		uint32		i = funcctx->call_cntr;
 | |
| 		Datum		values[NUM_BUFFERCACHE_PAGES_ELEM];
 | |
| 		bool		nulls[NUM_BUFFERCACHE_PAGES_ELEM];
 | |
| 
 | |
| 		values[0] = Int32GetDatum(fctx->record[i].bufferid);
 | |
| 		nulls[0] = false;
 | |
| 
 | |
| 		/*
 | |
| 		 * Set all fields except the bufferid to null if the buffer is unused
 | |
| 		 * or not valid.
 | |
| 		 */
 | |
| 		if (fctx->record[i].blocknum == InvalidBlockNumber ||
 | |
| 			fctx->record[i].isvalid == false)
 | |
| 		{
 | |
| 			nulls[1] = true;
 | |
| 			nulls[2] = true;
 | |
| 			nulls[3] = true;
 | |
| 			nulls[4] = true;
 | |
| 			nulls[5] = true;
 | |
| 			nulls[6] = true;
 | |
| 			nulls[7] = true;
 | |
| 			/* unused for v1.0 callers, but the array is always long enough */
 | |
| 			nulls[8] = true;
 | |
| 		}
 | |
| 		else
 | |
| 		{
 | |
| 			values[1] = ObjectIdGetDatum(fctx->record[i].relfilenumber);
 | |
| 			nulls[1] = false;
 | |
| 			values[2] = ObjectIdGetDatum(fctx->record[i].reltablespace);
 | |
| 			nulls[2] = false;
 | |
| 			values[3] = ObjectIdGetDatum(fctx->record[i].reldatabase);
 | |
| 			nulls[3] = false;
 | |
| 			values[4] = ObjectIdGetDatum(fctx->record[i].forknum);
 | |
| 			nulls[4] = false;
 | |
| 			values[5] = Int64GetDatum((int64) fctx->record[i].blocknum);
 | |
| 			nulls[5] = false;
 | |
| 			values[6] = BoolGetDatum(fctx->record[i].isdirty);
 | |
| 			nulls[6] = false;
 | |
| 			values[7] = Int16GetDatum(fctx->record[i].usagecount);
 | |
| 			nulls[7] = false;
 | |
| 			/* unused for v1.0 callers, but the array is always long enough */
 | |
| 			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
 | |
| 			nulls[8] = false;
 | |
| 		}
 | |
| 
 | |
| 		/* Build and return the tuple. */
 | |
| 		tuple = heap_form_tuple(fctx->tupdesc, values, nulls);
 | |
| 		result = HeapTupleGetDatum(tuple);
 | |
| 
 | |
| 		SRF_RETURN_NEXT(funcctx, result);
 | |
| 	}
 | |
| 	else
 | |
| 		SRF_RETURN_DONE(funcctx);
 | |
| }
 | |
| 
 | |
| Datum
 | |
| pg_buffercache_summary(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Datum		result;
 | |
| 	TupleDesc	tupledesc;
 | |
| 	HeapTuple	tuple;
 | |
| 	Datum		values[NUM_BUFFERCACHE_SUMMARY_ELEM];
 | |
| 	bool		nulls[NUM_BUFFERCACHE_SUMMARY_ELEM];
 | |
| 
 | |
| 	int32		buffers_used = 0;
 | |
| 	int32		buffers_unused = 0;
 | |
| 	int32		buffers_dirty = 0;
 | |
| 	int32		buffers_pinned = 0;
 | |
| 	int64		usagecount_total = 0;
 | |
| 
 | |
| 	if (get_call_result_type(fcinfo, NULL, &tupledesc) != TYPEFUNC_COMPOSITE)
 | |
| 		elog(ERROR, "return type must be a row type");
 | |
| 
 | |
| 	for (int i = 0; i < NBuffers; i++)
 | |
| 	{
 | |
| 		BufferDesc *bufHdr;
 | |
| 		uint32		buf_state;
 | |
| 
 | |
| 		/*
 | |
| 		 * This function summarizes the state of all headers. Locking the
 | |
| 		 * buffer headers wouldn't provide an improved result as the state of
 | |
| 		 * the buffer can still change after we release the lock and it'd
 | |
| 		 * noticeably increase the cost of the function.
 | |
| 		 */
 | |
| 		bufHdr = GetBufferDescriptor(i);
 | |
| 		buf_state = pg_atomic_read_u32(&bufHdr->state);
 | |
| 
 | |
| 		if (buf_state & BM_VALID)
 | |
| 		{
 | |
| 			buffers_used++;
 | |
| 			usagecount_total += BUF_STATE_GET_USAGECOUNT(buf_state);
 | |
| 
 | |
| 			if (buf_state & BM_DIRTY)
 | |
| 				buffers_dirty++;
 | |
| 		}
 | |
| 		else
 | |
| 			buffers_unused++;
 | |
| 
 | |
| 		if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
 | |
| 			buffers_pinned++;
 | |
| 	}
 | |
| 
 | |
| 	memset(nulls, 0, sizeof(nulls));
 | |
| 	values[0] = Int32GetDatum(buffers_used);
 | |
| 	values[1] = Int32GetDatum(buffers_unused);
 | |
| 	values[2] = Int32GetDatum(buffers_dirty);
 | |
| 	values[3] = Int32GetDatum(buffers_pinned);
 | |
| 
 | |
| 	if (buffers_used != 0)
 | |
| 		values[4] = Float8GetDatum((double) usagecount_total / buffers_used);
 | |
| 	else
 | |
| 		nulls[4] = true;
 | |
| 
 | |
| 	/* Build and return the tuple. */
 | |
| 	tuple = heap_form_tuple(tupledesc, values, nulls);
 | |
| 	result = HeapTupleGetDatum(tuple);
 | |
| 
 | |
| 	PG_RETURN_DATUM(result);
 | |
| }
 | |
| 
 | |
| Datum
 | |
| pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 | |
| 	int			usage_counts[BM_MAX_USAGE_COUNT + 1] = {0};
 | |
| 	int			dirty[BM_MAX_USAGE_COUNT + 1] = {0};
 | |
| 	int			pinned[BM_MAX_USAGE_COUNT + 1] = {0};
 | |
| 	Datum		values[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM];
 | |
| 	bool		nulls[NUM_BUFFERCACHE_USAGE_COUNTS_ELEM] = {0};
 | |
| 
 | |
| 	InitMaterializedSRF(fcinfo, 0);
 | |
| 
 | |
| 	for (int i = 0; i < NBuffers; i++)
 | |
| 	{
 | |
| 		BufferDesc *bufHdr = GetBufferDescriptor(i);
 | |
| 		uint32		buf_state = pg_atomic_read_u32(&bufHdr->state);
 | |
| 		int			usage_count;
 | |
| 
 | |
| 		usage_count = BUF_STATE_GET_USAGECOUNT(buf_state);
 | |
| 		usage_counts[usage_count]++;
 | |
| 
 | |
| 		if (buf_state & BM_DIRTY)
 | |
| 			dirty[usage_count]++;
 | |
| 
 | |
| 		if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
 | |
| 			pinned[usage_count]++;
 | |
| 	}
 | |
| 
 | |
| 	for (int i = 0; i < BM_MAX_USAGE_COUNT + 1; i++)
 | |
| 	{
 | |
| 		values[0] = Int32GetDatum(i);
 | |
| 		values[1] = Int32GetDatum(usage_counts[i]);
 | |
| 		values[2] = Int32GetDatum(dirty[i]);
 | |
| 		values[3] = Int32GetDatum(pinned[i]);
 | |
| 
 | |
| 		tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
 | |
| 	}
 | |
| 
 | |
| 	return (Datum) 0;
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * Try to evict a shared buffer.
 | |
|  */
 | |
| Datum
 | |
| pg_buffercache_evict(PG_FUNCTION_ARGS)
 | |
| {
 | |
| 	Buffer		buf = PG_GETARG_INT32(0);
 | |
| 
 | |
| 	if (!superuser())
 | |
| 		ereport(ERROR,
 | |
| 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 | |
| 				 errmsg("must be superuser to use pg_buffercache_evict function")));
 | |
| 
 | |
| 	if (buf < 1 || buf > NBuffers)
 | |
| 		elog(ERROR, "bad buffer ID: %d", buf);
 | |
| 
 | |
| 	PG_RETURN_BOOL(EvictUnpinnedBuffer(buf));
 | |
| }
 |