postgres/src/backend/executor/instrument.c

/*-------------------------------------------------------------------------
 *
 * instrument.c
 *	 functions for instrumentation of plan execution
 *
 *
 * Copyright (c) 2001-2006, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.17 2006/06/07 18:49:03 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <math.h>
#include <unistd.h>

#include "executor/instrument.h"


/*
 * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
 * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
 * (Because that requires a kernel call on most systems, it's expensive.)
 *
 * This macro determines the sampling interval: that is, after how many more
 * iterations will we take the next time sample, given that niters iterations
 * have occurred already.  In general, if the function is f(x), then for N
 * iterations we will take on the order of integral(1/f(x), x=0..N)
 * samples.  Some examples follow, with the number of samples that would be
 * collected over 1,000,000 iterations:
 *
 *		f(x) = x         =>   log2(N)                        20
 *		f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
 *		f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
 *
 * I've chosen the last one as it seems to provide a good compromise between
 * low overhead but still getting a meaningful number of samples. However,
 * not all machines have the cbrt() function so on those we substitute
 * sqrt(). The difference is not very significant in the tests I made.
 *
 * The actual sampling interval is randomized with the SampleFunc() value
 * as the mean; this hopefully will reduce any measurement bias due to
 * variation in the node execution time.
 */
#ifdef HAVE_CBRT
#define SampleFunc(niters) cbrt(niters)
#else
#define SampleFunc(niters) sqrt(niters)
#endif

#define SampleInterval(niters) \
	(SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))

/*
 * We sample at every node iteration until we've reached this threshold,
 * so that nodes not called a large number of times are completely accurate.
 * (Perhaps this should be a GUC variable?)
 */
#define SAMPLE_THRESHOLD 50

/*
 * Determine the sampling overhead.  This only needs to be done once per
 * backend (actually, probably once per postmaster would do ...)
 */
static double SampleOverhead;
static bool SampleOverheadCalculated = false;

static void
CalculateSampleOverhead(void)
{
	int i;

	/*
	 * We could get a wrong result due to being interrupted by task switch.
	 * To minimize the risk we do it a few times and take the lowest.
	 */
	SampleOverhead = 1.0e6;

	for (i = 0; i < 5; i++)
	{
		Instrumentation timer;
		instr_time	tmptime;
		int j;
		double overhead;

		memset(&timer, 0, sizeof(timer));
		InstrStartNode(&timer);
#define TEST_COUNT 100
		for (j = 0; j < TEST_COUNT; j++)
		{
			INSTR_TIME_SET_CURRENT(tmptime);
		}
		InstrStopNode(&timer, 1);
		overhead = INSTR_TIME_GET_DOUBLE(timer.counter) / TEST_COUNT;
		if (overhead < SampleOverhead)
			SampleOverhead = overhead;
	}

	SampleOverheadCalculated = true;
}


/* Allocate new instrumentation structure(s) */
Instrumentation *
InstrAlloc(int n)
{
	Instrumentation *instr;

	/* Calculate sampling overhead, if not done yet in this backend */
	if (!SampleOverheadCalculated)
		CalculateSampleOverhead();

	instr = palloc0(n * sizeof(Instrumentation));

	/* we don't need to do any initialization except zero 'em */

	return instr;
}

/* Entry to a plan node */
void
InstrStartNode(Instrumentation *instr)
{
	if (INSTR_TIME_IS_ZERO(instr->starttime))
	{
		/*
		 * Always sample if not yet up to threshold, else check whether
		 * next threshold has been reached
		 */
		if (instr->itercount < SAMPLE_THRESHOLD)
			instr->sampling = true;
		else if (instr->itercount >= instr->nextsample)
		{
			instr->sampling = true;
			instr->nextsample =
				instr->itercount + SampleInterval(instr->itercount);
		}
		if (instr->sampling)
			INSTR_TIME_SET_CURRENT(instr->starttime);
	}
	else
		elog(DEBUG2, "InstrStartNode called twice in a row");
}

/* Exit from a plan node */
void
InstrStopNode(Instrumentation *instr, double nTuples)
{
	/* count the returned tuples and iterations */
	instr->tuplecount += nTuples;
	instr->itercount += 1;

	/* measure runtime if appropriate */
	if (instr->sampling)
	{
		instr_time	endtime;

		/*
		 * To be sure that SampleOverhead accurately reflects the extra
		 * overhead, we must do INSTR_TIME_SET_CURRENT() as the *first*
		 * action that is different between the sampling and non-sampling
		 * code paths.
		 */
		INSTR_TIME_SET_CURRENT(endtime);

		if (INSTR_TIME_IS_ZERO(instr->starttime))
		{
			elog(DEBUG2, "InstrStopNode called without start");
			return;
		}

#ifndef WIN32
		instr->counter.tv_sec += endtime.tv_sec - instr->starttime.tv_sec;
		instr->counter.tv_usec += endtime.tv_usec - instr->starttime.tv_usec;

		/* Normalize after each add to avoid overflow/underflow of tv_usec */
		while (instr->counter.tv_usec < 0)
		{
			instr->counter.tv_usec += 1000000;
			instr->counter.tv_sec--;
		}
		while (instr->counter.tv_usec >= 1000000)
		{
			instr->counter.tv_usec -= 1000000;
			instr->counter.tv_sec++;
		}
#else							/* WIN32 */
		instr->counter.QuadPart += (endtime.QuadPart - instr->starttime.QuadPart);
#endif

		INSTR_TIME_SET_ZERO(instr->starttime);

		instr->samplecount += 1;
		instr->sampling = false;
	}

	/* Is this the first tuple of this cycle? */
	if (!instr->running)
	{
		instr->running = true;
		instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter);
	}
}

/* Finish a run cycle for a plan node */
void
InstrEndLoop(Instrumentation *instr)
{
	double		totaltime;

	/* Skip if nothing has happened, or already shut down */
	if (!instr->running)
		return;

	if (!INSTR_TIME_IS_ZERO(instr->starttime))
		elog(DEBUG2, "InstrEndLoop called on running node");

	/* Compute time spent in node */
	totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);

	/*
	 * If we didn't measure runtime on every iteration, then we must increase
	 * the measured total to account for the other iterations.  Naively
	 * multiplying totaltime by itercount/samplecount would be wrong because
	 * it effectively assumes the sampling overhead applies to all iterations,
	 * even the ones we didn't measure.  (Note that what we are trying to
	 * estimate here is the actual time spent in the node, including the
	 * actual measurement overhead; not the time exclusive of measurement
	 * overhead.)  We exclude the first iteration from the correction basis,
	 * because it often takes longer than others.
	 */
	if (instr->itercount > instr->samplecount)
	{
		double per_iter;

		per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
			- SampleOverhead;
		if (per_iter > 0)				/* sanity check */
			totaltime += per_iter * (instr->itercount - instr->samplecount);
	}

	/* Accumulate per-cycle statistics into totals */
	instr->startup += instr->firsttuple;
	instr->total += totaltime;
	instr->ntuples += instr->tuplecount;
	instr->nloops += 1;

	/* Reset for next cycle (if any) */
	instr->running = false;
	instr->sampling = false;
	INSTR_TIME_SET_ZERO(instr->starttime);
	INSTR_TIME_SET_ZERO(instr->counter);
	instr->firsttuple = 0;
	instr->tuplecount = 0;
	instr->itercount = 0;
	instr->samplecount = 0;
	instr->nextsample = 0;
}