aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/executor/instrument.c178
-rw-r--r--src/include/executor/instrument.h11
2 files changed, 113 insertions, 76 deletions
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 41b061bd526..d094ffbbcf6 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -7,90 +7,117 @@
* Copyright (c) 2001-2006, PostgreSQL Global Development Group
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include <unistd.h>
#include <math.h>
+#include <unistd.h>
#include "executor/instrument.h"
-/* This is the function that is used to determine the sampling intervals. In
- * general, if the function is f(x), then for N tuples we will take on the
- * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
- * number of samples that would be collected over 1,000,000 tuples.
- f(x) = x => log2(N) 20
- f(x) = x^(1/2) => 2 * N^(1/2) 2000
- f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
-
+/*
+ * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
+ * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
+ * (Because that requires a kernel call on most systems, it's expensive.)
+ *
+ * This macro determines the sampling interval: that is, after how many more
+ * iterations will we take the next time sample, given that niters iterations
+ * have occurred already. In general, if the function is f(x), then for N
+ * iterations we will take on the order of integral(1/f(x), x=0..N)
+ * samples. Some examples follow, with the number of samples that would be
+ * collected over 1,000,000 iterations:
+ *
+ * f(x) = x => log2(N) 20
+ * f(x) = x^(1/2) => 2 * N^(1/2) 2000
+ * f(x) = x^(1/3) => 1.5 * N^(2/3) 15000
+ *
* I've chosen the last one as it seems to provide a good compromise between
* low overhead but still getting a meaningful number of samples. However,
* not all machines have the cbrt() function so on those we substitute
* sqrt(). The difference is not very significant in the tests I made.
-*/
+ *
+ * The actual sampling interval is randomized with the SampleFunc() value
+ * as the mean; this hopefully will reduce any measurement bias due to
+ * cyclic variation in the node execution time.
+ */
#ifdef HAVE_CBRT
-#define SampleFunc cbrt
+#define SampleFunc(niters) cbrt(niters)
#else
-#define SampleFunc sqrt
+#define SampleFunc(niters) sqrt(niters)
#endif
+#define SampleInterval(niters) \
+ (SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
+
+/*
+ * We sample at every node iteration until we've reached this threshold,
+ * so that nodes not called a large number of times are completely accurate.
+ * (Perhaps this should be a GUC variable? But beware of messing up
+ * CalculateSampleOverhead if value is too small.)
+ */
#define SAMPLE_THRESHOLD 50
+/*
+ * Determine the sampling overhead. This only needs to be done once per
+ * backend (actually, probably once per postmaster would do ...)
+ */
static double SampleOverhead;
-static bool SampleOverheadCalculated;
+static bool SampleOverheadCalculated = false;
static void
-CalculateSampleOverhead()
+CalculateSampleOverhead(void)
{
Instrumentation instr;
int i;
-
- /* We want to determine the sampling overhead, to correct
- * calculations later. This only needs to be done once per backend.
- * Is this the place? A wrong value here (due to a mistimed
- * task-switch) will cause bad calculations later.
- *
+
+ /*
+ * We could get a wrong result due to being interrupted by task switch.
* To minimize the risk we do it a few times and take the lowest.
*/
-
SampleOverhead = 1.0e6;
-
- for( i = 0; i<5; i++ )
+
+ for (i = 0; i < 5; i++)
{
int j;
double overhead;
-
- memset( &instr, 0, sizeof(instr) );
-
- /* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
- for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ )
+
+ memset(&instr, 0, sizeof(instr));
+ /*
+ * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
+ * so that's as far as we can test.
+ */
+ for (j=0; j < SAMPLE_THRESHOLD; j++)
{
- InstrStartNode( &instr );
- InstrStopNode( &instr, 1 );
+ InstrStartNode(&instr);
+ InstrStopNode(&instr, 1);
}
overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
- if( overhead < SampleOverhead )
+ if (overhead < SampleOverhead)
SampleOverhead = overhead;
}
-
+
SampleOverheadCalculated = true;
}
+
/* Allocate new instrumentation structure(s) */
Instrumentation *
InstrAlloc(int n)
{
- Instrumentation *instr = palloc0(n * sizeof(Instrumentation));
+ Instrumentation *instr;
- /* we don't need to do any initialization except zero 'em */
-
- /* Calculate overhead, if not done yet */
- if( !SampleOverheadCalculated )
+ /* Calculate sampling overhead, if not done yet in this backend */
+ if (!SampleOverheadCalculated)
CalculateSampleOverhead();
+
+ instr = palloc0(n * sizeof(Instrumentation));
+
+ /* we don't need to do any initialization except zero 'em */
+
return instr;
}
@@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
{
if (INSTR_TIME_IS_ZERO(instr->starttime))
{
- /* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
- if (instr->tuplecount < SAMPLE_THRESHOLD)
+ /*
+ * Always sample if not yet up to threshold, else check whether
+ * next threshold has been reached
+ */
+ if (instr->itercount < SAMPLE_THRESHOLD)
instr->sampling = true;
- else
+ else if (instr->itercount >= instr->nextsample)
{
- /* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
- if( instr->tuplecount > instr->nextsample )
- {
- instr->sampling = true;
- /* The doubling is so the random will average 1 over time */
- instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
- }
+ instr->sampling = true;
+ instr->nextsample =
+ instr->itercount + SampleInterval(instr->itercount);
}
if (instr->sampling)
INSTR_TIME_SET_CURRENT(instr->starttime);
@@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
void
InstrStopNode(Instrumentation *instr, double nTuples)
{
- instr_time endtime;
-
- /* count the returned tuples */
+ /* count the returned tuples and iterations */
instr->tuplecount += nTuples;
+ instr->itercount += 1;
+ /* measure runtime if appropriate */
if (instr->sampling)
{
+ instr_time endtime;
+
if (INSTR_TIME_IS_ZERO(instr->starttime))
{
elog(DEBUG2, "InstrStopNode called without start");
@@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
#endif
INSTR_TIME_SET_ZERO(instr->starttime);
- instr->samplecount += nTuples;
+
+ instr->samplecount += 1;
instr->sampling = false;
}
@@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
if (!INSTR_TIME_IS_ZERO(instr->starttime))
elog(DEBUG2, "InstrEndLoop called on running node");
- /* Accumulate per-cycle statistics into totals */
+ /* Compute time spent in node */
totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
- instr->startup += instr->firsttuple;
-
- /* Here we take into account sampling effects. Doing it naively ends
- * up assuming the sampling overhead applies to all tuples, even the
- * ones we didn't measure. We've calculated an overhead, so we
- * subtract that for all samples we didn't measure. The first tuple
- * is also special cased, because it usually takes longer. */
-
- if( instr->samplecount < instr->tuplecount )
+ /*
+ * If we didn't measure runtime on every iteration, then we must increase
+ * the measured total to account for the other iterations. Naively
+ * multiplying totaltime by itercount/samplecount would be wrong because
+ * it effectively assumes the sampling overhead applies to all iterations,
+ * even the ones we didn't measure. (Note that what we are trying to
+ * estimate here is the actual time spent in the node, including the
+ * actual measurement overhead; not the time exclusive of measurement
+ * overhead.) We exclude the first iteration from the correction basis,
+ * because it often takes longer than others.
+ */
+ if (instr->itercount > instr->samplecount)
{
- double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
- instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
- + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
+ double per_iter;
+
+ per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
+ - SampleOverhead;
+ if (per_iter > 0) /* sanity check */
+ totaltime += per_iter * (instr->itercount - instr->samplecount);
}
- else
- instr->total += totaltime;
-
+
+ /* Accumulate per-cycle statistics into totals */
+ instr->startup += instr->firsttuple;
+ instr->total += totaltime;
instr->ntuples += instr->tuplecount;
- instr->nsamples += instr->samplecount;
instr->nloops += 1;
/* Reset for next cycle (if any) */
instr->running = false;
+ instr->sampling = false;
INSTR_TIME_SET_ZERO(instr->starttime);
INSTR_TIME_SET_ZERO(instr->counter);
instr->firsttuple = 0;
- instr->samplecount = 0;
instr->tuplecount = 0;
+ instr->itercount = 0;
+ instr->samplecount = 0;
+ instr->nextsample = 0;
}
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index fbf578d473f..fd57f865788 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -6,7 +6,7 @@
*
* Copyright (c) 2001-2006, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.14 2006/05/30 14:01:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.15 2006/05/30 19:24:25 tgl Exp $
*
*-------------------------------------------------------------------------
*/
@@ -57,20 +57,19 @@ typedef struct Instrumentation
{
/* Info about current plan cycle: */
bool running; /* TRUE if we've completed first tuple */
+ bool sampling; /* Are we sampling in current iteration? */
instr_time starttime; /* Start time of current iteration of node */
instr_time counter; /* Accumulated runtime for this node */
double firsttuple; /* Time for first tuple of this cycle */
double tuplecount; /* Tuples emitted so far this cycle */
- double samplecount; /* Samples collected this cycle */
+ double itercount; /* Plan node iterations this cycle */
+ double samplecount; /* Iterations in which we sampled runtime */
+ double nextsample; /* Next itercount to sample at */
/* Accumulated statistics across all completed cycles: */
double startup; /* Total startup time (in seconds) */
double total; /* Total total time (in seconds) */
double ntuples; /* Total tuples produced */
double nloops; /* # of run cycles for this node */
- double nsamples; /* # of samples taken */
- /* Tracking for sampling */
- bool sampling; /* Are we sampling this iteration */
- double nextsample; /* The next tuplecount we're going to sample */
} Instrumentation;
extern Instrumentation *InstrAlloc(int n);