1 files changed, 130 insertions, 39 deletions
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 5f2b84ae98e..41b061bd526 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -7,16 +7,78 @@
  * Copyright (c) 2001-2006, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.14 2006/03/05 15:58:26 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include <unistd.h>
+#include <math.h>
 
 #include "executor/instrument.h"
 
+/* This is the function that is used to determine the sampling intervals. In
+ * general, if the function is f(x), then for N tuples we will take on the
+ * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
+ * number of samples that would be collected over 1,000,000 tuples.
+
+  f(x) = x         =>   log2(N)                        20
+  f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
+  f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
+  
+ * I've chosen the last one as it seems to provide a good compromise between
+ * low overhead but still getting a meaningful number of samples. However,
+ * not all machines have the cbrt() function so on those we substitute
+ * sqrt(). The difference is not very significant in the tests I made.
+*/ 
+#ifdef HAVE_CBRT
+#define SampleFunc cbrt
+#else
+#define SampleFunc sqrt
+#endif
+
+#define SAMPLE_THRESHOLD 50
+
+static double SampleOverhead;
+static bool SampleOverheadCalculated;
+
+static void
+CalculateSampleOverhead()
+{
+	Instrumentation instr;
+	int i;
+	
+	/* We want to determine the sampling overhead, to correct
+	 * calculations later. This only needs to be done once per backend.
+	 * Is this the place? A wrong value here (due to a mistimed
+	 * task-switch) will cause bad calculations later.
+	 *
+	 * To minimize the risk we do it a few times and take the lowest.
+	 */
+	
+	SampleOverhead = 1.0e6;
+	
+	for( i = 0; i<5; i++ )
+	{
+		int j;
+		double overhead;
+		
+		memset( &instr, 0, sizeof(instr) );
+	
+		/* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
+		for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ )
+		{
+			InstrStartNode( &instr );
+			InstrStopNode( &instr, 1 );
+		}
+		overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
+		if( overhead < SampleOverhead )
+			SampleOverhead = overhead;
+	}
+		
+	SampleOverheadCalculated = true;
+}
 
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
@@ -25,7 +87,10 @@ InstrAlloc(int n)
 	Instrumentation *instr = palloc0(n * sizeof(Instrumentation));
 
 	/* we don't need to do any initialization except zero 'em */
-
+	
+	/* Calculate overhead, if not done yet */
+	if( !SampleOverheadCalculated )
+		CalculateSampleOverhead();
 	return instr;
 }
 
@@ -34,49 +99,69 @@ void
 InstrStartNode(Instrumentation *instr)
 {
 	if (INSTR_TIME_IS_ZERO(instr->starttime))
-		INSTR_TIME_SET_CURRENT(instr->starttime);
+	{
+		/* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
+		if (instr->tuplecount < SAMPLE_THRESHOLD)
+			instr->sampling = true;
+		else
+		{
+			/* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
+			if( instr->tuplecount > instr->nextsample )
+			{
+				instr->sampling = true;
+				/* The doubling is so the random will average 1 over time */
+				instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
+			}
+		}
+		if (instr->sampling)
+			INSTR_TIME_SET_CURRENT(instr->starttime);
+	}
 	else
 		elog(DEBUG2, "InstrStartNode called twice in a row");
 }
 
 /* Exit from a plan node */
 void
-InstrStopNode(Instrumentation *instr, bool returnedTuple)
+InstrStopNode(Instrumentation *instr, double nTuples)
 {
 	instr_time	endtime;
 
 	/* count the returned tuples */
-	if (returnedTuple)
-		instr->tuplecount += 1;
+	instr->tuplecount += nTuples;
 
-	if (INSTR_TIME_IS_ZERO(instr->starttime))
+	if (instr->sampling)
 	{
-		elog(DEBUG2, "InstrStopNode called without start");
-		return;
-	}
+		if (INSTR_TIME_IS_ZERO(instr->starttime))
+		{
+			elog(DEBUG2, "InstrStopNode called without start");
+			return;
+		}
 
-	INSTR_TIME_SET_CURRENT(endtime);
+		INSTR_TIME_SET_CURRENT(endtime);
 
 #ifndef WIN32
-	instr->counter.tv_sec += endtime.tv_sec - instr->starttime.tv_sec;
-	instr->counter.tv_usec += endtime.tv_usec - instr->starttime.tv_usec;
-
-	/* Normalize after each add to avoid overflow/underflow of tv_usec */
-	while (instr->counter.tv_usec < 0)
-	{
-		instr->counter.tv_usec += 1000000;
-		instr->counter.tv_sec--;
-	}
-	while (instr->counter.tv_usec >= 1000000)
-	{
-		instr->counter.tv_usec -= 1000000;
-		instr->counter.tv_sec++;
-	}
+		instr->counter.tv_sec += endtime.tv_sec - instr->starttime.tv_sec;
+		instr->counter.tv_usec += endtime.tv_usec - instr->starttime.tv_usec;
+
+		/* Normalize after each add to avoid overflow/underflow of tv_usec */
+		while (instr->counter.tv_usec < 0)
+		{
+			instr->counter.tv_usec += 1000000;
+			instr->counter.tv_sec--;
+		}
+		while (instr->counter.tv_usec >= 1000000)
+		{
+			instr->counter.tv_usec -= 1000000;
+			instr->counter.tv_sec++;
+		}
 #else							/* WIN32 */
-	instr->counter.QuadPart += (endtime.QuadPart - instr->starttime.QuadPart);
+		instr->counter.QuadPart += (endtime.QuadPart - instr->starttime.QuadPart);
 #endif
 
-	INSTR_TIME_SET_ZERO(instr->starttime);
+		INSTR_TIME_SET_ZERO(instr->starttime);
+		instr->samplecount += nTuples;
+		instr->sampling = false;
+	}
 
 	/* Is this the first tuple of this cycle? */
 	if (!instr->running)
@@ -86,17 +171,6 @@ InstrStopNode(Instrumentation *instr, bool returnedTuple)
 	}
 }
 
-/* As above, but count multiple tuples returned at once */
-void
-InstrStopNodeMulti(Instrumentation *instr, double nTuples)
-{
-	/* count the returned tuples */
-	instr->tuplecount += nTuples;
-
-	/* delegate the rest */
-	InstrStopNode(instr, false);
-}
-
 /* Finish a run cycle for a plan node */
 void
 InstrEndLoop(Instrumentation *instr)
@@ -114,8 +188,24 @@ InstrEndLoop(Instrumentation *instr)
 	totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
 
 	instr->startup += instr->firsttuple;
-	instr->total += totaltime;
+
+	/* Here we take into account sampling effects. Doing it naively ends
+	 * up assuming the sampling overhead applies to all tuples, even the
+	 * ones we didn't measure. We've calculated an overhead, so we
+	 * subtract that for all samples we didn't measure. The first tuple
+	 * is also special cased, because it usually takes longer. */
+
+	if( instr->samplecount < instr->tuplecount )
+	{
+		double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
+		instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
+		                                  + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
+	}
+	else
+		instr->total += totaltime;
+		
 	instr->ntuples += instr->tuplecount;
+	instr->nsamples += instr->samplecount;
 	instr->nloops += 1;
 
 	/* Reset for next cycle (if any) */
@@ -123,5 +213,6 @@ InstrEndLoop(Instrumentation *instr)
 	INSTR_TIME_SET_ZERO(instr->starttime);
 	INSTR_TIME_SET_ZERO(instr->counter);
 	instr->firsttuple = 0;
+	instr->samplecount = 0;
 	instr->tuplecount = 0;
 }