diff options
author | Robert Haas <rhaas@postgresql.org> | 2014-07-30 13:22:08 -0400 |
---|---|---|
committer | Robert Haas <rhaas@postgresql.org> | 2014-07-30 13:22:08 -0400 |
commit | ed802e7dc36059efbc6669b4bfeebad43f0898c1 (patch) | |
tree | d419b849c8d01a347856e07c13fc6ec88cfce8da | |
parent | e280c630a87e1b8325770c6073097d109d79a00f (diff) | |
download | postgresql-ed802e7dc36059efbc6669b4bfeebad43f0898c1.tar.gz postgresql-ed802e7dc36059efbc6669b4bfeebad43f0898c1.zip |
pgbench: Allow \setrandom to generate Gaussian/exponential distributions.
Mitsumasa KONDO and Fabien COELHO, with further wordsmithing by me.
-rw-r--r-- | contrib/pgbench/pgbench.c | 183 | ||||
-rw-r--r-- | doc/src/sgml/pgbench.sgml | 61 |
2 files changed, 231 insertions, 13 deletions
diff --git a/contrib/pgbench/pgbench.c b/contrib/pgbench/pgbench.c index 4aa8a5031a0..ad55c3cc030 100644 --- a/contrib/pgbench/pgbench.c +++ b/contrib/pgbench/pgbench.c @@ -98,6 +98,8 @@ static int pthread_join(pthread_t th, void **thread_return); #define LOG_STEP_SECONDS 5 /* seconds between log messages */ #define DEFAULT_NXACTS 10 /* default nxacts */ +#define MIN_GAUSSIAN_THRESHOLD 2.0 /* minimum threshold for gauss */ + int nxacts = 0; /* number of transactions per client */ int duration = 0; /* duration in seconds */ @@ -471,6 +473,76 @@ getrand(TState *thread, int64 min, int64 max) return min + (int64) ((max - min + 1) * pg_erand48(thread->random_state)); } +/* + * random number generator: exponential distribution from min to max inclusive. + * the threshold is so that the density of probability for the last cut-off max + * value is exp(-threshold). + */ +static int64 +getExponentialRand(TState *thread, int64 min, int64 max, double threshold) +{ + double cut, uniform, rand; + Assert(threshold > 0.0); + cut = exp(-threshold); + /* erand in [0, 1), uniform in (0, 1] */ + uniform = 1.0 - pg_erand48(thread->random_state); + /* + * inner expresion in (cut, 1] (if threshold > 0), + * rand in [0, 1) + */ + Assert((1.0 - cut) != 0.0); + rand = - log(cut + (1.0 - cut) * uniform) / threshold; + /* return int64 random number within between min and max */ + return min + (int64)((max - min + 1) * rand); +} + +/* random number generator: gaussian distribution from min to max inclusive */ +static int64 +getGaussianRand(TState *thread, int64 min, int64 max, double threshold) +{ + double stdev; + double rand; + + /* + * Get user specified random number from this loop, with + * -threshold < stdev <= threshold + * + * This loop is executed until the number is in the expected range. + * + * As the minimum threshold is 2.0, the probability of looping is low: + * sqrt(-2 ln(r)) <= 2 => r >= e^{-2} ~ 0.135, then when taking the average + * sinus multiplier as 2/pi, we have a 8.6% looping probability in the + * worst case. For a 5.0 threshold value, the looping probability + * is about e^{-5} * 2 / pi ~ 0.43%. + */ + do + { + /* + * pg_erand48 generates [0,1), but for the basic version of the + * Box-Muller transform the two uniformly distributed random numbers + * are expected in (0, 1] (see http://en.wikipedia.org/wiki/Box_muller) + */ + double rand1 = 1.0 - pg_erand48(thread->random_state); + double rand2 = 1.0 - pg_erand48(thread->random_state); + + /* Box-Muller basic form transform */ + double var_sqrt = sqrt(-2.0 * log(rand1)); + stdev = var_sqrt * sin(2.0 * M_PI * rand2); + + /* + * we may try with cos, but there may be a bias induced if the previous + * value fails the test. To be on the safe side, let us try over. + */ + } + while (stdev < -threshold || stdev >= threshold); + + /* stdev is in [-threshold, threshold), normalization to [0,1) */ + rand = (stdev + threshold) / (threshold * 2.0); + + /* return int64 random number within between min and max */ + return min + (int64)((max - min + 1) * rand); +} + /* call PQexec() and exit() on failure */ static void executeStatement(PGconn *con, const char *sql) @@ -1319,6 +1391,7 @@ top: char *var; int64 min, max; + double threshold = 0; char res[64]; if (*argv[2] == ':') @@ -1364,11 +1437,11 @@ top: } /* - * getrand() needs to be able to subtract max from min and add one - * to the result without overflowing. Since we know max > min, we - * can detect overflow just by checking for a negative result. But - * we must check both that the subtraction doesn't overflow, and - * that adding one to the result doesn't overflow either. + * Generate random number functions need to be able to subtract + * max from min and add one to the result without overflowing. + * Since we know max > min, we can detect overflow just by checking + * for a negative result. But we must check both that the subtraction + * doesn't overflow, and that adding one to the result doesn't overflow either. */ if (max - min < 0 || (max - min) + 1 < 0) { @@ -1377,10 +1450,64 @@ top: return true; } + if (argc == 4 || /* uniform without or with "uniform" keyword */ + (argc == 5 && pg_strcasecmp(argv[4], "uniform") == 0)) + { +#ifdef DEBUG + printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max)); +#endif + snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max)); + } + else if (argc == 6 && + ((pg_strcasecmp(argv[4], "gaussian") == 0) || + (pg_strcasecmp(argv[4], "exponential") == 0))) + { + if (*argv[5] == ':') + { + if ((var = getVariable(st, argv[5] + 1)) == NULL) + { + fprintf(stderr, "%s: invalid threshold number %s\n", argv[0], argv[5]); + st->ecnt++; + return true; + } + threshold = strtod(var, NULL); + } + else + threshold = strtod(argv[5], NULL); + + if (pg_strcasecmp(argv[4], "gaussian") == 0) + { + if (threshold < MIN_GAUSSIAN_THRESHOLD) + { + fprintf(stderr, "%s: gaussian threshold must be at least %f\n,", argv[5], MIN_GAUSSIAN_THRESHOLD); + st->ecnt++; + return true; + } +#ifdef DEBUG + printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getGaussianRand(thread, min, max, threshold)); +#endif + snprintf(res, sizeof(res), INT64_FORMAT, getGaussianRand(thread, min, max, threshold)); + } + else if (pg_strcasecmp(argv[4], "exponential") == 0) + { + if (threshold <= 0.0) + { + fprintf(stderr, "%s: exponential threshold must be strictly positive\n,", argv[5]); + st->ecnt++; + return true; + } #ifdef DEBUG - printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getrand(thread, min, max)); + printf("min: " INT64_FORMAT " max: " INT64_FORMAT " random: " INT64_FORMAT "\n", min, max, getExponentialRand(thread, min, max, threshold)); #endif - snprintf(res, sizeof(res), INT64_FORMAT, getrand(thread, min, max)); + snprintf(res, sizeof(res), INT64_FORMAT, getExponentialRand(thread, min, max, threshold)); + } + } + else /* this means an error somewhere in the parsing phase... */ + { + fprintf(stderr, "%s: unexpected arguments\n", argv[0]); + st->ecnt++; + return true; + } if (!putVariable(st, argv[0], argv[1], res)) { @@ -1914,15 +2041,51 @@ process_commands(char *buf) if (pg_strcasecmp(my_commands->argv[0], "setrandom") == 0) { + /* parsing: + * \setrandom variable min max [uniform] + * \setrandom variable min max (gaussian|exponential) threshold + */ + if (my_commands->argc < 4) { fprintf(stderr, "%s: missing argument\n", my_commands->argv[0]); exit(1); } + /* argc >= 4 */ - for (j = 4; j < my_commands->argc; j++) - fprintf(stderr, "%s: extra argument \"%s\" ignored\n", - my_commands->argv[0], my_commands->argv[j]); + if (my_commands->argc == 4 || /* uniform without/with "uniform" keyword */ + (my_commands->argc == 5 && + pg_strcasecmp(my_commands->argv[4], "uniform") == 0)) + { + /* nothing to do */ + } + else if (/* argc >= 5 */ + (pg_strcasecmp(my_commands->argv[4], "gaussian") == 0) || + (pg_strcasecmp(my_commands->argv[4], "exponential") == 0)) + { + if (my_commands->argc < 6) + { + fprintf(stderr, "%s(%s): missing threshold argument\n", my_commands->argv[0], my_commands->argv[4]); + exit(1); + } + else if (my_commands->argc > 6) + { + fprintf(stderr, "%s(%s): too many arguments (extra:", + my_commands->argv[0], my_commands->argv[4]); + for (j = 6; j < my_commands->argc; j++) + fprintf(stderr, " %s", my_commands->argv[j]); + fprintf(stderr, ")\n"); + exit(1); + } + } + else /* cannot parse, unexpected arguments */ + { + fprintf(stderr, "%s: unexpected arguments (bad:", my_commands->argv[0]); + for (j = 4; j < my_commands->argc; j++) + fprintf(stderr, " %s", my_commands->argv[j]); + fprintf(stderr, ")\n"); + exit(1); + } } else if (pg_strcasecmp(my_commands->argv[0], "set") == 0) { diff --git a/doc/src/sgml/pgbench.sgml b/doc/src/sgml/pgbench.sgml index f264c245ec0..b7d88f30005 100644 --- a/doc/src/sgml/pgbench.sgml +++ b/doc/src/sgml/pgbench.sgml @@ -748,8 +748,8 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</> <varlistentry> <term> - <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</></literal> - </term> + <literal>\setrandom <replaceable>varname</> <replaceable>min</> <replaceable>max</> [ uniform | [ { gaussian | exponential } <replaceable>threshold</> ] ]</literal> + </term> <listitem> <para> @@ -761,9 +761,64 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</> </para> <para> + By default, or when <literal>uniform</> is specified, all values in the + range are drawn with equal probability. Specifiying <literal>gaussian</> + or <literal>exponential</> options modifies this behavior; each + requires a mandatory threshold which determines the precise shape of the + distribution. + </para> + + <para> + For a Gaussian distribution, the interval is mapped onto a standard + normal distribution (the classical bell-shaped Gaussian curve) truncated + at <literal>-threshold</> on the left and <literal>+threshold</> + on the right. + To be precise, if <literal>PHI(x)</> is the cumulative distribution + function of the standard normal distribution, with mean <literal>mu</> + defined as <literal>(max + min) / 2.0</>, then value <replaceable>i</> + between <replaceable>min</> and <replaceable>max</> inclusive is drawn + with probability: + <literal> + (PHI(2.0 * threshold * (i - min - mu + 0.5) / (max - min + 1)) - + PHI(2.0 * threshold * (i - min - mu - 0.5) / (max - min + 1))) / + (2.0 * PHI(threshold) - 1.0) + </> + Intuitively, the larger the <replaceable>threshold</>, the more + frequently values close to the middle of the interval are drawn, and the + less frequently values close to the <replaceable>min</> and + <replaceable>max</> bounds. + About 67% of values are drawn from the middle <literal>1.0 / threshold</> + and 95% in the middle <literal>2.0 / threshold</>; for instance, if + <replaceable>threshold</> is 4.0, 67% of values are drawn from the middle + quarter and 95% from the middle half of the interval. + The minimum <replaceable>threshold</> is 2.0 for performance of + the Box-Muller transform. + </para> + + <para> + For an exponential distribution, the <replaceable>threshold</> + parameter controls the distribution by truncating a quickly-decreasing + exponential distribution at <replaceable>threshold</>, and then + projecting onto integers between the bounds. + To be precise, value <replaceable>i</> between <replaceable>min</> and + <replaceable>max</> inclusive is drawn with probability: + <literal>(exp(-threshold*(i-min)/(max+1-min)) - + exp(-threshold*(i+1-min)/(max+1-min))) / (1.0 - exp(-threshold))</>. + Intuitively, the larger the <replaceable>threshold</>, the more + frequently values close to <replaceable>min</> are accessed, and the + less frequently values close to <replaceable>max</> are accessed. + The closer to 0 the threshold, the flatter (more uniform) the access + distribution. + A crude approximation of the distribution is that the most frequent 1% + values in the range, close to <replaceable>min</>, are drawn + <replaceable>threshold</>% of the time. + The <replaceable>threshold</> value must be strictly positive. + </para> + + <para> Example: <programlisting> -\setrandom aid 1 :naccounts +\setrandom aid 1 :naccounts gaussian 5.0 </programlisting></para> </listitem> </varlistentry> |