diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2023-04-07 11:47:07 -0400 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2023-04-07 11:47:07 -0400 |
commit | 888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4 (patch) | |
tree | b596640ca993a5fd85568e173b0a8e48a7d69071 /src/backend/utils/adt/array_userfuncs.c | |
parent | cd82e5c79d145dddd7a30ed35e4d3b83945b56f3 (diff) | |
download | postgresql-888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4.tar.gz postgresql-888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4.zip |
Add array_sample() and array_shuffle() functions.
These are useful in Monte Carlo applications.
Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself
Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net
Diffstat (limited to 'src/backend/utils/adt/array_userfuncs.c')
-rw-r--r-- | src/backend/utils/adt/array_userfuncs.c | 166 |
1 files changed, 166 insertions, 0 deletions
diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c index 80750191d8d..33e2b98307d 100644 --- a/src/backend/utils/adt/array_userfuncs.c +++ b/src/backend/utils/adt/array_userfuncs.c @@ -15,6 +15,7 @@ #include "catalog/pg_type.h" #include "libpq/pqformat.h" #include "common/int.h" +#include "common/pg_prng.h" #include "port/pg_bitutils.h" #include "utils/array.h" #include "utils/datum.h" @@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS) PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext)); } + +/* + * array_shuffle_n + * Return a copy of array with n randomly chosen items. + * + * The number of items must not exceed the size of the first dimension of the + * array. We preserve the first dimension's lower bound if keep_lb, + * else it's set to 1. Lower-order dimensions are preserved in any case. + * + * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info + * from the system catalogs, given only the elmtyp. However, the caller is + * in a better position to cache this info across multiple calls. + */ +static ArrayType * +array_shuffle_n(ArrayType *array, int n, bool keep_lb, + Oid elmtyp, TypeCacheEntry *typentry) +{ + ArrayType *result; + int ndim, + *dims, + *lbs, + nelm, + nitem, + rdims[MAXDIM], + rlbs[MAXDIM]; + int16 elmlen; + bool elmbyval; + char elmalign; + Datum *elms, + *ielms; + bool *nuls, + *inuls; + + ndim = ARR_NDIM(array); + dims = ARR_DIMS(array); + lbs = ARR_LBOUND(array); + + elmlen = typentry->typlen; + elmbyval = typentry->typbyval; + elmalign = typentry->typalign; + + /* If the target array is empty, exit fast */ + if (ndim < 1 || dims[0] < 1 || n < 1) + return construct_empty_array(elmtyp); + + deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign, + &elms, &nuls, &nelm); + + nitem = dims[0]; /* total number of items */ + nelm /= nitem; /* number of elements per item */ + + Assert(n <= nitem); /* else it's caller error */ + + /* + * Shuffle array using Fisher-Yates algorithm. Scan the array and swap + * current item (nelm datums starting at ielms) with a randomly chosen + * later item (nelm datums starting at jelms) in each iteration. We can + * stop once we've done n iterations; then first n items are the result. + */ + ielms = elms; + inuls = nuls; + for (int i = 0; i < n; i++) + { + int j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm; + Datum *jelms = elms + j; + bool *jnuls = nuls + j; + + /* Swap i'th and j'th items; advance ielms/inuls to next item */ + for (int k = 0; k < nelm; k++) + { + Datum elm = *ielms; + bool nul = *inuls; + + *ielms++ = *jelms; + *inuls++ = *jnuls; + *jelms++ = elm; + *jnuls++ = nul; + } + } + + /* Set up dimensions of the result */ + memcpy(rdims, dims, ndim * sizeof(int)); + memcpy(rlbs, lbs, ndim * sizeof(int)); + rdims[0] = n; + if (!keep_lb) + rlbs[0] = 1; + + result = construct_md_array(elms, nuls, ndim, rdims, rlbs, + elmtyp, elmlen, elmbyval, elmalign); + + pfree(elms); + pfree(nuls); + + return result; +} + +/* + * array_shuffle + * + * Returns an array with the same dimensions as the input array, with its + * first-dimension elements in random order. + */ +Datum +array_shuffle(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + ArrayType *result; + Oid elmtyp; + TypeCacheEntry *typentry; + + /* + * There is no point in shuffling empty arrays or arrays with less than + * two items. + */ + if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2) + PG_RETURN_ARRAYTYPE_P(array); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + + result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry); + + PG_RETURN_ARRAYTYPE_P(result); +} + +/* + * array_sample + * + * Returns an array of n randomly chosen first-dimension elements + * from the input array. + */ +Datum +array_sample(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int n = PG_GETARG_INT32(1); + ArrayType *result; + Oid elmtyp; + TypeCacheEntry *typentry; + int nitem; + + nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0]; + + if (n < 0 || n > nitem) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("sample size must be between 0 and %d", nitem))); + + elmtyp = ARR_ELEMTYPE(array); + typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra; + if (typentry == NULL || typentry->type_id != elmtyp) + { + typentry = lookup_type_cache(elmtyp, 0); + fcinfo->flinfo->fn_extra = (void *) typentry; + } + + result = array_shuffle_n(array, n, false, elmtyp, typentry); + + PG_RETURN_ARRAYTYPE_P(result); +} |