aboutsummaryrefslogtreecommitdiff
path: root/src/backend/utils/adt/array_userfuncs.c
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2023-04-07 11:47:07 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2023-04-07 11:47:07 -0400
commit888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4 (patch)
treeb596640ca993a5fd85568e173b0a8e48a7d69071 /src/backend/utils/adt/array_userfuncs.c
parentcd82e5c79d145dddd7a30ed35e4d3b83945b56f3 (diff)
downloadpostgresql-888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4.tar.gz
postgresql-888f2ea0a81ff171087bdd1c5c1eeda3b78d73d4.zip
Add array_sample() and array_shuffle() functions.
These are useful in Monte Carlo applications. Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net
Diffstat (limited to 'src/backend/utils/adt/array_userfuncs.c')
-rw-r--r--src/backend/utils/adt/array_userfuncs.c166
1 files changed, 166 insertions, 0 deletions
diff --git a/src/backend/utils/adt/array_userfuncs.c b/src/backend/utils/adt/array_userfuncs.c
index 80750191d8d..33e2b98307d 100644
--- a/src/backend/utils/adt/array_userfuncs.c
+++ b/src/backend/utils/adt/array_userfuncs.c
@@ -15,6 +15,7 @@
#include "catalog/pg_type.h"
#include "libpq/pqformat.h"
#include "common/int.h"
+#include "common/pg_prng.h"
#include "port/pg_bitutils.h"
#include "utils/array.h"
#include "utils/datum.h"
@@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS)
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
}
+
+/*
+ * array_shuffle_n
+ * Return a copy of array with n randomly chosen items.
+ *
+ * The number of items must not exceed the size of the first dimension of the
+ * array. We preserve the first dimension's lower bound if keep_lb,
+ * else it's set to 1. Lower-order dimensions are preserved in any case.
+ *
+ * NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
+ * from the system catalogs, given only the elmtyp. However, the caller is
+ * in a better position to cache this info across multiple calls.
+ */
+static ArrayType *
+array_shuffle_n(ArrayType *array, int n, bool keep_lb,
+ Oid elmtyp, TypeCacheEntry *typentry)
+{
+ ArrayType *result;
+ int ndim,
+ *dims,
+ *lbs,
+ nelm,
+ nitem,
+ rdims[MAXDIM],
+ rlbs[MAXDIM];
+ int16 elmlen;
+ bool elmbyval;
+ char elmalign;
+ Datum *elms,
+ *ielms;
+ bool *nuls,
+ *inuls;
+
+ ndim = ARR_NDIM(array);
+ dims = ARR_DIMS(array);
+ lbs = ARR_LBOUND(array);
+
+ elmlen = typentry->typlen;
+ elmbyval = typentry->typbyval;
+ elmalign = typentry->typalign;
+
+ /* If the target array is empty, exit fast */
+ if (ndim < 1 || dims[0] < 1 || n < 1)
+ return construct_empty_array(elmtyp);
+
+ deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
+ &elms, &nuls, &nelm);
+
+ nitem = dims[0]; /* total number of items */
+ nelm /= nitem; /* number of elements per item */
+
+ Assert(n <= nitem); /* else it's caller error */
+
+ /*
+ * Shuffle array using Fisher-Yates algorithm. Scan the array and swap
+ * current item (nelm datums starting at ielms) with a randomly chosen
+ * later item (nelm datums starting at jelms) in each iteration. We can
+ * stop once we've done n iterations; then first n items are the result.
+ */
+ ielms = elms;
+ inuls = nuls;
+ for (int i = 0; i < n; i++)
+ {
+ int j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm;
+ Datum *jelms = elms + j;
+ bool *jnuls = nuls + j;
+
+ /* Swap i'th and j'th items; advance ielms/inuls to next item */
+ for (int k = 0; k < nelm; k++)
+ {
+ Datum elm = *ielms;
+ bool nul = *inuls;
+
+ *ielms++ = *jelms;
+ *inuls++ = *jnuls;
+ *jelms++ = elm;
+ *jnuls++ = nul;
+ }
+ }
+
+ /* Set up dimensions of the result */
+ memcpy(rdims, dims, ndim * sizeof(int));
+ memcpy(rlbs, lbs, ndim * sizeof(int));
+ rdims[0] = n;
+ if (!keep_lb)
+ rlbs[0] = 1;
+
+ result = construct_md_array(elms, nuls, ndim, rdims, rlbs,
+ elmtyp, elmlen, elmbyval, elmalign);
+
+ pfree(elms);
+ pfree(nuls);
+
+ return result;
+}
+
+/*
+ * array_shuffle
+ *
+ * Returns an array with the same dimensions as the input array, with its
+ * first-dimension elements in random order.
+ */
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
+ ArrayType *result;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+
+ /*
+ * There is no point in shuffling empty arrays or arrays with less than
+ * two items.
+ */
+ if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2)
+ PG_RETURN_ARRAYTYPE_P(array);
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+
+ result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}
+
+/*
+ * array_sample
+ *
+ * Returns an array of n randomly chosen first-dimension elements
+ * from the input array.
+ */
+Datum
+array_sample(PG_FUNCTION_ARGS)
+{
+ ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
+ int n = PG_GETARG_INT32(1);
+ ArrayType *result;
+ Oid elmtyp;
+ TypeCacheEntry *typentry;
+ int nitem;
+
+ nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0];
+
+ if (n < 0 || n > nitem)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("sample size must be between 0 and %d", nitem)));
+
+ elmtyp = ARR_ELEMTYPE(array);
+ typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+ if (typentry == NULL || typentry->type_id != elmtyp)
+ {
+ typentry = lookup_type_cache(elmtyp, 0);
+ fcinfo->flinfo->fn_extra = (void *) typentry;
+ }
+
+ result = array_shuffle_n(array, n, false, elmtyp, typentry);
+
+ PG_RETURN_ARRAYTYPE_P(result);
+}