mirror of
https://github.com/postgres/postgres.git
synced 2025-06-30 21:42:05 +03:00
Add array_sample() and array_shuffle() functions.
These are useful in Monte Carlo applications. Martin Kalcher, reviewed/adjusted by Daniel Gustafsson and myself Discussion: https://postgr.es/m/9d160a44-7675-51e8-60cf-6d64b76db831@aboutsource.net
This commit is contained in:
@ -15,6 +15,7 @@
|
||||
#include "catalog/pg_type.h"
|
||||
#include "libpq/pqformat.h"
|
||||
#include "common/int.h"
|
||||
#include "common/pg_prng.h"
|
||||
#include "port/pg_bitutils.h"
|
||||
#include "utils/array.h"
|
||||
#include "utils/datum.h"
|
||||
@ -1525,3 +1526,168 @@ array_positions(PG_FUNCTION_ARGS)
|
||||
|
||||
PG_RETURN_DATUM(makeArrayResult(astate, CurrentMemoryContext));
|
||||
}
|
||||
|
||||
/*
|
||||
* array_shuffle_n
|
||||
* Return a copy of array with n randomly chosen items.
|
||||
*
|
||||
* The number of items must not exceed the size of the first dimension of the
|
||||
* array. We preserve the first dimension's lower bound if keep_lb,
|
||||
* else it's set to 1. Lower-order dimensions are preserved in any case.
|
||||
*
|
||||
* NOTE: it would be cleaner to look up the elmlen/elmbval/elmalign info
|
||||
* from the system catalogs, given only the elmtyp. However, the caller is
|
||||
* in a better position to cache this info across multiple calls.
|
||||
*/
|
||||
static ArrayType *
|
||||
array_shuffle_n(ArrayType *array, int n, bool keep_lb,
|
||||
Oid elmtyp, TypeCacheEntry *typentry)
|
||||
{
|
||||
ArrayType *result;
|
||||
int ndim,
|
||||
*dims,
|
||||
*lbs,
|
||||
nelm,
|
||||
nitem,
|
||||
rdims[MAXDIM],
|
||||
rlbs[MAXDIM];
|
||||
int16 elmlen;
|
||||
bool elmbyval;
|
||||
char elmalign;
|
||||
Datum *elms,
|
||||
*ielms;
|
||||
bool *nuls,
|
||||
*inuls;
|
||||
|
||||
ndim = ARR_NDIM(array);
|
||||
dims = ARR_DIMS(array);
|
||||
lbs = ARR_LBOUND(array);
|
||||
|
||||
elmlen = typentry->typlen;
|
||||
elmbyval = typentry->typbyval;
|
||||
elmalign = typentry->typalign;
|
||||
|
||||
/* If the target array is empty, exit fast */
|
||||
if (ndim < 1 || dims[0] < 1 || n < 1)
|
||||
return construct_empty_array(elmtyp);
|
||||
|
||||
deconstruct_array(array, elmtyp, elmlen, elmbyval, elmalign,
|
||||
&elms, &nuls, &nelm);
|
||||
|
||||
nitem = dims[0]; /* total number of items */
|
||||
nelm /= nitem; /* number of elements per item */
|
||||
|
||||
Assert(n <= nitem); /* else it's caller error */
|
||||
|
||||
/*
|
||||
* Shuffle array using Fisher-Yates algorithm. Scan the array and swap
|
||||
* current item (nelm datums starting at ielms) with a randomly chosen
|
||||
* later item (nelm datums starting at jelms) in each iteration. We can
|
||||
* stop once we've done n iterations; then first n items are the result.
|
||||
*/
|
||||
ielms = elms;
|
||||
inuls = nuls;
|
||||
for (int i = 0; i < n; i++)
|
||||
{
|
||||
int j = (int) pg_prng_uint64_range(&pg_global_prng_state, i, nitem - 1) * nelm;
|
||||
Datum *jelms = elms + j;
|
||||
bool *jnuls = nuls + j;
|
||||
|
||||
/* Swap i'th and j'th items; advance ielms/inuls to next item */
|
||||
for (int k = 0; k < nelm; k++)
|
||||
{
|
||||
Datum elm = *ielms;
|
||||
bool nul = *inuls;
|
||||
|
||||
*ielms++ = *jelms;
|
||||
*inuls++ = *jnuls;
|
||||
*jelms++ = elm;
|
||||
*jnuls++ = nul;
|
||||
}
|
||||
}
|
||||
|
||||
/* Set up dimensions of the result */
|
||||
memcpy(rdims, dims, ndim * sizeof(int));
|
||||
memcpy(rlbs, lbs, ndim * sizeof(int));
|
||||
rdims[0] = n;
|
||||
if (!keep_lb)
|
||||
rlbs[0] = 1;
|
||||
|
||||
result = construct_md_array(elms, nuls, ndim, rdims, rlbs,
|
||||
elmtyp, elmlen, elmbyval, elmalign);
|
||||
|
||||
pfree(elms);
|
||||
pfree(nuls);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/*
|
||||
* array_shuffle
|
||||
*
|
||||
* Returns an array with the same dimensions as the input array, with its
|
||||
* first-dimension elements in random order.
|
||||
*/
|
||||
Datum
|
||||
array_shuffle(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
|
||||
ArrayType *result;
|
||||
Oid elmtyp;
|
||||
TypeCacheEntry *typentry;
|
||||
|
||||
/*
|
||||
* There is no point in shuffling empty arrays or arrays with less than
|
||||
* two items.
|
||||
*/
|
||||
if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2)
|
||||
PG_RETURN_ARRAYTYPE_P(array);
|
||||
|
||||
elmtyp = ARR_ELEMTYPE(array);
|
||||
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
|
||||
if (typentry == NULL || typentry->type_id != elmtyp)
|
||||
{
|
||||
typentry = lookup_type_cache(elmtyp, 0);
|
||||
fcinfo->flinfo->fn_extra = (void *) typentry;
|
||||
}
|
||||
|
||||
result = array_shuffle_n(array, ARR_DIMS(array)[0], true, elmtyp, typentry);
|
||||
|
||||
PG_RETURN_ARRAYTYPE_P(result);
|
||||
}
|
||||
|
||||
/*
|
||||
* array_sample
|
||||
*
|
||||
* Returns an array of n randomly chosen first-dimension elements
|
||||
* from the input array.
|
||||
*/
|
||||
Datum
|
||||
array_sample(PG_FUNCTION_ARGS)
|
||||
{
|
||||
ArrayType *array = PG_GETARG_ARRAYTYPE_P(0);
|
||||
int n = PG_GETARG_INT32(1);
|
||||
ArrayType *result;
|
||||
Oid elmtyp;
|
||||
TypeCacheEntry *typentry;
|
||||
int nitem;
|
||||
|
||||
nitem = (ARR_NDIM(array) < 1) ? 0 : ARR_DIMS(array)[0];
|
||||
|
||||
if (n < 0 || n > nitem)
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("sample size must be between 0 and %d", nitem)));
|
||||
|
||||
elmtyp = ARR_ELEMTYPE(array);
|
||||
typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
|
||||
if (typentry == NULL || typentry->type_id != elmtyp)
|
||||
{
|
||||
typentry = lookup_type_cache(elmtyp, 0);
|
||||
fcinfo->flinfo->fn_extra = (void *) typentry;
|
||||
}
|
||||
|
||||
result = array_shuffle_n(array, n, false, elmtyp, typentry);
|
||||
|
||||
PG_RETURN_ARRAYTYPE_P(result);
|
||||
}
|
||||
|
Reference in New Issue
Block a user