From 8634b18abdb30629b4b869f0c81bb7a350832baa Mon Sep 17 00:00:00 2001 From: Warren Weckesser Date: Fri, 18 Oct 2019 04:12:24 -0400 Subject: [PATCH 1/2] ENH: random: Add the multivariate hypergeometric distribution The new method multivariate_hypergeometric(self, object colors, object nsample, size=None, method='marginals') of the class numpy.random.Generator implements the multivariate hypergeometric distribution; see https://en.wikipedia.org/wiki/Hypergeometric_distribution, specifically the section "Multivariate hypergeometric distribution". Two algorithms are implemented. The user selects which algorithm to use with the `method` parameter. The default, `method='marginals'`, is based on repeated calls of the univariate hypergeometric distribution function. The other algorithm, selected with `method='count'`, is a brute-force method that allocates an internal array of length ``sum(colors)``. It should only be used when that value is small, but it can be much faster than the "marginals" algorithm in that case. The C implementations of the two methods are in the files random_mvhg_count.c and random_mvhg_marginals.c in numpy/random/src/distributions. --- .../upcoming_changes/13794.new_function.rst | 5 + numpy/random/_generator.pyx | 249 +++++++++++++++++- numpy/random/include/distributions.h | 14 + numpy/random/setup.py | 3 +- .../src/distributions/random_mvhg_count.c | 131 +++++++++ .../src/distributions/random_mvhg_marginals.c | 138 ++++++++++ numpy/random/tests/test_generator_mt19937.py | 136 +++++++++- 7 files changed, 673 insertions(+), 3 deletions(-) create mode 100644 doc/release/upcoming_changes/13794.new_function.rst create mode 100644 numpy/random/src/distributions/random_mvhg_count.c create mode 100644 numpy/random/src/distributions/random_mvhg_marginals.c diff --git a/doc/release/upcoming_changes/13794.new_function.rst b/doc/release/upcoming_changes/13794.new_function.rst new file mode 100644 index 000000000000..cf8b38bb06cb --- /dev/null +++ b/doc/release/upcoming_changes/13794.new_function.rst @@ -0,0 +1,5 @@ +Multivariate hypergeometric distribution added to `numpy.random` +---------------------------------------------------------------- +The method `multivariate_hypergeometric` has been added to the class +`numpy.random.Generator`. This method generates random variates from +the multivariate hypergeometric probability distribution. diff --git a/numpy/random/_generator.pyx b/numpy/random/_generator.pyx index 22b17ab031ad..6d9fe20f9ce8 100644 --- a/numpy/random/_generator.pyx +++ b/numpy/random/_generator.pyx @@ -13,7 +13,7 @@ from numpy.core.multiarray import normalize_axis_index from libc cimport string from libc.stdint cimport (uint8_t, uint16_t, uint32_t, uint64_t, - int32_t, int64_t) + int32_t, int64_t, INT64_MAX, SIZE_MAX) from ._bounded_integers cimport (_rand_bool, _rand_int32, _rand_int64, _rand_int16, _rand_int8, _rand_uint64, _rand_uint32, _rand_uint16, _rand_uint8, _gen_mask) @@ -126,9 +126,38 @@ cdef extern from "include/distributions.h": void random_multinomial(bitgen_t *bitgen_state, int64_t n, int64_t *mnix, double *pix, np.npy_intp d, binomial_t *binomial) nogil + int random_mvhg_count(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) nogil + void random_mvhg_marginals(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) nogil + np.import_array() +cdef int64_t _safe_sum_nonneg_int64(size_t num_colors, int64_t *colors): + """ + Sum the values in the array `colors`. + + Return -1 if an overflow occurs. + The values in *colors are assumed to be nonnegative. + """ + cdef size_t i + cdef int64_t sum + + sum = 0 + for i in range(num_colors): + if colors[i] > INT64_MAX - sum: + return -1 + sum += colors[i] + return sum + + cdef bint _check_bit_generator(object bitgen): """Check if an object satisfies the BitGenerator interface. """ @@ -3241,6 +3270,8 @@ cdef class Generator: See Also -------- + multivariate_hypergeometric : Draw samples from the multivariate + hypergeometric distribution. scipy.stats.hypergeom : probability density function, distribution or cumulative density function, etc. @@ -3739,6 +3770,222 @@ cdef class Generator: return multin + def multivariate_hypergeometric(self, object colors, object nsample, + size=None, method='marginals'): + """ + multivariate_hypergeometric(colors, nsample, size=None, + method='marginals') + + Generate variates from a multivariate hypergeometric distribution. + + The multivariate hypergeometric distribution is a generalization + of the hypergeometric distribution. + + Choose ``nsample`` items at random without replacement from a + collection with ``N`` distinct types. ``N`` is the length of + ``colors``, and the values in ``colors`` are the number of occurrences + of that type in the collection. The total number of items in the + collection is ``sum(colors)``. Each random variate generated by this + function is a vector of length ``N`` holding the counts of the + different types that occurred in the ``nsample`` items. + + The name ``colors`` comes from a common description of the + distribution: it is the probability distribution of the number of + marbles of each color selected without replacement from an urn + containing marbles of different colors; ``colors[i]`` is the number + of marbles in the urn with color ``i``. + + Parameters + ---------- + colors : sequence of integers + The number of each type of item in the collection from which + a sample is drawn. The values in ``colors`` must be nonnegative. + To avoid loss of precision in the algorithm, ``sum(colors)`` + must be less than ``10**9`` when `method` is "marginals". + nsample : int + The number of items selected. ``nsample`` must not be greater + than ``sum(colors)``. + size : int or tuple of ints, optional + The number of variates to generate, either an integer or a tuple + holding the shape of the array of variates. If the given size is, + e.g., ``(k, m)``, then ``k * m`` variates are drawn, where one + variate is a vector of length ``len(colors)``, and the return value + has shape ``(k, m, len(colors))``. If `size` is an integer, the + output has shape ``(size, len(colors))``. Default is None, in + which case a single variate is returned as an array with shape + ``(len(colors),)``. + method : string, optional + Specify the algorithm that is used to generate the variates. + Must be 'count' or 'marginals' (the default). See the Notes + for a description of the methods. + + Returns + ------- + variates : ndarray + Array of variates drawn from the multivariate hypergeometric + distribution. + + See Also + -------- + hypergeometric : Draw samples from the (univariate) hypergeometric + distribution. + + Notes + ----- + The two methods do not return the same sequence of variates. + + The "count" algorithm is roughly equivalent to the following numpy + code:: + + choices = np.repeat(np.arange(len(colors)), colors) + selection = np.random.choice(choices, nsample, replace=False) + variate = np.bincount(selection, minlength=len(colors)) + + The "count" algorithm uses a temporary array of integers with length + ``sum(colors)``. + + The "marginals" algorithm generates a variate by using repeated + calls to the univariate hypergeometric sampler. It is roughly + equivalent to:: + + variate = np.zeros(len(colors), dtype=np.int64) + # `remaining` is the cumulative sum of `colors` from the last + # element to the first; e.g. if `colors` is [3, 1, 5], then + # `remaining` is [9, 6, 5]. + remaining = np.cumsum(colors[::-1])[::-1] + for i in range(len(colors)-1): + if nsample < 1: + break + variate[i] = hypergeometric(colors[i], remaining[i+1], + nsample) + nsample -= variate[i] + variate[-1] = nsample + + The default method is "marginals". For some cases (e.g. when + `colors` contains relatively small integers), the "count" method + can be significantly faster than the "marginals" method. If + performance of the algorithm is important, test the two methods + with typical inputs to decide which works best. + + .. versionadded:: 1.18.0 + + Examples + -------- + >>> colors = [16, 8, 4] + >>> seed = 4861946401452 + >>> gen = np.random.Generator(np.random.PCG64(seed)) + >>> gen.multivariate_hypergeometric(colors, 6) + array([5, 0, 1]) + >>> gen.multivariate_hypergeometric(colors, 6, size=3) + array([[5, 0, 1], + [2, 2, 2], + [3, 3, 0]]) + >>> gen.multivariate_hypergeometric(colors, 6, size=(2, 2)) + array([[[3, 2, 1], + [3, 2, 1]], + [[4, 1, 1], + [3, 2, 1]]]) + """ + cdef int64_t nsamp + cdef size_t num_colors + cdef int64_t total + cdef int64_t *colors_ptr + cdef int64_t max_index + cdef size_t num_variates + cdef int64_t *variates_ptr + cdef int result + + if method not in ['count', 'marginals']: + raise ValueError('method must be "count" or "marginals".') + + try: + operator.index(nsample) + except TypeError: + raise ValueError('nsample must be an integer') + + if nsample < 0: + raise ValueError("nsample must be nonnegative.") + if nsample > INT64_MAX: + raise ValueError("nsample must not exceed %d" % INT64_MAX) + nsamp = nsample + + # Validation of colors, a 1-d sequence of nonnegative integers. + invalid_colors = False + try: + colors = np.asarray(colors) + if colors.ndim != 1: + invalid_colors = True + elif colors.size > 0 and not np.issubdtype(colors.dtype, + np.integer): + invalid_colors = True + elif np.any((colors < 0) | (colors > INT64_MAX)): + invalid_colors = True + except ValueError: + invalid_colors = True + if invalid_colors: + raise ValueError('colors must be a one-dimensional sequence ' + 'of nonnegative integers not exceeding %d.' % + INT64_MAX) + + colors = np.ascontiguousarray(colors, dtype=np.int64) + num_colors = colors.size + + colors_ptr = np.PyArray_DATA(colors) + + total = _safe_sum_nonneg_int64(num_colors, colors_ptr) + if total == -1: + raise ValueError("sum(colors) must not exceed the maximum value " + "of a 64 bit signed integer (%d)" % INT64_MAX) + + if method == 'marginals' and total >= 1000000000: + raise ValueError('When method is "marginals", sum(colors) must ' + 'be less than 1000000000.') + + # The C code that implements the 'count' method will malloc an + # array of size total*sizeof(size_t). Here we ensure that that + # product does not overflow. + if SIZE_MAX > INT64_MAX: + max_index = INT64_MAX // sizeof(size_t) + else: + max_index = SIZE_MAX // sizeof(size_t) + if method == 'count' and total > max_index: + raise ValueError("When method is 'count', sum(colors) must not " + "exceed %d" % max_index) + if nsamp > total: + raise ValueError("nsample > sum(colors)") + + # Figure out the shape of the return array. + if size is None: + shape = (num_colors,) + elif np.isscalar(size): + shape = (size, num_colors) + else: + shape = tuple(size) + (num_colors,) + variates = np.zeros(shape, dtype=np.int64) + + if num_colors == 0: + return variates + + # One variate is a vector of length num_colors. + num_variates = variates.size // num_colors + variates_ptr = np.PyArray_DATA(variates) + + if method == 'count': + with self.lock, nogil: + result = random_mvhg_count(&self._bitgen, total, + num_colors, colors_ptr, nsamp, + num_variates, variates_ptr) + if result == -1: + raise MemoryError("Insufficent memory for multivariate_" + "hypergeometric with method='count' and " + "sum(colors)=%d" % total) + else: + with self.lock, nogil: + random_mvhg_marginals(&self._bitgen, total, + num_colors, colors_ptr, nsamp, + num_variates, variates_ptr) + return variates + def dirichlet(self, object alpha, size=None): """ dirichlet(alpha, size=None) diff --git a/numpy/random/include/distributions.h b/numpy/random/include/distributions.h index fb69c7d2cef4..c02ea605eda2 100644 --- a/numpy/random/include/distributions.h +++ b/numpy/random/include/distributions.h @@ -170,6 +170,20 @@ DECLDIR void random_bounded_bool_fill(bitgen_t *bitgen_state, npy_bool off, DECLDIR void random_multinomial(bitgen_t *bitgen_state, RAND_INT_TYPE n, RAND_INT_TYPE *mnix, double *pix, npy_intp d, binomial_t *binomial); +/* multivariate hypergeometric, "count" method */ +DECLDIR int random_mvhg_count(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates); + +/* multivariate hypergeometric, "marginals" method */ +DECLDIR void random_mvhg_marginals(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates); + /* Common to legacy-distributions.c and distributions.c but not exported */ RAND_INT_TYPE random_binomial_btpe(bitgen_t *bitgen_state, diff --git a/numpy/random/setup.py b/numpy/random/setup.py index ddb16339bfa9..ca01250f4373 100644 --- a/numpy/random/setup.py +++ b/numpy/random/setup.py @@ -100,6 +100,8 @@ def generate_libraries(ext, build_dir): other_srcs = [ 'src/distributions/logfactorial.c', 'src/distributions/distributions.c', + 'src/distributions/random_mvhg_count.c', + 'src/distributions/random_mvhg_marginals.c', 'src/distributions/random_hypergeometric.c', ] for gen in ['_generator', '_bounded_integers']: @@ -114,7 +116,6 @@ def generate_libraries(ext, build_dir): define_macros=defs, ) config.add_extension('mtrand', - # mtrand does not depend on random_hypergeometric.c. sources=['mtrand.c', 'src/legacy/legacy-distributions.c', 'src/distributions/logfactorial.c', diff --git a/numpy/random/src/distributions/random_mvhg_count.c b/numpy/random/src/distributions/random_mvhg_count.c new file mode 100644 index 000000000000..9c0cc045dc28 --- /dev/null +++ b/numpy/random/src/distributions/random_mvhg_count.c @@ -0,0 +1,131 @@ +#include +#include +#include + +#include "include/distributions.h" + +/* + * random_mvhg_count + * + * Draw variates from the multivariate hypergeometric distribution-- + * the "count" algorithm. + * + * Parameters + * ---------- + * bitgen_t *bitgen_state + * Pointer to a `bitgen_t` instance. + * int64_t total + * The sum of the values in the array `colors`. (This is redundant + * information, but we know the caller has already computed it, so + * we might as well use it.) + * size_t num_colors + * The length of the `colors` array. + * int64_t *colors + * The array of colors (i.e. the number of each type in the collection + * from which the random variate is drawn). + * int64_t nsample + * The number of objects drawn without replacement for each variate. + * `nsample` must not exceed sum(colors). This condition is not checked; + * it is assumed that the caller has already validated the value. + * size_t num_variates + * The number of variates to be produced and put in the array + * pointed to by `variates`. One variate is a vector of length + * `num_colors`, so the array pointed to by `variates` must have length + * `num_variates * num_colors`. + * int64_t *variates + * The array that will hold the result. It must have length + * `num_variates * num_colors`. + * The array is not initialized in the function; it is expected that the + * array has been initialized with zeros when the function is called. + * + * Notes + * ----- + * The "count" algorithm for drawing one variate is roughly equivalent to the + * following numpy code: + * + * choices = np.repeat(np.arange(len(colors)), colors) + * selection = np.random.choice(choices, nsample, replace=False) + * variate = np.bincount(selection, minlength=len(colors)) + * + * This function uses a temporary array with length sum(colors). + * + * Assumptions on the arguments (not checked in the function): + * * colors[k] >= 0 for k in range(num_colors) + * * total = sum(colors) + * * 0 <= nsample <= total + * * the product total * sizeof(size_t) does not exceed SIZE_MAX + * * the product num_variates * num_colors does not overflow + */ + +int random_mvhg_count(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) +{ + size_t *choices; + bool more_than_half; + + if ((total == 0) || (nsample == 0) || (num_variates == 0)) { + // Nothing to do. + return 0; + } + + choices = malloc(total * (sizeof *choices)); + if (choices == NULL) { + return -1; + } + + /* + * If colors contains, for example, [3 2 5], then choices + * will contain [0 0 0 1 1 2 2 2 2 2]. + */ + for (size_t i = 0, k = 0; i < num_colors; ++i) { + for (int64_t j = 0; j < colors[i]; ++j) { + choices[k] = i; + ++k; + } + } + + more_than_half = nsample > (total / 2); + if (more_than_half) { + nsample = total - nsample; + } + + for (size_t i = 0; i < num_variates * num_colors; i += num_colors) { + /* + * Fisher-Yates shuffle, but only loop through the first + * `nsample` entries of `choices`. After the loop, + * choices[:nsample] contains a random sample from the + * the full array. + */ + for (size_t j = 0; j < (size_t) nsample; ++j) { + size_t tmp, k; + // Note: nsample is not greater than total, so there is no danger + // of integer underflow in `(size_t) total - j - 1`. + k = j + (size_t) random_interval(bitgen_state, + (size_t) total - j - 1); + tmp = choices[k]; + choices[k] = choices[j]; + choices[j] = tmp; + } + /* + * Count the number of occurrences of each value in choices[:nsample]. + * The result, stored in sample[i:i+num_colors], is the sample from + * the multivariate hypergeometric distribution. + */ + for (size_t j = 0; j < (size_t) nsample; ++j) { + variates[i + choices[j]] += 1; + } + + if (more_than_half) { + for (size_t k = 0; k < num_colors; ++k) { + variates[i + k] = colors[k] - variates[i + k]; + } + } + } + + free(choices); + + return 0; +} diff --git a/numpy/random/src/distributions/random_mvhg_marginals.c b/numpy/random/src/distributions/random_mvhg_marginals.c new file mode 100644 index 000000000000..301a4acad0a5 --- /dev/null +++ b/numpy/random/src/distributions/random_mvhg_marginals.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include + +#include "include/distributions.h" +#include "logfactorial.h" + + +/* + * random_mvhg_marginals + * + * Draw samples from the multivariate hypergeometric distribution-- + * the "marginals" algorithm. + * + * This version generates the sample by iteratively calling + * hypergeometric() (the univariate hypergeometric distribution). + * + * Parameters + * ---------- + * bitgen_t *bitgen_state + * Pointer to a `bitgen_t` instance. + * int64_t total + * The sum of the values in the array `colors`. (This is redundant + * information, but we know the caller has already computed it, so + * we might as well use it.) + * size_t num_colors + * The length of the `colors` array. The functions assumes + * num_colors > 0. + * int64_t *colors + * The array of colors (i.e. the number of each type in the collection + * from which the random variate is drawn). + * int64_t nsample + * The number of objects drawn without replacement for each variate. + * `nsample` must not exceed sum(colors). This condition is not checked; + * it is assumed that the caller has already validated the value. + * size_t num_variates + * The number of variates to be produced and put in the array + * pointed to by `variates`. One variate is a vector of length + * `num_colors`, so the array pointed to by `variates` must have length + * `num_variates * num_colors`. + * int64_t *variates + * The array that will hold the result. It must have length + * `num_variates * num_colors`. + * The array is not initialized in the function; it is expected that the + * array has been initialized with zeros when the function is called. + * + * Notes + * ----- + * Here's an example that demonstrates the idea of this algorithm. + * + * Suppose the urn contains red, green, blue and yellow marbles. + * Let nred be the number of red marbles, and define the quantities for + * the other colors similarly. The total number of marbles is + * + * total = nred + ngreen + nblue + nyellow. + * + * To generate a sample using rk_hypergeometric: + * + * red_sample = hypergeometric(ngood=nred, nbad=total - nred, + * nsample=nsample) + * + * This gives us the number of red marbles in the sample. The number of + * marbles in the sample that are *not* red is nsample - red_sample. + * To figure out the distribution of those marbles, we again use + * rk_hypergeometric: + * + * green_sample = hypergeometric(ngood=ngreen, + * nbad=total - nred - ngreen, + * nsample=nsample - red_sample) + * + * Similarly, + * + * blue_sample = hypergeometric( + * ngood=nblue, + * nbad=total - nred - ngreen - nblue, + * nsample=nsample - red_sample - green_sample) + * + * Finally, + * + * yellow_sample = total - (red_sample + green_sample + blue_sample). + * + * The above sequence of steps is implemented as a loop for an arbitrary + * number of colors in the innermost loop in the code below. `remaining` + * is the value passed to `nbad`; it is `total - colors[0]` in the first + * call to random_hypergeometric(), and then decreases by `colors[j]` in + * each iteration. `num_to_sample` is the `nsample` argument. It + * starts at this function's `nsample` input, and is decreased by the + * result of the call to random_hypergeometric() in each iteration. + * + * Assumptions on the arguments (not checked in the function): + * * colors[k] >= 0 for k in range(num_colors) + * * total = sum(colors) + * * 0 <= nsample <= total + * * the product num_variates * num_colors does not overflow + */ + +void random_mvhg_marginals(bitgen_t *bitgen_state, + int64_t total, + size_t num_colors, int64_t *colors, + int64_t nsample, + size_t num_variates, int64_t *variates) +{ + bool more_than_half; + + if ((total == 0) || (nsample == 0) || (num_variates == 0)) { + // Nothing to do. + return; + } + + more_than_half = nsample > (total / 2); + if (more_than_half) { + nsample = total - nsample; + } + + for (size_t i = 0; i < num_variates * num_colors; i += num_colors) { + int64_t num_to_sample = nsample; + int64_t remaining = total; + for (size_t j = 0; (num_to_sample > 0) && (j + 1 < num_colors); ++j) { + int64_t r; + remaining -= colors[j]; + r = random_hypergeometric(bitgen_state, + colors[j], remaining, num_to_sample); + variates[i + j] = r; + num_to_sample -= r; + } + + if (num_to_sample > 0) { + variates[i + num_colors - 1] = num_to_sample; + } + + if (more_than_half) { + for (size_t k = 0; k < num_colors; ++k) { + variates[i + k] = colors[k] - variates[i + k]; + } + } + } +} diff --git a/numpy/random/tests/test_generator_mt19937.py b/numpy/random/tests/test_generator_mt19937.py index 391c33c1aeb1..526275dda03b 100644 --- a/numpy/random/tests/test_generator_mt19937.py +++ b/numpy/random/tests/test_generator_mt19937.py @@ -4,7 +4,7 @@ import numpy as np from numpy.testing import ( - assert_, assert_raises, assert_equal, + assert_, assert_raises, assert_equal, assert_allclose, assert_warns, assert_no_warnings, assert_array_equal, assert_array_almost_equal, suppress_warnings) @@ -115,6 +115,140 @@ def test_p_non_contiguous(self): assert_array_equal(non_contig, contig) +class TestMultivariateHypergeometric(object): + + def setup(self): + self.seed = 8675309 + + def test_argument_validation(self): + # Error cases... + + # `colors` must be a 1-d sequence + assert_raises(ValueError, random.multivariate_hypergeometric, + 10, 4) + + # Negative nsample + assert_raises(ValueError, random.multivariate_hypergeometric, + [2, 3, 4], -1) + + # Negative color + assert_raises(ValueError, random.multivariate_hypergeometric, + [-1, 2, 3], 2) + + # nsample exceeds sum(colors) + assert_raises(ValueError, random.multivariate_hypergeometric, + [2, 3, 4], 10) + + # nsample exceeds sum(colors) (edge case of empty colors) + assert_raises(ValueError, random.multivariate_hypergeometric, + [], 1) + + # Validation errors associated with very large values in colors. + assert_raises(ValueError, random.multivariate_hypergeometric, + [999999999, 101], 5, 1, 'marginals') + + int64_info = np.iinfo(np.int64) + max_int64 = int64_info.max + max_int64_index = max_int64 // int64_info.dtype.itemsize + assert_raises(ValueError, random.multivariate_hypergeometric, + [max_int64_index - 100, 101], 5, 1, 'count') + + @pytest.mark.parametrize('method', ['count', 'marginals']) + def test_edge_cases(self, method): + # Set the seed, but in fact, all the results in this test are + # deterministic, so we don't really need this. + random = Generator(MT19937(self.seed)) + + x = random.multivariate_hypergeometric([0, 0, 0], 0, method=method) + assert_array_equal(x, [0, 0, 0]) + + x = random.multivariate_hypergeometric([], 0, method=method) + assert_array_equal(x, []) + + x = random.multivariate_hypergeometric([], 0, size=1, method=method) + assert_array_equal(x, np.empty((1, 0), dtype=np.int64)) + + x = random.multivariate_hypergeometric([1, 2, 3], 0, method=method) + assert_array_equal(x, [0, 0, 0]) + + x = random.multivariate_hypergeometric([9, 0, 0], 3, method=method) + assert_array_equal(x, [3, 0, 0]) + + colors = [1, 1, 0, 1, 1] + x = random.multivariate_hypergeometric(colors, sum(colors), + method=method) + assert_array_equal(x, colors) + + x = random.multivariate_hypergeometric([3, 4, 5], 12, size=3, + method=method) + assert_array_equal(x, [[3, 4, 5]]*3) + + # Cases for nsample: + # nsample < 10 + # 10 <= nsample < colors.sum()/2 + # colors.sum()/2 < nsample < colors.sum() - 10 + # colors.sum() - 10 < nsample < colors.sum() + @pytest.mark.parametrize('nsample', [8, 25, 45, 55]) + @pytest.mark.parametrize('method', ['count', 'marginals']) + @pytest.mark.parametrize('size', [5, (2, 3), 150000]) + def test_typical_cases(self, nsample, method, size): + random = Generator(MT19937(self.seed)) + + colors = np.array([10, 5, 20, 25]) + sample = random.multivariate_hypergeometric(colors, nsample, size, + method=method) + if isinstance(size, int): + expected_shape = (size,) + colors.shape + else: + expected_shape = size + colors.shape + assert_equal(sample.shape, expected_shape) + assert_((sample >= 0).all()) + assert_((sample <= colors).all()) + assert_array_equal(sample.sum(axis=-1), + np.full(size, fill_value=nsample, dtype=int)) + if isinstance(size, int) and size >= 100000: + # This sample is large enough to compare its mean to + # the expected values. + assert_allclose(sample.mean(axis=0), + nsample * colors / colors.sum(), + rtol=1e-3, atol=0.005) + + def test_repeatability1(self): + random = Generator(MT19937(self.seed)) + sample = random.multivariate_hypergeometric([3, 4, 5], 5, size=5, + method='count') + expected = np.array([[2, 1, 2], + [2, 1, 2], + [1, 1, 3], + [2, 0, 3], + [2, 1, 2]]) + assert_array_equal(sample, expected) + + def test_repeatability2(self): + random = Generator(MT19937(self.seed)) + sample = random.multivariate_hypergeometric([20, 30, 50], 50, + size=5, + method='marginals') + expected = np.array([[ 9, 17, 24], + [ 7, 13, 30], + [ 9, 15, 26], + [ 9, 17, 24], + [12, 14, 24]]) + assert_array_equal(sample, expected) + + def test_repeatability3(self): + random = Generator(MT19937(self.seed)) + sample = random.multivariate_hypergeometric([20, 30, 50], 12, + size=5, + method='marginals') + expected = np.array([[2, 3, 7], + [5, 3, 4], + [2, 5, 5], + [5, 3, 4], + [1, 5, 6]]) + assert_array_equal(sample, expected) + + class TestSetState(object): def setup(self): self.seed = 1234567890 From 0455447b5019ab40cf9cfa4b2225c1260824bd6b Mon Sep 17 00:00:00 2001 From: Warren Weckesser Date: Fri, 18 Oct 2019 14:11:38 -0400 Subject: [PATCH 2/2] DOC: random: Add entry for multivariate_hypergeometric to the Generator docs. --- doc/source/reference/random/generator.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/random/generator.rst b/doc/source/reference/random/generator.rst index 068143270481..a2cbb493a7f1 100644 --- a/doc/source/reference/random/generator.rst +++ b/doc/source/reference/random/generator.rst @@ -62,6 +62,7 @@ Distributions ~numpy.random.Generator.lognormal ~numpy.random.Generator.logseries ~numpy.random.Generator.multinomial + ~numpy.random.Generator.multivariate_hypergeometric ~numpy.random.Generator.multivariate_normal ~numpy.random.Generator.negative_binomial ~numpy.random.Generator.noncentral_chisquare