ENH: Add Floyd's algorithm and call from RandomState.choice, see #2764

Implement Floyd's algorithm for random sampling without replacement. Reference: Bentley, J. Floyd, B., Programming Pearls: A Sample Of Brilliance. Communications of the ACM, Vol.11, No.9, 1987. The crux is the implementation of the hash set. After some benchmarks I settled on the same basic design as used by "khash", see github.com/attractivechaos/klib/blob/master/khash.h This means: * Open-addressing scheme, with quadratic probing. * The size of the underlying array is a power of 2. * The maximum load factor is chosen to be 0.77. Other implementation details: * Negative values are used as flags for empty slots. * A simple randomizing hash function is used to handle the insertion of consecutive integers.
numpy · ghost · Dec 9, 2015 · Dec 12, 2015 · Dec 12, 2015 · Dec 9, 2015
commit aedf7d58a15de0a5df84cb2cfbcad3bfdddaeec5
diff --git a/numpy/random/mtrand/mtrand.pyx b/numpy/random/mtrand/mtrand.pyx
@@ -997,6 +997,46 @@ cdef class RandomState:
         return bytestring
 
 
+    cdef int _floyd_add(self, long key, long *set, npy_intp size) nogil:
+        cdef long mask, step, i
+        mask = size - 1
+        i = 1103515245 * key + 12345
+        for step from 0 <= step < size:
+            i = (i + step) & mask
+            if set[i] < 0:
+                set[i] = key
+                return 1
+            elif set[i] == key:
+                return 0
+
+    cdef object _floyd_sample(self, long n, long k):
+        cdef npy_intp size, i
+        cdef unsigned long t, j
+        cdef ndarray set_array "arrayObject"
+        cdef long *set_data
+
+        size = 2 ** <npy_intp>(log(k*1.3)/log(2.0) + 1.)
+        set_array = <ndarray>np.empty(size, np.int_)
+        set_data = <long *>PyArray_DATA(set_array)
+
+        with self.lock, nogil:
+            for i from 0 <= i < size:
+                set_data[i] = -1
+
+            for j from n-k <= j < n:
+                t = rk_interval(j, self.internal_state)
+                if not self._floyd_add(<long>t, set_data, size):
+                    self._floyd_add(<long>j, set_data, size)
+
+            i = 0
+            for j from 0 <= j < size:
+                if set_data[j] >= 0:
+                    set_data[i] = set_data[j]
+                    i += 1
+
+        sample = set_array[:k]
+        return sample
+
     def choice(self, a, size=None, replace=True, p=None):
         """
         choice(a, size=None, replace=True, p=None)
@@ -1154,7 +1194,15 @@ cdef class RandomState:
                     n_uniq += new.size
                 idx = found
             else:
-                idx = self.permutation(pop_size)[:size]
+                # if self.version <= 0 or ...
+                if pop_size < 2*size:
+                    # For small popsize do a single O(size) pass instead of
+                    # the two passes required for Floyd's algorithm.
+                    idx = self.permutation(pop_size)[:size]
+                else:
+                    idx = self._floyd_sample(pop_size, size)
+                    self.shuffle(idx)
+
                 if shape is not None:
                     idx.shape = shape