From 10b7dca2526e381e6140b4830be01e1709d4e83e Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Mon, 23 Dec 2024 20:33:03 -0800 Subject: [PATCH 1/4] fix randint distribution for large max --- aten/src/ATen/core/DistributionsHelper.h | 6 +----- .../ATen/native/cuda/DistributionTemplates.h | 6 +----- test/inductor/test_torchinductor.py | 20 +++++++++++++++++++ test/test_tensor_creation_ops.py | 18 +++++++++++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h index e823565133fc..e265c455b3e6 100644 --- a/aten/src/ATen/core/DistributionsHelper.h +++ b/aten/src/ATen/core/DistributionsHelper.h @@ -41,11 +41,7 @@ struct uniform_int_from_to_distribution { template C10_HOST_DEVICE inline T operator()(RNG generator) { - if (( - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) && range_ >= 1ULL << 32) + if (range_ >= 1ULL << 25) // allow approx 1% skew in uniform int generation using % { return transformation::uniform_int_from_to(generator->random64(), range_, base_); } else { diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index b685a67ae5d9..58c9c28f6f4b 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -280,11 +280,7 @@ namespace cuda { template void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) { AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { - if (( - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) && range >= 1ULL << 32) + if (range >= 1ULL << 25) // allow approx 1% skew in uniform int generation using % { // define lambda to mod with range and add base auto random_func = [range, base] __device__ (uint64_t rand) { diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py index c5e2eeb46ea5..2afb7e9c608a 100644 --- a/test/inductor/test_torchinductor.py +++ b/test/inductor/test_torchinductor.py @@ -8437,6 +8437,26 @@ def fn(x): self.assertGreater(c0.max(), 2**40) self.assertLess(c0.max(), 2**50) + def test_randint_distribution(self): + @torch.compile(fullgraph=True) + def fn(n_argsmax, size): + return torch.randint(n_max, (size,), device=self.device) + + def bin(index, max_size): + return index // (max_size // n_bins) + + size = 1_000_000 + n_max = int(0.75 * 2**32) + n_bins = 8 + + res = fn(n_max, size) + bins = bin(res, n_max).float().cpu() + hist, _ = bins.histogram(8, range=(0, n_bins)) + expected_bin = res.shape[0] / 8 + expected_error = math.sqrt(expected_bin) / expected_bin * 3 + error = (hist - expected_bin).abs().max() / expected_bin + self.assertTrue(error < expected_error) + @config.patch(fallback_random=True) def test_like_rands(self): def fn(x): diff --git a/test/test_tensor_creation_ops.py b/test/test_tensor_creation_ops.py index 315089cf2cb6..7bdd731d1cfc 100644 --- a/test/test_tensor_creation_ops.py +++ b/test/test_tensor_creation_ops.py @@ -3507,6 +3507,24 @@ def seed(generator): self.assertTrue((res1 < 6).all().item()) self.assertTrue((res1 >= 0).all().item()) + + def test_randint_distribution(self, device): + size = 1_000_000 + n_max = int(0.75 * 2 ** 32) + n_bins = 8 + + def bin(index, max_size): + return index // (max_size // n_bins) + res = torch.randint(n_max, (size,), device=device) + # histogram implemented for float only + bins = bin(res, n_max).float().cpu() + hist, _ = bins.histogram(8, range=(0, n_bins)) + expected_bin = res.shape[0] / 8 + expected_error = math.sqrt(expected_bin) / expected_bin * 3 + error = (hist - expected_bin).abs().max() / expected_bin + self.assertTrue(error < expected_error) + + @dtypes(torch.half, torch.float, torch.bfloat16, torch.double, torch.complex32, torch.complex64, torch.complex128) def test_randn(self, device, dtype): From b4c530f6934431a2e7434e5560d9bcdc7afa9765 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Tue, 24 Dec 2024 11:28:10 -0800 Subject: [PATCH 2/4] skip inductor dynamic tests --- test/inductor/test_torchinductor_codegen_dynamic_shapes.py | 1 + test/inductor/test_torchinductor_dynamic_shapes.py | 1 + 2 files changed, 2 insertions(+) diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py index f04452fa8326..76c42ba7fc9f 100644 --- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py @@ -231,6 +231,7 @@ def run(*ex, **kwargs): "test_pointwise_laguerre_polynomial_l_dynamic_shapes": TestFailure(("cuda", "xpu")), "test_pointwise_legendre_polynomial_p_dynamic_shapes": TestFailure(("cuda", "xpu")), "test_polar_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True), + "test_randint_distribution": TestFailure(("cuda",)), "test_randn_generator_dynamic_shapes": TestFailure(("cpu",)), "test_randn_like_empty_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), "test_single_elem_dynamic_shapes": TestFailure(("cpu",)), diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py index 68d0e07c143a..cb6cb23f8307 100644 --- a/test/inductor/test_torchinductor_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_dynamic_shapes.py @@ -59,6 +59,7 @@ ("cpu", "cuda", "xpu") ), "test_conv_inference_heuristics_dynamic_shapes": TestFailure(("cuda", "xpu")), + "test_randint_distribution": TestFailure(("cuda",)), } if TEST_WITH_ROCM: From 28c48ec431cef37b0c7f37bfb27cbdd0136e5663 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Thu, 26 Dec 2024 10:42:09 -0800 Subject: [PATCH 3/4] skip dynamic shape tests, adjust cpu rng test --- aten/src/ATen/test/rng_test.h | 4 +++- test/inductor/test_torchinductor_codegen_dynamic_shapes.py | 2 +- test/inductor/test_torchinductor_dynamic_shapes.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/aten/src/ATen/test/rng_test.h b/aten/src/ATen/test/rng_test.h index 250d54f20b2d..c4a8953fad31 100644 --- a/aten/src/ATen/test/rng_test.h +++ b/aten/src/ATen/test/rng_test.h @@ -137,7 +137,9 @@ void test_random_from_to(const at::Device& device) { range = static_cast(max_to) - static_cast(from) + 1; from_case_covered = true; } - if (range < (1ULL << 32)) { + // this is leaking details of implementation into test + // we are starting to use random64() at 2^25 to minimize skew due to % + if (range < (1ULL << 25)) { exp = static_cast(static_cast((static_cast(val) % range + from))); } else { exp = static_cast(static_cast((val % range + from))); diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py index 76c42ba7fc9f..0173fe9c4de0 100644 --- a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py @@ -231,7 +231,7 @@ def run(*ex, **kwargs): "test_pointwise_laguerre_polynomial_l_dynamic_shapes": TestFailure(("cuda", "xpu")), "test_pointwise_legendre_polynomial_p_dynamic_shapes": TestFailure(("cuda", "xpu")), "test_polar_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu"), is_skip=True), - "test_randint_distribution": TestFailure(("cuda",)), + "test_randint_distribution_dynamic_shapes": TestFailure(("cuda",)), "test_randn_generator_dynamic_shapes": TestFailure(("cpu",)), "test_randn_like_empty_dynamic_shapes": TestFailure(("cpu", "cuda", "xpu")), "test_single_elem_dynamic_shapes": TestFailure(("cpu",)), diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py index cb6cb23f8307..ba6b9ab47119 100644 --- a/test/inductor/test_torchinductor_dynamic_shapes.py +++ b/test/inductor/test_torchinductor_dynamic_shapes.py @@ -59,7 +59,7 @@ ("cpu", "cuda", "xpu") ), "test_conv_inference_heuristics_dynamic_shapes": TestFailure(("cuda", "xpu")), - "test_randint_distribution": TestFailure(("cuda",)), + "test_randint_distribution_dynamic_shapes": TestFailure(("cuda",)), } if TEST_WITH_ROCM: From 64642dbdc123adf13c6531f1c9e236e1a557b0f2 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Tue, 7 Jan 2025 18:19:30 -0800 Subject: [PATCH 4/4] use higher threshold --- aten/src/ATen/core/DistributionsHelper.h | 2 +- aten/src/ATen/native/cuda/DistributionTemplates.h | 2 +- aten/src/ATen/test/rng_test.h | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aten/src/ATen/core/DistributionsHelper.h b/aten/src/ATen/core/DistributionsHelper.h index 2bf98cf95f8b..acca669503d4 100644 --- a/aten/src/ATen/core/DistributionsHelper.h +++ b/aten/src/ATen/core/DistributionsHelper.h @@ -40,7 +40,7 @@ struct uniform_int_from_to_distribution { template C10_HOST_DEVICE inline T operator()(RNG generator) { - if (range_ >= 1ULL << 25) // allow approx 1% skew in uniform int generation using % + if (range_ >= 1ULL << 28) // allow approx 5% skew in uniform int generation using % { return transformation::uniform_int_from_to(generator->random64(), range_, base_); } else { diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 58c9c28f6f4b..cbb1f076c57d 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -280,7 +280,7 @@ namespace cuda { template void random_from_to_kernel(TensorIteratorBase& iter, uint64_t range, int64_t base, RNG gen) { AT_DISPATCH_V2(iter.dtype(), "random_from_to_kernel_cuda", AT_WRAP([&] { - if (range >= 1ULL << 25) // allow approx 1% skew in uniform int generation using % + if (range >= 1ULL << 28) // allow approx 5% skew in uniform int generation using % { // define lambda to mod with range and add base auto random_func = [range, base] __device__ (uint64_t rand) { diff --git a/aten/src/ATen/test/rng_test.h b/aten/src/ATen/test/rng_test.h index c4a8953fad31..a785163e8f8e 100644 --- a/aten/src/ATen/test/rng_test.h +++ b/aten/src/ATen/test/rng_test.h @@ -138,8 +138,8 @@ void test_random_from_to(const at::Device& device) { from_case_covered = true; } // this is leaking details of implementation into test - // we are starting to use random64() at 2^25 to minimize skew due to % - if (range < (1ULL << 25)) { + // we are starting to use random64() at 2^28 to minimize skew due to % + if (range < (1ULL << 28)) { exp = static_cast(static_cast((static_cast(val) % range + from))); } else { exp = static_cast(static_cast((val % range + from)));