8000 Tests Generelization for multiple accelerator devices (#139749) · pytorch/pytorch@95b41d2 · GitHub
[go: up one dir, main page]

Skip to content

Commit 95b41d2

Browse files
rahulsingh-intelpytorchmergebot
authored andcommitted
Tests Generelization for multiple accelerator devices (#139749)
Motivation: Generalize unit tests so that can be executed for cuda and non cuda devices. Chnages: There are general changes in common_dtesnor module for device type generalization so that tests can be executed on non cuda devices too. Pull Request resolved: #139749 Approved by: https://github.com/kwen2501
1 parent 1800f5f commit 95b41d2

File tree

4 files changed

+45
-25
lines changed

4 files changed

+45
-25
lines changed

test/distributed/tensor/test_dtensor_compile.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,14 @@
3535
RowwiseParallel,
3636
)
3737
from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
38+
from torch.testing._internal.common_fsdp import get_devtype
3839
from torch.testing._internal.common_utils import (
3940
instantiate_parametrized_tests,
4041
parametrize,
4142
run_tests,
4243
skipIfTorchDynamo,
44+
TEST_CUDA,
45+
TEST_HPU,
4346
)
4447
from torch.testing._internal.distributed._tensor.common_dtensor import (
4548
DTensorTestBase,
@@ -52,6 +55,9 @@
5255
from torch.utils.checkpoint import checkpoint
5356

5457

58+
dev_type = torch.device(get_devtype())
59+
60+
5561
class SimpleModel(nn.Module):
5662
def __init__(self, device):
5763
super().__init__()
@@ -102,7 +108,7 @@ def tearDown(self):
102108

103109
@property
104110
def device_type(self) -> str:
105-
return "cuda" if torch.cuda.is_available() else "cpu"
111+
return "cuda" if TEST_CUDA else "hpu" if TEST_HPU else "cpu"
106112

107113
@property
108114
def world_size(self) -> int:
@@ -907,7 +913,7 @@ def test_2d_fsdp_tp_compile(self):
907913
tp_model = parallelize_module(model, twod_mesh["tp"], parallelize_plan)
908914
eager_2d = FSDP(
909915
tp_model,
910-
device_id=self.rank,
916+
device_id=dev_type.type,
911917
use_orig_params=True,
912918
device_mesh=twod_mesh["dp"],
913919
)
@@ -919,7 +925,7 @@ def test_2d_fsdp_tp_compile(self):
919925
)
920926
fsdp_2d = FSDP(
921927
tp_model2,
922-
device_id=self.rank,
928+
device_id=dev_type.type,
923929
use_orig_params=True,
924930
device_mesh=twod_mesh["dp"],
925931
)

test/distributed/tensor/test_random_ops.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
)
2020
from torch.distributed.tensor.debug import CommDebugMode
2121
from torch.distributed.tensor.parallel import ColwiseParallel, parallelize_module
22-
from torch.testing._internal.common_utils import run_tests
22+
from torch.testing._internal.common_utils import run_tests, TEST_HPU
2323
from torch.testing._internal.distributed._tensor.common_dtensor import (
2424
DTensorTestBase,
2525
skip_if_lt_x_gpu,
@@ -28,6 +28,9 @@
2828
)
2929

3030

31+
TYPE_DEVICE = "hpu" if TEST_HPU else "cuda"
32+
33+
3134
class DistTensorRandomInitTest(DTensorTestBase):
3235
def _run_init_op(self, init_op, *args, **kwargs):
3336
device_mesh = self.build_device_mesh()
@@ -47,7 +50,7 @@ def _run_init_op(self, init_op, *args, **kwargs):
4750
self.assertEqual(local_tensor_clone, dtensor.to_local())
4851
else:
4952
# create DTensor from Tensor
50-
_tensor = torch.empty(*input_size, device="cuda")
53+
_tensor = torch.empty(*input_size, device=TYPE_DEVICE)
5154
dtensor = distribute_tensor(_tensor, device_mesh, [Shard(1)])
5255

5356
# DTensor random init
@@ -242,15 +245,15 @@ class DistTensorRandomOpTest(DTensorTestBase):
242245
@with_comms
243246
@skip_unless_torch_gpu
244247
def test_rng_tracker_init(self):
245-
torch.cuda.manual_seed(self.rank)
246-
object_list = [torch.cuda.initial_seed()]
248+
torch.manual_seed(self.rank)
249+
object_list = [torch.initial_seed()]
247250
broadcast_object_list(object_list)
248251
seed_from_rank_0 = int(object_list[0])
249252

250253
device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
251254
# seed synchronization happens after the first `distribute_tensor` call
252255
distribute_tensor(
253-
torch.empty([self.world_size], device="cuda"), device_mesh, [Shard(0)]
256+
torch.empty([self.world_size], device=TYPE_DEVICE), device_mesh, [Shard(0)]
254257
)
255258
self.assertEqual(seed_from_rank_0, random._rng_tracker.get_seed("parallel-rng"))
256259

@@ -340,13 +343,13 @@ def test_deterministic_dropout_1d(self):
340343
# execution the default random seed will be different (a random value).
341344
# The DTensor random ops will use the same random seed even though the
342345
# torch random generator keeps different seeds on ranks.
343-
torch.cuda.manual_seed(self.rank)
346+
torch.manual_seed(self.rank)
344347
# TODO: add test before/after enabling distribute region
345348
device_mesh = DeviceMesh(self.device_type, torch.arange(self.world_size))
346349
size = [4, 4]
347350

348351
dtensor = distribute_tensor(
349-
torch.empty(*size, device="cuda"), device_mesh, [Shard(1)]
352+
torch.empty(*size, device=TYPE_DEVICE), device_mesh, [Shard(1)]
350353
)
351354

352355
# a random op call shifts the offset
@@ -400,7 +403,7 @@ def test_deterministic_rand_1d(self):
400403
local_tensor[other_slice, :],
401404
)
402405

403-
torch.cuda.manual_seed(self.rank)
406+
torch.manual_seed(self.rank)
404407
dtensor = fn(size, device_mesh=device_mesh, placements=[Replicate()])
405408
local_tensor = funcol.all_gather_tensor(
406409
dtensor.to_local(), gather_dim=0, group=(device_mesh, 0)

test/distributed/tensor/test_redistribute.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from torch.distributed.device_mesh import init_device_mesh
1010
from torch.distributed.tensor._collective_utils import shard_dim_alltoall
1111
from torch.distributed.tensor.debug import CommDebugMode
12-
from torch.testing._internal.common_utils import run_tests
12+
from torch.testing._internal.common_utils import run_tests, TEST_CUDA, TEST_HPU
1313
from torch.testing._internal.distributed._tensor.common_dtensor import (
1414
DTensorTestBase,
1515
with_comms,
@@ -366,7 +366,7 @@ def test_redistribute_shard_dim_change(self):
366366
local_out_dt = out_dt.to_local()
367367
local_expected_dt = expected_dt.to_local()
368368
self.assertEqual(out_dt.to_local(), expected_dt.to_local())
369-
if self.device_type == "cuda":
369+
if TEST_HPU or TEST_CUDA:
370370
self.assertEqual(
371371
comm_mode.get_comm_counts()[
372372
torch.ops._dtensor.shard_dim_alltoall

torch/testing/_internal/distributed/_tensor/common_dtensor.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
RowwiseParallel,
3333
SequenceParallel,
3434
)
35+
from torch.testing._internal.common_utils import (
36+
TEST_HPU,
37+
TEST_CUDA,
38+
)
3539
from torch.testing._internal.common_distributed import (
3640
MultiProcessTestCase,
3741
MultiThreadedTestCase,
@@ -41,17 +45,26 @@
4145
)
4246

4347
from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
44-
45-
DEVICE_TYPE = (
46-
"cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
47-
)
48+
from torch._utils import _get_device_module
49+
50+
if TEST_CUDA:
51+
DEVICE_TYPE = "cuda"
52+
PG_BACKEND = "nccl"
53+
DEVICE_COUNT = _get_device_module("cuda").device_count()
54+
elif TEST_HPU:
55+
DEVICE_TYPE = "hpu"
56+
PG_BACKEND = "hccl"
57+
DEVICE_COUNT = _get_device_module("hpu").device_count()
58+
else:
59+
DEVICE_TYPE = "cpu"
60+
PG_BACKEND = "gloo"
4861

4962
NUM_DEVICES = 4
5063

5164
# We use this as a proxy for "multiple GPUs exist"
52-
if torch.cuda.is_available() and torch.cuda.device_count() > 1:
65+
if TEST_CUDA and DEVICE_COUNT > 1:
5366
# when we actually have multiple GPUs, relax the requirement to smaller counts.
54-
NUM_DEVICES = min(NUM_DEVICES, torch.cuda.device_count())
67+
NUM_DEVICES = min(NUM_DEVICES, DEVICE_COUNT)
5568

5669
T = TypeVar("T")
5770

@@ -311,7 +324,7 @@ def world_size(self) -> int:
311324

312325
@property
313326
def backend(self) -> str:
314-
backend = "nccl" if self.device_type == "cuda" else "gloo"
327+
backend = "nccl" if TEST_CUDA else "hccl" if TEST_HPU else "gloo"
315328
return backend
316329

317330
def build_device_mesh(self) -> DeviceMesh:
@@ -321,7 +334,7 @@ def init_pg(self, eager_init) -> None:
321334
if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
322335
sys.exit(TEST_SKIPS[f"multi-gpu-{self.world_size}"].exit_code)
323336

324-
if self.backend not in ["nccl", "gloo", "mpi", "cpu:gloo,cuda:nccl"]:
337+
if self.backend not in ["nccl", "gloo", "mpi", "cpu:gloo,cuda:nccl", "hccl"]:
325338
raise RuntimeError(f"Backend {self.backend} not supported!")
326339

327340
device_id = None
@@ -330,7 +343,6 @@ def init_pg(self, eager_init) -> None:
330343
torch.cuda.set_device(self.rank)
331344
# we only need to set device_id for nccl backend with eager init
332345
device_id = torch.device(f"{self.device_type}:{self.rank}") if eager_init else None
333-
334346
# For nccl backend, bind the device to the process if device_id is not None
335347
# so the nccl communicator is immediately formed and we can use `ncclCommSplit`
336348
# for form subgroup to avoid unnecesssary overhead.
@@ -342,11 +354,10 @@ def init_pg(self, eager_init) -> None:
342354
device_id=device_id,
343355
)
344356

345-
346357
def destroy_pg(self) -> None:
347358
# Wait for all ranks to reach here before starting shutdown.
348359
# FIXME dist.barrier deadlocks with multiple threads and NCCL: https://github.com/pytorch/pytorch/issues/95895
349-
# dist.all_reduce(torch.zeros((1,), device="cuda" if torch.cuda.is_available() else "cpu"))
360+
# dist.all_reduce(torch.zeros((1,), device="cuda" if TEST_CUDA else "cpu"))
350361
# FIXME can't use the above all_reduce as it causes hangs on bionic and focal. It hangs:
351362
# test_dtensor.py -- DTensorMeshTest.test_dtensor_device_mesh_device_conversion
352363
dist.barrier()
@@ -383,7 +394,7 @@ def wrapper(
383394
self, *args: tuple[object], **kwargs: Dict[str, Any] # type: ignore[misc]
384395
) -> None:
385396
# if enough GPU we can use GPU, otherwise we fallback to CPU
386-
if not torch.cuda.is_available() or torch.cuda.device_count() < self.world_size:
397+
if not TEST_CUDA or torch.cuda.device_count() < self.world_size:
387398
self.device_type = "cpu"
388399
else:
389400
self.device_type = DEVICE_TYPE

0 commit comments

Comments
 (0)
0