@@ -1100,7 +1100,6 @@ def test_reduce_sum(self):
1100
1100
1101
1101
@unittest .skipIf (BACKEND != "nccl" , "Only Nccl supports CUDA reduce" )
1102
1102
@skip_if_no_gpu
1103
- @skip_if_rocm
1104
1103
def test_reduce_sum_cuda (self ):
1105
1104
group , group_id , rank = self ._init_global_test ()
1106
10000
1105
rank_to_GPU = self ._init_multigpu_helper ()
@@ -1256,7 +1255,6 @@ def test_reduce_sum_twice(self):
1256
1255
1257
1256
@unittest .skipIf (BACKEND != "nccl" , "Only Nccl supports CUDA reduce" )
1258
1257
@skip_if_no_gpu
1259
- @skip_if_rocm
1260
1258
def test_reduce_sum_cuda_twice (self ):
1261
1259
group , group_id , rank = self ._init_global_test ()
1262
1260
rank_to_GPU = self ._init_multigpu_helper ()
@@ -1635,7 +1633,6 @@ def test_sparse_all_reduce_sum(self):
1635
1633
1636
1634
@unittest .skipIf (BACKEND != "gloo" , "Only Gloo backend support sparse all reduce" )
1637
1635
@skip_if_no_gpu
1638
- @skip_if_rocm
1639
1636
def test_sparse_all_reduce_sum_cuda (self ):
1640
1637
self ._test_sparse_all_reduce_sum (lambda t : t .clone ().cuda ())
1641
1638
@@ -2227,7 +2224,6 @@ def test_all_to_all_single_equal_split(self):
2227
2224
BACKEND != "nccl" , "Only Nccl supports CUDA all_to_all_single"
2228
2225
)
2229
2226
@skip_if_no_gpu
2230
- @skip_if_rocm
2231
2227
def test_all_to_all_single_equal_split_cuda (self ):
2232
2228
group , group_id , rank = self ._init_global_test ()
2233
2229
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2250,7 +2246,6 @@ def test_all_to_all_single_unequal_split(self):
2250
2246
BACKEND != "nccl" , "Only Nccl supports CUDA all_to_all_single"
2251
2247
)
2252
2248
@skip_if_no_gpu
2253
- @skip_if_rocm
2254
2249
def test_all_to_all_single_unequal_split_cuda (self ):
2255
2250
group , group_id , rank = self ._init_global_test ()
2256
2251
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2286,7 +2281,6 @@ def test_all_to_all_single_equal_split_group(self):
2286
2281
BACKEND != "nccl" , "Only Nccl supports CUDA all_to_all_single"
2287
2282
)
2288
2283
@skip_if_no_gpu
2289
- @skip_if_rocm
2290
2284
@skip_if_small_worldsize
2291
2285
def test_all_to_all_single_equal_split_group_cuda (self ):
2292
2286
group , group_id , rank = self ._init_group_test ()
@@ -2311,7 +2305,6 @@ def test_all_to_all_single_unequal_split_group(self):
2311
2305
BACKEND != "nccl" , "Only Nccl supports CUDA all_to_all_single"
2312
2306
)
2313
2307
@skip_if_no_gpu
2314
- @skip_if_rocm
2315
2308
@skip_if_small_worldsize
2316
2309
def test_all_to_all_single_unequal_split_group_cuda (self ):
2317
2310
group , group_id , rank = self ._init_global_test ()
@@ -2356,7 +2349,6 @@ def test_all_to_all_single_equal_split_full_group(self):
2356
2349
BACKEND != "nccl" , "Only Nccl supports CUDA all_to_all_single"
2357
2350
)
2358
2351
@skip_if_no_gpu
2359
- @skip_if_rocm
2360
2352
def test_all_to_all_single_equal_split_full_group_cuda (self ):
2361
2353
group , group_id , rank = self ._init_full_group_test ()
2362
2354
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2379,7 +2371,6 @@ def test_all_to_all_single_unequal_split_full_group(self):
2379
2371
BACKEND != "nccl" , "Only Nccl supports CUDA all_to_all_single"
2380
2372
)
2381
2373
@skip_if_no_gpu
2382
- @skip_if_rocm
2383
2374
def test_all_to_all_single_unequal_split_full_group_cuda (self ):
2384
2375
group , group_id , rank = self ._init_full_group_test ()
2385
2376
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2443,7 +2434,6 @@ def test_barrier_cuda(self):
2443
2434
@skip_if_small_worldsize
2444
2435
@skip_if_no_gpu
2445
2436
@unittest .skipIf (BACKEND == "mpi" , "MPI doesn't supports GPU barrier" )
2446
- @skip_if_rocm
2447
2437
def test_barrier_group_cuda (self ):
2448
2438
group , group_id , rank = self ._init_group_test ()
2449
2439
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2583,7 +2573,6 @@ def _test_reduce_multigpu_helper(
2583
2573
2584
2574
@unittest .skipIf (BACKEND != "nccl" , "Only Nccl backend supports reduce multigpu" )
2585
2575
@skip_if_no_gpu
2586
- @skip_if_rocm
2587
2576
def test_reduce_multigpu (self ):
2588
2577
group , group_id , rank = self ._init_global_test ()
2589
2578
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2820,7 +2809,6 @@ def test_DistributedDataParallel_requires_grad(self):
2820
2809
"Only NCCL and GLOO backend support DistributedDataParallel" ,
2821
2810
)
2822
2811
@skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
2823
- @skip_if_rocm
2824
2812
def test_DistributedDataParallel_non_default_stream (self ):
2825
2813
stream = torch .cuda .Stream (self .rank )
2826
2814
rank = self .rank
@@ -2896,7 +2884,6 @@ def test_DistributedDataParallel_powerSGD_ddp_comm_hook(self):
2896
2884
@unittest .skipIf (BACKEND != 'nccl' and BACKEND != 'gloo' ,
2897
2885
"Only Nccl & Gloo backend support DistributedDataParallel" )
2898
2886
@skip_if_no_gpu
2899
- @skip_if_rocm
2900
2887
def test_DistributedDataParallel (self ):
2901
2888
group , group_id , rank = self ._init_global_test ()
2902
2889
rank_to_GPU = self ._init_multigpu_helper ()
@@ -2913,7 +2900,6 @@ def test_DistributedDataParallel(self):
2913
2900
@unittest .skipIf (BACKEND != 'nccl' and BACKEND != 'gloo' ,
2914
2901
"Only Nccl & Gloo backend support DistributedDataParallel" )
2915
2902
@skip_if_no_gpu
2916
- @skip_if_rocm
2917
2903
def test_DistributedDataParallel_with_grad_is_view (self ):
2918
2904
group , group_id , rank = self ._init_global_test ()
2919
2905
rank_to_GPU = self ._init_multigpu_helper ()
@@ -3070,7 +3056,6 @@ def test_DistributedDataParallel_SyncBatchNorm_2D_Input(self):
3070
3056
"Only Nccl & Gloo backend support DistributedDataParallel" )
3071
3057
@skip_if_no_gpu
3072
3058
@require_world_size (2 )
3073
- @skip_if_rocm
3074
3059
def test_DistributedDataParallel_SyncBatchNorm_Single_Input_Per_Process (self ):
3075
3060
group , group_id , rank = self ._init_global_test ()
3076
3061
rank_to_GPU = self ._init_multigpu_helper ()
@@ -3197,7 +3182,6 @@ def _run_reduction_test(
3197
3182
@require_backend ({"nccl" })
3198
3183
@require_backends_available ({"nccl" })
3199
3184
@skip_if_lt_x_gpu (2 )
3200
- @skip_if_rocm
3201
3185
def test_nccl_backend_bool_allreduce (self ):
3202
3186
torch .cuda .set_device (self .rank )
3203
3187
# Run all_reduce with PRODUCT
@@ -3228,7 +3212,6 @@ def test_nccl_backend_bool_allreduce(self):
3228
3212
@require_backend ({"nccl" })
3229
3213
@require_backends_available ({"nccl" })
3230
3214
@skip_if_lt_x_gpu (2 )
3231
- @skip_if_rocm
3232
3215
def test_nccl_backend_bool_allgather (self ):
3233
3216
torch .cuda .set_device (self .rank )
3234
3217
inp = {0 : [True , True ], 1 : [False , True
10000
span>]}
@@ -3252,7 +3235,6 @@ def test_nccl_backend_bool_allgather(self):
3252
3235
@require_backend ({"nccl" })
3253
3236
@require_backends_available ({"nccl" })
3254
3237
@skip_if_lt_x_gpu (int (os .environ ["WORLD_SIZE" ]))
3255
- @skip_if_rocm
3256
3238
def test_nccl_backend_bool_reduce (self ):
3257
3239
torch .cuda .set_device (self .rank )
3258
3240
inp = {0 : [True , True ], 1 : [False , False ]}
@@ -3285,7 +3267,6 @@ def test_nccl_backend_bool_reduce(self):
3285
3267
@require_backend ({"nccl" })
3286
3268
@require_backends_available ({"nccl" })
3287
3269
@skip_if_lt_x_gpu (2 )
3288
- @skip_if_rocm
3289
3270
def test_nccl_backend_bool_broadcast (self ):
3290
3271
tensor_size = 10
3291
3272
bcast_tensor = torch .tensor (
@@ -3481,7 +3462,6 @@ def validate_net_equivalence(self, net):
3481
3462
@require_backend ({"gloo" , "nccl" })
3482
3463
@require_backends_available ({"gloo" , "nccl" })
3483
3464
@skip_if_lt_x_gpu (2 )
3484
- @skip_if_rocm
3485
3465
def test_ddp_sync_params_and_buffers (self ):
3486
3466
# Test that after calling _sync_params_and_buffers, models across ranks
3487
3467
# are the same and are equal to the model on the input rank.
@@ -3523,7 +3503,6 @@ def test_ddp_sync_params_and_buffers(self):
3523
3503
@require_backend ({"gloo" , "nccl" })
3524
3504
@require_backends_available ({"gloo" , "nccl" })
3525
3505
@skip_if_lt_x_gpu (2 )
3526
- @skip_if_rocm
3527
3506
def test_ddp_grad_div_uneven_inputs (self ):
3528
3507
# Test gradient division during training with join() API. If
3529
3508
# divide_by_initial_world_size=False, we scale by the effective world
@@ -3577,7 +3556,6 @@ def test_ddp_grad_div_uneven_inputs(self):
3577
3556
@require_backend ({"gloo" , "nccl" })
3578
3557
@require_backends_available ({"gloo" , "nccl" })
3579
3558
@skip_if_lt_x_gpu (2 )
3580
- @skip_if_rocm
3581
3559
def test_ddp_join_model_equivalence (self ):
3582
3560
# Verifies equivalence with model training locally and with DDP under
3583
3561
# the join context manager.
@@ -3681,7 +3659,6 @@ def _run_uneven_inputs_test(
3681
3659
@require_backend ({"gloo" , "nccl" })
3682
3660
@require_backends_available ({"gloo" , "nccl" })
3683
3661
@skip_if_lt_x_gpu (2 )
3684
- @skip_if_rocm
3685
3662
def test_ddp_uneven_inputs (self ):
3686
3663
class DDPUnevenTestInput (NamedTuple ):
3687
3664
name : str
@@ -3839,7 +3816,6 @@ def forward(self, x, rank):
3839
3816
@require_backend ({"gloo" , "nccl" })
3840
3817
@require_backends_available ({"gloo" , "nccl" })
3841
3818
@skip_if_lt_x_gpu (2 )
3842
- @skip_if_rocm
3843
3819
def test_ddp_uneven_input_join_disable (self ):
3844
3820
# tests that if net.join() with enable=False is specified, DDP works as
3845
3821
# expected with even inputs.
@@ -3874,7 +3850,6 @@ def test_ddp_uneven_input_join_disable(self):
3874
3850
@require_backend ({"gloo" , "nccl" })
3875
3851
@require_backends_available ({"gloo" , "nccl" })
3876
3852
@skip_if_lt_x_gpu (2 )
3877
- @skip_if_rocm
3878
3853
def test_ddp_uneven_input_exception (self ):
3879
3854
# Tests that exceptions during training are correctly propagated by the
3880
3855
# context manager.
@@ -3902,7 +3877,6 @@ def forward(self, _):
3902
3877
@require_backend ({"gloo" , "nccl" })
3903
3878
@require_backends_available ({"gloo" , "nccl" })
3904
3879
@skip_if_lt_x_gpu (4 )
3905
- @skip_if_rocm
3906
3880
def test_ddp_uneven_inputs_replicated_error (self ):
3907
3881
# Tests that the context manager errors out in SPMD mode.
3908
3882
group = dist .new_group ([0 , 1 ])
@@ -3952,7 +3926,6 @@ def test_broadcast_object_list(self):
3952
3926
@require_backend ({"gloo" , "nccl" })
3953
3927
@require_backends_available ({"gloo" , "nccl" })
3954
3928
@skip_if_lt_x_gpu (2 )
3955
- @skip_if_rocm
3956
3929
def test_ddp_ignore_params_arg (self ):
3957
3930
class TestModel (nn .Module ):
3958
3931
def __init__ (self , rank ):
@@ -4040,7 +4013,6 @@ def forward(self, x):
4040
4013
@require_backend ({"gloo" , "nccl" })
4041
4014
@require_backends_available ({"gloo" , "nccl" })
4042
4015
@skip_if_lt_x_gpu (2 )
4043
- @skip_if_rocm
4044
4016
def test_ddp_unused_params_rebuild_buckets_exception (self ):
4045
4017
class ToyModel (nn .Module ):
4046
4018
def __init__ (self ):
@@ -4071,7 +4043,6 @@ def forward(self, x):
4071
4043
@require_backend ({"gloo" , "nccl" })
4072
4044
@require_backends_available ({"gloo" , "nccl" })
4073
4045
@skip_if_lt_x_gpu (2 )
4074
- @skip_if_rocm
4075
4046
def test_ddp_shared_grad_acc_unused_params (self ):
4076
4047
# When find_unused_parameters=True, ensure we mark unused parameters
4077
4048
# even if they share gradient accumulators.
@@ -4104,7 +4075,6 @@ def forward(self, x):
4104
4075
@require_backend ({"gloo" , "nccl" })
4105
4076
@require_backends_available ({"gloo" , "nccl" })
4106
4077
@skip_if_lt_x_gpu (2 )
4107
- @skip_if_rocm
4108
4078
def test_ddp_device (self ):
4109
4079
m = nn .Linear (10 , 10 ).to (self .rank )
4110
4080
expected_len = 2
@@ -4210,7 +4180,6 @@ def train_iter(inp, input_type):
4210
4180
@require_backend ({"gloo" , "nccl" })
4211
4181
@require_backends_available ({"gloo" , "nccl" })
4212
4182
@skip_if_lt_x_gpu (2 )
4213
- @skip_if_rocm
4214
4183
def test_ddp_namedtuple (self ):
4215
4184
batch = 5
4216
4185
dim = 10
0 commit comments