@@ -26,7 +26,7 @@ class TestTrainerDistributedLoss(TestCasePlus):
26
26
@require_torch_multi_accelerator
27
27
def test_trainer (self ):
28
28
device_count = backend_device_count (torch_device )
29
- min_bs = 1
29
+ min_bs = 2
30
30
output_dir = self .get_auto_remove_tmp_dir ()
31
31
for gpu_num , enable , bs , name in (
32
32
(1 , True , min_bs * device_count , "base" ),
@@ -50,9 +50,10 @@ def test_trainer(self):
50
50
broken_diff = [abs (base_loss [i ] - broken_loss [i ]) for i in range (len (base_loss ))]
51
51
fixed_diff = [abs (base_loss [i ] - fixed_loss [i ]) for i in range (len (base_loss ))]
52
52
sum_base = sum (base_loss )
53
- sum_broken = sum (broken_diff )
53
+ sum_broken = sum (broken_loss )
54
54
relative_broken = abs (sum_base - sum_broken ) / max (sum_base , sum_broken )
55
55
56
+ # the gap may be smaller for other models, but it still ok.
56
57
self .assertGreater (max (broken_diff ), 0.5 )
57
58
self .assertLess (max (fixed_diff ), 0.005 )
58
59
self .assertLess (relative_broken , 0.1 )
@@ -63,7 +64,7 @@ def run_distributed_training(training_args):
63
64
model_name = "nickypro/tinyllama-15M"
64
65
dataset_name = "wikitext"
65
66
dataset_config = "wikitext-2-raw-v1"
66
- dataset = datasets .load_dataset (dataset_name , dataset_config , split = "train[:17 ]" )
67
+ dataset = datasets .load_dataset (dataset_name , dataset_config , split = "train[:100 ]" )
67
68
tokenizer = AutoTokenizer .from_pretrained (model_name )
68
69
tokenizer .pad_token = tokenizer .eos_token
69
70
0 commit comments