pytorch
diff --git a/‎aten/src/ATen/native/cuda/MultiMarginLoss.cu‎
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/native/cuda/MultiMarginLoss.cu‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎test/test_nn.py‎
Lines changed: 19 additions & 0 deletions b/‎test/test_nn.py‎
Lines changed: 19 additions & 0 deletions
@@ -121,6 +121,7 @@ __global__ void MultiMarginLoss_backward_kernel(
     gradInput_k[target_k] = static_cast<scalar_t>(gradInput_target_k);
   }
 
+  __syncthreads();
   for (int i=i_start; i<i_end; i+= i_step) {
     gradInput_k[i] *= * gradOutput_k;
   }
 
@@ -9291,6 +9291,25 @@ def test_MarginLoss_empty(self, device, dtype):
                 y = torch.ones(10, 0, device=device).type(torch.long)
                 mod(x, y)
 
+    @onlyCUDA
+    @dtypes(torch.float, torch.double)
+    def test_MarginLoss_race(self, device, dtype):
+        loss = torch.nn.MultiMarginLoss().to(device)
+        batch = 1
+        classes = 128
+        x = torch.randn(batch, classes, requires_grad=True, device=device, dtype=dtype)
+        y = torch.randint(low=0, high=classes, size=(batch,), device=device, dtype=torch.long)
+        x_cpu = x.detach().clone().cpu()
+        y_cpu = y.detach().clone().cpu()
+        out = loss(x, y)
+        out.backward()
+        x_cpu = x.detach().clone().cpu()
+        x_cpu.requires_grad = True
+        y_cpu = y.detach().clone().cpu()
+        out_cpu = loss.cpu()(x_cpu, y_cpu)
+        out_cpu.backward()
+        self.assertEqual(x_cpu.grad, x.grad.cpu())
+
     @onlyCUDA
     def test_MarginLoss_warnings(self, device):
         model = torch.nn.Linear(128, 22, device=device)
Original file line number	Diff line number	Diff line change
`@@ -121,6 +121,7 @@ __global__ void MultiMarginLoss_backward_kernel(`
`121`	`121`	`gradInput_k[target_k] = static_cast<scalar_t>(gradInput_target_k);`
`122`	`122`	`}`
`123`	`123`
	`124`	`+ __syncthreads();`
`124`	`125`	`for (int i=i_start; i<i_end; i+= i_step) {`
`125`	`126`	`gradInput_k[i] = gradOutput_k;`
`126`	`127`	`}`