8000 MPS incompatibility: Calls into the C++ engine to run the backward pass · Issue #143123 · pytorch/pytorch · GitHub
[go: up one dir, main page]

Skip to content

MPS incompatibility: Calls into the C++ engine to run the backward pass #143123

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
sebassaras02 opened this issue Dec 12, 2024 · 5 comments
Open
Labels
module: mps Related to Apple Metal Performance Shaders framework module: regression It used to work, and now it doesn't triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module

Comments

@sebassaras02
Copy link
sebassaras02 commented Dec 12, 2024

🐛 Describe the bug

I've been build a CLIP model

This is my architecture

from transformers import DistilBertModel
import torch
import torch.nn as nn

class TextEncoderHead(nn.Module):
    def __init__(self, model):
        super(TextEncoderHead, self).__init__()
        self.model = model
        self.seq1 = nn.Sequential(
            nn.Linear(768, 512),
            nn.LayerNorm(512)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        outputs = outputs.last_hidden_state.mean(dim=1)
        outputs = self.seq1(outputs)
        return outputs.contiguous()
    
class ImageEncoderHead(nn.Module):
    def __init__(self, model):
        super(ImageEncoderHead, self).__init__()
        self.model = model
        self.seq1 = nn.Sequential(
            nn.Linear(768, 512),
            nn.LayerNorm(512)
        )
    
    def forward(self, pixel_values):
        outputs = self.model(pixel_values=pixel_values)
        outputs = outputs.last_hidden_state.mean(dim=1)
        outputs = self.seq1(outputs)
        return outputs.contiguous()
    
class CLIPChemistryModel(nn.Module):
    def __init__(self, text_encoder, image_encoder):
        super(CLIPChemistryModel, self).__init__()
        self.text_encoder = text_encoder
        self.image_encoder = image_encoder

    def forward(self, image, input_ids, attention_mask):
        # calculate the embeddings
        ie = self.image_encoder(image)
        te = self.text_encoder(input_ids, attention_mask)
        return ie, te

I have this trainer and loss function

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

def trainer_fn(model, dataloader_train, dataloader_val, epochs, loss_fn, optimizer, device):

    total_loss_train = []
    total_loss_val = []

    model.to(device)

    for epoch in tqdm(range(epochs), desc="Training..."):
        # MODEL TRAINING 
        model.train()
        running_loss = 0
        counter = 0 
        for batch in dataloader_train:
            image, input_ids, attention_mask = batch
            image = image.to(device)
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            # Verificar si los tensores son contiguos
            print(f"image is contiguous: {image.is_contiguous()}")
            print(f"input_ids is contiguous: {input_ids.is_contiguous()}")
            print(f"attention_mask is contiguous: {attention_mask.is_contiguous()}")
            # Forward pass
            image_embeddings, text_embeddings = model(image, input_ids, attention_mask)
            # Verificar si los tensores de embeddings son contiguos
            print(f"image_embeddings is contiguous: {image_embeddings.is_contiguous()}")
            print(f"text_embeddings is contiguous: {text_embeddings.is_contiguous()}")
            
            # Calculate the loss
            loss = loss_fn(image_embeddings, text_embeddings)
            print(loss)
            # Backward pass
            loss.backward()
            # Optimize the weights
            optimizer.step()
            # Zero the gradients
            optimizer.zero_grad()
            # Update the learning rate
            running_loss += loss.item()
            counter += 1
            print(counter)
        total_loss_train.append(running_loss/counter)

        # MODEL EVALUATION
        model.eval()
        running_vloss = 0
        vcounter = 0
        with torch.no_grad():
            for batch in dataloader_val:
                image, input_ids, attention_mask = batch
                image = image.to(device)
                input_ids = input_ids.to(device)
                attention_mask = attention_mask.to(device)
            
                # forward pass
                image_embeddings, text_embeddings = model(image, input_ids, attention_mask)

                print(f"image_embeddings is contiguous: {image_embeddings.is_contiguous()}")
                print(f"text_embeddings is contiguous: {text_embeddings.is_contiguous()}")
                

                # calculate the loss
                loss = loss_fn(image_embeddings=image_embeddings, text_embeddings=text_embeddings)
                running_vloss += loss.item()
                vcounter += 1
        total_loss_val.append(running_vloss/vcounter)

        # PRINT THE LOSS
        print(f"Epoch {epoch+1} - Train Loss: {total_loss_train[-1]} - Validation Loss: {total_loss_val[-1]}")


def contrastive_loss(image_embeddings, text_embeddings, temperature=1.0):
    """
    Compute contrastive loss between image and text embeddings.
    """
    temperature = torch.tensor(temperature, device=image_embeddings.device).float()
    image_embeddings = image_embeddings.contiguous().float()
    text_embeddings = text_embeddings.contiguous().float()
    batch_size = image_embeddings.shape[0]
    image_embeddings = F.normalize(image_embeddings, p=2, dim=1)
    text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
    logits = torch.einsum('nc,mc->nm', [image_embeddings, text_embeddings])
    logits = logits * torch.exp(temperature)
    labels = torch.arange(batch_size, device=image_embeddings.device)
    loss_i2t = F.cross_entropy(logits, labels)
    loss_t2i = F.cross_entropy(logits.t(), labels)
    loss = (loss_i2t + loss_t2i) / 2
    return loss

When I run over mps, this causes this error

Traceback (most recent call last):
File "/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/trainer.py", line 74, in
main()
File "/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/trainer.py", line 61, in main
trainer_fn(
File "/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/utils.py", line 37, in trainer_fn
loss.backward()
File "/opt/anaconda3/envs/clip/lib/python3.11/site-packages/torch/_tensor.py", line 581, in backward
torch.autograd.backward(
File "/opt/anaconda3/envs/clip/lib/python3.11/site-packages/torch/autograd/init.py", line 347, in backward
_engine_run_backward(
File "/opt/anaconda3/envs/clip/lib/python3.11/site-packages/torch/autograd/graph.py", line 825, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.

I changed to CPU and works fine. Does anyone know why this happens?

Versions

torch 2.5.1
macos 15.1

cc @kulinseth @albanD @malfet @DenisVieriu97 @jhavukainen

@NripeshN
Copy link

HI @sebassaras02,

I had faced a similar issue and have already created an issue(#142344). I think the team is looking into it.

@NripeshN
Copy link

For now I forced install PyTorch 2.4 instead of 2.5 and this workaround fixed the issue for me(allowed me to use mps)

@malfet malfet added the module: mps Related to Apple Metal Performance Shaders framework label Dec 12, 2024
@colesbury colesbury added the triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module label Dec 12, 2024
@albanD albanD added the module: regression It used to work, and now it doesn't label Dec 12, 2024
@sebassaras02
Copy link
Author

Thanks @NripeshN downgrading the version of pytorch is working.

@malfet
Copy link
Contributor
malfet commented Dec 17, 2024

Checking if this is a duplicate of #142344 ....

@cglewis
Copy link
cglewis commented May 16, 2025

I'm still experiencing this issue on 2.6.0 and 2.7.0. Reverting back to 2.4.0 works, but I'd like to be able to use the newer versions of torch and not be stuck on 2.4.0

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
module: mps Related to Apple Metal Performance Shaders framework module: regression It used to work, and now it doesn't triaged This issue has been looked at a team member, and triaged and prioritized into an appropriate module
Projects
None yet
Development

Successfully merging a pull request may close this issue.

6 participants
0