Description
🐛 Describe the bug
I've been build a CLIP model
This is my architecture
from transformers import DistilBertModel
import torch
import torch.nn as nn
class TextEncoderHead(nn.Module):
def __init__(self, model):
super(TextEncoderHead, self).__init__()
self.model = model
self.seq1 = nn.Sequential(
nn.Linear(768, 512),
nn.LayerNorm(512)
)
def forward(self, input_ids, attention_mask):
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
outputs = outputs.last_hidden_state.mean(dim=1)
outputs = self.seq1(outputs)
return outputs.contiguous()
class ImageEncoderHead(nn.Module):
def __init__(self, model):
super(ImageEncoderHead, self).__init__()
self.model = model
self.seq1 = nn.Sequential(
nn.Linear(768, 512),
nn.LayerNorm(512)
)
def forward(self, pixel_values):
outputs = self.model(pixel_values=pixel_values)
outputs = outputs.last_hidden_state.mean(dim=1)
outputs = self.seq1(outputs)
return outputs.contiguous()
class CLIPChemistryModel(nn.Module):
def __init__(self, text_encoder, image_encoder):
super(CLIPChemistryModel, self).__init__()
self.text_encoder = text_encoder
self.image_encoder = image_encoder
def forward(self, image, input_ids, attention_mask):
# calculate the embeddings
ie = self.image_encoder(image)
te = self.text_encoder(input_ids, attention_mask)
return ie, te
I have this trainer and loss function
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
def trainer_fn(model, dataloader_train, dataloader_val, epochs, loss_fn, optimizer, device):
total_loss_train = []
total_loss_val = []
model.to(device)
for epoch in tqdm(range(epochs), desc="Training..."):
# MODEL TRAINING
model.train()
running_loss = 0
counter = 0
for batch in dataloader_train:
image, input_ids, attention_mask = batch
image = image.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# Verificar si los tensores son contiguos
print(f"image is contiguous: {image.is_contiguous()}")
print(f"input_ids is contiguous: {input_ids.is_contiguous()}")
print(f"attention_mask is contiguous: {attention_mask.is_contiguous()}")
# Forward pass
image_embeddings, text_embeddings = model(image, input_ids, attention_mask)
# Verificar si los tensores de embeddings son contiguos
print(f"image_embeddings is contiguous: {image_embeddings.is_contiguous()}")
print(f"text_embeddings is contiguous: {text_embeddings.is_contiguous()}")
# Calculate the loss
loss = loss_fn(image_embeddings, text_embeddings)
print(loss)
# Backward pass
loss.backward()
# Optimize the weights
optimizer.step()
# Zero the gradients
optimizer.zero_grad()
# Update the learning rate
running_loss += loss.item()
counter += 1
print(counter)
total_loss_train.append(running_loss/counter)
# MODEL EVALUATION
model.eval()
running_vloss = 0
vcounter = 0
with torch.no_grad():
for batch in dataloader_val:
image, input_ids, attention_mask = batch
image = image.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
# forward pass
image_embeddings, text_embeddings = model(image, input_ids, attention_mask)
print(f"image_embeddings is contiguous: {image_embeddings.is_contiguous()}")
print(f"text_embeddings is contiguous: {text_embeddings.is_contiguous()}")
# calculate the loss
loss = loss_fn(image_embeddings=image_embeddings, text_embeddings=text_embeddings)
running_vloss += loss.item()
vcounter += 1
total_loss_val.append(running_vloss/vcounter)
7D39
# PRINT THE LOSS
print(f"Epoch {epoch+1} - Train Loss: {total_loss_train[-1]} - Validation Loss: {total_loss_val[-1]}")
def contrastive_loss(image_embeddings, text_embeddings, temperature=1.0):
"""
Compute contrastive loss between image and text embeddings.
"""
temperature = torch.tensor(temperature, device=image_embeddings.device).float()
image_embeddings = image_embeddings.contiguous().float()
text_embeddings = text_embeddings.contiguous().float()
batch_size = image_embeddings.shape[0]
image_embeddings = F.normalize(image_embeddings, p=2, dim=1)
text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
logits = torch.einsum('nc,mc->nm', [image_embeddings, text_embeddings])
logits = logits * torch.exp(temperature)
labels = torch.arange(batch_size, device=image_embeddings.device)
loss_i2t = F.cross_entropy(logits, labels)
loss_t2i = F.cross_entropy(logits.t(), labels)
loss = (loss_i2t + loss_t2i) / 2
return loss
When I run over mps, this causes this error
Traceback (most recent call last):
File "/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/trainer.py", line 74, in
main()
File "/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/trainer.py", line 61, in main
trainer_fn(
File "/Users/sebastianalejandrosarastizambonino/Documents/projects/CLIP_Pytorch/src/utils.py", line 37, in trainer_fn
loss.backward()
File "/opt/anaconda3/envs/clip/lib/python3.11/site-packages/torch/_tensor.py", line 581, in backward
torch.autograd.backward(
File "/opt/anaconda3/envs/clip/lib/python3.11/site-packages/torch/autograd/init.py", line 347, in backward
_engine_run_backward(
File "/opt/anaconda3/envs/clip/lib/python3.11/site-packages/torch/autograd/graph.py", line 825, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
I changed to CPU and works fine. Does anyone know why this happens?
Versions
torch 2.5.1
macos 15.1