assignment-9
November 12, 2024
[2]: import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter
from itertools import chain
# Sample parallel sentences for demonstration (replace with actual data)
source_sentences = ["hello", "how are you", "good morning"]
target_sentences = ["hola", "cómo estás", "buenos días"]
# Vocabulary building function
def build_vocab(sentences):
counter = Counter(chain.from_iterable(s.split() for s in sentences))
vocab = {word: idx + 3 for idx, (word, _) in enumerate(counter.
↪most_common())}
vocab["<pad>"] = 0
vocab["<sos>"] = 1
vocab["<eos>"] = 2
return vocab
# Build vocabulary for source and target languages
source_vocab = build_vocab(source_sentences)
target_vocab = build_vocab(target_sentences)
# Tokenize function
def tokenize(sentence, vocab):
tokens = ["<sos>"] + sentence.split() + ["<eos>"]
return [vocab[token] if token in vocab else vocab["<pad>"] for token in␣
↪tokens]
# Prepare data for training
train_data = [(torch.tensor(tokenize(src, source_vocab)), torch.
↪tensor(tokenize(tgt, target_vocab)))
for src, tgt in zip(source_sentences, target_sentences)]
# Define Dataset and DataLoader
1
class TranslationDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
dataset = TranslationDataset(train_data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=lambda␣
↪x: x)
# Encoder model
class Encoder(nn.Module):
def __init__(self, input_dim, emb_dim, hidden_dim, n_layers):
super(Encoder, self).__init__()
self.embedding = nn.Embedding(input_dim, emb_dim)
self.lstm = nn.LSTM(emb_dim, hidden_dim, n_layers, batch_first=True)
def forward(self, src):
embedded = self.embedding(src)
outputs, (hidden, cell) = self.lstm(embedded)
return outputs, hidden, cell
# Attention model
class Attention(nn.Module):
def __init__(self, hidden_dim):
super(Attention, self).__init__()
self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
self.v = nn.Linear(hidden_dim, 1, bias=False)
def forward(self, hidden, encoder_outputs):
src_len = encoder_outputs.shape[1]
hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs),␣
↪dim=2)))
attention = self.v(energy).squeeze(2)
return torch.softmax(attention, dim=1)
# Decoder model with attention
class Decoder(nn.Module):
def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, attention):
super(Decoder, self).__init__()
self.output_dim = output_dim
self.embedding = nn.Embedding(output_dim, emb_dim)
2
self.lstm = nn.LSTM(emb_dim + hidden_dim, hidden_dim, n_layers,␣
↪batch_first=True)
self.fc_out = nn.Linear(hidden_dim * 2, output_dim)
self.attention = attention
def forward(self, tgt, hidden, cell, encoder_outputs):
tgt = tgt.unsqueeze(1)
embedded = self.embedding(tgt)
attn_weights = self.attention(hidden, encoder_outputs)
context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
lstm_input = torch.cat((embedded, context), dim=2)
output, (hidden, cell) = self.lstm(lstm_input, (hidden, cell))
prediction = self.fc_out(torch.cat((output, context), dim=2).squeeze(1))
return prediction, hidden, cell
# Seq2Seq model combining encoder and decoder
class Seq2Seq(nn.Module):
def __init__(self, encoder, decoder, device):
super(Seq2Seq, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self, src, tgt):
encoder_outputs, hidden, cell = self.encoder(src)
outputs = torch.zeros(tgt.shape[0], tgt.shape[1], self.decoder.
↪output_dim).to(self.device)
input = tgt[:, 0]
for t in range(1, tgt.shape[1]):
output, hidden, cell = self.decoder(input, hidden, cell,␣
↪encoder_outputs)
outputs[:, t] = output
input = output.argmax(1)
return outputs
# Hyperparameters and model initialization
INPUT_DIM = len(source_vocab)
OUTPUT_DIM = len(target_vocab)
EMB_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS)
attention = Attention(HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, attention)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(encoder, decoder, device).to(device)
3
# Training setup
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=target_vocab["<pad>"])
# Training loop
def train(model, dataloader, optimizer, criterion):
model.train()
epoch_loss = 0
for batch in dataloader:
src, tgt = zip(*batch)
src, tgt = torch.nn.utils.rnn.pad_sequence(src,␣
↪padding_value=source_vocab["<pad>"], batch_first=True), \
torch.nn.utils.rnn.pad_sequence(tgt,␣
↪padding_value=target_vocab["<pad>"], batch_first=True)
src, tgt = src.to(device), tgt.to(device)
optimizer.zero_grad()
output = model(src, tgt)
output_dim = output.shape[-1]
output = output[:, 1:].reshape(-1, output_dim)
tgt = tgt[:, 1:].reshape(-1)
loss = criterion(output, tgt)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(dataloader)
# Training epochs
for epoch in range(10):
loss = train(model, dataloader, optimizer, criterion)
print(f'Epoch {epoch+1}, Loss: {loss:.4f}')
Epoch 1, Loss: 2.0400
Epoch 2, Loss: 1.8393
Epoch 3, Loss: 1.5252
Epoch 4, Loss: 1.2470
Epoch 5, Loss: 0.8623
Epoch 6, Loss: 0.5609
Epoch 7, Loss: 0.4295
Epoch 8, Loss: 0.1870
Epoch 9, Loss: 0.2638
Epoch 10, Loss: 0.1014