This is the call I made :
enc_net = VanillaRNN(embed_dim, hidden_dim)
dec_net = VanillaRNN(embed_dim, hidden_dim)
rnn_net = RNNTranslator(eng_vocab.vocab_size, fra_vocab.vocab_size, embed_dim, hidden_dim,
enc_net, dec_net)
trained_rnn_net = train_model(rnn_net, train_iter, val_iter, lr, epochs, DEVICE,
run_name="VanillaRNN-VanillaRNN-baseline")
But I have this error :
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[27], line 1
----> 1 trained_rnn_net = train_model(rnn_net, train_iter, val_iter, lr, epochs, DEVICE,
run_name="VanillaRNN-VanillaRNN-baseline")
Cell In[23], line 52, in train_model(model, train_iter, val_iter, lr, epochs, device, run_name)
49 optimizer.zero_grad() # Reset gradients
51 # Forward pass: Pass source_seq, source_lengths, target_seq[:, :-1], and target_lengths
---> 52 outputs, _ = model(source_seq, source_lengths, target_seq[:, :-1], target_lengths) # Pass all 4
arguments
54 # Calculate the loss, ignoring BOS token and PAD token
55 loss = criterion(outputs.view(-1, outputs.size(-1)), target_seq[:, 1:].view(-1)) # Exclude BOS
token
File ~/miniconda3/envs/ddpm/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in
Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/ddpm/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in
Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or
self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
Cell In[20], line 46, in RNNTranslator.forward(self, source_seq, source_lengths, target_seq,
target_lengths)
42 initial_hidden, initial_cell = self.encoder(source_seq, source_lengths) # Encoder returns hidden
and cell states
44 # Step 2: Decode the target sequence using the decoder
45 # During training, we use teacher forcing (target_seq is available)
---> 46 logits, _ = self.decoder(initial_hidden, initial_cell, target_seq)
48
#################################################################################
###
49 # END OF YOUR CODE
50
#################################################################################
###
52 return logits
File ~/miniconda3/envs/ddpm/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in
Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/ddpm/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in
Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or
self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
Cell In[19], line 41, in RNNDecoder.forward(self, initial_hidden, initial_cell, target_seq)
39 for t in range(seq_length):
40 input_t = self.embedding(target_seq[:, t]) # Shape: (batch_size, embed_dim)
---> 41 hidden_state, _ = self.rnn(input_t.unsqueeze(1), hidden_state) # Forward through RNN
42 output_t = self.custom_linear(hidden_state) # Compute the output
43 outputs.append(output_t)
File ~/miniconda3/envs/ddpm/lib/python3.10/site-packages/torch/nn/modules/module.py:1518, in
Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/ddpm/lib/python3.10/site-packages/torch/nn/modules/module.py:1527, in
Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or
self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
Cell In[13], line 74, in VanillaRNN.forward(self, x, h_0, valid_len, **kwargs)
67 h_t = self.rnn_cell(x_t, h_t)
69 # Ensure h_t has the correct shape
70 # Add these lines before the assignment
71 #print(f"Shape of h_t at time step {t}: {h_t.shape}")
72 #print(f"Shape of h_seq before assignment: {h_seq[:, t, :].shape}")
---> 74 h_seq[:, t, :] = h_t
76 # If valid_len is provided, apply masking
77 if valid_len is not None:
78 # Create a mask for the current time step
RuntimeError: expand(torch.cuda.FloatTensor{[64, 64, 512]}, size=[64, 512]): the number of sizes
provided (2) must be greater or equal to the number of dimensions in the tensor (3)
My VanillaRNN code :
class VanillaRNN(nn.Module):
def __init__(self, input_size, hidden_size):
"""
Constructor for the VanillaRNN class.
Args:
input_size (int): The size (number of features) of the input data at each time step.
hidden_size (int): The size (number of features) of the hidden state.
Description:
Initializes the VanillaRNN model, which processes sequential input data across multiple time
steps.
This class uses the previously defined VanillaRNNCell to process one time step at a time.
The RNN will maintain a hidden state of size `hidden_size` throughout the sequence.
"""
super(VanillaRNN, self).__init__()
#################################################################################
###
# TODO: Implement the VanillaRNN constructor
#################################################################################
###
# Your implementation code
super(VanillaRNN, self).__init__()
# Initialize the VanillaRNNCell with input and hidden sizes
self.rnn_cell = VanillaRNNCell(input_size, hidden_size)
self.hidden_size = hidden_size
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
def forward(self, x, h_0=None, valid_len=None, **kwargs):
"""
Forward pass for the entire sequence.
Args:
x (Tensor): Input sequence of shape (batch_size, seq_length, input_size).
h_0 (Tensor, optional): Initial hidden state of shape (batch_size, hidden_size).
valid_len (Tensor): Tensor of shape (batch_size,) containing the lengths of sequences before
padding.
Returns:
h_seq (Tensor): Hidden states at each time step of shape (batch_size, seq_length,
hidden_size).
h_t (Tensor): The last hidden state for last time step of shape (batch_size, hidden_size)
"""
batch_size, seq_length, _ = x.size()
#################################################################################
###
# TODO: Implement the VanillaRNN forward pass. If h_0 is None, then the initial
# hidden state should be initialized to 0 tensor. Also, implement the masking based
# on the given valid_len if it is not None.
#################################################################################
###
# Your implementation code
# If h_0 is None, initialize it as a zero tensor of shape (batch_size, hidden_size)
if h_0 is None:
h_0 = torch.zeros(batch_size, self.hidden_size).to(x.device)
# Initialize a tensor to hold all hidden states across the sequence
h_seq = torch.zeros(batch_size, seq_length, self.hidden_size).to(x.device)
# Set the initial hidden state
h_t = h_0
# Iterate over each time step in the sequence
for t in range(seq_length):
# Get the input at time step t
x_t = x[:, t, :]
# Perform the forward pass using VanillaRNNCell for time step t
h_t = self.rnn_cell(x_t, h_t)
# Ensure h_t has the correct shape
# Add these lines before the assignment
#print(f"Shape of h_t at time step {t}: {h_t.shape}")
#print(f"Shape of h_seq before assignment: {h_seq[:, t, :].shape}")
h_seq[:, t, :] = h_t
# If valid_len is provided, apply masking
if valid_len is not None:
# Create a mask for the current time step
mask = (valid_len > t).float().unsqueeze(1)
# Update h_t based on the mask
h_t = h_t * mask + h_0 * (1 - mask) # Reset to h_0 for invalid lengths
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
return h_seq, h_t
My RNNTraslator code :
class RNNTranslator(nn.Module):
def __init__(self, src_vocab_size, tgt_vocab_size, embedding_dim, hidden_size, enc_net, dec_net):
super(RNNTranslator, self).__init__()
#################################################################################
###
# TODO: Implement the RNNTranslator constructor
#################################################################################
###
# Your implementation code
# Encoder: Takes source vocab size and embedding dim, uses custom RNN cell
self.encoder = RNNEncoder(src_vocab_size, embedding_dim, hidden_size, enc_net)
# Decoder: Takes target vocab size and uses custom RNN cell
self.decoder = RNNDecoder(tgt_vocab_size, embedding_dim, hidden_size, dec_net)
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
def forward(self, source_seq, source_lengths, target_seq, target_lengths):
"""
Args:
source_seq (Tensor): Source sequences (batch_size, source_sequence_length).
source_lengths (Tensor): Lengths of source sequences before padding (batch_size,).
target_seq (Tensor): Target sequences (batch_size, target_sequence_length).
target_lengths (Tensor): Lengths of target sequences before padding (batch_size,).
Returns:
logits (Tensor): Logits of shape (batch_size, tgt_seq_len, tgt_vocab_size), representing
predicted token probabilities.
"""
batch_size = source_seq.size(0)
#################################################################################
###
# TODO: Implement the RNNTranslator forward pass by running the forward pass of the
# Encoder continued by the Decoder.
#################################################################################
###
# Your implementation code
# Step 1: Encode the source sequence using the encoder
batch_size = source_seq.size(0)
initial_hidden, initial_cell = self.encoder(source_seq, source_lengths) # Encoder returns hidden
and cell states
# Step 2: Decode the target sequence using the decoder
# During training, we use teacher forcing (target_seq is available)
logits, _ = self.decoder(initial_hidden, initial_cell, target_seq)
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
return logits
def predict(self, source_seq, source_lengths):
"""
Generates translations for the given source sequences.
Args:
source_seq (Tensor): Source sequences (batch_size, source_sequence_length).
source_lengths (Tensor): Lengths of source sequences before padding (batch_size,).
Returns:
predicted_tokens (Tensor): Generated token indices (batch_size, max_length).
"""
batch_size = source_seq.size(0)
#################################################################################
###
# TODO: Implement the RNNTranslator prediction function. Should be similar with
# the forward pass but not the same. Keep notes that you should return predicted
# tokens and not logits.
#################################################################################
###
# Your implementation code
# Step 1: Encode the source sequence using the encoder
batch_size = source_seq.size(0)
initial_hidden, initial_cell = self.encoder(source_seq, source_lengths)
# Step 2: Generate the target sequence using the decoder (greedy decoding for inference)
predicted_tokens, _ = self.decoder(initial_hidden, initial_cell, target_seq=None,
max_target_length=max_target_length)
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
return predicted_tokens
My encoder code :
class RNNEncoder(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, rnn_net):
super(RNNEncoder, self).__init__()
"""
Args:
vocab_size (int): Number of unique words in the source vocabulary.
embed_dim (int): Dimension of the word embeddings.
hidden_dim (int): Dimension of the hidden state in the RNN.
rnn_net: Any of your RNN variants that already implemented.
"""
#################################################################################
###
# TODO: Implement the RNNEncoder constructor. You can use `nn.Embedding` for the
# embedding layer.
#################################################################################
###
# Your implementation code
# Embedding layer: maps word indices to dense vectors
self.embedding = nn.Embedding(vocab_size, embed_dim)
# RNN layer: can be Vanilla RNN, LSTM, or GRU
self.rnn_net = rnn_net
self.hidden_dim = hidden_dim
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
def forward(self, input_seq, valid_len):
"""
Args:
input_seq (Tensor): Tensor of shape (batch_size, sequence_length) containing word indices.
valid_len (Tensor): Tensor of shape (batch_size,) containing the lengths of sequences before
padding.
Returns:
final_hidden (Tensor): Tensor of shape (batch_size, hidden_dim) representing the final hidden
states.
final_cell (Tensor or None): Either
1. Tensor of shape (batch_size, hidden_dim) representing the final cell states if the rnn_net is
LSTM.
2. None if the rnn_net is not LSTM
"""
#################################################################################
###
# TODO: Implement the RNNEncoder forward pass. You must embed the input sequence to
# before put it into the RNN layer. Remember, output of RNN layer can be different
# depending on the type of RNN. Your code must be able to handle all of the
# implemented RNN variants.
#################################################################################
###
# Your implementation code
# Get the batch size and sequence length
batch_size, seq_length = input_seq.size()
# Embed the input sequence (batch_size, seq_length, embed_dim)
embedded_seq = self.embedding(input_seq)
# Forward pass through the RNN
if isinstance(self.rnn_net, nn.LSTM):
# LSTM returns both hidden and cell states
rnn_output, (final_hidden, final_cell) = self.rnn_net(embedded_seq, valid_len=valid_len)
else:
# GRU/Vanilla RNN only returns the hidden state
rnn_output, final_hidden = self.rnn_net(embedded_seq, valid_len=valid_len)
final_cell = None # No cell state for non-LSTM RNNs
#################################################################################
###
# END OF YOUR CODE
#################################################################################
###
return final_hidden, final_cell
My decoder code :
class RNNDecoder(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, rnn_net):
super(RNNDecoder, self).__init__()
"""
Args:
vocab_size (int): Number of unique words in the target vocabulary.
embed_dim (int): Dimension of the word embeddings.
hidden_dim (int): Dimension of the hidden state in the RNN.
"""
self.embedding = nn.Embedding(vocab_size, embed_dim) # Embedding layer
self.rnn = rnn_net
self.output_W = nn.Parameter(torch.randn(hidden_dim, vocab_size)) # Custom linear layer
weights
self.output_b = nn.Parameter(torch.zeros(vocab_size)) # Custom linear layer bias
def custom_linear(self, hidden_state):
# Ensure the hidden state is 2D for matrix multiplication
return hidden_state @ self.output_W + self.output_b
def forward(self, initial_hidden, initial_cell=None, target_seq=None):
"""
Args:
initial_hidden (Tensor): Tensor of shape (batch_size, hidden_dim).
initial_cell (Tensor or None): Either a tensor of shape (batch_size, hidden_dim) for LSTM or
None for RNN.
target_seq (Tensor): Tensor of shape (batch_size, target_sequence_length) containing word
indices.
Returns:
outputs (Tensor): Tensor of shape (batch_size, target_sequence_length, vocab_size) with raw
scores.
hidden_state (Tensor): Tensor containing the final hidden state.
"""
batch_size = initial_hidden.size(0)
if target_seq is not None:
# Teacher Forcing mode
seq_length = target_seq.size(1)
outputs = []
hidden_state = initial_hidden
for t in range(seq_length):
input_t = self.embedding(target_seq[:, t]) # Shape: (batch_size, embed_dim)
hidden_state, _ = self.rnn(input_t.unsqueeze(1), hidden_state) # Forward through RNN
output_t = self.custom_linear(hidden_state) # Compute the output
outputs.append(output_t)
outputs = torch.stack(outputs, dim=1) # Shape: (batch_size, seq_length, vocab_size)
return outputs, hidden_state
else:
# Inference mode
outputs = []
input_t = torch.full((batch_size,), BOS_TOKEN, dtype=torch.long,
device=initial_hidden.device) # Start with BOS token
hidden_state = initial_hidden
for _ in range(max_target_length):
input_t_embedded = self.embedding(input_t) # Shape: (batch_size, embed_dim)
hidden_state, _ = self.rnn(input_t_embedded.unsqueeze(1), hidden_state) # Forward
through RNN
output_t = self.custom_linear(hidden_state) # Compute the output
outputs.append(output_t)
# Greedy decoding: use the predicted token for the next input
input_t = output_t.argmax(dim=1) # Get the predicted token (batch_size,)
outputs = torch.stack(outputs, dim=1) # Shape: (batch_size, max_target_length, vocab_size)
return outputs, hidden_state
My train_model code:
import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
def train_model(model, train_iter, val_iter, lr, epochs, device, run_name="experiment-1"):
"""
Trains a Translator model (RNN or Transformer-based).
Args:
model (nn.Module): The Translator model to train.
train_iter (DataLoader): Training data iterator.
val_iter (DataLoader): Validation data iterator.
lr (float): Learning rate.
epochs (int): Number of epochs to train.
device (device): Device used to train.
run_name (str): Run name to log on Tensorboard.
"""
model.to(device)
#####################################################################
# TODO: ###############
# Setup the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
PAD_TOKEN = 0 # Assuming 0 is the PAD token, adjust if necessary
BOS_TOKEN = 1 # Assuming 1 is the BOS token, adjust if necessary
criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN) # Ignoring padding tokens
# Set up TensorBoard writer
writer = SummaryWriter(log_dir=f'runs/{run_name}')
for epoch in range(1, epochs + 1):
# Training phase
model.train()
total_train_loss = 0
total_train_batches = 0
# Iterate over training batches
for _, train_data in tqdm(enumerate(train_iter), desc=f"Epoch {epoch} Training"):
# Unpack the training data
source_seq, source_lengths, target_seq, target_lengths = train_data
# Move tensors to the appropriate device only if necessary
source_seq = source_seq.to(device) if not source_seq.is_cuda else source_seq
source_lengths = source_lengths.to(device) if not source_lengths.is_cuda else source_lengths
target_seq = target_seq.to(device) if not target_seq.is_cuda else target_seq
target_lengths = target_lengths.to(device) if not target_lengths.is_cuda else target_lengths
optimizer.zero_grad() # Reset gradients
# Forward pass: Pass source_seq, source_lengths, target_seq[:, :-1], and target_lengths
outputs, _ = model(source_seq, source_lengths, target_seq[:, :-1], target_lengths) # Pass all
4 arguments
# Calculate the loss, ignoring BOS token and PAD token
loss = criterion(outputs.view(-1, outputs.size(-1)), target_seq[:, 1:].view(-1)) # Exclude BOS
token
# Backpropagation
loss.backward()
optimizer.step()
total_train_loss += loss.item()
total_train_batches += 1
average_train_loss = total_train_loss / total_train_batches
# Validation phase
model.eval()
total_val_loss = 0
total_val_batches = 0
with torch.no_grad():
for _, val_data in tqdm(enumerate(val_iter), desc=f"Epoch {epoch} Validation"):
source_seq, source_lengths, target_seq, target_lengths = val_data
# Move tensors to the appropriate device only if necessary
source_seq = source_seq.to(device) if not source_seq.is_cuda else source_seq
source_lengths = source_lengths.to(device) if not source_lengths.is_cuda else
source_lengths
target_seq = target_seq.to(device) if not target_seq.is_cuda else target_seq
target_lengths = target_lengths.to(device) if not target_lengths.is_cuda else target_lengths
# Forward pass (excluding last token for the input)
outputs, _ = model(source_seq, source_lengths, target_seq[:, :-1], target_lengths)
# Calculate the validation loss, ignoring BOS and PAD tokens
loss = criterion(outputs.view(-1, outputs.size(-1)), target_seq[:, 1:].view(-1))
total_val_loss += loss.item()
total_val_batches += 1
average_val_loss = total_val_loss / total_val_batches
###End of TODO########################################
# Log to TensorBoard
writer.add_scalar('Loss/Train', average_train_loss, epoch)
writer.add_scalar('Loss/Validation', average_val_loss, epoch)
# Print epoch summary
print(f"""Epoch {epoch}/{epochs}
Train Loss: {average_train_loss:.4f}
Validation Loss: {average_val_loss:.4f}\n""")
# Close the TensorBoard writer
writer.close()
return model
correct the code that causes the problem , only in the todo section