assignment-8
November 12, 2024
[2]: import torch
import [Link] as nn
import [Link] as optim
import math
# Define the Transformer model
class TransformerModel([Link]):
def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6,␣
↪num_decoder_layers=6, dim_feedforward=2048, max_seq_length=100):
super(TransformerModel, self).__init__()
[Link] = [Link](vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
[Link] = [Link](d_model, nhead, num_encoder_layers,␣
↪num_decoder_layers, dim_feedforward)
self.fc_out = [Link](d_model, vocab_size)
def forward(self, src, tgt, src_mask=None, tgt_mask=None,␣
↪src_padding_mask=None, tgt_padding_mask=None, memory_key_padding_mask=None):
src = [Link](src) * [Link]([Link].embedding_dim)
tgt = [Link](tgt) * [Link]([Link].embedding_dim)
src = self.positional_encoding(src)
tgt = self.positional_encoding(tgt)
transformer_out = [Link](
src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask,␣
↪memory_key_padding_mask
)
output = self.fc_out(transformer_out)
return output
class PositionalEncoding([Link]):
def __init__(self, d_model, max_seq_length=100, dropout=0.1):
super(PositionalEncoding, self).__init__()
[Link] = [Link](p=dropout)
pe = [Link](max_seq_length, d_model)
1
position = [Link](0, max_seq_length, dtype=[Link]).
↪unsqueeze(1)
div_term = [Link]([Link](0, d_model, 2).float() * (-math.
↪log(10000.0) / d_model))
pe[:, 0::2] = [Link](position * div_term)
pe[:, 1::2] = [Link](position * div_term)
pe = [Link](0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + [Link][:[Link](0), :]
return [Link](x)
# Helper function to create masks
def generate_square_subsequent_mask(sz):
mask = ([Link]([Link](sz, sz)) == 1).transpose(0, 1)
mask = [Link]().masked_fill(mask == 0, float('-inf')).masked_fill(mask␣
↪== 1, float(0.0))
return mask
# Hyperparameters
vocab_size = 10000 # example vocab size
d_model = 512
max_seq_length = 100
# Instantiate the model, loss function, and optimizer
model = TransformerModel(vocab_size=vocab_size, d_model=d_model,␣
↪max_seq_length=max_seq_length)
criterion = [Link]()
optimizer = [Link]([Link](), lr=0.0001)
# Dummy data for demonstration
src = [Link](0, vocab_size, (max_seq_length, 32)) # (sequence length,␣
↪batch size)
tgt = [Link](0, vocab_size, (max_seq_length, 32)) # (sequence length,␣
↪batch size)
c:\Users\fool0\AppData\Local\Programs\Python\Python312\Lib\site-
packages\torch\nn\modules\[Link]: UserWarning: enable_nested_tensor
is True, but self.use_nested_tensor is False because
encoder_layer.self_attn.batch_first was not True(use batch_first for better
inference performance)
[Link](