0% found this document useful (0 votes)
20 views2 pages

PyTorch Transformer Model Guide

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
20 views2 pages

PyTorch Transformer Model Guide

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

assignment-8

November 12, 2024

[2]: import torch


import [Link] as nn
import [Link] as optim
import math

# Define the Transformer model


class TransformerModel([Link]):
def __init__(self, vocab_size, d_model=512, nhead=8, num_encoder_layers=6,␣
↪num_decoder_layers=6, dim_feedforward=2048, max_seq_length=100):

super(TransformerModel, self).__init__()
[Link] = [Link](vocab_size, d_model)
self.positional_encoding = PositionalEncoding(d_model, max_seq_length)
[Link] = [Link](d_model, nhead, num_encoder_layers,␣
↪num_decoder_layers, dim_feedforward)

self.fc_out = [Link](d_model, vocab_size)

def forward(self, src, tgt, src_mask=None, tgt_mask=None,␣


↪src_padding_mask=None, tgt_padding_mask=None, memory_key_padding_mask=None):

src = [Link](src) * [Link]([Link].embedding_dim)


tgt = [Link](tgt) * [Link]([Link].embedding_dim)
src = self.positional_encoding(src)
tgt = self.positional_encoding(tgt)

transformer_out = [Link](
src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask,␣
↪memory_key_padding_mask

)
output = self.fc_out(transformer_out)
return output

class PositionalEncoding([Link]):
def __init__(self, d_model, max_seq_length=100, dropout=0.1):
super(PositionalEncoding, self).__init__()
[Link] = [Link](p=dropout)

pe = [Link](max_seq_length, d_model)

1
position = [Link](0, max_seq_length, dtype=[Link]).
↪unsqueeze(1)
div_term = [Link]([Link](0, d_model, 2).float() * (-math.
↪log(10000.0) / d_model))

pe[:, 0::2] = [Link](position * div_term)


pe[:, 1::2] = [Link](position * div_term)
pe = [Link](0).transpose(0, 1)
self.register_buffer('pe', pe)

def forward(self, x):


x = x + [Link][:[Link](0), :]
return [Link](x)

# Helper function to create masks


def generate_square_subsequent_mask(sz):
mask = ([Link]([Link](sz, sz)) == 1).transpose(0, 1)
mask = [Link]().masked_fill(mask == 0, float('-inf')).masked_fill(mask␣
↪== 1, float(0.0))

return mask

# Hyperparameters
vocab_size = 10000 # example vocab size
d_model = 512
max_seq_length = 100

# Instantiate the model, loss function, and optimizer


model = TransformerModel(vocab_size=vocab_size, d_model=d_model,␣
↪max_seq_length=max_seq_length)

criterion = [Link]()
optimizer = [Link]([Link](), lr=0.0001)

# Dummy data for demonstration


src = [Link](0, vocab_size, (max_seq_length, 32)) # (sequence length,␣
↪batch size)

tgt = [Link](0, vocab_size, (max_seq_length, 32)) # (sequence length,␣


↪batch size)

c:\Users\fool0\AppData\Local\Programs\Python\Python312\Lib\site-
packages\torch\nn\modules\[Link]: UserWarning: enable_nested_tensor
is True, but self.use_nested_tensor is False because
encoder_layer.self_attn.batch_first was not True(use batch_first for better
inference performance)
[Link](

You might also like