0% found this document useful (0 votes)
32 views2 pages

Custom GPT-2 Model Implementation

Uploaded by

sid_hyd
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
32 views2 pages

Custom GPT-2 Model Implementation

Uploaded by

sid_hyd
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

gpt.

md 2024-07-27

projection produce logits for the vocabulary, which can be used for text generation or computing the cross-
entropy loss if targets are provided.
Additionally, the class includes a from_pretrained method, allowing users to load pretrained weights
from Hugging Face models. This involves mapping and aligning parameters from the Hugging Face model to
the custom GPT-2 model, ensuring compatibility and functionality. Overall, the GPT class encapsulates the
architecture and operations needed to train and deploy a powerful language model.

class GPT([Link]):

def __init__(self, config):


super().__init__()
[Link] = config
[Link] = [Link](dict(
wte = [Link](config.vocab_size, config.n_embd),
wpe = [Link](config.block_size, config.n_embd),
h = [Link]([Block(config) for _ in
range(config.n_layer)]),
ln_f = [Link](config.n_embd),
))
self.lm_head = [Link](config.n_embd, config.vocab_size,
bias=False)
[Link] = self.lm_head.weight
# initialize the weights (use the code -
[Link]
[Link](self._init_weights)

# initialize the weights, taken from the original gpt2 model


def _init_weights(self, module):
if isinstance(module, [Link]):
std = 0.02
if hasattr(module, 'GPT_SCALE_UNIT'):
std *= (2 * [Link].n_layer) ** -0.5
[Link].normal_([Link], mean=0.0, std=0.02)
if [Link] is not None:
[Link].zeros_([Link])
if isinstance(module, [Link]):
[Link].normal_([Link], mean=0.0, std=0.02)

def forward(self, idx, targets=None):


B, T = [Link]()
assert T <= [Link].block_size, "Cannot forward, model block
size is exhausted"
pos = [Link](0, T, dtype=[Link], device=[Link]) #
(T)
pos_emb = [Link](pos)
tok_emb = [Link](idx)
x = tok_emb + pos_emb
for block in [Link].h:
x = block(x)
x = [Link].ln_f(x)
logits = self.lm_head(x) # (B, T, vocab_size)

5 / 11
[Link] 2024-07-27

loss = None
if targets is not None:
loss = F.cross_entropy([Link](-1, [Link](-1)),
[Link](-1)) # cross entropy loss
return logits, loss

@classmethod
def from_pretrained(cls, model_type):
"""Load pretrained model weights from Huggingface"""
assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-
xl'}
from transformers import GPT2LMHeadModel
print(f"Loading {model_type} weights...")

config_args = {
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M Param
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), #
350M Param
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M
Param
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M
Param
} [model_type]
config_args['vocab_size'] = 50257 # GPT2 vocab size
config_args['block_size'] = 1024 # GPT2 block size

config = GPTConfig(**config_args)
model = GPT(config)
sd = model.state_dict()
sd_keys = [Link]()
sd_keys = [k for k in sd_keys if not [Link]('.[Link]')]

model_hf = GPT2LMHeadModel.from_pretrained(model_type)
sd_hf = model_hf.state_dict()

sd_keys_hf = sd_hf.keys()
sd_keys_hf = [k for k in sd_keys_hf if not
[Link]('.attn.masked_bias')]
sd_keys_hf = [k for k in sd_keys_hf if not
[Link]('.[Link]')]
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight',
'mlp.c_fc.weight', 'mlp.c_proj.weight']
assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys:
{len(sd_keys_hf)} != {len(sd_keys)}"
for k in sd_keys_hf:
if any([Link](w) for w in transposed):
assert sd_hf[k].shape[::-1] == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k].t())
else:
assert sd_hf[k].shape == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k])

6 / 11

You might also like