gpt.
md 2024-07-27
projection produce logits for the vocabulary, which can be used for text generation or computing the cross-
entropy loss if targets are provided.
Additionally, the class includes a from_pretrained method, allowing users to load pretrained weights
from Hugging Face models. This involves mapping and aligning parameters from the Hugging Face model to
the custom GPT-2 model, ensuring compatibility and functionality. Overall, the GPT class encapsulates the
architecture and operations needed to train and deploy a powerful language model.
class GPT([Link]):
def __init__(self, config):
super().__init__()
[Link] = config
[Link] = [Link](dict(
wte = [Link](config.vocab_size, config.n_embd),
wpe = [Link](config.block_size, config.n_embd),
h = [Link]([Block(config) for _ in
range(config.n_layer)]),
ln_f = [Link](config.n_embd),
))
self.lm_head = [Link](config.n_embd, config.vocab_size,
bias=False)
[Link] = self.lm_head.weight
# initialize the weights (use the code -
[Link]
[Link](self._init_weights)
# initialize the weights, taken from the original gpt2 model
def _init_weights(self, module):
if isinstance(module, [Link]):
std = 0.02
if hasattr(module, 'GPT_SCALE_UNIT'):
std *= (2 * [Link].n_layer) ** -0.5
[Link].normal_([Link], mean=0.0, std=0.02)
if [Link] is not None:
[Link].zeros_([Link])
if isinstance(module, [Link]):
[Link].normal_([Link], mean=0.0, std=0.02)
def forward(self, idx, targets=None):
B, T = [Link]()
assert T <= [Link].block_size, "Cannot forward, model block
size is exhausted"
pos = [Link](0, T, dtype=[Link], device=[Link]) #
(T)
pos_emb = [Link](pos)
tok_emb = [Link](idx)
x = tok_emb + pos_emb
for block in [Link].h:
x = block(x)
x = [Link].ln_f(x)
logits = self.lm_head(x) # (B, T, vocab_size)
5 / 11
[Link] 2024-07-27
loss = None
if targets is not None:
loss = F.cross_entropy([Link](-1, [Link](-1)),
[Link](-1)) # cross entropy loss
return logits, loss
@classmethod
def from_pretrained(cls, model_type):
"""Load pretrained model weights from Huggingface"""
assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-
xl'}
from transformers import GPT2LMHeadModel
print(f"Loading {model_type} weights...")
config_args = {
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M Param
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), #
350M Param
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M
Param
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M
Param
} [model_type]
config_args['vocab_size'] = 50257 # GPT2 vocab size
config_args['block_size'] = 1024 # GPT2 block size
config = GPTConfig(**config_args)
model = GPT(config)
sd = model.state_dict()
sd_keys = [Link]()
sd_keys = [k for k in sd_keys if not [Link]('.[Link]')]
model_hf = GPT2LMHeadModel.from_pretrained(model_type)
sd_hf = model_hf.state_dict()
sd_keys_hf = sd_hf.keys()
sd_keys_hf = [k for k in sd_keys_hf if not
[Link]('.attn.masked_bias')]
sd_keys_hf = [k for k in sd_keys_hf if not
[Link]('.[Link]')]
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight',
'mlp.c_fc.weight', 'mlp.c_proj.weight']
assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys:
{len(sd_keys_hf)} != {len(sd_keys)}"
for k in sd_keys_hf:
if any([Link](w) for w in transposed):
assert sd_hf[k].shape[::-1] == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k].t())
else:
assert sd_hf[k].shape == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k])
6 / 11