-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Description
Describe the problem
I'm seeing this error when I load a checkpoint to resume training using zero optimization stage 0:
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/deepspeed_training_for_testing.py", line 604, in
[rank0]: train_chromatin_predictor_model(args)
[rank0]: File "/lus/lfs1aip2/scratch/s5b/ventilean.s5b/miniforge3/envs/pytorch-deepspeed/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
[rank0]: return f(*args, **kwargs)
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/deepspeed_training_for_testing.py", line 488, in train_chromatin_predictor_model
[rank0]: trainer = ChromatinPredictorTrainer(
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/deepspeed_training_for_testing.py", line 168, in init
[rank0]: self.checkpointer.load_snapshot_zero_stage0()
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/ds_checkpoints_and_logs.py", line 169, in load_snapshot_zero_stage0
[rank0]: load_path, client_state = self.engine.load_checkpoint(
[rank0]: File "/lus/lfs1aip2/scratch/s5b/ventilean.s5b/miniforge3/envs/pytorch-deepspeed/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3115, in load_checkpoint
[rank0]: self.optimizer._restore_from_bit16_weights()
[rank0]: AttributeError: 'FP16_Optimizer' object has no attribute '_restore_from_bit16_weights'
ds_config.json
{
"train_micro_batch_size_per_gpu": 4,
"gradient_accumulation_steps": 2,
"gradient_clipping": 0.8,
"train_batch_size": 64,
"bf16": { "enabled": true },
"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-3,
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 1e-6
}
},
"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": 590,
"warmup_min_ratio": 1e-8,
"warmup_num_steps": 29
}
},
"zero_optimization": {
"stage": 0,
"offload_param": {
"device": "none",
"pin_memory": true
},
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"contiguous_gradients": false,
"overlap_comm": false
},
"activation_checkpointing": {
"partition_activations": false,
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false,
"comms_logger": {
"enabled": true,
"verbose": false,
"prof_all": true,
"debug": false
}
}
Relevant code
load_path, client_state = self.engine.load_checkpoint(
load_dir=snapshot_folder,
tag='last_snapshot',
load_module_strict=False,
load_optimizer_states=True,
load_lr_scheduler_states=True,
load_module_only=False
)
# Update training state
self.start_epoch = client_state['epoch'] + 1
self.best_avg_val_loss = client_state['best_avg_val_loss']
self.best_epoch = client_state['best_epoch']