Skip to content

AttributeError: 'FP16_Optimizer' object has no attribute '_restore_from_bit16_weights' #7571

@leandro-ventimiglia

Description

@leandro-ventimiglia

Describe the problem
I'm seeing this error when I load a checkpoint to resume training using zero optimization stage 0:

[rank0]: Traceback (most recent call last):
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/deepspeed_training_for_testing.py", line 604, in
[rank0]: train_chromatin_predictor_model(args)
[rank0]: File "/lus/lfs1aip2/scratch/s5b/ventilean.s5b/miniforge3/envs/pytorch-deepspeed/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 355, in wrapper
[rank0]: return f(*args, **kwargs)
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/deepspeed_training_for_testing.py", line 488, in train_chromatin_predictor_model
[rank0]: trainer = ChromatinPredictorTrainer(
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/deepspeed_training_for_testing.py", line 168, in init
[rank0]: self.checkpointer.load_snapshot_zero_stage0()
[rank0]: File "/home/s5b/ventilean.s5b/projects/10k_human_pbmc/TRAINING/src/deepspeed/ds_checkpoints_and_logs.py", line 169, in load_snapshot_zero_stage0
[rank0]: load_path, client_state = self.engine.load_checkpoint(
[rank0]: File "/lus/lfs1aip2/scratch/s5b/ventilean.s5b/miniforge3/envs/pytorch-deepspeed/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3115, in load_checkpoint
[rank0]: self.optimizer._restore_from_bit16_weights()
[rank0]: AttributeError: 'FP16_Optimizer' object has no attribute '_restore_from_bit16_weights'

ds_config.json
{
"train_micro_batch_size_per_gpu": 4,

"gradient_accumulation_steps": 2,

"gradient_clipping": 0.8,

"train_batch_size": 64,

"bf16": { "enabled": true },

"fp16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 1000,
"initial_scale_power": 16,
"hysteresis": 2,
"min_loss_scale": 1
},

"optimizer": {
"type": "Adam",
"params": {
"lr": 1e-3,
"betas": [0.8, 0.999],
"eps": 1e-8,
"weight_decay": 1e-6
}
},

"scheduler": {
"type": "WarmupCosineLR",
"params": {
"total_num_steps": 590,
"warmup_min_ratio": 1e-8,
"warmup_num_steps": 29
}
},

"zero_optimization": {
"stage": 0,
"offload_param": {
"device": "none",
"pin_memory": true
},
"offload_optimizer": {
"device": "none",
"pin_memory": true
},
"contiguous_gradients": false,
"overlap_comm": false
},

"activation_checkpointing": {
"partition_activations": false,
"contiguous_memory_optimization": false
},

"wall_clock_breakdown": false,

"comms_logger": {
"enabled": true,
"verbose": false,
"prof_all": true,
"debug": false
}
}

Relevant code

load_path, client_state = self.engine.load_checkpoint(
    load_dir=snapshot_folder,
    tag='last_snapshot',
    load_module_strict=False,
    load_optimizer_states=True,
    load_lr_scheduler_states=True,
    load_module_only=False
)

# Update training state
self.start_epoch = client_state['epoch'] + 1
self.best_avg_val_loss = client_state['best_avg_val_loss']
self.best_epoch = client_state['best_epoch']

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions