Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made. Args: trainer: Trainer object num_training_steps: per single g
(trainer, num_training_steps, inference=False)
| 581 | |
| 582 | |
| 583 | def deepspeed_init(trainer, num_training_steps, inference=False): |
| 584 | """ |
| 585 | Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. |
| 586 | |
| 587 | If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made. |
| 588 | |
| 589 | Args: |
| 590 | trainer: Trainer object |
| 591 | num_training_steps: per single gpu |
| 592 | resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load |
| 593 | inference: launch in inference mode (no optimizer and no lr scheduler) |
| 594 | auto_find_batch_size: whether to ignore the `train_micro_batch_size_per_gpu` argument as it's being |
| 595 | set automatically by the auto batch size finder |
| 596 | |
| 597 | Returns: optimizer, lr_scheduler |
| 598 | |
| 599 | We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on: |
| 600 | https://github.com/deepspeedai/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it |
| 601 | can't resume from a checkpoint after it did some stepping https://github.com/deepspeedai/DeepSpeed/issues/1612 |
| 602 | |
| 603 | """ |
| 604 | from deepspeed.utils import logger as ds_logger |
| 605 | |
| 606 | model = trainer.model |
| 607 | args = trainer.args |
| 608 | |
| 609 | hf_deepspeed_config = trainer.accelerator.state.deepspeed_plugin.hf_ds_config |
| 610 | |
| 611 | # resume config update - some bits like `model` and `num_training_steps` only become available during train |
| 612 | hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) |
| 613 | |
| 614 | # set the Deepspeed log level consistent with the Trainer |
| 615 | ds_logger.setLevel(args.get_process_log_level()) |
| 616 | |
| 617 | if inference: |
| 618 | # only Z3 makes sense for the inference |
| 619 | if not hf_deepspeed_config.is_zero3(): |
| 620 | raise ValueError("ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config") |
| 621 | |
| 622 | # in case the training config is re-used for inference |
| 623 | hf_deepspeed_config.del_config_sub_tree("optimizer") |
| 624 | hf_deepspeed_config.del_config_sub_tree("lr_scheduler") |
| 625 | optimizer, lr_scheduler = None, None |
| 626 | model_parameters = None |
| 627 | else: |
| 628 | trainer.optimizer = None # important for when deepspeed_init is used as re-init |
| 629 | deepspeed_tp_size = hf_deepspeed_config.config.get("tensor_parallel", {}).get("autotp_size", 1) |
| 630 | if deepspeed_tp_size > 1: |
| 631 | import deepspeed |
| 632 | |
| 633 | model = deepspeed.tp_model_init( |
| 634 | model=model, |
| 635 | tp_size=deepspeed_tp_size, |
| 636 | dtype=hf_deepspeed_config.dtype(), |
| 637 | config=hf_deepspeed_config.config, |
| 638 | ) |
| 639 | model_parameters = list(filter(lambda p: p.requires_grad, model.parameters())) |
| 640 | optimizer, lr_scheduler = deepspeed_optim_sched( |
no test coverage detected