(trial: optuna.Trial, checkpoint_dir=None)
| 239 | if trainer.args.process_index == 0: |
| 240 | |
| 241 | def _objective(trial: optuna.Trial, checkpoint_dir=None): |
| 242 | checkpoint = None |
| 243 | if checkpoint_dir: |
| 244 | for subdir in os.listdir(checkpoint_dir): |
| 245 | if subdir.startswith(PREFIX_CHECKPOINT_DIR): |
| 246 | checkpoint = os.path.join(checkpoint_dir, subdir) |
| 247 | trainer.objective = None |
| 248 | if trainer.args.world_size > 1: |
| 249 | if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED: |
| 250 | raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.") |
| 251 | trainer.hp_space(trial) |
| 252 | fixed_trial = optuna.trial.FixedTrial(trial.params, trial.number) |
| 253 | trial_main_rank_list = [fixed_trial] |
| 254 | torch.distributed.broadcast_object_list(trial_main_rank_list, src=0) |
| 255 | trainer.train(resume_from_checkpoint=checkpoint, trial=trial) |
| 256 | else: |
| 257 | trainer.train(resume_from_checkpoint=checkpoint, trial=trial) |
| 258 | # If there hasn't been any evaluation during the training loop. |
| 259 | if getattr(trainer, "objective", None) is None: |
| 260 | metrics = trainer.evaluate() |
| 261 | trainer.objective = trainer.compute_objective(metrics) |
| 262 | |
| 263 | # Free GPU memory |
| 264 | trainer.model_wrapped, trainer.model = release_memory(trainer.model_wrapped, trainer.model) |
| 265 | trainer.accelerator.clear() |
| 266 | |
| 267 | return trainer.objective |
| 268 | |
| 269 | timeout = kwargs.pop("timeout", None) |
| 270 | n_jobs = kwargs.pop("n_jobs", 1) |
nothing calls this directly
no test coverage detected