Finalize training: metrics, best-model loading, cleanup. Returns TrainOutput.
(self, trial, num_train_samples, start_time)
| 1825 | ) |
| 1826 | |
| 1827 | def _finalize_training(self, trial, num_train_samples, start_time): |
| 1828 | """Finalize training: metrics, best-model loading, cleanup. Returns TrainOutput.""" |
| 1829 | logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") |
| 1830 | |
| 1831 | # add remaining tr_loss |
| 1832 | self._total_loss_scalar += self._tr_loss.item() |
| 1833 | effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError |
| 1834 | train_loss = self._total_loss_scalar / effective_global_step |
| 1835 | |
| 1836 | metrics = speed_metrics( |
| 1837 | "train", |
| 1838 | start_time, |
| 1839 | num_samples=num_train_samples, |
| 1840 | num_steps=self.state.max_steps, |
| 1841 | ) |
| 1842 | self.store_flos() |
| 1843 | metrics["total_flos"] = self.state.total_flos |
| 1844 | metrics["train_loss"] = train_loss |
| 1845 | |
| 1846 | self._memory_tracker.stop_and_update_metrics(metrics) |
| 1847 | self.log(metrics) |
| 1848 | |
| 1849 | if self.args.load_best_model_at_end and self.state.best_model_checkpoint is not None: |
| 1850 | self._load_best_model() |
| 1851 | |
| 1852 | checkpoints_sorted = sort_checkpoints( |
| 1853 | output_dir=self._get_output_dir(trial), best_model_checkpoint=self.state.best_model_checkpoint |
| 1854 | ) |
| 1855 | |
| 1856 | # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. |
| 1857 | if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: |
| 1858 | for checkpoint in checkpoints_sorted: |
| 1859 | if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): |
| 1860 | logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") |
| 1861 | shutil.rmtree(checkpoint, ignore_errors=True) |
| 1862 | |
| 1863 | self.control = self.callback_handler.on_train_end(self.args, self.state, self.control) |
| 1864 | |
| 1865 | # Wait for the checkpoint to be uploaded. |
| 1866 | self._finish_current_push() |
| 1867 | |
| 1868 | # After training we make sure to retrieve back the original forward pass method |
| 1869 | # for the embedding layer by removing the forward post hook. |
| 1870 | if self.neftune_noise_alpha is not None: |
| 1871 | deactivate_neftune(self.model, self.neftune_hook_handle, self.accelerator) |
| 1872 | self.is_in_train = False |
| 1873 | |
| 1874 | return TrainOutput(self.state.global_step, train_loss, metrics) |
| 1875 | |
| 1876 | def training_step( |
| 1877 | self, |
no test coverage detected