Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing.
(
self,
batch_size: int | None = None,
args: TrainingArguments | None = None,
resume_from_checkpoint: str | None = None,
trial: "optuna.Trial | dict[str, Any] | None" = None,
ignore_keys_for_eval: list[str] | None = None,
)
| 1438 | ) |
| 1439 | |
| 1440 | def _inner_training_loop( |
| 1441 | self, |
| 1442 | batch_size: int | None = None, |
| 1443 | args: TrainingArguments | None = None, |
| 1444 | resume_from_checkpoint: str | None = None, |
| 1445 | trial: "optuna.Trial | dict[str, Any] | None" = None, |
| 1446 | ignore_keys_for_eval: list[str] | None = None, |
| 1447 | ) -> TrainOutput: |
| 1448 | """Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing.""" |
| 1449 | # reset everything |
| 1450 | self.accelerator.free_memory() |
| 1451 | if args.auto_find_batch_size: |
| 1452 | self._update_auto_batch_size(batch_size) |
| 1453 | # Data loader and number of training steps |
| 1454 | train_dataloader = self.get_train_dataloader() |
| 1455 | if self.is_fsdp_xla_v2_enabled: |
| 1456 | train_dataloader = tpu_spmd_dataloader(train_dataloader) |
| 1457 | |
| 1458 | # Setting up training control variables: |
| 1459 | ( |
| 1460 | num_train_epochs, |
| 1461 | num_update_steps_per_epoch, |
| 1462 | num_examples, |
| 1463 | num_train_samples, |
| 1464 | total_train_batch_size, |
| 1465 | steps_in_epoch, |
| 1466 | max_steps, |
| 1467 | ) = self.set_initial_training_values(args, train_dataloader) |
| 1468 | |
| 1469 | epochs_trained, steps_trained_in_current_epoch = self._init_training_state( |
| 1470 | max_steps, num_update_steps_per_epoch, num_train_epochs, resume_from_checkpoint, trial |
| 1471 | ) |
| 1472 | model, train_dataloader = self._prepare_for_training(max_steps, train_dataloader, resume_from_checkpoint) |
| 1473 | |
| 1474 | # Train! |
| 1475 | logger.info("***** Running training *****") |
| 1476 | logger.info(f" Num examples = {num_examples:,}") |
| 1477 | logger.info(f" Num Epochs = {num_train_epochs:,}") |
| 1478 | logger.info(f" Num update steps per epoch = {num_update_steps_per_epoch:,}") |
| 1479 | logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") |
| 1480 | if self.args.per_device_train_batch_size != self._train_batch_size: |
| 1481 | logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") |
| 1482 | logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") |
| 1483 | logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") |
| 1484 | logger.info(f" Total optimization steps = {max_steps:,}") |
| 1485 | logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") |
| 1486 | |
| 1487 | if resume_from_checkpoint is not None: |
| 1488 | logger.info( |
| 1489 | f" Resuming training from checkpoint with epoch {epochs_trained} and global step {self.state.global_step}" |
| 1490 | ) |
| 1491 | if not self.args.ignore_data_skip: |
| 1492 | logger.info( |
| 1493 | f" Fast-forwarding the dataloader past {epochs_trained} epochs and" |
| 1494 | f" {steps_trained_in_current_epoch} batches to resume from the exact training state." |
| 1495 | ) |
| 1496 | |
| 1497 | start_time = time.time() |
nothing calls this directly
no test coverage detected