MCPcopy
hub / github.com/huggingface/transformers / _inner_training_loop

Method _inner_training_loop

src/transformers/trainer.py:1440–1531  ·  view source on GitHub ↗

Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing.

(
        self,
        batch_size: int | None = None,
        args: TrainingArguments | None = None,
        resume_from_checkpoint: str | None = None,
        trial: "optuna.Trial | dict[str, Any] | None" = None,
        ignore_keys_for_eval: list[str] | None = None,
    )

Source from the content-addressed store, hash-verified

1438 )
1439
1440 def _inner_training_loop(
1441 self,
1442 batch_size: int | None = None,
1443 args: TrainingArguments | None = None,
1444 resume_from_checkpoint: str | None = None,
1445 trial: "optuna.Trial | dict[str, Any] | None" = None,
1446 ignore_keys_for_eval: list[str] | None = None,
1447 ) -> TrainOutput:
1448 """Run the actual training loop: forward, backward, optimizer step, logging, and checkpointing."""
1449 # reset everything
1450 self.accelerator.free_memory()
1451 if args.auto_find_batch_size:
1452 self._update_auto_batch_size(batch_size)
1453 # Data loader and number of training steps
1454 train_dataloader = self.get_train_dataloader()
1455 if self.is_fsdp_xla_v2_enabled:
1456 train_dataloader = tpu_spmd_dataloader(train_dataloader)
1457
1458 # Setting up training control variables:
1459 (
1460 num_train_epochs,
1461 num_update_steps_per_epoch,
1462 num_examples,
1463 num_train_samples,
1464 total_train_batch_size,
1465 steps_in_epoch,
1466 max_steps,
1467 ) = self.set_initial_training_values(args, train_dataloader)
1468
1469 epochs_trained, steps_trained_in_current_epoch = self._init_training_state(
1470 max_steps, num_update_steps_per_epoch, num_train_epochs, resume_from_checkpoint, trial
1471 )
1472 model, train_dataloader = self._prepare_for_training(max_steps, train_dataloader, resume_from_checkpoint)
1473
1474 # Train!
1475 logger.info("***** Running training *****")
1476 logger.info(f" Num examples = {num_examples:,}")
1477 logger.info(f" Num Epochs = {num_train_epochs:,}")
1478 logger.info(f" Num update steps per epoch = {num_update_steps_per_epoch:,}")
1479 logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}")
1480 if self.args.per_device_train_batch_size != self._train_batch_size:
1481 logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}")
1482 logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}")
1483 logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
1484 logger.info(f" Total optimization steps = {max_steps:,}")
1485 logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}")
1486
1487 if resume_from_checkpoint is not None:
1488 logger.info(
1489 f" Resuming training from checkpoint with epoch {epochs_trained} and global step {self.state.global_step}"
1490 )
1491 if not self.args.ignore_data_skip:
1492 logger.info(
1493 f" Fast-forwarding the dataloader past {epochs_trained} epochs and"
1494 f" {steps_trained_in_current_epoch} batches to resume from the exact training state."
1495 )
1496
1497 start_time = time.time()

Callers

nothing calls this directly

Calls 15

get_train_dataloaderMethod · 0.95
_init_training_stateMethod · 0.95
_prepare_for_trainingMethod · 0.95
_evaluateMethod · 0.95
_run_epochMethod · 0.95
_finalize_trainingMethod · 0.95
tpu_spmd_dataloaderFunction · 0.85
get_model_param_countFunction · 0.85
zero_gradMethod · 0.80
infoMethod · 0.45

Tested by

no test coverage detected