Run one full pass over the dataloader.
(
self,
model,
epoch,
train_dataloader,
steps_in_epoch,
num_update_steps_per_epoch,
trial,
ignore_keys_for_eval,
start_time,
resume_from_checkpoint,
epochs_trained,
steps_trained_in_current_epoch,
)
| 1660 | return model, train_dataloader |
| 1661 | |
| 1662 | def _run_epoch( |
| 1663 | self, |
| 1664 | model, |
| 1665 | epoch, |
| 1666 | train_dataloader, |
| 1667 | steps_in_epoch, |
| 1668 | num_update_steps_per_epoch, |
| 1669 | trial, |
| 1670 | ignore_keys_for_eval, |
| 1671 | start_time, |
| 1672 | resume_from_checkpoint, |
| 1673 | epochs_trained, |
| 1674 | steps_trained_in_current_epoch, |
| 1675 | ): |
| 1676 | """Run one full pass over the dataloader.""" |
| 1677 | |
| 1678 | step = -1 |
| 1679 | grad_norm = None |
| 1680 | learning_rate = None |
| 1681 | rng_to_sync = False |
| 1682 | |
| 1683 | # Handle resumption from checkpoint: skip already-trained batches in the resumed epoch |
| 1684 | num_update_steps_trained = 0 |
| 1685 | if epoch == epochs_trained and resume_from_checkpoint is not None: |
| 1686 | if steps_trained_in_current_epoch > 0 and not self.args.ignore_data_skip: |
| 1687 | train_dataloader = skip_first_batches(train_dataloader, steps_trained_in_current_epoch) |
| 1688 | step = steps_trained_in_current_epoch - 1 |
| 1689 | num_update_steps_trained = steps_trained_in_current_epoch // self.args.gradient_accumulation_steps |
| 1690 | rng_to_sync = True |
| 1691 | elif steps_trained_in_current_epoch == 0: |
| 1692 | self._load_rng_state(resume_from_checkpoint) |
| 1693 | |
| 1694 | if hasattr(train_dataloader, "set_epoch"): |
| 1695 | train_dataloader.set_epoch(epoch) |
| 1696 | epoch_iterator = iter(train_dataloader) |
| 1697 | |
| 1698 | # We chunkify the epoch iterator into gradient accumulation steps `n` batches |
| 1699 | remainder = steps_in_epoch % self.args.gradient_accumulation_steps |
| 1700 | if remainder == 0: |
| 1701 | remainder = self.args.gradient_accumulation_steps |
| 1702 | |
| 1703 | # Outer loop: one iteration per optimizer step. Each iteration prefetches |
| 1704 | # `gradient_accumulation_steps` batches (fewer for the last step if the epoch |
| 1705 | # doesn't divide evenly). |
| 1706 | for update_step in range(num_update_steps_trained, num_update_steps_per_epoch): |
| 1707 | num_batches = ( |
| 1708 | self.args.gradient_accumulation_steps if update_step != (num_update_steps_per_epoch - 1) else remainder |
| 1709 | ) |
| 1710 | batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, self.args.device) |
| 1711 | |
| 1712 | # This is used to correctly scale the loss when the last accumulation step has fewer batches. |
| 1713 | # Not used if `num_items_in_batch` is not None. |
| 1714 | self.current_gradient_accumulation_steps = len(batch_samples) |
| 1715 | |
| 1716 | # need to sync after if we skipped the batches in `get_batch_samples` for shuffle order reason |
| 1717 | if rng_to_sync: |
| 1718 | self._load_rng_state(resume_from_checkpoint) |
| 1719 | rng_to_sync = False |
no test coverage detected