Returns the current learning rate from the scheduler. Handles DeepSpeed's dynamic loss scaling warmup period where `get_last_lr` may fail.
(self)
| 1299 | return decay_parameters |
| 1300 | |
| 1301 | def _get_learning_rate(self) -> float: |
| 1302 | """ |
| 1303 | Returns the current learning rate from the scheduler. |
| 1304 | |
| 1305 | Handles DeepSpeed's dynamic loss scaling warmup period where `get_last_lr` may fail. |
| 1306 | """ |
| 1307 | if self.is_deepspeed_enabled: |
| 1308 | # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may |
| 1309 | # not run for the first few dozen steps while loss scale is too large, and thus during |
| 1310 | # that time `get_last_lr` will fail if called during that warm up stage, so work around it: |
| 1311 | try: |
| 1312 | last_lr = self.lr_scheduler.get_last_lr()[0] |
| 1313 | except AssertionError as e: |
| 1314 | if "need to call step" in str(e): |
| 1315 | logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0") |
| 1316 | last_lr = 0 |
| 1317 | else: |
| 1318 | raise |
| 1319 | else: |
| 1320 | if isinstance(self.lr_scheduler, (torch.optim.lr_scheduler.ReduceLROnPlateau, GreedyLR)): |
| 1321 | last_lr = self.optimizer.param_groups[0]["lr"] |
| 1322 | else: |
| 1323 | last_lr = self.lr_scheduler.get_last_lr()[0] |
| 1324 | |
| 1325 | if torch.is_tensor(last_lr): |
| 1326 | last_lr = last_lr.item() |
| 1327 | return last_lr |
| 1328 | |
| 1329 | # ---- Training ---- |
| 1330 |
no test coverage detected