Fix checkpoint resume logic, global epoch numbering, and robust recovery; update tests for checkpointing; all tests passing

2026-01-24 05:26:02 +00:00 · 2025-08-01 10:40:08 +03:00
parent f7364070f0
commit 08f0356b4d
6 changed files with 113 additions and 53 deletions
--- a/bin/train_gpt_model.py
+++ b/bin/train_gpt_model.py
@@ -42,6 +42,7 @@ def main():
    parser.add_argument('--lr', type=float, default=0.0001,
                      help='Learning rate')
    
+    parser.add_argument('--keep-last-n', type=int, default=3, help='Сколько последних чекпоинтов хранить')
    args = parser.parse_args()

    # Проверяем и создаем директорию для сохранения
@@ -88,7 +89,8 @@ def main():
        learning_rate=args.lr,
        checkpoint_dir=output_dir,
        resume_training=True,
-        start_epoch=start_epoch
+        start_epoch=start_epoch,
+        keep_last_n=args.keep_last_n
    )
    torch.save(model.state_dict(), args.output)