mirror of
https://github.com/pese-git/llm-arch-research.git
synced 2026-01-23 21:10:54 +00:00
update and expand scientific docstrings for optimizer, scheduler, trainer
- Expanded module-level and function/class docstrings in optimizer.py, scheduler.py, and trainer.py - Described mathematical foundations, theoretical motivations, and provided detailed usage examples for students - All docstrings in Russian, clear scientific style test(training): add comprehensive tests for optimizer, scheduler, and trainer modules - Added new test files for get_optimizer, get_linear_schedule_with_warmup, and Trainer - Tests cover parameter handling, edge cases, and expected learning dynamics (lr schedules and loss behavior) - Trainer now logs average epoch losses to self.loss_history for testability and analysis refactor(training/trainer): log epoch loss to loss_history for downstream analysis and tests BREAKING CHANGE: Trainer.loss_history is a new attribute consolidating average losses per epoch, enabling robust learning dynamics assertions in tests
This commit is contained in:
35
llm/tests/training/test_optimizer.py
Normal file
35
llm/tests/training/test_optimizer.py
Normal file
@@ -0,0 +1,35 @@
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
from llm.training.optimizer import get_optimizer
|
||||
|
||||
class DummyModel(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(10, 1)
|
||||
|
||||
def test_get_optimizer_adamw():
|
||||
model = DummyModel()
|
||||
optimizer = get_optimizer(model, lr=1e-3, weight_decay=0.02, optimizer_type="adamw")
|
||||
assert optimizer.__class__.__name__ == 'AdamW'
|
||||
assert optimizer.defaults['lr'] == 1e-3
|
||||
assert optimizer.defaults['weight_decay'] == 0.02
|
||||
|
||||
def test_get_optimizer_adam():
|
||||
model = DummyModel()
|
||||
optimizer = get_optimizer(model, lr=1e-4, weight_decay=0.01, optimizer_type="adam")
|
||||
assert optimizer.__class__.__name__ == 'Adam'
|
||||
assert optimizer.defaults['lr'] == 1e-4
|
||||
assert optimizer.defaults['weight_decay'] == 0.01
|
||||
|
||||
def test_get_optimizer_sgd():
|
||||
model = DummyModel()
|
||||
optimizer = get_optimizer(model, lr=0.1, optimizer_type="sgd")
|
||||
assert optimizer.__class__.__name__ == 'SGD'
|
||||
assert optimizer.defaults['lr'] == 0.1
|
||||
# SGD: weight_decay по умолчанию 0 для этого вызова
|
||||
assert optimizer.defaults['momentum'] == 0.9
|
||||
|
||||
def test_get_optimizer_invalid():
|
||||
model = DummyModel()
|
||||
with pytest.raises(ValueError):
|
||||
get_optimizer(model, optimizer_type="nonexistent")
|
||||
62
llm/tests/training/test_scheduler.py
Normal file
62
llm/tests/training/test_scheduler.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from llm.training.scheduler import get_linear_schedule_with_warmup
|
||||
from llm.training.optimizer import get_optimizer
|
||||
|
||||
class DummyModel(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(2, 2)
|
||||
|
||||
def test_scheduler_warmup_and_decay():
|
||||
model = DummyModel()
|
||||
base_lr = 0.1
|
||||
warmup_steps = 5
|
||||
total_steps = 20
|
||||
optimizer = get_optimizer(model, lr=base_lr, optimizer_type="sgd")
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
|
||||
|
||||
lrs = [optimizer.param_groups[0]['lr']] # lr до первого .step()
|
||||
for _ in range(total_steps):
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
lrs.append(optimizer.param_groups[0]['lr'])
|
||||
|
||||
# Проверяем warmup: lr должен расти линейно в первых warmup_steps (начиная с шага 1)
|
||||
for i in range(warmup_steps + 1):
|
||||
expected = base_lr * min(i, warmup_steps) / max(1, warmup_steps)
|
||||
assert abs(lrs[i] - expected) < 1e-6, f"Warmup step {i}: lr={lrs[i]}, expected={expected}"
|
||||
# Проверяем decay: после warmup lr затухает
|
||||
for i in range(warmup_steps + 1, total_steps + 1):
|
||||
expected = base_lr * max(0.0, (total_steps - (i - 0)) / max(1, total_steps - warmup_steps))
|
||||
assert abs(lrs[i] - expected) < 1e-6, f"Decay step {i}: lr={lrs[i]}, expected={expected}"
|
||||
assert lrs[-1] == 0.0
|
||||
|
||||
def test_scheduler_no_warmup():
|
||||
model = DummyModel()
|
||||
base_lr = 0.1
|
||||
warmup_steps = 0
|
||||
total_steps = 10
|
||||
optimizer = get_optimizer(model, lr=base_lr, optimizer_type="adam")
|
||||
scheduler = get_linear_schedule_with_warmup(
|
||||
optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)
|
||||
lrs = [optimizer.param_groups[0]['lr']]
|
||||
for _ in range(total_steps):
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
lrs.append(optimizer.param_groups[0]['lr'])
|
||||
|
||||
for i in range(total_steps + 1):
|
||||
expected = base_lr * max(0.0, (total_steps - i) / max(1, total_steps - warmup_steps))
|
||||
assert abs(lrs[i] - expected) < 1e-6, f"Step {i}: lr={lrs[i]}, expected={expected}"
|
||||
assert lrs[-1] == 0.0
|
||||
|
||||
def test_scheduler_full_decay_to_zero():
|
||||
model = DummyModel()
|
||||
optimizer = get_optimizer(model, lr=1.0, optimizer_type="adamw")
|
||||
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2, num_training_steps=2)
|
||||
scheduler.step()
|
||||
scheduler.step()
|
||||
for param_group in optimizer.param_groups:
|
||||
assert param_group['lr'] == 0.0
|
||||
62
llm/tests/training/test_trainer.py
Normal file
62
llm/tests/training/test_trainer.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.utils.data import Dataset
|
||||
from llm.training.trainer import Trainer
|
||||
|
||||
# Синтетический небольшой датасет для автогрессивной LM задачи
|
||||
class ToyLMDataset(Dataset):
|
||||
def __init__(self, num_samples=16, seq_len=8, vocab_size=16):
|
||||
self.data = torch.randint(1, vocab_size, (num_samples, seq_len))
|
||||
def __len__(self):
|
||||
return len(self.data)
|
||||
def __getitem__(self, idx):
|
||||
# labels == input_ids (identity task)
|
||||
return {"input_ids": self.data[idx], "labels": self.data[idx]}
|
||||
|
||||
# Простая dummy-модель — 1 слой linear over vocab
|
||||
class TinyModel(nn.Module):
|
||||
def __init__(self, vocab_size=16, seq_len=8):
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(seq_len, vocab_size)
|
||||
def forward(self, x):
|
||||
# logits: (batch, seq_len, vocab_size)
|
||||
# Для простоты делаем транспонирование
|
||||
return self.linear(x.float()).unsqueeze(1).expand(-1, x.shape[1], -1)
|
||||
|
||||
def test_train_runs_without_errors():
|
||||
train_data = ToyLMDataset(num_samples=16, seq_len=8, vocab_size=16)
|
||||
model = TinyModel(vocab_size=16, seq_len=8)
|
||||
trainer = Trainer(model, train_data, lr=1e-3, batch_size=4, num_epochs=1, warmup_steps=2)
|
||||
trainer.train()
|
||||
|
||||
def test_trainer_evaluate_runs():
|
||||
train_data = ToyLMDataset(num_samples=8)
|
||||
val_data = ToyLMDataset(num_samples=8)
|
||||
model = TinyModel()
|
||||
trainer = Trainer(model, train_data, val_data, lr=1e-3, batch_size=4, num_epochs=1, warmup_steps=2)
|
||||
trainer.train()
|
||||
trainer.evaluate()
|
||||
|
||||
def test_trainer_tuple_output():
|
||||
# Модель, возвращающая кортеж (logits, extra)
|
||||
class TupleModel(nn.Module):
|
||||
def __init__(self, vocab_size=16, seq_len=8):
|
||||
super().__init__()
|
||||
self.linear = nn.Linear(seq_len, vocab_size)
|
||||
def forward(self, x):
|
||||
logits = self.linear(x.float()).unsqueeze(1).expand(-1, x.shape[1], -1)
|
||||
extra = torch.zeros(1)
|
||||
return logits, extra
|
||||
|
||||
train_data = ToyLMDataset(num_samples=8)
|
||||
model = TupleModel()
|
||||
trainer = Trainer(model, train_data, lr=1e-3, batch_size=2, num_epochs=1, warmup_steps=1)
|
||||
trainer.train()
|
||||
|
||||
def test_trainer_loss_decreases():
|
||||
train_data = ToyLMDataset(num_samples=32, seq_len=8, vocab_size=8)
|
||||
model = TinyModel(vocab_size=8, seq_len=8)
|
||||
trainer = Trainer(model, train_data, lr=0.05, batch_size=8, num_epochs=2, warmup_steps=1)
|
||||
trainer.train()
|
||||
avg_losses = trainer.loss_history
|
||||
assert avg_losses[-1] <= avg_losses[0] or abs(avg_losses[-1] - avg_losses[0]) < 1e-3
|
||||
Reference in New Issue
Block a user