2025-10-04 22:40:21 +03:00
|
|
|
|
"""
|
|
|
|
|
|
Общие утилиты для работы с данными в экспериментах.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
from typing import List, Tuple
|
|
|
|
|
|
from .configs import TRAIN_TEXTS, PATHS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_training_data(split_ratio: float = 0.8) -> Tuple[List[str], List[str]]:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Загружает данные для обучения и разделяет на train/validation.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
split_ratio: Доля данных для обучения
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Returns:
|
|
|
|
|
|
Tuple: (train_texts, val_texts)
|
|
|
|
|
|
"""
|
|
|
|
|
|
train_size = int(len(TRAIN_TEXTS) * split_ratio)
|
|
|
|
|
|
train_data = TRAIN_TEXTS[:train_size]
|
|
|
|
|
|
val_data = TRAIN_TEXTS[train_size:]
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
return train_data, val_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_directories():
|
|
|
|
|
|
"""Создает необходимые директории если они не существуют."""
|
|
|
|
|
|
directories = [
|
|
|
|
|
|
"checkpoints",
|
2025-10-06 22:57:19 +03:00
|
|
|
|
"checkpoints/gpt-bpe",
|
2025-10-04 22:40:21 +03:00
|
|
|
|
"checkpoints/hf-bpe-tokenizer",
|
|
|
|
|
|
"checkpoints/hf-trained",
|
|
|
|
|
|
"checkpoints/hf-trained-proxy",
|
2025-10-06 22:57:19 +03:00
|
|
|
|
"logs",
|
2025-10-04 22:40:21 +03:00
|
|
|
|
]
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
for directory in directories:
|
|
|
|
|
|
os.makedirs(directory, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_model_paths(experiment_type: str = "llm_only") -> dict:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Возвращает пути для конкретного типа эксперимента.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
experiment_type: Тип эксперимента ('llm_only' или 'hf_integration')
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Returns:
|
|
|
|
|
|
dict: Словарь с путями
|
|
|
|
|
|
"""
|
|
|
|
|
|
base_paths = PATHS.copy()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
if experiment_type == "hf_integration":
|
2025-10-06 22:57:19 +03:00
|
|
|
|
base_paths.update(
|
|
|
|
|
|
{"model": base_paths["hf_model"], "tokenizer": base_paths["hf_tokenizer"]}
|
|
|
|
|
|
)
|
2025-10-04 22:40:21 +03:00
|
|
|
|
else: # llm_only
|
2025-10-06 22:57:19 +03:00
|
|
|
|
base_paths.update(
|
|
|
|
|
|
{
|
|
|
|
|
|
"model": base_paths["gpt_bpe_model"],
|
|
|
|
|
|
"tokenizer": base_paths["bpe_tokenizer"],
|
|
|
|
|
|
}
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
return base_paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_experiment_info(experiment_name: str, config: dict):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Выводит информацию о запускаемом эксперименте.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
experiment_name: Название эксперимента
|
|
|
|
|
|
config: Конфигурация эксперимента
|
|
|
|
|
|
"""
|
|
|
|
|
|
print("=" * 70)
|
|
|
|
|
|
print(f"🚀 Эксперимент: {experiment_name}")
|
|
|
|
|
|
print("=" * 70)
|
|
|
|
|
|
print("📊 Конфигурация:")
|
|
|
|
|
|
for key, value in config.items():
|
|
|
|
|
|
print(f" {key}: {value}")
|
|
|
|
|
|
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_experiment_results(results: dict, filepath: str):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Сохраняет результаты эксперимента в файл.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
results: Словарь с результатами
|
|
|
|
|
|
filepath: Путь для сохранения
|
|
|
|
|
|
"""
|
|
|
|
|
|
import json
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
|
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
2025-10-04 22:40:21 +03:00
|
|
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"✅ Результаты эксперимента сохранены: {filepath}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_experiment_results(filepath: str) -> dict:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Загружает результаты эксперимента из файла.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
filepath: Путь к файлу с результатами
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Returns:
|
|
|
|
|
|
dict: Загруженные результаты
|
|
|
|
|
|
"""
|
|
|
|
|
|
import json
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
if not os.path.exists(filepath):
|
|
|
|
|
|
return {}
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
|
|
|
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
2025-10-04 22:40:21 +03:00
|
|
|
|
return json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ExperimentLogger:
|
|
|
|
|
|
"""
|
|
|
|
|
|
Логгер для экспериментов.
|
|
|
|
|
|
"""
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
def __init__(self, experiment_name: str):
|
|
|
|
|
|
self.experiment_name = experiment_name
|
|
|
|
|
|
self.metrics = {}
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
def log_metric(self, name: str, value: float):
|
|
|
|
|
|
"""Логирует метрику."""
|
|
|
|
|
|
if name not in self.metrics:
|
|
|
|
|
|
self.metrics[name] = []
|
|
|
|
|
|
self.metrics[name].append(value)
|
|
|
|
|
|
print(f"📈 {name}: {value:.4f}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
def log_step(self, step: int, loss: float, **kwargs):
|
|
|
|
|
|
"""Логирует шаг обучения."""
|
|
|
|
|
|
print(f"📊 Step {step}: loss={loss:.4f}", end="")
|
|
|
|
|
|
for key, value in kwargs.items():
|
|
|
|
|
|
print(f", {key}={value:.4f}", end="")
|
|
|
|
|
|
print()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
def log_epoch(self, epoch: int, train_loss: float, val_loss: float = None):
|
|
|
|
|
|
"""Логирует завершение эпохи."""
|
|
|
|
|
|
print(f"🎯 Epoch {epoch}: train_loss={train_loss:.4f}", end="")
|
|
|
|
|
|
if val_loss is not None:
|
|
|
|
|
|
print(f", val_loss={val_loss:.4f}", end="")
|
|
|
|
|
|
print()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
def save_logs(self, filepath: str):
|
|
|
|
|
|
"""Сохраняет логи эксперимента."""
|
|
|
|
|
|
import json
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
|
|
|
|
|
logs = {"experiment_name": self.experiment_name, "metrics": self.metrics}
|
|
|
|
|
|
|
|
|
|
|
|
with open(filepath, "w", encoding="utf-8") as f:
|
2025-10-04 22:40:21 +03:00
|
|
|
|
json.dump(logs, f, ensure_ascii=False, indent=2)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"✅ Логи эксперимента сохранены: {filepath}")
|