2025-10-04 22:40:21 +03:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
|
|
|
|
Experiment: simple_hf_training.py
|
|
|
|
|
|
Description: Упрощенное обучение GPT модели с использованием hf-proxy.
|
|
|
|
|
|
Использует ручное обучение вместо сложного HuggingFace Trainer.
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import torch
|
|
|
|
|
|
import torch.nn as nn
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
|
|
|
|
# Добавляем путь к shared модулям
|
|
|
|
|
|
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
|
|
|
|
|
|
|
|
from llm.models.gpt import GPT
|
|
|
|
|
|
from llm.tokenizers import BPETokenizer
|
|
|
|
|
|
from hf_proxy import HFAdapter, HFTokenizerAdapter
|
|
|
|
|
|
|
|
|
|
|
|
from shared.configs import (
|
2025-10-06 22:57:19 +03:00
|
|
|
|
TRAIN_TEXTS,
|
|
|
|
|
|
BASE_GPT_CONFIG,
|
|
|
|
|
|
BPE_CONFIG,
|
|
|
|
|
|
TRAINING_CONFIG,
|
|
|
|
|
|
PATHS,
|
|
|
|
|
|
TEST_PROMPTS,
|
2025-10-04 22:40:21 +03:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_dataset(hf_tokenizer, texts, max_length=128):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Создает простой датасет для обучения.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
hf_tokenizer: Адаптированный токенизатор
|
|
|
|
|
|
texts: Список текстов
|
|
|
|
|
|
max_length: Максимальная длина последовательности
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Returns:
|
|
|
|
|
|
list: Список тензоров input_ids
|
|
|
|
|
|
"""
|
|
|
|
|
|
dataset = []
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
for text in texts:
|
|
|
|
|
|
# Токенизируем текст
|
|
|
|
|
|
inputs = hf_tokenizer(
|
2025-10-06 22:57:19 +03:00
|
|
|
|
text,
|
|
|
|
|
|
max_length=max_length,
|
2025-10-04 22:40:21 +03:00
|
|
|
|
truncation=True,
|
|
|
|
|
|
padding=False,
|
2025-10-06 22:57:19 +03:00
|
|
|
|
return_tensors="pt",
|
2025-10-04 22:40:21 +03:00
|
|
|
|
)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
|
|
|
|
|
input_ids = inputs["input_ids"][0]
|
|
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Создаем метки для языкового моделирования
|
|
|
|
|
|
labels = input_ids.clone()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
|
|
|
|
|
dataset.append({"input_ids": input_ids, "labels": labels})
|
|
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
return dataset
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def manual_training_loop(hf_model, hf_tokenizer, train_texts, val_texts, config):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Ручной цикл обучения без использования Trainer.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
hf_model: Адаптированная модель
|
|
|
|
|
|
hf_tokenizer: Адаптированный токенизатор
|
|
|
|
|
|
train_texts: Тексты для обучения
|
|
|
|
|
|
val_texts: Тексты для валидации
|
|
|
|
|
|
config: Конфигурация обучения
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Returns:
|
|
|
|
|
|
dict: Результаты обучения
|
|
|
|
|
|
"""
|
|
|
|
|
|
print("🎯 Запуск ручного обучения...")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Создаем датасеты
|
|
|
|
|
|
train_dataset = create_dataset(hf_tokenizer, train_texts)
|
|
|
|
|
|
val_dataset = create_dataset(hf_tokenizer, val_texts)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"📊 Данные: {len(train_dataset)} train, {len(val_dataset)} validation")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Оптимизатор
|
2025-10-06 22:57:19 +03:00
|
|
|
|
optimizer = torch.optim.AdamW(hf_model.parameters(), lr=config["learning_rate"])
|
|
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Функция потерь
|
|
|
|
|
|
loss_fn = nn.CrossEntropyLoss()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Обучение
|
|
|
|
|
|
hf_model.train()
|
|
|
|
|
|
train_losses = []
|
|
|
|
|
|
val_losses = []
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
for epoch in range(config["num_epochs"]):
|
|
|
|
|
|
print(f"\n📅 Эпоха {epoch + 1}/{config['num_epochs']}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Обучение
|
|
|
|
|
|
epoch_train_loss = 0
|
|
|
|
|
|
for i, batch in enumerate(train_dataset):
|
|
|
|
|
|
optimizer.zero_grad()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
|
|
|
|
|
input_ids = batch["input_ids"].unsqueeze(0) # [1, seq_len]
|
|
|
|
|
|
labels = batch["labels"].unsqueeze(0) # [1, seq_len]
|
|
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Forward pass
|
|
|
|
|
|
outputs = hf_model(input_ids=input_ids, labels=labels)
|
|
|
|
|
|
loss = outputs.loss
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Backward pass
|
|
|
|
|
|
loss.backward()
|
|
|
|
|
|
optimizer.step()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
epoch_train_loss += loss.item()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
if i % 5 == 0:
|
|
|
|
|
|
print(f" Batch {i}/{len(train_dataset)}: loss = {loss.item():.4f}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
avg_train_loss = epoch_train_loss / len(train_dataset)
|
|
|
|
|
|
train_losses.append(avg_train_loss)
|
|
|
|
|
|
print(f" 📊 Средняя train loss: {avg_train_loss:.4f}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Валидация
|
|
|
|
|
|
hf_model.eval()
|
|
|
|
|
|
epoch_val_loss = 0
|
|
|
|
|
|
with torch.no_grad():
|
|
|
|
|
|
for batch in val_dataset:
|
2025-10-06 22:57:19 +03:00
|
|
|
|
input_ids = batch["input_ids"].unsqueeze(0)
|
|
|
|
|
|
labels = batch["labels"].unsqueeze(0)
|
|
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
outputs = hf_model(input_ids=input_ids, labels=labels)
|
|
|
|
|
|
epoch_val_loss += outputs.loss.item()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
avg_val_loss = epoch_val_loss / len(val_dataset)
|
|
|
|
|
|
val_losses.append(avg_val_loss)
|
|
|
|
|
|
print(f" 📊 Средняя val loss: {avg_val_loss:.4f}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
hf_model.train()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
return {
|
2025-10-06 22:57:19 +03:00
|
|
|
|
"train_losses": train_losses,
|
|
|
|
|
|
"val_losses": val_losses,
|
|
|
|
|
|
"final_train_loss": train_losses[-1],
|
|
|
|
|
|
"final_val_loss": val_losses[-1],
|
2025-10-04 22:40:21 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_generation_after_training(hf_model, hf_tokenizer, test_prompts):
|
|
|
|
|
|
"""
|
|
|
|
|
|
Тестирует генерацию после обучения.
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
Args:
|
|
|
|
|
|
hf_model: Обученная модель
|
|
|
|
|
|
hf_tokenizer: Токенизатор
|
|
|
|
|
|
test_prompts: Тестовые промпты
|
|
|
|
|
|
"""
|
|
|
|
|
|
print("\n🧪 Тестирование генерации после обучения...")
|
|
|
|
|
|
hf_model.eval()
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
for prompt in test_prompts[:3]:
|
|
|
|
|
|
print(f"\n🔤 Промпт: '{prompt}'")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
try:
|
|
|
|
|
|
inputs = hf_tokenizer(prompt, return_tensors="pt")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
with torch.no_grad():
|
|
|
|
|
|
generated = hf_model.generate(
|
2025-10-06 22:57:19 +03:00
|
|
|
|
input_ids=inputs["input_ids"],
|
2025-10-04 22:40:21 +03:00
|
|
|
|
max_new_tokens=20,
|
|
|
|
|
|
do_sample=True,
|
2025-10-06 22:57:19 +03:00
|
|
|
|
temperature=0.8,
|
2025-10-04 22:40:21 +03:00
|
|
|
|
)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
generated_text = hf_tokenizer.decode(generated[0], skip_special_tokens=True)
|
|
|
|
|
|
print(f"🎯 Результат: '{generated_text}'")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"❌ Ошибка генерации: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
"""Основная функция эксперимента."""
|
|
|
|
|
|
print("=" * 60)
|
|
|
|
|
|
print("🚀 УПРОЩЕННОЕ ОБУЧЕНИЕ GPT С HF-PROXY")
|
|
|
|
|
|
print("=" * 60)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
try:
|
|
|
|
|
|
# === Подготовка данных ===
|
|
|
|
|
|
print("🔧 Подготовка данных...")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
train_texts = TRAIN_TEXTS[
|
|
|
|
|
|
:10
|
|
|
|
|
|
] # Используем меньше данных для быстрого тестирования
|
2025-10-04 22:40:21 +03:00
|
|
|
|
val_texts = TRAIN_TEXTS[10:12]
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"📊 Данные: {len(train_texts)} train, {len(val_texts)} validation")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# === Подготовка токенизатора ===
|
|
|
|
|
|
print("🔧 Подготовка токенизатора...")
|
|
|
|
|
|
llm_tokenizer = BPETokenizer()
|
|
|
|
|
|
llm_tokenizer.train(
|
|
|
|
|
|
texts=train_texts,
|
|
|
|
|
|
vocab_size=BPE_CONFIG["vocab_size"],
|
2025-10-06 22:57:19 +03:00
|
|
|
|
special_tokens=BPE_CONFIG["special_tokens"],
|
2025-10-04 22:40:21 +03:00
|
|
|
|
)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
hf_tokenizer = HFTokenizerAdapter(llm_tokenizer)
|
|
|
|
|
|
print(f"✅ Токенизатор создан (vocab_size={hf_tokenizer.vocab_size})")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# === Подготовка модели ===
|
|
|
|
|
|
print("🔧 Подготовка модели...")
|
|
|
|
|
|
model_config = BASE_GPT_CONFIG.copy()
|
|
|
|
|
|
model_config["vocab_size"] = hf_tokenizer.vocab_size
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
llm_model = GPT(model_config)
|
|
|
|
|
|
hf_model = HFAdapter.from_llm_model(llm_model)
|
|
|
|
|
|
print(f"✅ Модель создана")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# === Тестирование до обучения ===
|
|
|
|
|
|
print("\n🧪 Тестирование до обучения...")
|
|
|
|
|
|
test_generation_after_training(hf_model, hf_tokenizer, TEST_PROMPTS)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# === Обучение ===
|
|
|
|
|
|
print(f"\n🎯 Обучение модели...")
|
|
|
|
|
|
training_config = {
|
|
|
|
|
|
"learning_rate": TRAINING_CONFIG["learning_rate"],
|
|
|
|
|
|
"num_epochs": 2, # Меньше эпох для быстрого тестирования
|
2025-10-06 22:57:19 +03:00
|
|
|
|
"batch_size": TRAINING_CONFIG["batch_size"],
|
2025-10-04 22:40:21 +03:00
|
|
|
|
}
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
results = manual_training_loop(
|
|
|
|
|
|
hf_model, hf_tokenizer, train_texts, val_texts, training_config
|
|
|
|
|
|
)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"\n📊 Результаты обучения:")
|
|
|
|
|
|
print(f" Final train loss: {results['final_train_loss']:.4f}")
|
|
|
|
|
|
print(f" Final val loss: {results['final_val_loss']:.4f}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# === Тестирование после обучения ===
|
|
|
|
|
|
print("\n🧪 Тестирование после обучения...")
|
|
|
|
|
|
test_generation_after_training(hf_model, hf_tokenizer, TEST_PROMPTS)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# === Сохранение модели ===
|
|
|
|
|
|
print(f"\n💾 Сохранение модели...")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Создаем директории
|
|
|
|
|
|
os.makedirs("checkpoints/hf_simple_trained", exist_ok=True)
|
|
|
|
|
|
os.makedirs("checkpoints/hf_simple_tokenizer", exist_ok=True)
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Сохраняем токенизатор
|
|
|
|
|
|
hf_tokenizer.save_pretrained("checkpoints/hf_simple_tokenizer")
|
|
|
|
|
|
print("✅ Токенизатор сохранен")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Сохраняем модель
|
|
|
|
|
|
HFAdapter.save_pretrained(
|
2025-10-06 22:57:19 +03:00
|
|
|
|
hf_model, "checkpoints/hf_simple_trained", tokenizer=hf_tokenizer
|
2025-10-04 22:40:21 +03:00
|
|
|
|
)
|
|
|
|
|
|
print("✅ Модель сохранена")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
# Сохраняем результаты
|
|
|
|
|
|
results_path = "checkpoints/simple_training_results.json"
|
2025-10-06 22:57:19 +03:00
|
|
|
|
with open(results_path, "w", encoding="utf-8") as f:
|
|
|
|
|
|
json.dump(
|
|
|
|
|
|
{
|
|
|
|
|
|
"training_config": training_config,
|
|
|
|
|
|
"model_config": model_config,
|
|
|
|
|
|
"results": results,
|
|
|
|
|
|
},
|
|
|
|
|
|
f,
|
|
|
|
|
|
indent=2,
|
|
|
|
|
|
ensure_ascii=False,
|
|
|
|
|
|
)
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"✅ Результаты сохранены в {results_path}")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
print(f"\n🎉 Упрощенное обучение завершено успешно!")
|
|
|
|
|
|
print(f"\n💡 Для использования обученной модели:")
|
|
|
|
|
|
print(f" uv run python experiments/hf_integration/generate_with_hf_tools.py")
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"❌ Ошибка в эксперименте: {e}")
|
|
|
|
|
|
import traceback
|
2025-10-06 22:57:19 +03:00
|
|
|
|
|
2025-10-04 22:40:21 +03:00
|
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|