Files
simple-llm/simple_llm_demo.ipynb

324 lines
11 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Демонстрация simple_llm\n",
"## Полное руководство по установке и использованию"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Установка и настройка\n",
"\n",
"### Клонирование репозитория:\n",
"```bash\n",
"git clone https://github.com/ваш_username/simple-llm.git\n",
"cd simple-llm\n",
"```\n",
"\n",
"### Установка зависимостей:\n",
"```bash\n",
"pip install -e .\n",
"pip install torch tqdm\n",
"```\n",
"\n",
"### Проверка структуры данных:\n",
"```bash\n",
"mkdir -p data/corpus/sample data/model data/tokenizer\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Инициализация и проверка окружения"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import torch\n",
"\n",
"# Проверка версии PyTorch\n",
"print(f\"PyTorch version: {torch.__version__}\")\n",
"\n",
"# Добавление пути к библиотеке\n",
"project_path = os.path.abspath('../simple-llm')\n",
"sys.path.append(project_path)\n",
"print(f\"Путь к проекту: {project_path}\")\n",
"\n",
"# Проверка модулей\n",
"try:\n",
" from simple_llm.tokenizer.bpe import BPETokenizer\n",
" from simple_llm.data.get_data import load_text_corpus\n",
" from simple_llm.transformer.gpt import GPT\n",
" print(\"✓ Все модули успешно импортированы\")\n",
"except ImportError as e:\n",
" print(f\"✗ Ошибка: {e}\")\n",
" print(\"Решение: выполните 'pip install -e .' из корня проекта\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Работа с токенизатором"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Инициализация токенизатора\n",
"tokenizer = BPETokenizer()\n",
"\n",
"# Загрузка и обработка текста\n",
"corpus_path = 'data/corpus/sample/'\n",
"if os.path.exists(corpus_path):\n",
" text = load_text_corpus(corpus_path)\n",
" print(f\"Загружено текста: {len(text.split())} слов\")\n",
" \n",
" # Обучение токенизатора\n",
" tokenizer.train(text, vocab_size=1000)\n",
" print(f\"Токенизатор обучен, размер словаря: {tokenizer.vocab_size}\")\n",
" \n",
" # Тест токенизации\n",
" test_phrase = \"Пример работы токенизатора\"\n",
" tokens = tokenizer.encode(test_phrase)\n",
" print(f\"Текст: {test_phrase}\")\n",
" print(f\"Токены: {tokens}\")\n",
" print(f\"Обратное преобразование: {tokenizer.decode(tokens)}\")\n",
"else:\n",
" print(f\"Директория {corpus_path} не содержит данных для обучения\")\n",
" print(\"Добавьте текстовые файлы в формате .txt в эту директорию\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Подготовка данных для обучения"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if 'text' in locals():\n",
" # Токенизация всего корпуса\n",
" all_tokens = tokenizer.encode(text)\n",
" \n",
" # Создание обучающих последовательностей\n",
" seq_length = 64\n",
" examples = []\n",
" for i in range(0, len(all_tokens) - seq_length - 1, seq_length):\n",
" input_seq = all_tokens[i:i+seq_length]\n",
" target_seq = all_tokens[i+1:i+seq_length+1]\n",
" examples.append((input_seq, target_seq))\n",
" \n",
" print(f\"Создано обучающих примеров: {len(examples)}\")\n",
" print(f\"Размер последовательности: {seq_length} токенов\")\n",
" print(f\"Пример входных данных: {examples[0][0][:10]}...\")\n",
" print(f\"Пример целевых данных: {examples[0][1][:10]}...\")\n",
"else:\n",
" print(\"Невозможно подготовить данные: текст не загружен\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Обучение модели GPT"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if 'examples' in locals() and len(examples) > 0:\n",
" # Конфигурация модели\n",
" config = {\n",
" 'vocab_size': tokenizer.vocab_size,\n",
" 'embed_dim': 128,\n",
" 'num_heads': 4,\n",
" 'num_layers': 3,\n",
" 'max_len': seq_length\n",
" }\n",
" \n",
" # Инициализация модели\n",
" model = GPT(config)\n",
" optimizer = optim.Adam(model.parameters(), lr=0.001)\n",
" criterion = torch.nn.CrossEntropyLoss()\n",
" \n",
" # Процесс обучения\n",
" num_epochs = 5\n",
" batch_size = 8\n",
" \n",
" for epoch in range(num_epochs):\n",
" total_loss = 0\n",
" model.train()\n",
" \n",
" for i in range(0, len(examples), batch_size):\n",
" batch = examples[i:i+batch_size]\n",
" inputs = torch.tensor([ex[0] for ex in batch])\n",
" targets = torch.tensor([ex[1] for ex in batch])\n",
" \n",
" optimizer.zero_grad()\n",
" outputs = model(inputs)\n",
" loss = criterion(outputs.view(-1, config['vocab_size']), targets.view(-1))\n",
" loss.backward()\n",
" optimizer.step()\n",
" \n",
" total_loss += loss.item()\n",
" \n",
" avg_loss = total_loss / (len(examples) / batch_size)\n",
" print(f\"Эпоха {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}\")\n",
" \n",
" print(\"Обучение завершено!\")\n",
"else:\n",
" print(\"Невозможно начать обучение: нет подготовленных данных\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 6. Генерация текста"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def generate_text(model, tokenizer, prompt, max_len=50, temperature=0.7):\n",
" model.eval()\n",
" tokens = tokenizer.encode(prompt)\n",
" \n",
" for _ in range(max_len):\n",
" input_ids = torch.tensor([tokens[-config['max_len']:]])\n",
" with torch.no_grad():\n",
" logits = model(input_ids)[0, -1, :] / temperature\n",
" probs = torch.softmax(logits, dim=-1)\n",
" next_token = torch.multinomial(probs, num_samples=1).item()\n",
" tokens.append(next_token)\n",
" \n",
" return tokenizer.decode(tokens)\n",
"\n",
"if 'model' in locals():\n",
" prompts = [\n",
" \"Сегодня прекрасный день,\",\n",
" \"Искусственный интеллект\",\n",
" \"В далеком будущем\"\n",
" ]\n",
" \n",
" for prompt in prompts:\n",
" generated = generate_text(model, tokenizer, prompt)\n",
" print(f\"Промпт: '{prompt}'\")\n",
" print(f\"Результат: {generated}\\n\")\n",
"else:\n",
" print(\"Модель не обучена, генерация невозможна\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 7. Сохранение и загрузка моделей"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def save_model(model, tokenizer, model_name):\n",
" model_path = f\"data/model/{model_name}.pth\"\n",
" tokenizer_path = f\"data/tokenizer/{model_name}_tokenizer.json\"\n",
" \n",
" torch.save(model.state_dict(), model_path)\n",
" tokenizer.save(tokenizer_path)\n",
" print(f\"Модель сохранена в {model_path}\")\n",
" print(f\"Токенизатор сохранен в {tokenizer_path}\")\n",
"\n",
"def load_model(model_name, config):\n",
" model_path = f\"data/model/{model_name}.pth\"\n",
" tokenizer_path = f\"data/tokenizer/{model_name}_tokenizer.json\"\n",
" \n",
" model = GPT(config)\n",
" model.load_state_dict(torch.load(model_path))\n",
" \n",
" tokenizer = BPETokenizer()\n",
" tokenizer.load(tokenizer_path)\n",
" \n",
" print(f\"Модель загружена из {model_path}\")\n",
" return model, tokenizer\n",
"\n",
"# Пример использования:\n",
"# save_model(model, tokenizer, \"my_first_model\")\n",
"# loaded_model, loaded_tokenizer = load_model(\"my_first_model\", config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 8. Советы по улучшению\n",
"\n",
"1. Для лучших результатов:\n",
" - Увеличьте размер корпуса\n",
" - Добавьте больше эпох обучения\n",
" - Настройте параметры модели\n",
"\n",
"2. Экспериментируйте с:\n",
" - Температурой генерации (0.1-1.0)\n",
" - Разными промптами\n",
" - Архитектурой модели\n",
"\n",
"3. Дополнительные возможности:\n",
" - Визуализация attention-карт\n",
" - Реализация beam search\n",
" - Fine-tuning на специфичных данных"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}