{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Демонстрация simple_llm\n", "## Полное руководство по установке и использованию" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Установка и настройка\n", "\n", "### Клонирование репозитория:\n", "```bash\n", "git clone https://github.com/ваш_username/simple-llm.git\n", "cd simple-llm\n", "```\n", "\n", "### Установка зависимостей:\n", "```bash\n", "pip install -e .\n", "pip install torch tqdm\n", "```\n", "\n", "### Проверка структуры данных:\n", "```bash\n", "mkdir -p data/corpus/sample data/model data/tokenizer\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Инициализация и проверка окружения" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import os\n", "import torch\n", "\n", "# Проверка версии PyTorch\n", "print(f\"PyTorch version: {torch.__version__}\")\n", "\n", "# Добавление пути к библиотеке\n", "project_path = os.path.abspath('../simple-llm')\n", "sys.path.append(project_path)\n", "print(f\"Путь к проекту: {project_path}\")\n", "\n", "# Проверка модулей\n", "try:\n", " from simple_llm.tokenizer.bpe import BPETokenizer\n", " from simple_llm.data.get_data import load_text_corpus\n", " from simple_llm.transformer.gpt import GPT\n", " print(\"✓ Все модули успешно импортированы\")\n", "except ImportError as e:\n", " print(f\"✗ Ошибка: {e}\")\n", " print(\"Решение: выполните 'pip install -e .' из корня проекта\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Работа с токенизатором" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Инициализация токенизатора\n", "tokenizer = BPETokenizer()\n", "\n", "# Загрузка и обработка текста\n", "corpus_path = 'data/corpus/sample/'\n", "if os.path.exists(corpus_path):\n", " text = load_text_corpus(corpus_path)\n", " print(f\"Загружено текста: {len(text.split())} слов\")\n", " \n", " # Обучение токенизатора\n", " tokenizer.train(text, vocab_size=1000)\n", " print(f\"Токенизатор обучен, размер словаря: {tokenizer.vocab_size}\")\n", " \n", " # Тест токенизации\n", " test_phrase = \"Пример работы токенизатора\"\n", " tokens = tokenizer.encode(test_phrase)\n", " print(f\"Текст: {test_phrase}\")\n", " print(f\"Токены: {tokens}\")\n", " print(f\"Обратное преобразование: {tokenizer.decode(tokens)}\")\n", "else:\n", " print(f\"Директория {corpus_path} не содержит данных для обучения\")\n", " print(\"Добавьте текстовые файлы в формате .txt в эту директорию\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Подготовка данных для обучения" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if 'text' in locals():\n", " # Токенизация всего корпуса\n", " all_tokens = tokenizer.encode(text)\n", " \n", " # Создание обучающих последовательностей\n", " seq_length = 64\n", " examples = []\n", " for i in range(0, len(all_tokens) - seq_length - 1, seq_length):\n", " input_seq = all_tokens[i:i+seq_length]\n", " target_seq = all_tokens[i+1:i+seq_length+1]\n", " examples.append((input_seq, target_seq))\n", " \n", " print(f\"Создано обучающих примеров: {len(examples)}\")\n", " print(f\"Размер последовательности: {seq_length} токенов\")\n", " print(f\"Пример входных данных: {examples[0][0][:10]}...\")\n", " print(f\"Пример целевых данных: {examples[0][1][:10]}...\")\n", "else:\n", " print(\"Невозможно подготовить данные: текст не загружен\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Обучение модели GPT" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if 'examples' in locals() and len(examples) > 0:\n", " # Конфигурация модели\n", " config = {\n", " 'vocab_size': tokenizer.vocab_size,\n", " 'embed_dim': 128,\n", " 'num_heads': 4,\n", " 'num_layers': 3,\n", " 'max_len': seq_length\n", " }\n", " \n", " # Инициализация модели\n", " model = GPT(config)\n", " optimizer = optim.Adam(model.parameters(), lr=0.001)\n", " criterion = torch.nn.CrossEntropyLoss()\n", " \n", " # Процесс обучения\n", " num_epochs = 5\n", " batch_size = 8\n", " \n", " for epoch in range(num_epochs):\n", " total_loss = 0\n", " model.train()\n", " \n", " for i in range(0, len(examples), batch_size):\n", " batch = examples[i:i+batch_size]\n", " inputs = torch.tensor([ex[0] for ex in batch])\n", " targets = torch.tensor([ex[1] for ex in batch])\n", " \n", " optimizer.zero_grad()\n", " outputs = model(inputs)\n", " loss = criterion(outputs.view(-1, config['vocab_size']), targets.view(-1))\n", " loss.backward()\n", " optimizer.step()\n", " \n", " total_loss += loss.item()\n", " \n", " avg_loss = total_loss / (len(examples) / batch_size)\n", " print(f\"Эпоха {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}\")\n", " \n", " print(\"Обучение завершено!\")\n", "else:\n", " print(\"Невозможно начать обучение: нет подготовленных данных\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6. Генерация текста" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def generate_text(model, tokenizer, prompt, max_len=50, temperature=0.7):\n", " model.eval()\n", " tokens = tokenizer.encode(prompt)\n", " \n", " for _ in range(max_len):\n", " input_ids = torch.tensor([tokens[-config['max_len']:]])\n", " with torch.no_grad():\n", " logits = model(input_ids)[0, -1, :] / temperature\n", " probs = torch.softmax(logits, dim=-1)\n", " next_token = torch.multinomial(probs, num_samples=1).item()\n", " tokens.append(next_token)\n", " \n", " return tokenizer.decode(tokens)\n", "\n", "if 'model' in locals():\n", " prompts = [\n", " \"Сегодня прекрасный день,\",\n", " \"Искусственный интеллект\",\n", " \"В далеком будущем\"\n", " ]\n", " \n", " for prompt in prompts:\n", " generated = generate_text(model, tokenizer, prompt)\n", " print(f\"Промпт: '{prompt}'\")\n", " print(f\"Результат: {generated}\\n\")\n", "else:\n", " print(\"Модель не обучена, генерация невозможна\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 7. Сохранение и загрузка моделей" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def save_model(model, tokenizer, model_name):\n", " model_path = f\"data/model/{model_name}.pth\"\n", " tokenizer_path = f\"data/tokenizer/{model_name}_tokenizer.json\"\n", " \n", " torch.save(model.state_dict(), model_path)\n", " tokenizer.save(tokenizer_path)\n", " print(f\"Модель сохранена в {model_path}\")\n", " print(f\"Токенизатор сохранен в {tokenizer_path}\")\n", "\n", "def load_model(model_name, config):\n", " model_path = f\"data/model/{model_name}.pth\"\n", " tokenizer_path = f\"data/tokenizer/{model_name}_tokenizer.json\"\n", " \n", " model = GPT(config)\n", " model.load_state_dict(torch.load(model_path))\n", " \n", " tokenizer = BPETokenizer()\n", " tokenizer.load(tokenizer_path)\n", " \n", " print(f\"Модель загружена из {model_path}\")\n", " return model, tokenizer\n", "\n", "# Пример использования:\n", "# save_model(model, tokenizer, \"my_first_model\")\n", "# loaded_model, loaded_tokenizer = load_model(\"my_first_model\", config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 8. Советы по улучшению\n", "\n", "1. Для лучших результатов:\n", " - Увеличьте размер корпуса\n", " - Добавьте больше эпох обучения\n", " - Настройте параметры модели\n", "\n", "2. Экспериментируйте с:\n", " - Температурой генерации (0.1-1.0)\n", " - Разными промптами\n", " - Архитектурой модели\n", "\n", "3. Дополнительные возможности:\n", " - Визуализация attention-карт\n", " - Реализация beam search\n", " - Fine-tuning на специфичных данных" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 4 }