mirror of
https://github.com/pese-git/llm-arch-research.git
synced 2026-01-23 21:10:54 +00:00
Рефакторинг: единообразие оформления кода (пробелы, кавычки, пустые строки), без изменения логики по всему проекту.
This commit is contained in:
@@ -11,25 +11,25 @@ import os
|
||||
def test_gpt_model_creation():
|
||||
"""Test that GPT model can be created and forward pass works."""
|
||||
from llm.models.gpt import GPT
|
||||
|
||||
|
||||
config = {
|
||||
"vocab_size": 1000,
|
||||
"embed_dim": 128,
|
||||
"num_heads": 4,
|
||||
"num_layers": 2,
|
||||
"max_position_embeddings": 256,
|
||||
"dropout": 0.1
|
||||
"dropout": 0.1,
|
||||
}
|
||||
|
||||
|
||||
model = GPT(config)
|
||||
|
||||
|
||||
# Test forward pass
|
||||
batch_size, seq_len = 2, 16
|
||||
input_ids = torch.randint(0, config["vocab_size"], (batch_size, seq_len))
|
||||
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(input_ids)
|
||||
|
||||
|
||||
assert logits.shape == (batch_size, seq_len, config["vocab_size"])
|
||||
print("✅ GPT model creation and forward pass test passed")
|
||||
|
||||
@@ -37,27 +37,21 @@ def test_gpt_model_creation():
|
||||
def test_bpe_tokenizer_basic():
|
||||
"""Test basic BPE tokenizer functionality."""
|
||||
from llm.tokenizers import BPETokenizer
|
||||
|
||||
|
||||
tokenizer = BPETokenizer()
|
||||
|
||||
|
||||
# Train on simple texts
|
||||
texts = [
|
||||
"hello world",
|
||||
"test tokenization",
|
||||
"simple example"
|
||||
]
|
||||
|
||||
texts = ["hello world", "test tokenization", "simple example"]
|
||||
|
||||
tokenizer.train(
|
||||
texts=texts,
|
||||
vocab_size=50,
|
||||
special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
|
||||
texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
|
||||
)
|
||||
|
||||
|
||||
# Test encoding/decoding
|
||||
text = "hello world"
|
||||
tokens = tokenizer.encode(text)
|
||||
decoded = tokenizer.decode(tokens)
|
||||
|
||||
|
||||
assert isinstance(tokens, list)
|
||||
assert isinstance(decoded, str)
|
||||
assert len(tokens) > 0
|
||||
@@ -67,18 +61,18 @@ def test_bpe_tokenizer_basic():
|
||||
def test_token_embeddings():
|
||||
"""Test token embeddings."""
|
||||
from llm.core.token_embeddings import TokenEmbeddings
|
||||
|
||||
|
||||
vocab_size = 1000
|
||||
embed_dim = 128
|
||||
|
||||
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
|
||||
|
||||
# Test forward pass
|
||||
batch_size, seq_len = 2, 16
|
||||
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
|
||||
|
||||
|
||||
output = embeddings(input_ids)
|
||||
|
||||
|
||||
assert output.shape == (batch_size, seq_len, embed_dim)
|
||||
print("✅ Token embeddings test passed")
|
||||
|
||||
@@ -86,20 +80,20 @@ def test_token_embeddings():
|
||||
def test_multi_head_attention():
|
||||
"""Test multi-head attention."""
|
||||
from llm.core.multi_head_attention import MultiHeadAttention
|
||||
|
||||
|
||||
num_heads = 4
|
||||
emb_size = 128
|
||||
head_size = emb_size // num_heads
|
||||
max_seq_len = 256
|
||||
|
||||
|
||||
attention = MultiHeadAttention(num_heads, emb_size, head_size, max_seq_len)
|
||||
|
||||
|
||||
# Test forward pass
|
||||
batch_size, seq_len = 2, 16
|
||||
inputs = torch.randn(batch_size, seq_len, emb_size)
|
||||
|
||||
|
||||
output, _ = attention(inputs)
|
||||
|
||||
|
||||
assert output.shape == inputs.shape
|
||||
print("✅ Multi-head attention test passed")
|
||||
|
||||
@@ -107,17 +101,17 @@ def test_multi_head_attention():
|
||||
def test_feed_forward():
|
||||
"""Test feed forward network."""
|
||||
from llm.core.feed_forward import FeedForward
|
||||
|
||||
|
||||
embed_dim = 128
|
||||
|
||||
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Test forward pass
|
||||
batch_size, seq_len = 2, 16
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
output = ff(inputs)
|
||||
|
||||
|
||||
assert output.shape == inputs.shape
|
||||
print("✅ Feed forward test passed")
|
||||
|
||||
@@ -125,29 +119,25 @@ def test_feed_forward():
|
||||
def test_gpt_generation():
|
||||
"""Test GPT text generation."""
|
||||
from llm.models.gpt import GPT
|
||||
|
||||
|
||||
config = {
|
||||
"vocab_size": 1000,
|
||||
"embed_dim": 128,
|
||||
"num_heads": 4,
|
||||
"num_layers": 2,
|
||||
"max_position_embeddings": 256,
|
||||
"dropout": 0.1
|
||||
"dropout": 0.1,
|
||||
}
|
||||
|
||||
|
||||
model = GPT(config)
|
||||
model.eval()
|
||||
|
||||
|
||||
# Test greedy generation
|
||||
input_ids = torch.randint(0, config["vocab_size"], (1, 5))
|
||||
|
||||
|
||||
with torch.no_grad():
|
||||
generated = model.generate(
|
||||
x=input_ids,
|
||||
max_new_tokens=3,
|
||||
do_sample=False
|
||||
)
|
||||
|
||||
generated = model.generate(x=input_ids, max_new_tokens=3, do_sample=False)
|
||||
|
||||
assert generated.shape == (1, 8) # 5 initial + 3 new tokens
|
||||
print("✅ GPT generation test passed")
|
||||
|
||||
@@ -155,50 +145,48 @@ def test_gpt_generation():
|
||||
def test_bpe_tokenizer_save_load():
|
||||
"""Test BPE tokenizer save/load functionality."""
|
||||
from llm.tokenizers import BPETokenizer
|
||||
|
||||
|
||||
tokenizer = BPETokenizer()
|
||||
|
||||
|
||||
# Train on simple texts
|
||||
texts = ["hello world", "test save load"]
|
||||
tokenizer.train(
|
||||
texts=texts,
|
||||
vocab_size=30,
|
||||
special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
|
||||
texts=texts, vocab_size=30, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
|
||||
)
|
||||
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
save_path = os.path.join(temp_dir, "test_tokenizer.json")
|
||||
|
||||
|
||||
# Save tokenizer
|
||||
tokenizer.save(save_path)
|
||||
assert os.path.exists(save_path)
|
||||
|
||||
|
||||
# Load tokenizer
|
||||
loaded_tokenizer = BPETokenizer.load(save_path)
|
||||
|
||||
|
||||
# Test that vocab size is the same
|
||||
assert tokenizer.get_vocab_size() == loaded_tokenizer.get_vocab_size()
|
||||
|
||||
|
||||
# Test that vocabularies are the same
|
||||
assert tokenizer.get_vocab() == loaded_tokenizer.get_vocab()
|
||||
|
||||
|
||||
# Test that both can encode/decode (even if tokens differ due to BPE state)
|
||||
text = "hello world"
|
||||
original_tokens = tokenizer.encode(text)
|
||||
loaded_tokens = loaded_tokenizer.encode(text)
|
||||
|
||||
|
||||
# Both should produce valid token lists
|
||||
assert isinstance(original_tokens, list)
|
||||
assert isinstance(loaded_tokens, list)
|
||||
assert len(original_tokens) > 0
|
||||
assert len(loaded_tokens) > 0
|
||||
|
||||
|
||||
# Both should be able to decode
|
||||
original_decoded = tokenizer.decode(original_tokens)
|
||||
loaded_decoded = loaded_tokenizer.decode(loaded_tokens)
|
||||
assert isinstance(original_decoded, str)
|
||||
assert isinstance(loaded_decoded, str)
|
||||
|
||||
|
||||
print("✅ BPE tokenizer save/load test passed")
|
||||
|
||||
|
||||
@@ -206,18 +194,16 @@ def test_gpt_with_tokenizer():
|
||||
"""Test GPT model with tokenizer integration."""
|
||||
from llm.models.gpt import GPT
|
||||
from llm.tokenizers import BPETokenizer
|
||||
|
||||
|
||||
# Create and train tokenizer
|
||||
tokenizer = BPETokenizer()
|
||||
texts = ["hello world", "test integration"]
|
||||
tokenizer.train(
|
||||
texts=texts,
|
||||
vocab_size=50,
|
||||
special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
|
||||
texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
|
||||
)
|
||||
|
||||
|
||||
vocab_size = tokenizer.get_vocab_size()
|
||||
|
||||
|
||||
# Create GPT model with tokenizer's vocab size
|
||||
config = {
|
||||
"vocab_size": vocab_size,
|
||||
@@ -225,19 +211,19 @@ def test_gpt_with_tokenizer():
|
||||
"num_heads": 4,
|
||||
"num_layers": 2,
|
||||
"max_position_embeddings": 256,
|
||||
"dropout": 0.1
|
||||
"dropout": 0.1,
|
||||
}
|
||||
|
||||
|
||||
model = GPT(config)
|
||||
|
||||
|
||||
# Test with tokenized input
|
||||
text = "hello world"
|
||||
tokens = tokenizer.encode(text, add_special_tokens=False)
|
||||
input_ids = torch.tensor([tokens])
|
||||
|
||||
|
||||
with torch.no_grad():
|
||||
logits = model(input_ids)
|
||||
|
||||
|
||||
assert logits.shape == (1, len(tokens), vocab_size)
|
||||
print("✅ GPT with tokenizer integration test passed")
|
||||
|
||||
@@ -245,7 +231,7 @@ def test_gpt_with_tokenizer():
|
||||
def run_all_tests():
|
||||
"""Run all basic tests."""
|
||||
print("🧪 Running basic tests for llm library...")
|
||||
|
||||
|
||||
test_gpt_model_creation()
|
||||
test_bpe_tokenizer_basic()
|
||||
test_token_embeddings()
|
||||
@@ -254,7 +240,7 @@ def run_all_tests():
|
||||
test_gpt_generation()
|
||||
test_bpe_tokenizer_save_load()
|
||||
test_gpt_with_tokenizer()
|
||||
|
||||
|
||||
print("🎉 All basic tests passed!")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user