Files
llm-arch-research/llm/tests/test_basic.py

249 lines
6.5 KiB
Python
Raw Normal View History

"""
Basic tests for llm library components.
"""
import pytest
import torch
import tempfile
import os
def test_gpt_model_creation():
"""Test that GPT model can be created and forward pass works."""
from llm.models.gpt import GPT
config = {
"vocab_size": 1000,
"embed_dim": 128,
"num_heads": 4,
"num_layers": 2,
"max_position_embeddings": 256,
"dropout": 0.1,
}
model = GPT(config)
# Test forward pass
batch_size, seq_len = 2, 16
input_ids = torch.randint(0, config["vocab_size"], (batch_size, seq_len))
with torch.no_grad():
logits, _ = model(input_ids)
assert logits.shape == (batch_size, seq_len, config["vocab_size"])
print("✅ GPT model creation and forward pass test passed")
def test_bpe_tokenizer_basic():
"""Test basic BPE tokenizer functionality."""
from llm.tokenizers import BPETokenizer
tokenizer = BPETokenizer()
# Train on simple texts
texts = ["hello world", "test tokenization", "simple example"]
tokenizer.train(
texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)
# Test encoding/decoding
text = "hello world"
tokens = tokenizer.encode(text)
decoded = tokenizer.decode(tokens)
assert isinstance(tokens, list)
assert isinstance(decoded, str)
assert len(tokens) > 0
print("✅ BPE tokenizer basic test passed")
def test_token_embeddings():
"""Test token embeddings."""
from llm.core.token_embeddings import TokenEmbeddings
vocab_size = 1000
embed_dim = 128
embeddings = TokenEmbeddings(vocab_size, embed_dim)
# Test forward pass
batch_size, seq_len = 2, 16
input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
output = embeddings(input_ids)
assert output.shape == (batch_size, seq_len, embed_dim)
print("✅ Token embeddings test passed")
def test_multi_head_attention():
"""Test multi-head attention."""
from llm.core.multi_head_attention import MultiHeadAttention
num_heads = 4
emb_size = 128
head_size = emb_size // num_heads
max_seq_len = 256
attention = MultiHeadAttention(num_heads, emb_size, head_size, max_seq_len)
# Test forward pass
batch_size, seq_len = 2, 16
inputs = torch.randn(batch_size, seq_len, emb_size)
output, _ = attention(inputs)
assert output.shape == inputs.shape
print("✅ Multi-head attention test passed")
def test_feed_forward():
"""Test feed forward network."""
from llm.core.feed_forward import FeedForward
embed_dim = 128
ff = FeedForward(embed_dim)
# Test forward pass
batch_size, seq_len = 2, 16
inputs = torch.randn(batch_size, seq_len, embed_dim)
output = ff(inputs)
assert output.shape == inputs.shape
print("✅ Feed forward test passed")
def test_gpt_generation():
"""Test GPT text generation."""
from llm.models.gpt import GPT
config = {
"vocab_size": 1000,
"embed_dim": 128,
"num_heads": 4,
"num_layers": 2,
"max_position_embeddings": 256,
"dropout": 0.1,
}
model = GPT(config)
model.eval()
# Test greedy generation
input_ids = torch.randint(0, config["vocab_size"], (1, 5))
with torch.no_grad():
generated = model.generate(x=input_ids, max_new_tokens=3, do_sample=False)
assert generated.shape == (1, 8) # 5 initial + 3 new tokens
print("✅ GPT generation test passed")
def test_bpe_tokenizer_save_load():
"""Test BPE tokenizer save/load functionality."""
from llm.tokenizers import BPETokenizer
tokenizer = BPETokenizer()
# Train on simple texts
texts = ["hello world", "test save load"]
tokenizer.train(
texts=texts, vocab_size=30, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)
with tempfile.TemporaryDirectory() as temp_dir:
save_path = os.path.join(temp_dir, "test_tokenizer.json")
# Save tokenizer
tokenizer.save(save_path)
assert os.path.exists(save_path)
# Load tokenizer
loaded_tokenizer = BPETokenizer.load(save_path)
# Test that vocab size is the same
assert tokenizer.get_vocab_size() == loaded_tokenizer.get_vocab_size()
# Test that vocabularies are the same
assert tokenizer.get_vocab() == loaded_tokenizer.get_vocab()
# Test that both can encode/decode (even if tokens differ due to BPE state)
text = "hello world"
original_tokens = tokenizer.encode(text)
loaded_tokens = loaded_tokenizer.encode(text)
# Both should produce valid token lists
assert isinstance(original_tokens, list)
assert isinstance(loaded_tokens, list)
assert len(original_tokens) > 0
assert len(loaded_tokens) > 0
# Both should be able to decode
original_decoded = tokenizer.decode(original_tokens)
loaded_decoded = loaded_tokenizer.decode(loaded_tokens)
assert isinstance(original_decoded, str)
assert isinstance(loaded_decoded, str)
print("✅ BPE tokenizer save/load test passed")
def test_gpt_with_tokenizer():
"""Test GPT model with tokenizer integration."""
from llm.models.gpt import GPT
from llm.tokenizers import BPETokenizer
# Create and train tokenizer
tokenizer = BPETokenizer()
texts = ["hello world", "test integration"]
tokenizer.train(
texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
)
vocab_size = tokenizer.get_vocab_size()
# Create GPT model with tokenizer's vocab size
config = {
"vocab_size": vocab_size,
"embed_dim": 128,
"num_heads": 4,
"num_layers": 2,
"max_position_embeddings": 256,
"dropout": 0.1,
}
model = GPT(config)
# Test with tokenized input
text = "hello world"
tokens = tokenizer.encode(text, add_special_tokens=False)
input_ids = torch.tensor([tokens])
with torch.no_grad():
logits, _ = model(input_ids)
assert logits.shape == (1, len(tokens), vocab_size)
print("✅ GPT with tokenizer integration test passed")
def run_all_tests():
"""Run all basic tests."""
print("🧪 Running basic tests for llm library...")
test_gpt_model_creation()
test_bpe_tokenizer_basic()
test_token_embeddings()
test_multi_head_attention()
test_feed_forward()
test_gpt_generation()
test_bpe_tokenizer_save_load()
test_gpt_with_tokenizer()
print("🎉 All basic tests passed!")
if __name__ == "__main__":
run_all_tests()