llm-arch-research/llm/tests/test_basic.py

"""
Basic tests for llm library components.
"""

import pytest
import torch
import tempfile
import os


def test_gpt_model_creation():
    """Test that GPT model can be created and forward pass works."""
    from llm.models.gpt import GPT

    config = {
        "vocab_size": 1000,
        "embed_dim": 128,
        "num_heads": 4,
        "num_layers": 2,
        "max_position_embeddings": 256,
        "dropout": 0.1,
    }

    model = GPT(config)

    # Test forward pass
    batch_size, seq_len = 2, 16
    input_ids = torch.randint(0, config["vocab_size"], (batch_size, seq_len))

    with torch.no_grad():
        logits, _ = model(input_ids)

    assert logits.shape == (batch_size, seq_len, config["vocab_size"])
    print("✅ GPT model creation and forward pass test passed")


def test_bpe_tokenizer_basic():
    """Test basic BPE tokenizer functionality."""
    from llm.tokenizers import BPETokenizer

    tokenizer = BPETokenizer()

    # Train on simple texts
    texts = ["hello world", "test tokenization", "simple example"]

    tokenizer.train(
        texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
    )

    # Test encoding/decoding
    text = "hello world"
    tokens = tokenizer.encode(text)
    decoded = tokenizer.decode(tokens)

    assert isinstance(tokens, list)
    assert isinstance(decoded, str)
    assert len(tokens) > 0
    print("✅ BPE tokenizer basic test passed")


def test_token_embeddings():
    """Test token embeddings."""
    from llm.core.token_embeddings import TokenEmbeddings

    vocab_size = 1000
    embed_dim = 128

    embeddings = TokenEmbeddings(vocab_size, embed_dim)

    # Test forward pass
    batch_size, seq_len = 2, 16
    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))

    output = embeddings(input_ids)

    assert output.shape == (batch_size, seq_len, embed_dim)
    print("✅ Token embeddings test passed")


def test_multi_head_attention():
    """Test multi-head attention."""
    from llm.core.multi_head_attention import MultiHeadAttention

    num_heads = 4
    emb_size = 128
    head_size = emb_size // num_heads
    max_seq_len = 256

    attention = MultiHeadAttention(num_heads, emb_size, head_size, max_seq_len)

    # Test forward pass
    batch_size, seq_len = 2, 16
    inputs = torch.randn(batch_size, seq_len, emb_size)

    output, _ = attention(inputs)

    assert output.shape == inputs.shape
    print("✅ Multi-head attention test passed")


def test_feed_forward():
    """Test feed forward network."""
    from llm.core.feed_forward import FeedForward

    embed_dim = 128

    ff = FeedForward(embed_dim)

    # Test forward pass
    batch_size, seq_len = 2, 16
    inputs = torch.randn(batch_size, seq_len, embed_dim)

    output = ff(inputs)

    assert output.shape == inputs.shape
    print("✅ Feed forward test passed")


def test_gpt_generation():
    """Test GPT text generation."""
    from llm.models.gpt import GPT

    config = {
        "vocab_size": 1000,
        "embed_dim": 128,
        "num_heads": 4,
        "num_layers": 2,
        "max_position_embeddings": 256,
        "dropout": 0.1,
    }

    model = GPT(config)
    model.eval()

    # Test greedy generation
    input_ids = torch.randint(0, config["vocab_size"], (1, 5))

    with torch.no_grad():
        generated = model.generate(x=input_ids, max_new_tokens=3, do_sample=False)

    assert generated.shape == (1, 8)  # 5 initial + 3 new tokens
    print("✅ GPT generation test passed")


def test_bpe_tokenizer_save_load():
    """Test BPE tokenizer save/load functionality."""
    from llm.tokenizers import BPETokenizer

    tokenizer = BPETokenizer()

    # Train on simple texts
    texts = ["hello world", "test save load"]
    tokenizer.train(
        texts=texts, vocab_size=30, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
    )

    with tempfile.TemporaryDirectory() as temp_dir:
        save_path = os.path.join(temp_dir, "test_tokenizer.json")

        # Save tokenizer
        tokenizer.save(save_path)
        assert os.path.exists(save_path)

        # Load tokenizer
        loaded_tokenizer = BPETokenizer.load(save_path)

        # Test that vocab size is the same
        assert tokenizer.get_vocab_size() == loaded_tokenizer.get_vocab_size()

        # Test that vocabularies are the same
        assert tokenizer.get_vocab() == loaded_tokenizer.get_vocab()

        # Test that both can encode/decode (even if tokens differ due to BPE state)
        text = "hello world"
        original_tokens = tokenizer.encode(text)
        loaded_tokens = loaded_tokenizer.encode(text)

        # Both should produce valid token lists
        assert isinstance(original_tokens, list)
        assert isinstance(loaded_tokens, list)
        assert len(original_tokens) > 0
        assert len(loaded_tokens) > 0

        # Both should be able to decode
        original_decoded = tokenizer.decode(original_tokens)
        loaded_decoded = loaded_tokenizer.decode(loaded_tokens)
        assert isinstance(original_decoded, str)
        assert isinstance(loaded_decoded, str)

    print("✅ BPE tokenizer save/load test passed")


def test_gpt_with_tokenizer():
    """Test GPT model with tokenizer integration."""
    from llm.models.gpt import GPT
    from llm.tokenizers import BPETokenizer

    # Create and train tokenizer
    tokenizer = BPETokenizer()
    texts = ["hello world", "test integration"]
    tokenizer.train(
        texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
    )

    vocab_size = tokenizer.get_vocab_size()

    # Create GPT model with tokenizer's vocab size
    config = {
        "vocab_size": vocab_size,
        "embed_dim": 128,
        "num_heads": 4,
        "num_layers": 2,
        "max_position_embeddings": 256,
        "dropout": 0.1,
    }

    model = GPT(config)

    # Test with tokenized input
    text = "hello world"
    tokens = tokenizer.encode(text, add_special_tokens=False)
    input_ids = torch.tensor([tokens])

    with torch.no_grad():
        logits, _ = model(input_ids)

    assert logits.shape == (1, len(tokens), vocab_size)
    print("✅ GPT with tokenizer integration test passed")


def run_all_tests():
    """Run all basic tests."""
    print("🧪 Running basic tests for llm library...")

    test_gpt_model_creation()
    test_bpe_tokenizer_basic()
    test_token_embeddings()
    test_multi_head_attention()
    test_feed_forward()
    test_gpt_generation()
    test_bpe_tokenizer_save_load()
    test_gpt_with_tokenizer()

    print("🎉 All basic tests passed!")


if __name__ == "__main__":
    run_all_tests()