mirror of
https://github.com/pese-git/llm-arch-research.git
synced 2026-01-23 21:10:54 +00:00
test: add comprehensive test suite for LLM components
- Add pytest configuration and fixtures - Add tests for core modules: decoder, feed_forward, multi_head_attention - Add tests for positional and token embeddings - Add tests for GPT model - Add tests for tokenizers (base and BPE) - Add basic integration tests
This commit is contained in:
58
llm/tests/tokenizers/test_base_tokenizer.py
Normal file
58
llm/tests/tokenizers/test_base_tokenizer.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""
|
||||
Tests for base tokenizer.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from llm.tokenizers import BaseTokenizer
|
||||
|
||||
|
||||
class ConcreteTokenizer(BaseTokenizer):
|
||||
"""Concrete implementation for testing BaseTokenizer."""
|
||||
|
||||
def train(self, texts: list, vocab_size: int = 1000, **kwargs):
|
||||
"""Dummy implementation for testing."""
|
||||
pass
|
||||
|
||||
def encode(self, text: str, **kwargs) -> list:
|
||||
"""Dummy implementation for testing."""
|
||||
return [1, 2, 3]
|
||||
|
||||
def decode(self, tokens: list, **kwargs) -> str:
|
||||
"""Dummy implementation for testing."""
|
||||
return "decoded text"
|
||||
|
||||
|
||||
class TestBaseTokenizer:
|
||||
"""Test cases for BaseTokenizer."""
|
||||
|
||||
def test_initialization(self):
|
||||
"""Test that BaseTokenizer can be initialized through concrete class."""
|
||||
tokenizer = ConcreteTokenizer()
|
||||
assert tokenizer is not None
|
||||
assert tokenizer.vocab == {}
|
||||
assert tokenizer.vocab_size == 0
|
||||
|
||||
def test_encode_implemented(self):
|
||||
"""Test that encode method works in concrete implementation."""
|
||||
tokenizer = ConcreteTokenizer()
|
||||
result = tokenizer.encode("test text")
|
||||
assert result == [1, 2, 3]
|
||||
|
||||
def test_decode_implemented(self):
|
||||
"""Test that decode method works in concrete implementation."""
|
||||
tokenizer = ConcreteTokenizer()
|
||||
result = tokenizer.decode([1, 2, 3])
|
||||
assert result == "decoded text"
|
||||
|
||||
def test_get_vocab_size(self):
|
||||
"""Test that get_vocab_size method works."""
|
||||
tokenizer = ConcreteTokenizer()
|
||||
tokenizer.vocab = {"a": 0, "b": 1, "c": 2}
|
||||
tokenizer.vocab_size = 3
|
||||
assert tokenizer.get_vocab_size() == 3
|
||||
|
||||
def test_get_vocab(self):
|
||||
"""Test that get_vocab method works."""
|
||||
tokenizer = ConcreteTokenizer()
|
||||
tokenizer.vocab = {"a": 0, "b": 1, "c": 2}
|
||||
assert tokenizer.get_vocab() == {"a": 0, "b": 1, "c": 2}
|
||||
Reference in New Issue
Block a user