test: add comprehensive test suite for LLM components

- Add pytest configuration and fixtures - Add tests for core modules: decoder, feed_forward, multi_head_attention - Add tests for positional and token embeddings - Add tests for GPT model - Add tests for tokenizers (base and BPE) - Add basic integration tests
2026-01-23 21:10:54 +00:00 · 2025-10-05 08:11:18 +03:00
parent f4bdc81829
commit fb74dc7c17
12 changed files with 1653 additions and 0 deletions
--- a/llm/tests/tokenizers/test_base_tokenizer.py
+++ b/llm/tests/tokenizers/test_base_tokenizer.py
@@ -0,0 +1,58 @@
+"""
+Tests for base tokenizer.
+"""
+
+import pytest
+from llm.tokenizers import BaseTokenizer
+
+
+class ConcreteTokenizer(BaseTokenizer):
+    """Concrete implementation for testing BaseTokenizer."""
+    
+    def train(self, texts: list, vocab_size: int = 1000, **kwargs):
+        """Dummy implementation for testing."""
+        pass
+    
+    def encode(self, text: str, **kwargs) -> list:
+        """Dummy implementation for testing."""
+        return [1, 2, 3]
+    
+    def decode(self, tokens: list, **kwargs) -> str:
+        """Dummy implementation for testing."""
+        return "decoded text"
+
+
+class TestBaseTokenizer:
+    """Test cases for BaseTokenizer."""
+    
+    def test_initialization(self):
+        """Test that BaseTokenizer can be initialized through concrete class."""
+        tokenizer = ConcreteTokenizer()
+        assert tokenizer is not None
+        assert tokenizer.vocab == {}
+        assert tokenizer.vocab_size == 0
+    
+    def test_encode_implemented(self):
+        """Test that encode method works in concrete implementation."""
+        tokenizer = ConcreteTokenizer()
+        result = tokenizer.encode("test text")
+        assert result == [1, 2, 3]
+    
+    def test_decode_implemented(self):
+        """Test that decode method works in concrete implementation."""
+        tokenizer = ConcreteTokenizer()
+        result = tokenizer.decode([1, 2, 3])
+        assert result == "decoded text"
+    
+    def test_get_vocab_size(self):
+        """Test that get_vocab_size method works."""
+        tokenizer = ConcreteTokenizer()
+        tokenizer.vocab = {"a": 0, "b": 1, "c": 2}
+        tokenizer.vocab_size = 3
+        assert tokenizer.get_vocab_size() == 3
+    
+    def test_get_vocab(self):
+        """Test that get_vocab method works."""
+        tokenizer = ConcreteTokenizer()
+        tokenizer.vocab = {"a": 0, "b": 1, "c": 2}
+        assert tokenizer.get_vocab() == {"a": 0, "b": 1, "c": 2}