Рефакторинг: единообразие оформления кода (пробелы, кавычки, пустые строки), без изменения логики по всему проекту.

2026-01-23 21:10:54 +00:00 · 2025-10-06 22:57:19 +03:00
parent 332cad6159
commit 712278e33c
49 changed files with 2324 additions and 2004 deletions
--- a/llm/tests/conftest.py
+++ b/llm/tests/conftest.py
@@ -58,7 +58,7 @@ def gpt_config(vocab_size, embed_dim, num_heads, num_layers):
        "num_heads": num_heads,
        "num_layers": num_layers,
        "max_position_embeddings": 1024,
-        "dropout": 0.1
+        "dropout": 0.1,
    }


@@ -68,12 +68,14 @@ def random_inputs(batch_size, seq_len, vocab_size):
    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
    return input_ids

+
@pytest.fixture
 def random_float_inputs(batch_size, seq_len, embed_dim):
    """Generate random floating point input tensors for testing feed forward."""
    inputs = torch.randn(batch_size, seq_len, embed_dim)
    return inputs

+
@pytest.fixture
 def random_embeddings(batch_size, seq_len, embed_dim):
    """Generate random embedding tensors for testing attention modules."""
--- a/llm/tests/core/test_decoder.py
+++ b/llm/tests/core/test_decoder.py
@@ -9,180 +9,233 @@ from llm.core.decoder import Decoder

 class TestDecoder:
    """Test cases for Decoder."""
-    
+
    def test_initialization(self, embed_dim, num_heads):
        """Test that Decoder can be initialized."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
        assert decoder is not None
-        
+
        # Check internal components
-        assert hasattr(decoder, '_heads')
-        assert hasattr(decoder, '_ff')
-        assert hasattr(decoder, '_norm1')
-        assert hasattr(decoder, '_norm2')
-    
+        assert hasattr(decoder, "_heads")
+        assert hasattr(decoder, "_ff")
+        assert hasattr(decoder, "_norm1")
+        assert hasattr(decoder, "_norm2")
+
    def test_forward_pass(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass of Decoder."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        # Forward pass
        output = decoder(random_embeddings)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_forward_with_causal_mask(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass with causal mask."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        batch_size, seq_len = random_embeddings.shape[:2]
        # Create causal mask
        mask = torch.tril(torch.ones(seq_len, seq_len))
-        
+
        # Forward pass with causal mask
        output = decoder(random_embeddings, mask=mask)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
+
    def test_residual_connections(self, embed_dim, num_heads, random_embeddings):
        """Test that residual connections are properly applied."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        output = decoder(random_embeddings)
-        
+
        # With residual connections and layer norm, the output shouldn't be
        # too different from input (in terms of scale/distribution)
        input_norm = random_embeddings.norm(dim=-1).mean()
        output_norm = output.norm(dim=-1).mean()
-        
+
        # Norms should be of similar magnitude (not exact due to transformations)
        assert 0.1 < (output_norm / input_norm) < 10.0
-    
+
    def test_layer_norm(self, embed_dim, num_heads, random_embeddings):
        """Test that layer normalization is applied."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        output = decoder(random_embeddings)
-        
+
        # Check that output has reasonable statistics (due to layer norm)
        # Mean should be close to 0, std close to 1 for each sequence position
        output_mean = output.mean(dim=-1)
        output_std = output.std(dim=-1)
-        
+
        # These are approximate checks since the data goes through multiple transformations
        assert torch.allclose(output_mean, torch.zeros_like(output_mean), atol=1.0)
        assert torch.allclose(output_std, torch.ones_like(output_std), atol=2.0)
-    
+
    def test_gradient_flow(self, embed_dim, num_heads, random_embeddings):
        """Test that gradients flow through Decoder."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        # Forward pass
        output = decoder(random_embeddings)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed for learnable parameters
        # in attention and feed forward components
        assert decoder._heads._layer.weight.grad is not None
        assert decoder._ff._layer1.weight.grad is not None
        assert decoder._norm1.weight.grad is not None
        assert decoder._norm2.weight.grad is not None
-    
+
    def test_device_consistency(self, embed_dim, num_heads, random_embeddings, device):
        """Test that Decoder works on correct device."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len).to(device)
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        ).to(device)
        inputs = random_embeddings.to(device)
-        
+
        # Forward pass
        output = decoder(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert decoder._heads._layer.weight.device == device
-    
+
    def test_different_configurations(self):
        """Test Decoder with different configurations."""
        test_cases = [
-            (64, 2),   # embed_dim=64, num_heads=2
+            (64, 2),  # embed_dim=64, num_heads=2
            (128, 4),  # embed_dim=128, num_heads=4
            (256, 8),  # embed_dim=256, num_heads=8
        ]
-        
+
        for embed_dim, num_heads in test_cases:
            head_size = embed_dim // num_heads
            max_seq_len = 1024
-            decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
+            decoder = Decoder(
+                num_heads=num_heads,
+                emb_size=embed_dim,
+                head_size=head_size,
+                max_seq_len=max_seq_len,
+            )
            batch_size, seq_len = 2, 16
            inputs = torch.randn(batch_size, seq_len, embed_dim)
-            
+
            output = decoder(inputs)
-            
+
            assert output.shape == inputs.shape
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, embed_dim, num_heads, batch_size, seq_len):
        """Test Decoder with different input shapes."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        inputs = torch.randn(batch_size, seq_len, embed_dim)
        output = decoder(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
-    
+
    def test_training_vs_evaluation(self, embed_dim, num_heads, random_embeddings):
        """Test that Decoder behaves differently in train vs eval mode."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len, dropout=0.5)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+            dropout=0.5,
+        )
+
        # Training mode
        decoder.train()
        output_train = decoder(random_embeddings)
-        
+
        # Evaluation mode
        decoder.eval()
        output_eval = decoder(random_embeddings)
-        
+
        # Outputs should be different due to dropout
        assert not torch.allclose(output_train, output_eval)
-    
+
    def test_parameter_initialization(self, embed_dim, num_heads):
        """Test that parameters are properly initialized."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        # Check that various components have non-zero parameters
        assert not torch.allclose(
-            decoder._heads._layer.weight, 
-            torch.zeros_like(decoder._heads._layer.weight)
+            decoder._heads._layer.weight, torch.zeros_like(decoder._heads._layer.weight)
        )
        assert not torch.allclose(
-            decoder._ff._layer1.weight,
-            torch.zeros_like(decoder._ff._layer1.weight)
+            decoder._ff._layer1.weight, torch.zeros_like(decoder._ff._layer1.weight)
        )
        assert not torch.allclose(
-            decoder._norm1.weight,
-            torch.zeros_like(decoder._norm1.weight)
+            decoder._norm1.weight, torch.zeros_like(decoder._norm1.weight)
        )
--- a/llm/tests/core/test_feed_forward.py
+++ b/llm/tests/core/test_feed_forward.py
@@ -10,168 +10,178 @@ from llm.core.feed_forward import FeedForward

 class TestFeedForward:
    """Test cases for FeedForward."""
-    
+
    def test_initialization(self, embed_dim):
        """Test that FeedForward can be initialized."""
        ff = FeedForward(embed_dim)
        assert ff is not None
-        
+
        # Check internal layers
-        assert hasattr(ff, '_layer1')
-        assert hasattr(ff, '_layer2')
-        assert hasattr(ff, '_activation')
-        assert hasattr(ff, '_dropout')
-        
+        assert hasattr(ff, "_layer1")
+        assert hasattr(ff, "_layer2")
+        assert hasattr(ff, "_activation")
+        assert hasattr(ff, "_dropout")
+
        # Check layer dimensions
        expected_hidden_dim = embed_dim * 4  # Default expansion factor
        assert ff._layer1.weight.shape == (expected_hidden_dim, embed_dim)
        assert ff._layer2.weight.shape == (embed_dim, expected_hidden_dim)
-    
+
    def test_forward_pass(self, embed_dim, random_float_inputs):
        """Test forward pass of FeedForward."""
        ff = FeedForward(embed_dim)
-        
+
        # Forward pass
        output = ff(random_float_inputs)
-        
+
        # Check output shape
        assert output.shape == random_float_inputs.shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_custom_hidden_dim(self, embed_dim):
        """Test FeedForward with custom hidden dimension."""
        # FeedForward doesn't support custom hidden_dim in current implementation
        # This test is not applicable
        ff = FeedForward(embed_dim)
-        
+
        # Check layer dimensions (fixed 4x expansion)
        expected_hidden_dim = embed_dim * 4
        assert ff._layer1.weight.shape == (expected_hidden_dim, embed_dim)
        assert ff._layer2.weight.shape == (embed_dim, expected_hidden_dim)
-    
+
    def test_dropout(self, embed_dim, random_float_inputs):
        """Test that dropout is applied during training."""
        ff = FeedForward(embed_dim, dropout=0.5)
        ff.train()  # Set to training mode
-        
+
        output = ff(random_float_inputs)
-        
+
        # In training mode with dropout, some values should be zeroed
        # This is probabilistic, so we can't assert exact zeros,
        # but we can check the structure is preserved
        assert output.shape == random_float_inputs.shape
-    
+
    def test_no_dropout_in_eval(self, embed_dim, random_float_inputs):
        """Test that dropout is not applied during evaluation."""
        ff = FeedForward(embed_dim, dropout=0.5)
        ff.eval()  # Set to evaluation mode
-        
+
        # Run forward pass multiple times - outputs should be identical
        output1 = ff(random_float_inputs)
        output2 = ff(random_float_inputs)
-        
+
        assert torch.allclose(output1, output2)
-    
+
    def test_activation_function(self, embed_dim, random_float_inputs):
        """Test that activation function is applied."""
        ff = FeedForward(embed_dim)
-        
+
        # Manually compute expected output without dropout for deterministic comparison
        hidden = ff._layer1(random_float_inputs)
        activated = ff._activation(hidden)
        expected_output = ff._layer2(activated)
-        
+
        # Compare with forward pass in eval mode (no dropout)
        ff.eval()
        actual_output = ff(random_float_inputs)
-        
+
        assert torch.allclose(actual_output, expected_output, rtol=1e-4)
-    
+
    def test_gradient_flow(self, embed_dim, random_float_inputs):
        """Test that gradients flow through FeedForward."""
        ff = FeedForward(embed_dim)
-        
+
        # Forward pass
        output = ff(random_float_inputs)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed for learnable parameters
        assert ff._layer1.weight.grad is not None
        assert ff._layer2.weight.grad is not None
-        assert not torch.allclose(ff._layer1.weight.grad, 
-                                torch.zeros_like(ff._layer1.weight.grad))
-        assert not torch.allclose(ff._layer2.weight.grad, 
-                                torch.zeros_like(ff._layer2.weight.grad))
-    
+        assert not torch.allclose(
+            ff._layer1.weight.grad, torch.zeros_like(ff._layer1.weight.grad)
+        )
+        assert not torch.allclose(
+            ff._layer2.weight.grad, torch.zeros_like(ff._layer2.weight.grad)
+        )
+
    def test_device_consistency(self, embed_dim, random_float_inputs, device):
        """Test that FeedForward works on correct device."""
        ff = FeedForward(embed_dim).to(device)
        inputs = random_float_inputs.to(device)
-        
+
        # Forward pass
        output = ff(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert ff._layer1.weight.device == device
        assert ff._layer2.weight.device == device
-    
+
    def test_different_embed_dims(self):
        """Test FeedForward with different embedding dimensions."""
        test_cases = [64, 128, 256, 512]
-        
+
        for embed_dim in test_cases:
            ff = FeedForward(embed_dim)
            batch_size, seq_len = 2, 16
            inputs = torch.randn(batch_size, seq_len, embed_dim)
-            
+
            output = ff(inputs)
-            
+
            assert output.shape == inputs.shape
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, embed_dim, batch_size, seq_len):
        """Test FeedForward with different input shapes."""
        ff = FeedForward(embed_dim)
-        
+
        inputs = torch.randn(batch_size, seq_len, embed_dim)
        output = ff(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
-    
+
    def test_non_linearity(self, embed_dim, random_float_inputs):
        """Test that FeedForward introduces non-linearity."""
        ff = FeedForward(embed_dim)
-        
+
        # Create a simple linear transformation for comparison
        linear_layer = nn.Linear(embed_dim, embed_dim)
-        
+
        # Copy weights to make comparison fair
        with torch.no_grad():
            linear_layer.weight.copy_(ff._layer2.weight @ ff._layer1.weight)
            if linear_layer.bias is not None:
                linear_layer.bias.zero_()
-        
+
        linear_output = linear_layer(random_float_inputs)
        ff_output = ff(random_float_inputs)
-        
+
        # FeedForward output should be different from pure linear transformation
        # due to activation function
        assert not torch.allclose(ff_output, linear_output, rtol=1e-4)
-    
+
    def test_parameter_initialization(self, embed_dim):
        """Test that parameters are properly initialized."""
        ff = FeedForward(embed_dim)
-        
+
        # Check that weights are not all zeros
-        assert not torch.allclose(ff._layer1.weight, torch.zeros_like(ff._layer1.weight))
-        assert not torch.allclose(ff._layer2.weight, torch.zeros_like(ff._layer2.weight))
-        
+        assert not torch.allclose(
+            ff._layer1.weight, torch.zeros_like(ff._layer1.weight)
+        )
+        assert not torch.allclose(
+            ff._layer2.weight, torch.zeros_like(ff._layer2.weight)
+        )
+
        # Check that biases are not all zeros (they should be initialized with some values)
        if ff._layer1.bias is not None:
-            assert not torch.allclose(ff._layer1.bias, torch.zeros_like(ff._layer1.bias))
+            assert not torch.allclose(
+                ff._layer1.bias, torch.zeros_like(ff._layer1.bias)
+            )
        if ff._layer2.bias is not None:
-            assert not torch.allclose(ff._layer2.bias, torch.zeros_like(ff._layer2.bias))
+            assert not torch.allclose(
+                ff._layer2.bias, torch.zeros_like(ff._layer2.bias)
+            )
--- a/llm/tests/core/test_multi_head_attention.py
+++ b/llm/tests/core/test_multi_head_attention.py
@@ -9,157 +9,181 @@ from llm.core.multi_head_attention import MultiHeadAttention

 class TestMultiHeadAttention:
    """Test cases for MultiHeadAttention."""
-    
+
    def test_initialization(self, embed_dim, num_heads):
        """Test that MultiHeadAttention can be initialized."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
        assert attention is not None
-        
+
        # Check internal attributes
        assert len(attention._heads) == num_heads
        assert attention._layer.in_features == embed_dim
        assert attention._layer.out_features == embed_dim
-    
+
    def test_forward_pass(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass of MultiHeadAttention."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Forward pass
        output, _ = attention(random_embeddings)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_forward_with_mask(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass with attention mask."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Create a simple mask
        seq_len = random_embeddings.shape[1]
        mask = torch.tril(torch.ones(seq_len, seq_len))  # Causal mask
-        
+
        # Forward pass with mask
        output, _ = attention(random_embeddings, mask=mask)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
+
    def test_causal_mask(self, embed_dim, num_heads, random_embeddings):
        """Test that causal mask prevents attending to future positions."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Create causal mask
        seq_len = random_embeddings.shape[1]
        causal_mask = torch.tril(torch.ones(seq_len, seq_len))
-        
+
        # Forward pass with causal mask
        output, _ = attention(random_embeddings, mask=causal_mask)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
-    def test_attention_weights_normalization(self, embed_dim, num_heads, random_embeddings):
+
+    def test_attention_weights_normalization(
+        self, embed_dim, num_heads, random_embeddings
+    ):
        """Test that attention weights are properly normalized."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Forward pass
        output, _ = attention(random_embeddings)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
+
    def test_gradient_flow(self, embed_dim, num_heads, random_embeddings):
        """Test that gradients flow through MultiHeadAttention."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Forward pass
        output, _ = attention(random_embeddings)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed for learnable parameters
        assert attention._layer.weight.grad is not None
        if len(attention._heads) > 0:
            assert attention._heads[0]._q.weight.grad is not None
-    
+
    def test_device_consistency(self, embed_dim, num_heads, random_embeddings, device):
        """Test that MultiHeadAttention works on correct device."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024).to(device)
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        ).to(device)
        inputs = random_embeddings.to(device)
-        
+
        # Forward pass
        output, _ = attention(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert attention._layer.weight.device == device
-    
+
    def test_different_embed_dim_and_heads(self):
        """Test MultiHeadAttention with different embed_dim and num_heads combinations."""
        test_cases = [
-            (64, 2),   # embed_dim=64, num_heads=2
+            (64, 2),  # embed_dim=64, num_heads=2
            (128, 4),  # embed_dim=128, num_heads=4
            (256, 8),  # embed_dim=256, num_heads=8
-            (512, 16), # embed_dim=512, num_heads=16
+            (512, 16),  # embed_dim=512, num_heads=16
        ]
-        
+
        for embed_dim, num_heads in test_cases:
            head_size = embed_dim // num_heads
-            attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
+            attention = MultiHeadAttention(
+                num_heads, embed_dim, head_size, max_seq_len=1024
+            )
            batch_size, seq_len = 2, 16
            inputs = torch.randn(batch_size, seq_len, embed_dim)
-            
+
            output, _ = attention(inputs)
-            
+
            assert output.shape == inputs.shape
-    
+
    def test_attention_output_range(self, embed_dim, num_heads, random_embeddings):
        """Test that attention output is in reasonable range."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        output, _ = attention(random_embeddings)
-        
+
        # Output shouldn't have extreme values
        assert output.abs().max() < 100  # Reasonable upper bound
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, embed_dim, num_heads, batch_size, seq_len):
        """Test MultiHeadAttention with different input shapes."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        inputs = torch.randn(batch_size, seq_len, embed_dim)
        output, _ = attention(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
-    
+
    def test_parameter_sharing(self, embed_dim, num_heads):
        """Test that parameters are properly shared across the sequence."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024, dropout=0.0)  # No dropout for deterministic test
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024, dropout=0.0
+        )  # No dropout for deterministic test
+
        # Create two identical sequences
        seq_len = 10
        base_sequence = torch.randn(1, seq_len, embed_dim)
        identical_sequence = base_sequence.clone()
-        
+
        # Set to eval mode to disable dropout
        attention.eval()
-        
+
        with torch.no_grad():
            output1, _ = attention(base_sequence)
            output2, _ = attention(identical_sequence)
-        
+
        # With identical inputs and same parameters, outputs should be identical
        assert torch.allclose(output1, output2, rtol=1e-5)
--- a/llm/tests/core/test_positional_embeddings.py
+++ b/llm/tests/core/test_positional_embeddings.py
@@ -10,127 +10,134 @@ from llm.core.positional_embeddings import PositionalEmbeddings

 class TestPositionalEmbeddings:
    """Test cases for PositionalEmbeddings."""
-    
+
    def test_initialization(self, embed_dim):
        """Test that PositionalEmbeddings can be initialized."""
        max_seq_len = 1024
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
        assert embeddings is not None
-        
+
        # Check that positional embeddings are created
-        assert hasattr(embeddings, 'embedding')
+        assert hasattr(embeddings, "embedding")
        assert embeddings.embedding.weight.shape == (max_seq_len, embed_dim)
-    
+
    def test_forward_pass(self, embed_dim):
        """Test forward pass of PositionalEmbeddings."""
        max_seq_len = 1024
        seq_len = 64
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Forward pass - takes sequence length, not input tensor
        output = embeddings(seq_len)
-        
+
        # Check output shape
        expected_shape = (seq_len, embed_dim)
        assert output.shape == expected_shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_positional_encoding_values(self, embed_dim):
        """Test that positional encoding values are computed correctly."""
        max_seq_len = 10
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Get embeddings for all positions
        pe = embeddings(max_seq_len)  # Shape: [max_seq_len, embed_dim]
-        
+
        # Check that different positions have different embeddings
        # (since these are learnable embeddings, not fixed sine/cosine)
        for pos in range(max_seq_len):
            for i in range(pos + 1, max_seq_len):
                assert not torch.allclose(pe[pos], pe[i], rtol=1e-4)
-    
+
    def test_different_sequence_lengths(self, embed_dim):
        """Test PositionalEmbeddings with different sequence lengths."""
        test_cases = [
-            (10, 5),   # seq_len < max_seq_len
+            (10, 5),  # seq_len < max_seq_len
            (10, 10),  # seq_len == max_seq_len
        ]
-        
+
        for max_seq_len, seq_len in test_cases:
            embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-            
+
            # Get embeddings for specific sequence length
            output = embeddings(seq_len)
-            
+
            # Output should have shape [seq_len, embed_dim]
            assert output.shape == (seq_len, embed_dim)
-    
+
    def test_gradient_flow(self, embed_dim):
        """Test that gradients flow through PositionalEmbeddings."""
        max_seq_len = 64
        seq_len = 32
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Forward pass
        output = embeddings(seq_len)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Positional embeddings should have gradients (they're learnable)
        assert embeddings.embedding.weight.grad is not None
-        assert not torch.allclose(embeddings.embedding.weight.grad, 
-                                torch.zeros_like(embeddings.embedding.weight.grad))
-        
+        assert not torch.allclose(
+            embeddings.embedding.weight.grad,
+            torch.zeros_like(embeddings.embedding.weight.grad),
+        )
+
    def test_device_consistency(self, embed_dim, device):
        """Test that PositionalEmbeddings works on correct device."""
        max_seq_len = 64
        seq_len = 32
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim).to(device)
-        
+
        # Forward pass
        output = embeddings(seq_len)
-        
+
        # Check device consistency
        assert output.device == device
        assert embeddings.embedding.weight.device == device
-    
+
    def test_reproducibility(self, embed_dim):
        """Test that positional embeddings are reproducible."""
        max_seq_len = 100
        embeddings1 = PositionalEmbeddings(max_seq_len, embed_dim)
        embeddings2 = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Different instances should have different embeddings (random initialization)
-        assert not torch.allclose(embeddings1.embedding.weight, embeddings2.embedding.weight)
-        
+        assert not torch.allclose(
+            embeddings1.embedding.weight, embeddings2.embedding.weight
+        )
+
        # But same instance should produce same output for same input
        seq_len = 50
        output1 = embeddings1(seq_len)
        output2 = embeddings1(seq_len)  # Same instance, same input
        assert torch.allclose(output1, output2)
-    
+
    def test_positional_pattern(self, embed_dim):
        """Test that positional embeddings create a meaningful pattern."""
        max_seq_len = 50
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
        pe = embeddings(max_seq_len)  # Get all positional embeddings
-        
+
        # Check that different positions have different embeddings
        # (with high probability due to random initialization)
        assert not torch.allclose(pe[0], pe[1], rtol=1e-4)
        assert not torch.allclose(pe[10], pe[20], rtol=1e-4)
-    
-    @pytest.mark.parametrize("max_seq_len,seq_len,embed_dim", [
-        (64, 10, 64),
-        (128, 50, 128), 
-        (256, 100, 256),
-    ])
+
+    @pytest.mark.parametrize(
+        "max_seq_len,seq_len,embed_dim",
+        [
+            (64, 10, 64),
+            (128, 50, 128),
+            (256, 100, 256),
+        ],
+    )
    def test_different_configurations(self, max_seq_len, seq_len, embed_dim):
        """Test PositionalEmbeddings with different configurations."""
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        output = embeddings(seq_len)
-        
+
        assert output.shape == (seq_len, embed_dim)
--- a/llm/tests/core/test_token_embeddings.py
+++ b/llm/tests/core/test_token_embeddings.py
@@ -9,99 +9,103 @@ from llm.core.token_embeddings import TokenEmbeddings

 class TestTokenEmbeddings:
    """Test cases for TokenEmbeddings."""
-    
+
    def test_initialization(self, vocab_size, embed_dim):
        """Test that TokenEmbeddings can be initialized."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
        assert embeddings is not None
-        
+
        # Check embedding layer
-        assert hasattr(embeddings, '_embedding')
+        assert hasattr(embeddings, "_embedding")
        assert embeddings._embedding.weight.shape == (vocab_size, embed_dim)
-    
+
    def test_forward_pass(self, vocab_size, embed_dim, random_inputs):
        """Test forward pass of TokenEmbeddings."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        # Forward pass
        output = embeddings(random_inputs)
-        
+
        # Check output shape
-        assert output.shape == (random_inputs.shape[0], random_inputs.shape[1], embed_dim)
+        assert output.shape == (
+            random_inputs.shape[0],
+            random_inputs.shape[1],
+            embed_dim,
+        )
        assert isinstance(output, torch.Tensor)
-    
+
    def test_embedding_weights(self, vocab_size, embed_dim):
        """Test that embedding weights are properly initialized."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        weights = embeddings._embedding.weight
        assert weights.requires_grad is True
-        
+
        # Check that weights are not all zeros
        assert not torch.allclose(weights, torch.zeros_like(weights))
-    
+
    def test_different_vocab_sizes(self):
        """Test TokenEmbeddings with different vocabulary sizes."""
-        test_cases = [
-            (100, 128),
-            (1000, 256),
-            (50000, 512)
-        ]
-        
+        test_cases = [(100, 128), (1000, 256), (50000, 512)]
+
        for vocab_size, embed_dim in test_cases:
            embeddings = TokenEmbeddings(vocab_size, embed_dim)
            assert embeddings._embedding.weight.shape == (vocab_size, embed_dim)
-    
+
    def test_gradient_flow(self, vocab_size, embed_dim, random_inputs):
        """Test that gradients flow through TokenEmbeddings."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        # Forward pass
        output = embeddings(random_inputs)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed
        assert embeddings._embedding.weight.grad is not None
-        assert not torch.allclose(embeddings._embedding.weight.grad, 
-                                torch.zeros_like(embeddings._embedding.weight.grad))
-    
+        assert not torch.allclose(
+            embeddings._embedding.weight.grad,
+            torch.zeros_like(embeddings._embedding.weight.grad),
+        )
+
    def test_device_consistency(self, vocab_size, embed_dim, random_inputs, device):
        """Test that TokenEmbeddings works on correct device."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim).to(device)
        inputs = random_inputs.to(device)
-        
+
        # Forward pass
        output = embeddings(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert embeddings._embedding.weight.device == device
-    
+
    def test_embedding_lookup(self, vocab_size, embed_dim):
        """Test specific embedding lookups."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        # Test lookup for specific tokens
-        test_tokens = torch.tensor([[0, 1, 2], [vocab_size - 1, vocab_size - 2, vocab_size - 3]])
-        
+        test_tokens = torch.tensor(
+            [[0, 1, 2], [vocab_size - 1, vocab_size - 2, vocab_size - 3]]
+        )
+
        output = embeddings(test_tokens)
-        
+
        # Check shape
        assert output.shape == (2, 3, embed_dim)
-        
+
        # Check that different tokens have different embeddings
        # (with high probability due to random initialization)
        assert not torch.allclose(output[0, 0], output[0, 1], rtol=1e-4)
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 1), (2, 10), (8, 64)])
    def test_different_input_shapes(self, vocab_size, embed_dim, batch_size, seq_len):
        """Test TokenEmbeddings with different input shapes."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        inputs = torch.randint(0, vocab_size, (batch_size, seq_len))
        output = embeddings(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
--- a/llm/tests/models/test_gpt.py
+++ b/llm/tests/models/test_gpt.py
@@ -9,162 +9,156 @@ from llm.models.gpt import GPT

 class TestGPT:
    """Test cases for GPT model."""
-    
+
    def test_initialization(self, gpt_config):
        """Test that GPT can be initialized."""
        model = GPT(gpt_config)
        assert model is not None
-        
+
        # Check that model has required components
-        assert hasattr(model, '_token_embeddings')
-        assert hasattr(model, '_position_embeddings')
-        assert hasattr(model, '_decoders')
-        assert hasattr(model, '_linear')
-        assert hasattr(model, '_dropout')
-        
+        assert hasattr(model, "_token_embeddings")
+        assert hasattr(model, "_position_embeddings")
+        assert hasattr(model, "_decoders")
+        assert hasattr(model, "_linear")
+        assert hasattr(model, "_dropout")
+
        # Check number of decoder layers
-        assert len(model._decoders) == gpt_config['num_layers']
-    
+        assert len(model._decoders) == gpt_config["num_layers"]
+
    def test_forward_pass(self, gpt_config, random_inputs):
        """Test forward pass of GPT."""
        model = GPT(gpt_config)
-        
+
        # Forward pass
        logits = model(random_inputs)
-        
+
        # Check output shape
        batch_size, seq_len = random_inputs.shape
-        vocab_size = gpt_config['vocab_size']
+        vocab_size = gpt_config["vocab_size"]
        assert logits.shape == (batch_size, seq_len, vocab_size)
        assert isinstance(logits, torch.Tensor)
-    
-    def test_forward_with_attention_mask(self, gpt_config, random_inputs, attention_mask):
+
+    def test_forward_with_attention_mask(
+        self, gpt_config, random_inputs, attention_mask
+    ):
        """Test forward pass with attention mask."""
        model = GPT(gpt_config)
-        
+
        # Forward pass with mask
        logits = model(random_inputs, attention_mask=attention_mask)
-        
+
        # Check output shape
        batch_size, seq_len = random_inputs.shape
-        vocab_size = gpt_config['vocab_size']
+        vocab_size = gpt_config["vocab_size"]
        assert logits.shape == (batch_size, seq_len, vocab_size)
-    
+
    def test_generate_text(self, gpt_config):
        """Test text generation."""
        model = GPT(gpt_config)
        model.eval()  # Set to evaluation mode for generation
-        
+
        # Create initial input
        batch_size = 2
        initial_seq_len = 5
-        input_ids = torch.randint(0, gpt_config['vocab_size'], (batch_size, initial_seq_len))
-        
+        input_ids = torch.randint(
+            0, gpt_config["vocab_size"], (batch_size, initial_seq_len)
+        )
+
        # Generate text
        with torch.no_grad():
            generated = model.generate(
                x=input_ids,
                max_new_tokens=10,
-                do_sample=False  # Use greedy for deterministic testing
+                do_sample=False,  # Use greedy for deterministic testing
            )
-        
+
        # Check output shape
        expected_seq_len = initial_seq_len + 10
        assert generated.shape == (batch_size, expected_seq_len)
-        
+
        # Check that initial sequence is preserved
        assert torch.allclose(generated[:, :initial_seq_len], input_ids)
-    
+
    def test_generate_with_temperature(self, gpt_config):
        """Test text generation with temperature sampling."""
        model = GPT(gpt_config)
        model.eval()
-        
+
        # Create initial input
-        input_ids = torch.randint(0, gpt_config['vocab_size'], (1, 3))
-        
+        input_ids = torch.randint(0, gpt_config["vocab_size"], (1, 3))
+
        # Generate with temperature
        with torch.no_grad():
            generated = model.generate(
-                x=input_ids,
-                max_new_tokens=5,
-                do_sample=True,
-                temperature=0.8
+                x=input_ids, max_new_tokens=5, do_sample=True, temperature=0.8
            )
-        
+
        assert generated.shape == (1, 8)  # 3 initial + 5 new tokens
-    
+
    def test_generate_with_top_k(self, gpt_config):
        """Test text generation with top-k sampling."""
        model = GPT(gpt_config)
        model.eval()
-        
+
        # Create initial input
-        input_ids = torch.randint(0, gpt_config['vocab_size'], (1, 3))
-        
+        input_ids = torch.randint(0, gpt_config["vocab_size"], (1, 3))
+
        # Generate with top-k
        with torch.no_grad():
            generated = model.generate(
-                x=input_ids,
-                max_new_tokens=5,
-                do_sample=True,
-                top_k=10
+                x=input_ids, max_new_tokens=5, do_sample=True, top_k=10
            )
-        
+
        assert generated.shape == (1, 8)
-    
+
    def test_generate_with_top_p(self, gpt_config):
        """Test text generation with top-p (nucleus) sampling."""
        model = GPT(gpt_config)
        model.eval()
-        
+
        # Create initial input
-        input_ids = torch.randint(0, gpt_config['vocab_size'], (1, 3))
-        
+        input_ids = torch.randint(0, gpt_config["vocab_size"], (1, 3))
+
        # Generate with top-p
        with torch.no_grad():
            generated = model.generate(
-                x=input_ids,
-                max_new_tokens=5,
-                do_sample=True,
-                top_p=0.9
+                x=input_ids, max_new_tokens=5, do_sample=True, top_p=0.9
            )
-        
+
        assert generated.shape == (1, 8)
-    
+
    def test_gradient_flow(self, gpt_config, random_inputs):
        """Test that gradients flow through GPT."""
        model = GPT(gpt_config)
-        
+
        # Forward pass
        logits = model(random_inputs)
-        
+
        # Create a dummy loss and backward pass
-        targets = torch.randint(0, gpt_config['vocab_size'], random_inputs.shape)
+        targets = torch.randint(0, gpt_config["vocab_size"], random_inputs.shape)
        loss = torch.nn.functional.cross_entropy(
-            logits.view(-1, logits.size(-1)), 
-            targets.view(-1)
+            logits.view(-1, logits.size(-1)), targets.view(-1)
        )
        loss.backward()
-        
+
        # Check that gradients are computed for various components
        assert model._token_embeddings._embedding.weight.grad is not None
        assert model._linear.weight.grad is not None
        if len(model._decoders) > 0:
            assert model._decoders[0]._heads._heads[0]._q.weight.grad is not None
-    
+
    def test_device_consistency(self, gpt_config, random_inputs, device):
        """Test that GPT works on correct device."""
        model = GPT(gpt_config).to(device)
        inputs = random_inputs.to(device)
-        
+
        # Forward pass
        logits = model(inputs)
-        
+
        # Check device consistency
        assert logits.device == device
        assert model._token_embeddings._embedding.weight.device == device
-    
+
    def test_different_configurations(self):
        """Test GPT with different configurations."""
        test_configs = [
@@ -174,7 +168,7 @@ class TestGPT:
                "num_heads": 2,
                "num_layers": 2,
                "max_position_embeddings": 256,
-                "dropout": 0.1
+                "dropout": 0.1,
            },
            {
                "vocab_size": 5000,
@@ -182,7 +176,7 @@ class TestGPT:
                "num_heads": 4,
                "num_layers": 4,
                "max_position_embeddings": 512,
-                "dropout": 0.1
+                "dropout": 0.1,
            },
            {
                "vocab_size": 10000,
@@ -190,98 +184,94 @@ class TestGPT:
                "num_heads": 8,
                "num_layers": 6,
                "max_position_embeddings": 1024,
-                "dropout": 0.1
-            }
+                "dropout": 0.1,
+            },
        ]
-        
+
        for config in test_configs:
            model = GPT(config)
            batch_size, seq_len = 2, 16
-            inputs = torch.randint(0, config['vocab_size'], (batch_size, seq_len))
-            
+            inputs = torch.randint(0, config["vocab_size"], (batch_size, seq_len))
+
            logits = model(inputs)
-            
-            expected_shape = (batch_size, seq_len, config['vocab_size'])
+
+            expected_shape = (batch_size, seq_len, config["vocab_size"])
            assert logits.shape == expected_shape
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, gpt_config, batch_size, seq_len):
        """Test GPT with different input shapes."""
        model = GPT(gpt_config)
-        
-        inputs = torch.randint(0, gpt_config['vocab_size'], (batch_size, seq_len))
+
+        inputs = torch.randint(0, gpt_config["vocab_size"], (batch_size, seq_len))
        logits = model(inputs)
-        
-        expected_shape = (batch_size, seq_len, gpt_config['vocab_size'])
+
+        expected_shape = (batch_size, seq_len, gpt_config["vocab_size"])
        assert logits.shape == expected_shape
-    
+
    def test_training_vs_evaluation(self, gpt_config, random_inputs):
        """Test that GPT behaves differently in train vs eval mode."""
        model = GPT(gpt_config)
-        
+
        # Training mode
        model.train()
        output_train = model(random_inputs)
-        
+
        # Evaluation mode
        model.eval()
        output_eval = model(random_inputs)
-        
+
        # Outputs should be different due to dropout
        assert not torch.allclose(output_train, output_eval)
-    
+
    def test_parameter_count(self, gpt_config):
        """Test that GPT has reasonable number of parameters."""
        model = GPT(gpt_config)
-        
+
        total_params = sum(p.numel() for p in model.parameters())
-        
+
        # For a small GPT model, parameters should be in reasonable range
-        vocab_size = gpt_config['vocab_size']
-        embed_dim = gpt_config['embed_dim']
-        num_layers = gpt_config['num_layers']
-        num_heads = gpt_config['num_heads']
-        
+        vocab_size = gpt_config["vocab_size"]
+        embed_dim = gpt_config["embed_dim"]
+        num_layers = gpt_config["num_layers"]
+        num_heads = gpt_config["num_heads"]
+
        # Rough estimate: token_embeddings + output_layer + (attention + ff) * layers
        expected_min = vocab_size * embed_dim * 2  # embeddings and output
        expected_max = expected_min * 10  # Allow for decoder parameters
-        
+
        assert expected_min < total_params < expected_max
-    
+
    def test_causal_attention(self, gpt_config):
        """Test that GPT uses causal attention during generation."""
        model = GPT(gpt_config)
        model.eval()
-        
+
        # Create input with known pattern
        input_ids = torch.tensor([[1, 2, 3]]).long()
-        
+
        with torch.no_grad():
            # Get logits for next token prediction
            logits = model(input_ids)
-            
+
            # The model should only attend to previous tokens (causal)
            # We can't directly test attention masks in the public API,
            # but we can verify the generation works correctly
-            
-            generated = model.generate(
-                x=input_ids,
-                max_new_tokens=3,
-                do_sample=False
-            )
-            
+
+            generated = model.generate(x=input_ids, max_new_tokens=3, do_sample=False)
+
            # Generated sequence should be longer than input
            assert generated.shape[1] == input_ids.shape[1] + 3
-    
+
    def test_output_distribution(self, gpt_config, random_inputs):
        """Test that GPT output has proper distribution."""
        model = GPT(gpt_config)
-        
+
        logits = model(random_inputs)
-        
+
        # Logits should not have extreme values
        assert logits.abs().max() < 100
-        
+
        # Softmax should produce valid probabilities
        probs = torch.softmax(logits, dim=-1)
        assert torch.allclose(probs.sum(dim=-1), torch.ones_like(probs.sum(dim=-1)))
--- a/llm/tests/test_basic.py
+++ b/llm/tests/test_basic.py
@@ -11,25 +11,25 @@ import os
 def test_gpt_model_creation():
    """Test that GPT model can be created and forward pass works."""
    from llm.models.gpt import GPT
-    
+
    config = {
        "vocab_size": 1000,
        "embed_dim": 128,
        "num_heads": 4,
        "num_layers": 2,
        "max_position_embeddings": 256,
-        "dropout": 0.1
+        "dropout": 0.1,
    }
-    
+
    model = GPT(config)
-    
+
    # Test forward pass
    batch_size, seq_len = 2, 16
    input_ids = torch.randint(0, config["vocab_size"], (batch_size, seq_len))
-    
+
    with torch.no_grad():
        logits = model(input_ids)
-    
+
    assert logits.shape == (batch_size, seq_len, config["vocab_size"])
    print("✅ GPT model creation and forward pass test passed")

@@ -37,27 +37,21 @@ def test_gpt_model_creation():
 def test_bpe_tokenizer_basic():
    """Test basic BPE tokenizer functionality."""
    from llm.tokenizers import BPETokenizer
-    
+
    tokenizer = BPETokenizer()
-    
+
    # Train on simple texts
-    texts = [
-        "hello world",
-        "test tokenization",
-        "simple example"
-    ]
-    
+    texts = ["hello world", "test tokenization", "simple example"]
+
    tokenizer.train(
-        texts=texts,
-        vocab_size=50,
-        special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
+        texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
    )
-    
+
    # Test encoding/decoding
    text = "hello world"
    tokens = tokenizer.encode(text)
    decoded = tokenizer.decode(tokens)
-    
+
    assert isinstance(tokens, list)
    assert isinstance(decoded, str)
    assert len(tokens) > 0
@@ -67,18 +61,18 @@ def test_bpe_tokenizer_basic():
 def test_token_embeddings():
    """Test token embeddings."""
    from llm.core.token_embeddings import TokenEmbeddings
-    
+
    vocab_size = 1000
    embed_dim = 128
-    
+
    embeddings = TokenEmbeddings(vocab_size, embed_dim)
-    
+
    # Test forward pass
    batch_size, seq_len = 2, 16
    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len))
-    
+
    output = embeddings(input_ids)
-    
+
    assert output.shape == (batch_size, seq_len, embed_dim)
    print("✅ Token embeddings test passed")

@@ -86,20 +80,20 @@ def test_token_embeddings():
 def test_multi_head_attention():
    """Test multi-head attention."""
    from llm.core.multi_head_attention import MultiHeadAttention
-    
+
    num_heads = 4
    emb_size = 128
    head_size = emb_size // num_heads
    max_seq_len = 256
-    
+
    attention = MultiHeadAttention(num_heads, emb_size, head_size, max_seq_len)
-    
+
    # Test forward pass
    batch_size, seq_len = 2, 16
    inputs = torch.randn(batch_size, seq_len, emb_size)
-    
+
    output, _ = attention(inputs)
-    
+
    assert output.shape == inputs.shape
    print("✅ Multi-head attention test passed")

@@ -107,17 +101,17 @@ def test_multi_head_attention():
 def test_feed_forward():
    """Test feed forward network."""
    from llm.core.feed_forward import FeedForward
-    
+
    embed_dim = 128
-    
+
    ff = FeedForward(embed_dim)
-    
+
    # Test forward pass
    batch_size, seq_len = 2, 16
    inputs = torch.randn(batch_size, seq_len, embed_dim)
-    
+
    output = ff(inputs)
-    
+
    assert output.shape == inputs.shape
    print("✅ Feed forward test passed")

@@ -125,29 +119,25 @@ def test_feed_forward():
 def test_gpt_generation():
    """Test GPT text generation."""
    from llm.models.gpt import GPT
-    
+
    config = {
        "vocab_size": 1000,
        "embed_dim": 128,
        "num_heads": 4,
        "num_layers": 2,
        "max_position_embeddings": 256,
-        "dropout": 0.1
+        "dropout": 0.1,
    }
-    
+
    model = GPT(config)
    model.eval()
-    
+
    # Test greedy generation
    input_ids = torch.randint(0, config["vocab_size"], (1, 5))
-    
+
    with torch.no_grad():
-        generated = model.generate(
-            x=input_ids,
-            max_new_tokens=3,
-            do_sample=False
-        )
-    
+        generated = model.generate(x=input_ids, max_new_tokens=3, do_sample=False)
+
    assert generated.shape == (1, 8)  # 5 initial + 3 new tokens
    print("✅ GPT generation test passed")

@@ -155,50 +145,48 @@ def test_gpt_generation():
 def test_bpe_tokenizer_save_load():
    """Test BPE tokenizer save/load functionality."""
    from llm.tokenizers import BPETokenizer
-    
+
    tokenizer = BPETokenizer()
-    
+
    # Train on simple texts
    texts = ["hello world", "test save load"]
    tokenizer.train(
-        texts=texts,
-        vocab_size=30,
-        special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
+        texts=texts, vocab_size=30, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
    )
-    
+
    with tempfile.TemporaryDirectory() as temp_dir:
        save_path = os.path.join(temp_dir, "test_tokenizer.json")
-        
+
        # Save tokenizer
        tokenizer.save(save_path)
        assert os.path.exists(save_path)
-        
+
        # Load tokenizer
        loaded_tokenizer = BPETokenizer.load(save_path)
-        
+
        # Test that vocab size is the same
        assert tokenizer.get_vocab_size() == loaded_tokenizer.get_vocab_size()
-        
+
        # Test that vocabularies are the same
        assert tokenizer.get_vocab() == loaded_tokenizer.get_vocab()
-        
+
        # Test that both can encode/decode (even if tokens differ due to BPE state)
        text = "hello world"
        original_tokens = tokenizer.encode(text)
        loaded_tokens = loaded_tokenizer.encode(text)
-        
+
        # Both should produce valid token lists
        assert isinstance(original_tokens, list)
        assert isinstance(loaded_tokens, list)
        assert len(original_tokens) > 0
        assert len(loaded_tokens) > 0
-        
+
        # Both should be able to decode
        original_decoded = tokenizer.decode(original_tokens)
        loaded_decoded = loaded_tokenizer.decode(loaded_tokens)
        assert isinstance(original_decoded, str)
        assert isinstance(loaded_decoded, str)
-    
+
    print("✅ BPE tokenizer save/load test passed")


@@ -206,18 +194,16 @@ def test_gpt_with_tokenizer():
    """Test GPT model with tokenizer integration."""
    from llm.models.gpt import GPT
    from llm.tokenizers import BPETokenizer
-    
+
    # Create and train tokenizer
    tokenizer = BPETokenizer()
    texts = ["hello world", "test integration"]
    tokenizer.train(
-        texts=texts,
-        vocab_size=50,
-        special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
+        texts=texts, vocab_size=50, special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
    )
-    
+
    vocab_size = tokenizer.get_vocab_size()
-    
+
    # Create GPT model with tokenizer's vocab size
    config = {
        "vocab_size": vocab_size,
@@ -225,19 +211,19 @@ def test_gpt_with_tokenizer():
        "num_heads": 4,
        "num_layers": 2,
        "max_position_embeddings": 256,
-        "dropout": 0.1
+        "dropout": 0.1,
    }
-    
+
    model = GPT(config)
-    
+
    # Test with tokenized input
    text = "hello world"
    tokens = tokenizer.encode(text, add_special_tokens=False)
    input_ids = torch.tensor([tokens])
-    
+
    with torch.no_grad():
        logits = model(input_ids)
-    
+
    assert logits.shape == (1, len(tokens), vocab_size)
    print("✅ GPT with tokenizer integration test passed")

@@ -245,7 +231,7 @@ def test_gpt_with_tokenizer():
 def run_all_tests():
    """Run all basic tests."""
    print("🧪 Running basic tests for llm library...")
-    
+
    test_gpt_model_creation()
    test_bpe_tokenizer_basic()
    test_token_embeddings()
@@ -254,7 +240,7 @@ def run_all_tests():
    test_gpt_generation()
    test_bpe_tokenizer_save_load()
    test_gpt_with_tokenizer()
-    
+
    print("🎉 All basic tests passed!")


--- a/llm/tests/tokenizers/test_base_tokenizer.py
+++ b/llm/tests/tokenizers/test_base_tokenizer.py
@@ -8,15 +8,15 @@ from llm.tokenizers import BaseTokenizer

 class ConcreteTokenizer(BaseTokenizer):
    """Concrete implementation for testing BaseTokenizer."""
-    
+
    def train(self, texts: list, vocab_size: int = 1000, **kwargs):
        """Dummy implementation for testing."""
        pass
-    
+
    def encode(self, text: str, **kwargs) -> list:
        """Dummy implementation for testing."""
        return [1, 2, 3]
-    
+
    def decode(self, tokens: list, **kwargs) -> str:
        """Dummy implementation for testing."""
        return "decoded text"
@@ -24,33 +24,33 @@ class ConcreteTokenizer(BaseTokenizer):

 class TestBaseTokenizer:
    """Test cases for BaseTokenizer."""
-    
+
    def test_initialization(self):
        """Test that BaseTokenizer can be initialized through concrete class."""
        tokenizer = ConcreteTokenizer()
        assert tokenizer is not None
        assert tokenizer.vocab == {}
        assert tokenizer.vocab_size == 0
-    
+
    def test_encode_implemented(self):
        """Test that encode method works in concrete implementation."""
        tokenizer = ConcreteTokenizer()
        result = tokenizer.encode("test text")
        assert result == [1, 2, 3]
-    
+
    def test_decode_implemented(self):
        """Test that decode method works in concrete implementation."""
        tokenizer = ConcreteTokenizer()
        result = tokenizer.decode([1, 2, 3])
        assert result == "decoded text"
-    
+
    def test_get_vocab_size(self):
        """Test that get_vocab_size method works."""
        tokenizer = ConcreteTokenizer()
        tokenizer.vocab = {"a": 0, "b": 1, "c": 2}
        tokenizer.vocab_size = 3
        assert tokenizer.get_vocab_size() == 3
-    
+
    def test_get_vocab(self):
        """Test that get_vocab method works."""
        tokenizer = ConcreteTokenizer()
--- a/llm/tests/tokenizers/test_bpe_tokenizer.py
+++ b/llm/tests/tokenizers/test_bpe_tokenizer.py
@@ -10,18 +10,18 @@ from llm.tokenizers import BPETokenizer

 class TestBPETokenizer:
    """Test cases for BPETokenizer."""
-    
+
    @pytest.fixture
    def sample_texts(self):
        """Sample texts for training tokenizer."""
        return [
            "Искусственный интеллект",
-            "Нейронные сети", 
+            "Нейронные сети",
            "Машинное обучение",
            "Глубокое обучение",
-            "Трансформеры"
+            "Трансформеры",
        ]
-    
+
    @pytest.fixture
    def trained_tokenizer(self, sample_texts):
        """Create and train a BPE tokenizer."""
@@ -29,128 +29,130 @@ class TestBPETokenizer:
        tokenizer.train(
            texts=sample_texts,
            vocab_size=100,
-            special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
+            special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"],
        )
        return tokenizer
-    
+
    def test_initialization(self):
        """Test that BPETokenizer can be initialized."""
        tokenizer = BPETokenizer()
        assert tokenizer is not None
-    
+
    def test_train_tokenizer(self, sample_texts):
        """Test that tokenizer can be trained."""
        tokenizer = BPETokenizer()
        tokenizer.train(
            texts=sample_texts,
            vocab_size=50,
-            special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"]
+            special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"],
        )
-        
+
        assert tokenizer.get_vocab_size() > 0
        assert len(tokenizer.get_vocab()) == tokenizer.get_vocab_size()
-    
+
    def test_encode_decode(self, trained_tokenizer):
        """Test encoding and decoding text."""
        text = "Искусственный интеллект"
-        
+
        # Encode text
        tokens = trained_tokenizer.encode(text)
        assert isinstance(tokens, list)
        assert len(tokens) > 0
        assert all(isinstance(token, int) for token in tokens)
-        
+
        # Decode tokens
        decoded_text = trained_tokenizer.decode(tokens)
        assert isinstance(decoded_text, str)
        # Decoded text should be similar to original (may have special tokens)
        assert len(decoded_text) > 0
-    
+
    def test_encode_with_special_tokens(self, trained_tokenizer):
        """Test encoding with special tokens."""
        text = "Нейронные сети"
-        
+
        # Without special tokens
        tokens_no_special = trained_tokenizer.encode(text, add_special_tokens=False)
-        
+
        # With special tokens
        tokens_with_special = trained_tokenizer.encode(text, add_special_tokens=True)
-        
+
        # Should have more tokens when special tokens are added
        assert len(tokens_with_special) >= len(tokens_no_special)
-    
+
    def test_vocab_size(self, trained_tokenizer):
        """Test vocabulary size."""
        vocab_size = trained_tokenizer.get_vocab_size()
        assert isinstance(vocab_size, int)
        assert vocab_size > 0
-        
+
        vocab = trained_tokenizer.get_vocab()
        assert isinstance(vocab, dict)
        assert len(vocab) == vocab_size
-    
+
    def test_special_tokens(self, trained_tokenizer):
        """Test that special tokens are in vocabulary."""
        vocab = trained_tokenizer.get_vocab()
-        
+
        # Check that special tokens are in vocabulary
        special_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
        for token in special_tokens:
            assert token in vocab
            assert isinstance(vocab[token], int)
-    
+
    def test_save_load(self, trained_tokenizer, sample_texts):
        """Test saving and loading tokenizer."""
        with tempfile.TemporaryDirectory() as temp_dir:
            save_path = os.path.join(temp_dir, "test_tokenizer.json")
-            
+
            # Save tokenizer
            trained_tokenizer.save(save_path)
            assert os.path.exists(save_path)
-            
+
            # Load tokenizer
            loaded_tokenizer = BPETokenizer.load(save_path)
            assert loaded_tokenizer is not None
-            
+
            # Check that loaded tokenizer works the same
            original_vocab = trained_tokenizer.get_vocab()
            loaded_vocab = loaded_tokenizer.get_vocab()
-            
+
            assert original_vocab == loaded_vocab
-            assert trained_tokenizer.get_vocab_size() == loaded_tokenizer.get_vocab_size()
-            
+            assert (
+                trained_tokenizer.get_vocab_size() == loaded_tokenizer.get_vocab_size()
+            )
+
            # Test encoding consistency
            text = sample_texts[0]
            original_tokens = trained_tokenizer.encode(text)
            loaded_tokens = loaded_tokenizer.encode(text)
-            
+
            assert original_tokens == loaded_tokens
-    
+
    def test_unknown_tokens(self, trained_tokenizer):
        """Test handling of unknown tokens."""
        # Use text that likely contains unknown subwords
        text = "xyzabc123"  # Random text that shouldn't be in training data
-        
+
        tokens = trained_tokenizer.encode(text)
        assert len(tokens) > 0
-        
+
        # Should be able to decode back (even if it's mostly unk tokens)
        decoded = trained_tokenizer.decode(tokens)
        assert isinstance(decoded, str)
-    
+
    def test_empty_text(self, trained_tokenizer):
        """Test encoding and decoding empty text."""
        tokens = trained_tokenizer.encode("")
        assert isinstance(tokens, list)
-        
+
        decoded = trained_tokenizer.decode([])
        assert decoded == ""
-    
+
    def test_tokenize_method(self, trained_tokenizer):
        """Test the tokenize method."""
        text = "Искусственный интеллект"
        tokens = trained_tokenizer.tokenize(text)
-        
+
        assert isinstance(tokens, list)
        assert len(tokens) > 0
        assert all(isinstance(token, str) for token in tokens)