Рефакторинг: единообразие оформления кода (пробелы, кавычки, пустые строки), без изменения логики по всему проекту.

2026-01-24 13:32:08 +00:00 · 2025-10-06 22:57:19 +03:00
parent 332cad6159
commit 712278e33c
49 changed files with 2324 additions and 2004 deletions
--- a/llm/tests/core/test_decoder.py
+++ b/llm/tests/core/test_decoder.py
@@ -9,180 +9,233 @@ from llm.core.decoder import Decoder

 class TestDecoder:
    """Test cases for Decoder."""
-    
+
    def test_initialization(self, embed_dim, num_heads):
        """Test that Decoder can be initialized."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
        assert decoder is not None
-        
+
        # Check internal components
-        assert hasattr(decoder, '_heads')
-        assert hasattr(decoder, '_ff')
-        assert hasattr(decoder, '_norm1')
-        assert hasattr(decoder, '_norm2')
-    
+        assert hasattr(decoder, "_heads")
+        assert hasattr(decoder, "_ff")
+        assert hasattr(decoder, "_norm1")
+        assert hasattr(decoder, "_norm2")
+
    def test_forward_pass(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass of Decoder."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        # Forward pass
        output = decoder(random_embeddings)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_forward_with_causal_mask(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass with causal mask."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        batch_size, seq_len = random_embeddings.shape[:2]
        # Create causal mask
        mask = torch.tril(torch.ones(seq_len, seq_len))
-        
+
        # Forward pass with causal mask
        output = decoder(random_embeddings, mask=mask)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
+
    def test_residual_connections(self, embed_dim, num_heads, random_embeddings):
        """Test that residual connections are properly applied."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        output = decoder(random_embeddings)
-        
+
        # With residual connections and layer norm, the output shouldn't be
        # too different from input (in terms of scale/distribution)
        input_norm = random_embeddings.norm(dim=-1).mean()
        output_norm = output.norm(dim=-1).mean()
-        
+
        # Norms should be of similar magnitude (not exact due to transformations)
        assert 0.1 < (output_norm / input_norm) < 10.0
-    
+
    def test_layer_norm(self, embed_dim, num_heads, random_embeddings):
        """Test that layer normalization is applied."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        output = decoder(random_embeddings)
-        
+
        # Check that output has reasonable statistics (due to layer norm)
        # Mean should be close to 0, std close to 1 for each sequence position
        output_mean = output.mean(dim=-1)
        output_std = output.std(dim=-1)
-        
+
        # These are approximate checks since the data goes through multiple transformations
        assert torch.allclose(output_mean, torch.zeros_like(output_mean), atol=1.0)
        assert torch.allclose(output_std, torch.ones_like(output_std), atol=2.0)
-    
+
    def test_gradient_flow(self, embed_dim, num_heads, random_embeddings):
        """Test that gradients flow through Decoder."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        # Forward pass
        output = decoder(random_embeddings)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed for learnable parameters
        # in attention and feed forward components
        assert decoder._heads._layer.weight.grad is not None
        assert decoder._ff._layer1.weight.grad is not None
        assert decoder._norm1.weight.grad is not None
        assert decoder._norm2.weight.grad is not None
-    
+
    def test_device_consistency(self, embed_dim, num_heads, random_embeddings, device):
        """Test that Decoder works on correct device."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len).to(device)
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        ).to(device)
        inputs = random_embeddings.to(device)
-        
+
        # Forward pass
        output = decoder(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert decoder._heads._layer.weight.device == device
-    
+
    def test_different_configurations(self):
        """Test Decoder with different configurations."""
        test_cases = [
-            (64, 2),   # embed_dim=64, num_heads=2
+            (64, 2),  # embed_dim=64, num_heads=2
            (128, 4),  # embed_dim=128, num_heads=4
            (256, 8),  # embed_dim=256, num_heads=8
        ]
-        
+
        for embed_dim, num_heads in test_cases:
            head_size = embed_dim // num_heads
            max_seq_len = 1024
-            decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
+            decoder = Decoder(
+                num_heads=num_heads,
+                emb_size=embed_dim,
+                head_size=head_size,
+                max_seq_len=max_seq_len,
+            )
            batch_size, seq_len = 2, 16
            inputs = torch.randn(batch_size, seq_len, embed_dim)
-            
+
            output = decoder(inputs)
-            
+
            assert output.shape == inputs.shape
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, embed_dim, num_heads, batch_size, seq_len):
        """Test Decoder with different input shapes."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        inputs = torch.randn(batch_size, seq_len, embed_dim)
        output = decoder(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
-    
+
    def test_training_vs_evaluation(self, embed_dim, num_heads, random_embeddings):
        """Test that Decoder behaves differently in train vs eval mode."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len, dropout=0.5)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+            dropout=0.5,
+        )
+
        # Training mode
        decoder.train()
        output_train = decoder(random_embeddings)
-        
+
        # Evaluation mode
        decoder.eval()
        output_eval = decoder(random_embeddings)
-        
+
        # Outputs should be different due to dropout
        assert not torch.allclose(output_train, output_eval)
-    
+
    def test_parameter_initialization(self, embed_dim, num_heads):
        """Test that parameters are properly initialized."""
        head_size = embed_dim // num_heads
        max_seq_len = 1024
-        decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
-        
+        decoder = Decoder(
+            num_heads=num_heads,
+            emb_size=embed_dim,
+            head_size=head_size,
+            max_seq_len=max_seq_len,
+        )
+
        # Check that various components have non-zero parameters
        assert not torch.allclose(
-            decoder._heads._layer.weight, 
-            torch.zeros_like(decoder._heads._layer.weight)
+            decoder._heads._layer.weight, torch.zeros_like(decoder._heads._layer.weight)
        )
        assert not torch.allclose(
-            decoder._ff._layer1.weight,
-            torch.zeros_like(decoder._ff._layer1.weight)
+            decoder._ff._layer1.weight, torch.zeros_like(decoder._ff._layer1.weight)
        )
        assert not torch.allclose(
-            decoder._norm1.weight,
-            torch.zeros_like(decoder._norm1.weight)
+            decoder._norm1.weight, torch.zeros_like(decoder._norm1.weight)
        )
--- a/llm/tests/core/test_feed_forward.py
+++ b/llm/tests/core/test_feed_forward.py
@@ -10,168 +10,178 @@ from llm.core.feed_forward import FeedForward

 class TestFeedForward:
    """Test cases for FeedForward."""
-    
+
    def test_initialization(self, embed_dim):
        """Test that FeedForward can be initialized."""
        ff = FeedForward(embed_dim)
        assert ff is not None
-        
+
        # Check internal layers
-        assert hasattr(ff, '_layer1')
-        assert hasattr(ff, '_layer2')
-        assert hasattr(ff, '_activation')
-        assert hasattr(ff, '_dropout')
-        
+        assert hasattr(ff, "_layer1")
+        assert hasattr(ff, "_layer2")
+        assert hasattr(ff, "_activation")
+        assert hasattr(ff, "_dropout")
+
        # Check layer dimensions
        expected_hidden_dim = embed_dim * 4  # Default expansion factor
        assert ff._layer1.weight.shape == (expected_hidden_dim, embed_dim)
        assert ff._layer2.weight.shape == (embed_dim, expected_hidden_dim)
-    
+
    def test_forward_pass(self, embed_dim, random_float_inputs):
        """Test forward pass of FeedForward."""
        ff = FeedForward(embed_dim)
-        
+
        # Forward pass
        output = ff(random_float_inputs)
-        
+
        # Check output shape
        assert output.shape == random_float_inputs.shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_custom_hidden_dim(self, embed_dim):
        """Test FeedForward with custom hidden dimension."""
        # FeedForward doesn't support custom hidden_dim in current implementation
        # This test is not applicable
        ff = FeedForward(embed_dim)
-        
+
        # Check layer dimensions (fixed 4x expansion)
        expected_hidden_dim = embed_dim * 4
        assert ff._layer1.weight.shape == (expected_hidden_dim, embed_dim)
        assert ff._layer2.weight.shape == (embed_dim, expected_hidden_dim)
-    
+
    def test_dropout(self, embed_dim, random_float_inputs):
        """Test that dropout is applied during training."""
        ff = FeedForward(embed_dim, dropout=0.5)
        ff.train()  # Set to training mode
-        
+
        output = ff(random_float_inputs)
-        
+
        # In training mode with dropout, some values should be zeroed
        # This is probabilistic, so we can't assert exact zeros,
        # but we can check the structure is preserved
        assert output.shape == random_float_inputs.shape
-    
+
    def test_no_dropout_in_eval(self, embed_dim, random_float_inputs):
        """Test that dropout is not applied during evaluation."""
        ff = FeedForward(embed_dim, dropout=0.5)
        ff.eval()  # Set to evaluation mode
-        
+
        # Run forward pass multiple times - outputs should be identical
        output1 = ff(random_float_inputs)
        output2 = ff(random_float_inputs)
-        
+
        assert torch.allclose(output1, output2)
-    
+
    def test_activation_function(self, embed_dim, random_float_inputs):
        """Test that activation function is applied."""
        ff = FeedForward(embed_dim)
-        
+
        # Manually compute expected output without dropout for deterministic comparison
        hidden = ff._layer1(random_float_inputs)
        activated = ff._activation(hidden)
        expected_output = ff._layer2(activated)
-        
+
        # Compare with forward pass in eval mode (no dropout)
        ff.eval()
        actual_output = ff(random_float_inputs)
-        
+
        assert torch.allclose(actual_output, expected_output, rtol=1e-4)
-    
+
    def test_gradient_flow(self, embed_dim, random_float_inputs):
        """Test that gradients flow through FeedForward."""
        ff = FeedForward(embed_dim)
-        
+
        # Forward pass
        output = ff(random_float_inputs)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed for learnable parameters
        assert ff._layer1.weight.grad is not None
        assert ff._layer2.weight.grad is not None
-        assert not torch.allclose(ff._layer1.weight.grad, 
-                                torch.zeros_like(ff._layer1.weight.grad))
-        assert not torch.allclose(ff._layer2.weight.grad, 
-                                torch.zeros_like(ff._layer2.weight.grad))
-    
+        assert not torch.allclose(
+            ff._layer1.weight.grad, torch.zeros_like(ff._layer1.weight.grad)
+        )
+        assert not torch.allclose(
+            ff._layer2.weight.grad, torch.zeros_like(ff._layer2.weight.grad)
+        )
+
    def test_device_consistency(self, embed_dim, random_float_inputs, device):
        """Test that FeedForward works on correct device."""
        ff = FeedForward(embed_dim).to(device)
        inputs = random_float_inputs.to(device)
-        
+
        # Forward pass
        output = ff(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert ff._layer1.weight.device == device
        assert ff._layer2.weight.device == device
-    
+
    def test_different_embed_dims(self):
        """Test FeedForward with different embedding dimensions."""
        test_cases = [64, 128, 256, 512]
-        
+
        for embed_dim in test_cases:
            ff = FeedForward(embed_dim)
            batch_size, seq_len = 2, 16
            inputs = torch.randn(batch_size, seq_len, embed_dim)
-            
+
            output = ff(inputs)
-            
+
            assert output.shape == inputs.shape
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, embed_dim, batch_size, seq_len):
        """Test FeedForward with different input shapes."""
        ff = FeedForward(embed_dim)
-        
+
        inputs = torch.randn(batch_size, seq_len, embed_dim)
        output = ff(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
-    
+
    def test_non_linearity(self, embed_dim, random_float_inputs):
        """Test that FeedForward introduces non-linearity."""
        ff = FeedForward(embed_dim)
-        
+
        # Create a simple linear transformation for comparison
        linear_layer = nn.Linear(embed_dim, embed_dim)
-        
+
        # Copy weights to make comparison fair
        with torch.no_grad():
            linear_layer.weight.copy_(ff._layer2.weight @ ff._layer1.weight)
            if linear_layer.bias is not None:
                linear_layer.bias.zero_()
-        
+
        linear_output = linear_layer(random_float_inputs)
        ff_output = ff(random_float_inputs)
-        
+
        # FeedForward output should be different from pure linear transformation
        # due to activation function
        assert not torch.allclose(ff_output, linear_output, rtol=1e-4)
-    
+
    def test_parameter_initialization(self, embed_dim):
        """Test that parameters are properly initialized."""
        ff = FeedForward(embed_dim)
-        
+
        # Check that weights are not all zeros
-        assert not torch.allclose(ff._layer1.weight, torch.zeros_like(ff._layer1.weight))
-        assert not torch.allclose(ff._layer2.weight, torch.zeros_like(ff._layer2.weight))
-        
+        assert not torch.allclose(
+            ff._layer1.weight, torch.zeros_like(ff._layer1.weight)
+        )
+        assert not torch.allclose(
+            ff._layer2.weight, torch.zeros_like(ff._layer2.weight)
+        )
+
        # Check that biases are not all zeros (they should be initialized with some values)
        if ff._layer1.bias is not None:
-            assert not torch.allclose(ff._layer1.bias, torch.zeros_like(ff._layer1.bias))
+            assert not torch.allclose(
+                ff._layer1.bias, torch.zeros_like(ff._layer1.bias)
+            )
        if ff._layer2.bias is not None:
-            assert not torch.allclose(ff._layer2.bias, torch.zeros_like(ff._layer2.bias))
+            assert not torch.allclose(
+                ff._layer2.bias, torch.zeros_like(ff._layer2.bias)
+            )
--- a/llm/tests/core/test_multi_head_attention.py
+++ b/llm/tests/core/test_multi_head_attention.py
@@ -9,157 +9,181 @@ from llm.core.multi_head_attention import MultiHeadAttention

 class TestMultiHeadAttention:
    """Test cases for MultiHeadAttention."""
-    
+
    def test_initialization(self, embed_dim, num_heads):
        """Test that MultiHeadAttention can be initialized."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
        assert attention is not None
-        
+
        # Check internal attributes
        assert len(attention._heads) == num_heads
        assert attention._layer.in_features == embed_dim
        assert attention._layer.out_features == embed_dim
-    
+
    def test_forward_pass(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass of MultiHeadAttention."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Forward pass
        output, _ = attention(random_embeddings)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_forward_with_mask(self, embed_dim, num_heads, random_embeddings):
        """Test forward pass with attention mask."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Create a simple mask
        seq_len = random_embeddings.shape[1]
        mask = torch.tril(torch.ones(seq_len, seq_len))  # Causal mask
-        
+
        # Forward pass with mask
        output, _ = attention(random_embeddings, mask=mask)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
+
    def test_causal_mask(self, embed_dim, num_heads, random_embeddings):
        """Test that causal mask prevents attending to future positions."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Create causal mask
        seq_len = random_embeddings.shape[1]
        causal_mask = torch.tril(torch.ones(seq_len, seq_len))
-        
+
        # Forward pass with causal mask
        output, _ = attention(random_embeddings, mask=causal_mask)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
-    def test_attention_weights_normalization(self, embed_dim, num_heads, random_embeddings):
+
+    def test_attention_weights_normalization(
+        self, embed_dim, num_heads, random_embeddings
+    ):
        """Test that attention weights are properly normalized."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Forward pass
        output, _ = attention(random_embeddings)
-        
+
        # Check output shape
        assert output.shape == random_embeddings.shape
-    
+
    def test_gradient_flow(self, embed_dim, num_heads, random_embeddings):
        """Test that gradients flow through MultiHeadAttention."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        # Forward pass
        output, _ = attention(random_embeddings)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed for learnable parameters
        assert attention._layer.weight.grad is not None
        if len(attention._heads) > 0:
            assert attention._heads[0]._q.weight.grad is not None
-    
+
    def test_device_consistency(self, embed_dim, num_heads, random_embeddings, device):
        """Test that MultiHeadAttention works on correct device."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024).to(device)
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        ).to(device)
        inputs = random_embeddings.to(device)
-        
+
        # Forward pass
        output, _ = attention(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert attention._layer.weight.device == device
-    
+
    def test_different_embed_dim_and_heads(self):
        """Test MultiHeadAttention with different embed_dim and num_heads combinations."""
        test_cases = [
-            (64, 2),   # embed_dim=64, num_heads=2
+            (64, 2),  # embed_dim=64, num_heads=2
            (128, 4),  # embed_dim=128, num_heads=4
            (256, 8),  # embed_dim=256, num_heads=8
-            (512, 16), # embed_dim=512, num_heads=16
+            (512, 16),  # embed_dim=512, num_heads=16
        ]
-        
+
        for embed_dim, num_heads in test_cases:
            head_size = embed_dim // num_heads
-            attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
+            attention = MultiHeadAttention(
+                num_heads, embed_dim, head_size, max_seq_len=1024
+            )
            batch_size, seq_len = 2, 16
            inputs = torch.randn(batch_size, seq_len, embed_dim)
-            
+
            output, _ = attention(inputs)
-            
+
            assert output.shape == inputs.shape
-    
+
    def test_attention_output_range(self, embed_dim, num_heads, random_embeddings):
        """Test that attention output is in reasonable range."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        output, _ = attention(random_embeddings)
-        
+
        # Output shouldn't have extreme values
        assert output.abs().max() < 100  # Reasonable upper bound
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
    def test_different_input_shapes(self, embed_dim, num_heads, batch_size, seq_len):
        """Test MultiHeadAttention with different input shapes."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024
+        )
+
        inputs = torch.randn(batch_size, seq_len, embed_dim)
        output, _ = attention(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)
-    
+
    def test_parameter_sharing(self, embed_dim, num_heads):
        """Test that parameters are properly shared across the sequence."""
        head_size = embed_dim // num_heads
-        attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024, dropout=0.0)  # No dropout for deterministic test
-        
+        attention = MultiHeadAttention(
+            num_heads, embed_dim, head_size, max_seq_len=1024, dropout=0.0
+        )  # No dropout for deterministic test
+
        # Create two identical sequences
        seq_len = 10
        base_sequence = torch.randn(1, seq_len, embed_dim)
        identical_sequence = base_sequence.clone()
-        
+
        # Set to eval mode to disable dropout
        attention.eval()
-        
+
        with torch.no_grad():
            output1, _ = attention(base_sequence)
            output2, _ = attention(identical_sequence)
-        
+
        # With identical inputs and same parameters, outputs should be identical
        assert torch.allclose(output1, output2, rtol=1e-5)
--- a/llm/tests/core/test_positional_embeddings.py
+++ b/llm/tests/core/test_positional_embeddings.py
@@ -10,127 +10,134 @@ from llm.core.positional_embeddings import PositionalEmbeddings

 class TestPositionalEmbeddings:
    """Test cases for PositionalEmbeddings."""
-    
+
    def test_initialization(self, embed_dim):
        """Test that PositionalEmbeddings can be initialized."""
        max_seq_len = 1024
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
        assert embeddings is not None
-        
+
        # Check that positional embeddings are created
-        assert hasattr(embeddings, 'embedding')
+        assert hasattr(embeddings, "embedding")
        assert embeddings.embedding.weight.shape == (max_seq_len, embed_dim)
-    
+
    def test_forward_pass(self, embed_dim):
        """Test forward pass of PositionalEmbeddings."""
        max_seq_len = 1024
        seq_len = 64
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Forward pass - takes sequence length, not input tensor
        output = embeddings(seq_len)
-        
+
        # Check output shape
        expected_shape = (seq_len, embed_dim)
        assert output.shape == expected_shape
        assert isinstance(output, torch.Tensor)
-    
+
    def test_positional_encoding_values(self, embed_dim):
        """Test that positional encoding values are computed correctly."""
        max_seq_len = 10
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Get embeddings for all positions
        pe = embeddings(max_seq_len)  # Shape: [max_seq_len, embed_dim]
-        
+
        # Check that different positions have different embeddings
        # (since these are learnable embeddings, not fixed sine/cosine)
        for pos in range(max_seq_len):
            for i in range(pos + 1, max_seq_len):
                assert not torch.allclose(pe[pos], pe[i], rtol=1e-4)
-    
+
    def test_different_sequence_lengths(self, embed_dim):
        """Test PositionalEmbeddings with different sequence lengths."""
        test_cases = [
-            (10, 5),   # seq_len < max_seq_len
+            (10, 5),  # seq_len < max_seq_len
            (10, 10),  # seq_len == max_seq_len
        ]
-        
+
        for max_seq_len, seq_len in test_cases:
            embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-            
+
            # Get embeddings for specific sequence length
            output = embeddings(seq_len)
-            
+
            # Output should have shape [seq_len, embed_dim]
            assert output.shape == (seq_len, embed_dim)
-    
+
    def test_gradient_flow(self, embed_dim):
        """Test that gradients flow through PositionalEmbeddings."""
        max_seq_len = 64
        seq_len = 32
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Forward pass
        output = embeddings(seq_len)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Positional embeddings should have gradients (they're learnable)
        assert embeddings.embedding.weight.grad is not None
-        assert not torch.allclose(embeddings.embedding.weight.grad, 
-                                torch.zeros_like(embeddings.embedding.weight.grad))
-        
+        assert not torch.allclose(
+            embeddings.embedding.weight.grad,
+            torch.zeros_like(embeddings.embedding.weight.grad),
+        )
+
    def test_device_consistency(self, embed_dim, device):
        """Test that PositionalEmbeddings works on correct device."""
        max_seq_len = 64
        seq_len = 32
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim).to(device)
-        
+
        # Forward pass
        output = embeddings(seq_len)
-        
+
        # Check device consistency
        assert output.device == device
        assert embeddings.embedding.weight.device == device
-    
+
    def test_reproducibility(self, embed_dim):
        """Test that positional embeddings are reproducible."""
        max_seq_len = 100
        embeddings1 = PositionalEmbeddings(max_seq_len, embed_dim)
        embeddings2 = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        # Different instances should have different embeddings (random initialization)
-        assert not torch.allclose(embeddings1.embedding.weight, embeddings2.embedding.weight)
-        
+        assert not torch.allclose(
+            embeddings1.embedding.weight, embeddings2.embedding.weight
+        )
+
        # But same instance should produce same output for same input
        seq_len = 50
        output1 = embeddings1(seq_len)
        output2 = embeddings1(seq_len)  # Same instance, same input
        assert torch.allclose(output1, output2)
-    
+
    def test_positional_pattern(self, embed_dim):
        """Test that positional embeddings create a meaningful pattern."""
        max_seq_len = 50
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
        pe = embeddings(max_seq_len)  # Get all positional embeddings
-        
+
        # Check that different positions have different embeddings
        # (with high probability due to random initialization)
        assert not torch.allclose(pe[0], pe[1], rtol=1e-4)
        assert not torch.allclose(pe[10], pe[20], rtol=1e-4)
-    
-    @pytest.mark.parametrize("max_seq_len,seq_len,embed_dim", [
-        (64, 10, 64),
-        (128, 50, 128), 
-        (256, 100, 256),
-    ])
+
+    @pytest.mark.parametrize(
+        "max_seq_len,seq_len,embed_dim",
+        [
+            (64, 10, 64),
+            (128, 50, 128),
+            (256, 100, 256),
+        ],
+    )
    def test_different_configurations(self, max_seq_len, seq_len, embed_dim):
        """Test PositionalEmbeddings with different configurations."""
        embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
-        
+
        output = embeddings(seq_len)
-        
+
        assert output.shape == (seq_len, embed_dim)
--- a/llm/tests/core/test_token_embeddings.py
+++ b/llm/tests/core/test_token_embeddings.py
@@ -9,99 +9,103 @@ from llm.core.token_embeddings import TokenEmbeddings

 class TestTokenEmbeddings:
    """Test cases for TokenEmbeddings."""
-    
+
    def test_initialization(self, vocab_size, embed_dim):
        """Test that TokenEmbeddings can be initialized."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
        assert embeddings is not None
-        
+
        # Check embedding layer
-        assert hasattr(embeddings, '_embedding')
+        assert hasattr(embeddings, "_embedding")
        assert embeddings._embedding.weight.shape == (vocab_size, embed_dim)
-    
+
    def test_forward_pass(self, vocab_size, embed_dim, random_inputs):
        """Test forward pass of TokenEmbeddings."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        # Forward pass
        output = embeddings(random_inputs)
-        
+
        # Check output shape
-        assert output.shape == (random_inputs.shape[0], random_inputs.shape[1], embed_dim)
+        assert output.shape == (
+            random_inputs.shape[0],
+            random_inputs.shape[1],
+            embed_dim,
+        )
        assert isinstance(output, torch.Tensor)
-    
+
    def test_embedding_weights(self, vocab_size, embed_dim):
        """Test that embedding weights are properly initialized."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        weights = embeddings._embedding.weight
        assert weights.requires_grad is True
-        
+
        # Check that weights are not all zeros
        assert not torch.allclose(weights, torch.zeros_like(weights))
-    
+
    def test_different_vocab_sizes(self):
        """Test TokenEmbeddings with different vocabulary sizes."""
-        test_cases = [
-            (100, 128),
-            (1000, 256),
-            (50000, 512)
-        ]
-        
+        test_cases = [(100, 128), (1000, 256), (50000, 512)]
+
        for vocab_size, embed_dim in test_cases:
            embeddings = TokenEmbeddings(vocab_size, embed_dim)
            assert embeddings._embedding.weight.shape == (vocab_size, embed_dim)
-    
+
    def test_gradient_flow(self, vocab_size, embed_dim, random_inputs):
        """Test that gradients flow through TokenEmbeddings."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        # Forward pass
        output = embeddings(random_inputs)
-        
+
        # Create a dummy loss and backward pass
        loss = output.sum()
        loss.backward()
-        
+
        # Check that gradients are computed
        assert embeddings._embedding.weight.grad is not None
-        assert not torch.allclose(embeddings._embedding.weight.grad, 
-                                torch.zeros_like(embeddings._embedding.weight.grad))
-    
+        assert not torch.allclose(
+            embeddings._embedding.weight.grad,
+            torch.zeros_like(embeddings._embedding.weight.grad),
+        )
+
    def test_device_consistency(self, vocab_size, embed_dim, random_inputs, device):
        """Test that TokenEmbeddings works on correct device."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim).to(device)
        inputs = random_inputs.to(device)
-        
+
        # Forward pass
        output = embeddings(inputs)
-        
+
        # Check device consistency
        assert output.device == device
        assert embeddings._embedding.weight.device == device
-    
+
    def test_embedding_lookup(self, vocab_size, embed_dim):
        """Test specific embedding lookups."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        # Test lookup for specific tokens
-        test_tokens = torch.tensor([[0, 1, 2], [vocab_size - 1, vocab_size - 2, vocab_size - 3]])
-        
+        test_tokens = torch.tensor(
+            [[0, 1, 2], [vocab_size - 1, vocab_size - 2, vocab_size - 3]]
+        )
+
        output = embeddings(test_tokens)
-        
+
        # Check shape
        assert output.shape == (2, 3, embed_dim)
-        
+
        # Check that different tokens have different embeddings
        # (with high probability due to random initialization)
        assert not torch.allclose(output[0, 0], output[0, 1], rtol=1e-4)
-    
+
    @pytest.mark.parametrize("batch_size,seq_len", [(1, 1), (2, 10), (8, 64)])
    def test_different_input_shapes(self, vocab_size, embed_dim, batch_size, seq_len):
        """Test TokenEmbeddings with different input shapes."""
        embeddings = TokenEmbeddings(vocab_size, embed_dim)
-        
+
        inputs = torch.randint(0, vocab_size, (batch_size, seq_len))
        output = embeddings(inputs)
-        
+
        assert output.shape == (batch_size, seq_len, embed_dim)