mirror of
https://github.com/pese-git/llm-arch-research.git
synced 2026-01-24 13:32:08 +00:00
Рефакторинг: единообразие оформления кода (пробелы, кавычки, пустые строки), без изменения логики по всему проекту.
This commit is contained in:
@@ -9,180 +9,233 @@ from llm.core.decoder import Decoder
|
||||
|
||||
class TestDecoder:
|
||||
"""Test cases for Decoder."""
|
||||
|
||||
|
||||
def test_initialization(self, embed_dim, num_heads):
|
||||
"""Test that Decoder can be initialized."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
assert decoder is not None
|
||||
|
||||
|
||||
# Check internal components
|
||||
assert hasattr(decoder, '_heads')
|
||||
assert hasattr(decoder, '_ff')
|
||||
assert hasattr(decoder, '_norm1')
|
||||
assert hasattr(decoder, '_norm2')
|
||||
|
||||
assert hasattr(decoder, "_heads")
|
||||
assert hasattr(decoder, "_ff")
|
||||
assert hasattr(decoder, "_norm1")
|
||||
assert hasattr(decoder, "_norm2")
|
||||
|
||||
def test_forward_pass(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test forward pass of Decoder."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
# Forward pass
|
||||
output = decoder(random_embeddings)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_embeddings.shape
|
||||
assert isinstance(output, torch.Tensor)
|
||||
|
||||
|
||||
def test_forward_with_causal_mask(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test forward pass with causal mask."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
batch_size, seq_len = random_embeddings.shape[:2]
|
||||
# Create causal mask
|
||||
mask = torch.tril(torch.ones(seq_len, seq_len))
|
||||
|
||||
|
||||
# Forward pass with causal mask
|
||||
output = decoder(random_embeddings, mask=mask)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_embeddings.shape
|
||||
|
||||
|
||||
def test_residual_connections(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that residual connections are properly applied."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
output = decoder(random_embeddings)
|
||||
|
||||
|
||||
# With residual connections and layer norm, the output shouldn't be
|
||||
# too different from input (in terms of scale/distribution)
|
||||
input_norm = random_embeddings.norm(dim=-1).mean()
|
||||
output_norm = output.norm(dim=-1).mean()
|
||||
|
||||
|
||||
# Norms should be of similar magnitude (not exact due to transformations)
|
||||
assert 0.1 < (output_norm / input_norm) < 10.0
|
||||
|
||||
|
||||
def test_layer_norm(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that layer normalization is applied."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
output = decoder(random_embeddings)
|
||||
|
||||
|
||||
# Check that output has reasonable statistics (due to layer norm)
|
||||
# Mean should be close to 0, std close to 1 for each sequence position
|
||||
output_mean = output.mean(dim=-1)
|
||||
output_std = output.std(dim=-1)
|
||||
|
||||
|
||||
# These are approximate checks since the data goes through multiple transformations
|
||||
assert torch.allclose(output_mean, torch.zeros_like(output_mean), atol=1.0)
|
||||
assert torch.allclose(output_std, torch.ones_like(output_std), atol=2.0)
|
||||
|
||||
|
||||
def test_gradient_flow(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that gradients flow through Decoder."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
# Forward pass
|
||||
output = decoder(random_embeddings)
|
||||
|
||||
|
||||
# Create a dummy loss and backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
|
||||
# Check that gradients are computed for learnable parameters
|
||||
# in attention and feed forward components
|
||||
assert decoder._heads._layer.weight.grad is not None
|
||||
assert decoder._ff._layer1.weight.grad is not None
|
||||
assert decoder._norm1.weight.grad is not None
|
||||
assert decoder._norm2.weight.grad is not None
|
||||
|
||||
|
||||
def test_device_consistency(self, embed_dim, num_heads, random_embeddings, device):
|
||||
"""Test that Decoder works on correct device."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len).to(device)
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
).to(device)
|
||||
inputs = random_embeddings.to(device)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = decoder(inputs)
|
||||
|
||||
|
||||
# Check device consistency
|
||||
assert output.device == device
|
||||
assert decoder._heads._layer.weight.device == device
|
||||
|
||||
|
||||
def test_different_configurations(self):
|
||||
"""Test Decoder with different configurations."""
|
||||
test_cases = [
|
||||
(64, 2), # embed_dim=64, num_heads=2
|
||||
(64, 2), # embed_dim=64, num_heads=2
|
||||
(128, 4), # embed_dim=128, num_heads=4
|
||||
(256, 8), # embed_dim=256, num_heads=8
|
||||
]
|
||||
|
||||
|
||||
for embed_dim, num_heads in test_cases:
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
batch_size, seq_len = 2, 16
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
output = decoder(inputs)
|
||||
|
||||
|
||||
assert output.shape == inputs.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
|
||||
def test_different_input_shapes(self, embed_dim, num_heads, batch_size, seq_len):
|
||||
"""Test Decoder with different input shapes."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
output = decoder(inputs)
|
||||
|
||||
|
||||
assert output.shape == (batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
def test_training_vs_evaluation(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that Decoder behaves differently in train vs eval mode."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len, dropout=0.5)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
dropout=0.5,
|
||||
)
|
||||
|
||||
# Training mode
|
||||
decoder.train()
|
||||
output_train = decoder(random_embeddings)
|
||||
|
||||
|
||||
# Evaluation mode
|
||||
decoder.eval()
|
||||
output_eval = decoder(random_embeddings)
|
||||
|
||||
|
||||
# Outputs should be different due to dropout
|
||||
assert not torch.allclose(output_train, output_eval)
|
||||
|
||||
|
||||
def test_parameter_initialization(self, embed_dim, num_heads):
|
||||
"""Test that parameters are properly initialized."""
|
||||
head_size = embed_dim // num_heads
|
||||
max_seq_len = 1024
|
||||
decoder = Decoder(num_heads=num_heads, emb_size=embed_dim, head_size=head_size, max_seq_len=max_seq_len)
|
||||
|
||||
decoder = Decoder(
|
||||
num_heads=num_heads,
|
||||
emb_size=embed_dim,
|
||||
head_size=head_size,
|
||||
max_seq_len=max_seq_len,
|
||||
)
|
||||
|
||||
# Check that various components have non-zero parameters
|
||||
assert not torch.allclose(
|
||||
decoder._heads._layer.weight,
|
||||
torch.zeros_like(decoder._heads._layer.weight)
|
||||
decoder._heads._layer.weight, torch.zeros_like(decoder._heads._layer.weight)
|
||||
)
|
||||
assert not torch.allclose(
|
||||
decoder._ff._layer1.weight,
|
||||
torch.zeros_like(decoder._ff._layer1.weight)
|
||||
decoder._ff._layer1.weight, torch.zeros_like(decoder._ff._layer1.weight)
|
||||
)
|
||||
assert not torch.allclose(
|
||||
decoder._norm1.weight,
|
||||
torch.zeros_like(decoder._norm1.weight)
|
||||
decoder._norm1.weight, torch.zeros_like(decoder._norm1.weight)
|
||||
)
|
||||
|
||||
@@ -10,168 +10,178 @@ from llm.core.feed_forward import FeedForward
|
||||
|
||||
class TestFeedForward:
|
||||
"""Test cases for FeedForward."""
|
||||
|
||||
|
||||
def test_initialization(self, embed_dim):
|
||||
"""Test that FeedForward can be initialized."""
|
||||
ff = FeedForward(embed_dim)
|
||||
assert ff is not None
|
||||
|
||||
|
||||
# Check internal layers
|
||||
assert hasattr(ff, '_layer1')
|
||||
assert hasattr(ff, '_layer2')
|
||||
assert hasattr(ff, '_activation')
|
||||
assert hasattr(ff, '_dropout')
|
||||
|
||||
assert hasattr(ff, "_layer1")
|
||||
assert hasattr(ff, "_layer2")
|
||||
assert hasattr(ff, "_activation")
|
||||
assert hasattr(ff, "_dropout")
|
||||
|
||||
# Check layer dimensions
|
||||
expected_hidden_dim = embed_dim * 4 # Default expansion factor
|
||||
assert ff._layer1.weight.shape == (expected_hidden_dim, embed_dim)
|
||||
assert ff._layer2.weight.shape == (embed_dim, expected_hidden_dim)
|
||||
|
||||
|
||||
def test_forward_pass(self, embed_dim, random_float_inputs):
|
||||
"""Test forward pass of FeedForward."""
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = ff(random_float_inputs)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_float_inputs.shape
|
||||
assert isinstance(output, torch.Tensor)
|
||||
|
||||
|
||||
def test_custom_hidden_dim(self, embed_dim):
|
||||
"""Test FeedForward with custom hidden dimension."""
|
||||
# FeedForward doesn't support custom hidden_dim in current implementation
|
||||
# This test is not applicable
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Check layer dimensions (fixed 4x expansion)
|
||||
expected_hidden_dim = embed_dim * 4
|
||||
assert ff._layer1.weight.shape == (expected_hidden_dim, embed_dim)
|
||||
assert ff._layer2.weight.shape == (embed_dim, expected_hidden_dim)
|
||||
|
||||
|
||||
def test_dropout(self, embed_dim, random_float_inputs):
|
||||
"""Test that dropout is applied during training."""
|
||||
ff = FeedForward(embed_dim, dropout=0.5)
|
||||
ff.train() # Set to training mode
|
||||
|
||||
|
||||
output = ff(random_float_inputs)
|
||||
|
||||
|
||||
# In training mode with dropout, some values should be zeroed
|
||||
# This is probabilistic, so we can't assert exact zeros,
|
||||
# but we can check the structure is preserved
|
||||
assert output.shape == random_float_inputs.shape
|
||||
|
||||
|
||||
def test_no_dropout_in_eval(self, embed_dim, random_float_inputs):
|
||||
"""Test that dropout is not applied during evaluation."""
|
||||
ff = FeedForward(embed_dim, dropout=0.5)
|
||||
ff.eval() # Set to evaluation mode
|
||||
|
||||
|
||||
# Run forward pass multiple times - outputs should be identical
|
||||
output1 = ff(random_float_inputs)
|
||||
output2 = ff(random_float_inputs)
|
||||
|
||||
|
||||
assert torch.allclose(output1, output2)
|
||||
|
||||
|
||||
def test_activation_function(self, embed_dim, random_float_inputs):
|
||||
"""Test that activation function is applied."""
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Manually compute expected output without dropout for deterministic comparison
|
||||
hidden = ff._layer1(random_float_inputs)
|
||||
activated = ff._activation(hidden)
|
||||
expected_output = ff._layer2(activated)
|
||||
|
||||
|
||||
# Compare with forward pass in eval mode (no dropout)
|
||||
ff.eval()
|
||||
actual_output = ff(random_float_inputs)
|
||||
|
||||
|
||||
assert torch.allclose(actual_output, expected_output, rtol=1e-4)
|
||||
|
||||
|
||||
def test_gradient_flow(self, embed_dim, random_float_inputs):
|
||||
"""Test that gradients flow through FeedForward."""
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = ff(random_float_inputs)
|
||||
|
||||
|
||||
# Create a dummy loss and backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
|
||||
# Check that gradients are computed for learnable parameters
|
||||
assert ff._layer1.weight.grad is not None
|
||||
assert ff._layer2.weight.grad is not None
|
||||
assert not torch.allclose(ff._layer1.weight.grad,
|
||||
torch.zeros_like(ff._layer1.weight.grad))
|
||||
assert not torch.allclose(ff._layer2.weight.grad,
|
||||
torch.zeros_like(ff._layer2.weight.grad))
|
||||
|
||||
assert not torch.allclose(
|
||||
ff._layer1.weight.grad, torch.zeros_like(ff._layer1.weight.grad)
|
||||
)
|
||||
assert not torch.allclose(
|
||||
ff._layer2.weight.grad, torch.zeros_like(ff._layer2.weight.grad)
|
||||
)
|
||||
|
||||
def test_device_consistency(self, embed_dim, random_float_inputs, device):
|
||||
"""Test that FeedForward works on correct device."""
|
||||
ff = FeedForward(embed_dim).to(device)
|
||||
inputs = random_float_inputs.to(device)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = ff(inputs)
|
||||
|
||||
|
||||
# Check device consistency
|
||||
assert output.device == device
|
||||
assert ff._layer1.weight.device == device
|
||||
assert ff._layer2.weight.device == device
|
||||
|
||||
|
||||
def test_different_embed_dims(self):
|
||||
"""Test FeedForward with different embedding dimensions."""
|
||||
test_cases = [64, 128, 256, 512]
|
||||
|
||||
|
||||
for embed_dim in test_cases:
|
||||
ff = FeedForward(embed_dim)
|
||||
batch_size, seq_len = 2, 16
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
output = ff(inputs)
|
||||
|
||||
|
||||
assert output.shape == inputs.shape
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
|
||||
def test_different_input_shapes(self, embed_dim, batch_size, seq_len):
|
||||
"""Test FeedForward with different input shapes."""
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
output = ff(inputs)
|
||||
|
||||
|
||||
assert output.shape == (batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
def test_non_linearity(self, embed_dim, random_float_inputs):
|
||||
"""Test that FeedForward introduces non-linearity."""
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Create a simple linear transformation for comparison
|
||||
linear_layer = nn.Linear(embed_dim, embed_dim)
|
||||
|
||||
|
||||
# Copy weights to make comparison fair
|
||||
with torch.no_grad():
|
||||
linear_layer.weight.copy_(ff._layer2.weight @ ff._layer1.weight)
|
||||
if linear_layer.bias is not None:
|
||||
linear_layer.bias.zero_()
|
||||
|
||||
|
||||
linear_output = linear_layer(random_float_inputs)
|
||||
ff_output = ff(random_float_inputs)
|
||||
|
||||
|
||||
# FeedForward output should be different from pure linear transformation
|
||||
# due to activation function
|
||||
assert not torch.allclose(ff_output, linear_output, rtol=1e-4)
|
||||
|
||||
|
||||
def test_parameter_initialization(self, embed_dim):
|
||||
"""Test that parameters are properly initialized."""
|
||||
ff = FeedForward(embed_dim)
|
||||
|
||||
|
||||
# Check that weights are not all zeros
|
||||
assert not torch.allclose(ff._layer1.weight, torch.zeros_like(ff._layer1.weight))
|
||||
assert not torch.allclose(ff._layer2.weight, torch.zeros_like(ff._layer2.weight))
|
||||
|
||||
assert not torch.allclose(
|
||||
ff._layer1.weight, torch.zeros_like(ff._layer1.weight)
|
||||
)
|
||||
assert not torch.allclose(
|
||||
ff._layer2.weight, torch.zeros_like(ff._layer2.weight)
|
||||
)
|
||||
|
||||
# Check that biases are not all zeros (they should be initialized with some values)
|
||||
if ff._layer1.bias is not None:
|
||||
assert not torch.allclose(ff._layer1.bias, torch.zeros_like(ff._layer1.bias))
|
||||
assert not torch.allclose(
|
||||
ff._layer1.bias, torch.zeros_like(ff._layer1.bias)
|
||||
)
|
||||
if ff._layer2.bias is not None:
|
||||
assert not torch.allclose(ff._layer2.bias, torch.zeros_like(ff._layer2.bias))
|
||||
assert not torch.allclose(
|
||||
ff._layer2.bias, torch.zeros_like(ff._layer2.bias)
|
||||
)
|
||||
|
||||
@@ -9,157 +9,181 @@ from llm.core.multi_head_attention import MultiHeadAttention
|
||||
|
||||
class TestMultiHeadAttention:
|
||||
"""Test cases for MultiHeadAttention."""
|
||||
|
||||
|
||||
def test_initialization(self, embed_dim, num_heads):
|
||||
"""Test that MultiHeadAttention can be initialized."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
assert attention is not None
|
||||
|
||||
|
||||
# Check internal attributes
|
||||
assert len(attention._heads) == num_heads
|
||||
assert attention._layer.in_features == embed_dim
|
||||
assert attention._layer.out_features == embed_dim
|
||||
|
||||
|
||||
def test_forward_pass(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test forward pass of MultiHeadAttention."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
# Forward pass
|
||||
output, _ = attention(random_embeddings)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_embeddings.shape
|
||||
assert isinstance(output, torch.Tensor)
|
||||
|
||||
|
||||
def test_forward_with_mask(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test forward pass with attention mask."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
# Create a simple mask
|
||||
seq_len = random_embeddings.shape[1]
|
||||
mask = torch.tril(torch.ones(seq_len, seq_len)) # Causal mask
|
||||
|
||||
|
||||
# Forward pass with mask
|
||||
output, _ = attention(random_embeddings, mask=mask)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_embeddings.shape
|
||||
|
||||
|
||||
def test_causal_mask(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that causal mask prevents attending to future positions."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
# Create causal mask
|
||||
seq_len = random_embeddings.shape[1]
|
||||
causal_mask = torch.tril(torch.ones(seq_len, seq_len))
|
||||
|
||||
|
||||
# Forward pass with causal mask
|
||||
output, _ = attention(random_embeddings, mask=causal_mask)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_embeddings.shape
|
||||
|
||||
def test_attention_weights_normalization(self, embed_dim, num_heads, random_embeddings):
|
||||
|
||||
def test_attention_weights_normalization(
|
||||
self, embed_dim, num_heads, random_embeddings
|
||||
):
|
||||
"""Test that attention weights are properly normalized."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
# Forward pass
|
||||
output, _ = attention(random_embeddings)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == random_embeddings.shape
|
||||
|
||||
|
||||
def test_gradient_flow(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that gradients flow through MultiHeadAttention."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
# Forward pass
|
||||
output, _ = attention(random_embeddings)
|
||||
|
||||
|
||||
# Create a dummy loss and backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
|
||||
# Check that gradients are computed for learnable parameters
|
||||
assert attention._layer.weight.grad is not None
|
||||
if len(attention._heads) > 0:
|
||||
assert attention._heads[0]._q.weight.grad is not None
|
||||
|
||||
|
||||
def test_device_consistency(self, embed_dim, num_heads, random_embeddings, device):
|
||||
"""Test that MultiHeadAttention works on correct device."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024).to(device)
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
).to(device)
|
||||
inputs = random_embeddings.to(device)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output, _ = attention(inputs)
|
||||
|
||||
|
||||
# Check device consistency
|
||||
assert output.device == device
|
||||
assert attention._layer.weight.device == device
|
||||
|
||||
|
||||
def test_different_embed_dim_and_heads(self):
|
||||
"""Test MultiHeadAttention with different embed_dim and num_heads combinations."""
|
||||
test_cases = [
|
||||
(64, 2), # embed_dim=64, num_heads=2
|
||||
(64, 2), # embed_dim=64, num_heads=2
|
||||
(128, 4), # embed_dim=128, num_heads=4
|
||||
(256, 8), # embed_dim=256, num_heads=8
|
||||
(512, 16), # embed_dim=512, num_heads=16
|
||||
(512, 16), # embed_dim=512, num_heads=16
|
||||
]
|
||||
|
||||
|
||||
for embed_dim, num_heads in test_cases:
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
batch_size, seq_len = 2, 16
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
output, _ = attention(inputs)
|
||||
|
||||
|
||||
assert output.shape == inputs.shape
|
||||
|
||||
|
||||
def test_attention_output_range(self, embed_dim, num_heads, random_embeddings):
|
||||
"""Test that attention output is in reasonable range."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
output, _ = attention(random_embeddings)
|
||||
|
||||
|
||||
# Output shouldn't have extreme values
|
||||
assert output.abs().max() < 100 # Reasonable upper bound
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size,seq_len", [(1, 8), (2, 16), (4, 32)])
|
||||
def test_different_input_shapes(self, embed_dim, num_heads, batch_size, seq_len):
|
||||
"""Test MultiHeadAttention with different input shapes."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024)
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024
|
||||
)
|
||||
|
||||
inputs = torch.randn(batch_size, seq_len, embed_dim)
|
||||
output, _ = attention(inputs)
|
||||
|
||||
|
||||
assert output.shape == (batch_size, seq_len, embed_dim)
|
||||
|
||||
|
||||
def test_parameter_sharing(self, embed_dim, num_heads):
|
||||
"""Test that parameters are properly shared across the sequence."""
|
||||
head_size = embed_dim // num_heads
|
||||
attention = MultiHeadAttention(num_heads, embed_dim, head_size, max_seq_len=1024, dropout=0.0) # No dropout for deterministic test
|
||||
|
||||
attention = MultiHeadAttention(
|
||||
num_heads, embed_dim, head_size, max_seq_len=1024, dropout=0.0
|
||||
) # No dropout for deterministic test
|
||||
|
||||
# Create two identical sequences
|
||||
seq_len = 10
|
||||
base_sequence = torch.randn(1, seq_len, embed_dim)
|
||||
identical_sequence = base_sequence.clone()
|
||||
|
||||
|
||||
# Set to eval mode to disable dropout
|
||||
attention.eval()
|
||||
|
||||
|
||||
with torch.no_grad():
|
||||
output1, _ = attention(base_sequence)
|
||||
output2, _ = attention(identical_sequence)
|
||||
|
||||
|
||||
# With identical inputs and same parameters, outputs should be identical
|
||||
assert torch.allclose(output1, output2, rtol=1e-5)
|
||||
|
||||
@@ -10,127 +10,134 @@ from llm.core.positional_embeddings import PositionalEmbeddings
|
||||
|
||||
class TestPositionalEmbeddings:
|
||||
"""Test cases for PositionalEmbeddings."""
|
||||
|
||||
|
||||
def test_initialization(self, embed_dim):
|
||||
"""Test that PositionalEmbeddings can be initialized."""
|
||||
max_seq_len = 1024
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
assert embeddings is not None
|
||||
|
||||
|
||||
# Check that positional embeddings are created
|
||||
assert hasattr(embeddings, 'embedding')
|
||||
assert hasattr(embeddings, "embedding")
|
||||
assert embeddings.embedding.weight.shape == (max_seq_len, embed_dim)
|
||||
|
||||
|
||||
def test_forward_pass(self, embed_dim):
|
||||
"""Test forward pass of PositionalEmbeddings."""
|
||||
max_seq_len = 1024
|
||||
seq_len = 64
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
|
||||
|
||||
# Forward pass - takes sequence length, not input tensor
|
||||
output = embeddings(seq_len)
|
||||
|
||||
|
||||
# Check output shape
|
||||
expected_shape = (seq_len, embed_dim)
|
||||
assert output.shape == expected_shape
|
||||
assert isinstance(output, torch.Tensor)
|
||||
|
||||
|
||||
def test_positional_encoding_values(self, embed_dim):
|
||||
"""Test that positional encoding values are computed correctly."""
|
||||
max_seq_len = 10
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
|
||||
|
||||
# Get embeddings for all positions
|
||||
pe = embeddings(max_seq_len) # Shape: [max_seq_len, embed_dim]
|
||||
|
||||
|
||||
# Check that different positions have different embeddings
|
||||
# (since these are learnable embeddings, not fixed sine/cosine)
|
||||
for pos in range(max_seq_len):
|
||||
for i in range(pos + 1, max_seq_len):
|
||||
assert not torch.allclose(pe[pos], pe[i], rtol=1e-4)
|
||||
|
||||
|
||||
def test_different_sequence_lengths(self, embed_dim):
|
||||
"""Test PositionalEmbeddings with different sequence lengths."""
|
||||
test_cases = [
|
||||
(10, 5), # seq_len < max_seq_len
|
||||
(10, 5), # seq_len < max_seq_len
|
||||
(10, 10), # seq_len == max_seq_len
|
||||
]
|
||||
|
||||
|
||||
for max_seq_len, seq_len in test_cases:
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
|
||||
|
||||
# Get embeddings for specific sequence length
|
||||
output = embeddings(seq_len)
|
||||
|
||||
|
||||
# Output should have shape [seq_len, embed_dim]
|
||||
assert output.shape == (seq_len, embed_dim)
|
||||
|
||||
|
||||
def test_gradient_flow(self, embed_dim):
|
||||
"""Test that gradients flow through PositionalEmbeddings."""
|
||||
max_seq_len = 64
|
||||
seq_len = 32
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = embeddings(seq_len)
|
||||
|
||||
|
||||
# Create a dummy loss and backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
|
||||
# Positional embeddings should have gradients (they're learnable)
|
||||
assert embeddings.embedding.weight.grad is not None
|
||||
assert not torch.allclose(embeddings.embedding.weight.grad,
|
||||
torch.zeros_like(embeddings.embedding.weight.grad))
|
||||
|
||||
assert not torch.allclose(
|
||||
embeddings.embedding.weight.grad,
|
||||
torch.zeros_like(embeddings.embedding.weight.grad),
|
||||
)
|
||||
|
||||
def test_device_consistency(self, embed_dim, device):
|
||||
"""Test that PositionalEmbeddings works on correct device."""
|
||||
max_seq_len = 64
|
||||
seq_len = 32
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim).to(device)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = embeddings(seq_len)
|
||||
|
||||
|
||||
# Check device consistency
|
||||
assert output.device == device
|
||||
assert embeddings.embedding.weight.device == device
|
||||
|
||||
|
||||
def test_reproducibility(self, embed_dim):
|
||||
"""Test that positional embeddings are reproducible."""
|
||||
max_seq_len = 100
|
||||
embeddings1 = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
embeddings2 = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
|
||||
|
||||
# Different instances should have different embeddings (random initialization)
|
||||
assert not torch.allclose(embeddings1.embedding.weight, embeddings2.embedding.weight)
|
||||
|
||||
assert not torch.allclose(
|
||||
embeddings1.embedding.weight, embeddings2.embedding.weight
|
||||
)
|
||||
|
||||
# But same instance should produce same output for same input
|
||||
seq_len = 50
|
||||
output1 = embeddings1(seq_len)
|
||||
output2 = embeddings1(seq_len) # Same instance, same input
|
||||
assert torch.allclose(output1, output2)
|
||||
|
||||
|
||||
def test_positional_pattern(self, embed_dim):
|
||||
"""Test that positional embeddings create a meaningful pattern."""
|
||||
max_seq_len = 50
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
pe = embeddings(max_seq_len) # Get all positional embeddings
|
||||
|
||||
|
||||
# Check that different positions have different embeddings
|
||||
# (with high probability due to random initialization)
|
||||
assert not torch.allclose(pe[0], pe[1], rtol=1e-4)
|
||||
assert not torch.allclose(pe[10], pe[20], rtol=1e-4)
|
||||
|
||||
@pytest.mark.parametrize("max_seq_len,seq_len,embed_dim", [
|
||||
(64, 10, 64),
|
||||
(128, 50, 128),
|
||||
(256, 100, 256),
|
||||
])
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"max_seq_len,seq_len,embed_dim",
|
||||
[
|
||||
(64, 10, 64),
|
||||
(128, 50, 128),
|
||||
(256, 100, 256),
|
||||
],
|
||||
)
|
||||
def test_different_configurations(self, max_seq_len, seq_len, embed_dim):
|
||||
"""Test PositionalEmbeddings with different configurations."""
|
||||
embeddings = PositionalEmbeddings(max_seq_len, embed_dim)
|
||||
|
||||
|
||||
output = embeddings(seq_len)
|
||||
|
||||
|
||||
assert output.shape == (seq_len, embed_dim)
|
||||
|
||||
@@ -9,99 +9,103 @@ from llm.core.token_embeddings import TokenEmbeddings
|
||||
|
||||
class TestTokenEmbeddings:
|
||||
"""Test cases for TokenEmbeddings."""
|
||||
|
||||
|
||||
def test_initialization(self, vocab_size, embed_dim):
|
||||
"""Test that TokenEmbeddings can be initialized."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
assert embeddings is not None
|
||||
|
||||
|
||||
# Check embedding layer
|
||||
assert hasattr(embeddings, '_embedding')
|
||||
assert hasattr(embeddings, "_embedding")
|
||||
assert embeddings._embedding.weight.shape == (vocab_size, embed_dim)
|
||||
|
||||
|
||||
def test_forward_pass(self, vocab_size, embed_dim, random_inputs):
|
||||
"""Test forward pass of TokenEmbeddings."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = embeddings(random_inputs)
|
||||
|
||||
|
||||
# Check output shape
|
||||
assert output.shape == (random_inputs.shape[0], random_inputs.shape[1], embed_dim)
|
||||
assert output.shape == (
|
||||
random_inputs.shape[0],
|
||||
random_inputs.shape[1],
|
||||
embed_dim,
|
||||
)
|
||||
assert isinstance(output, torch.Tensor)
|
||||
|
||||
|
||||
def test_embedding_weights(self, vocab_size, embed_dim):
|
||||
"""Test that embedding weights are properly initialized."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
|
||||
|
||||
weights = embeddings._embedding.weight
|
||||
assert weights.requires_grad is True
|
||||
|
||||
|
||||
# Check that weights are not all zeros
|
||||
assert not torch.allclose(weights, torch.zeros_like(weights))
|
||||
|
||||
|
||||
def test_different_vocab_sizes(self):
|
||||
"""Test TokenEmbeddings with different vocabulary sizes."""
|
||||
test_cases = [
|
||||
(100, 128),
|
||||
(1000, 256),
|
||||
(50000, 512)
|
||||
]
|
||||
|
||||
test_cases = [(100, 128), (1000, 256), (50000, 512)]
|
||||
|
||||
for vocab_size, embed_dim in test_cases:
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
assert embeddings._embedding.weight.shape == (vocab_size, embed_dim)
|
||||
|
||||
|
||||
def test_gradient_flow(self, vocab_size, embed_dim, random_inputs):
|
||||
"""Test that gradients flow through TokenEmbeddings."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = embeddings(random_inputs)
|
||||
|
||||
|
||||
# Create a dummy loss and backward pass
|
||||
loss = output.sum()
|
||||
loss.backward()
|
||||
|
||||
|
||||
# Check that gradients are computed
|
||||
assert embeddings._embedding.weight.grad is not None
|
||||
assert not torch.allclose(embeddings._embedding.weight.grad,
|
||||
torch.zeros_like(embeddings._embedding.weight.grad))
|
||||
|
||||
assert not torch.allclose(
|
||||
embeddings._embedding.weight.grad,
|
||||
torch.zeros_like(embeddings._embedding.weight.grad),
|
||||
)
|
||||
|
||||
def test_device_consistency(self, vocab_size, embed_dim, random_inputs, device):
|
||||
"""Test that TokenEmbeddings works on correct device."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim).to(device)
|
||||
inputs = random_inputs.to(device)
|
||||
|
||||
|
||||
# Forward pass
|
||||
output = embeddings(inputs)
|
||||
|
||||
|
||||
# Check device consistency
|
||||
assert output.device == device
|
||||
assert embeddings._embedding.weight.device == device
|
||||
|
||||
|
||||
def test_embedding_lookup(self, vocab_size, embed_dim):
|
||||
"""Test specific embedding lookups."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
|
||||
|
||||
# Test lookup for specific tokens
|
||||
test_tokens = torch.tensor([[0, 1, 2], [vocab_size - 1, vocab_size - 2, vocab_size - 3]])
|
||||
|
||||
test_tokens = torch.tensor(
|
||||
[[0, 1, 2], [vocab_size - 1, vocab_size - 2, vocab_size - 3]]
|
||||
)
|
||||
|
||||
output = embeddings(test_tokens)
|
||||
|
||||
|
||||
# Check shape
|
||||
assert output.shape == (2, 3, embed_dim)
|
||||
|
||||
|
||||
# Check that different tokens have different embeddings
|
||||
# (with high probability due to random initialization)
|
||||
assert not torch.allclose(output[0, 0], output[0, 1], rtol=1e-4)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("batch_size,seq_len", [(1, 1), (2, 10), (8, 64)])
|
||||
def test_different_input_shapes(self, vocab_size, embed_dim, batch_size, seq_len):
|
||||
"""Test TokenEmbeddings with different input shapes."""
|
||||
embeddings = TokenEmbeddings(vocab_size, embed_dim)
|
||||
|
||||
|
||||
inputs = torch.randint(0, vocab_size, (batch_size, seq_len))
|
||||
output = embeddings(inputs)
|
||||
|
||||
|
||||
assert output.shape == (batch_size, seq_len, embed_dim)
|
||||
|
||||
Reference in New Issue
Block a user