diff --git a/pyproject.toml b/pyproject.toml
index e4796a580..eaec3a00e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -148,6 +148,9 @@
             "-W ignore::beartype.roar.BeartypeDecorHintPep585DeprecationWarning",
         ]
         doctest_optionflags="NORMALIZE_WHITESPACE ELLIPSIS FLOAT_CMP"
+        markers=[
+            "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+        ]
         filterwarnings=[
             "ignore:pkg_resources is deprecated as an API:DeprecationWarning",
             # Ignore numpy.distutils deprecation warning caused by pandas
diff --git a/tests/acceptance/model_bridge/compatibility/test_activation_cache.py b/tests/acceptance/model_bridge/compatibility/test_activation_cache.py
index 97ea225d1..94fbec582 100644
--- a/tests/acceptance/model_bridge/compatibility/test_activation_cache.py
+++ b/tests/acceptance/model_bridge/compatibility/test_activation_cache.py
@@ -4,7 +4,6 @@
 import torch
 
 from transformer_lens.ActivationCache import ActivationCache
-from transformer_lens.model_bridge import TransformerBridge
 
 
 class TestActivationCacheCompatibility:
@@ -14,16 +13,15 @@ class TestActivationCacheCompatibility:
     def cleanup_after_class(self):
         """Clean up memory after each test class."""
         yield
-        # Clear GPU memory
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         for _ in range(3):
             gc.collect()
 
     @pytest.fixture(scope="class")
-    def bridge_model(self):
-        """Create a TransformerBridge model for testing."""
-        return TransformerBridge.boot_transformers("gpt2", device="cpu")
+    def bridge_model(self, gpt2_bridge):
+        """Use session-scoped gpt2 bridge."""
+        return gpt2_bridge
 
     @pytest.fixture(scope="class")
     def sample_cache(self, bridge_model):
diff --git a/tests/acceptance/model_bridge/compatibility/test_backward_hooks.py b/tests/acceptance/model_bridge/compatibility/test_backward_hooks.py
index 5b355ed78..682f519c3 100644
--- a/tests/acceptance/model_bridge/compatibility/test_backward_hooks.py
+++ b/tests/acceptance/model_bridge/compatibility/test_backward_hooks.py
@@ -4,9 +4,6 @@
 import pytest
 import torch
 
-from transformer_lens import HookedTransformer
-from transformer_lens.model_bridge import TransformerBridge
-
 
 class TestBackwardHookCompatibility:
     """Test backward hook compatibility between TransformerBridge and HookedTransformer."""
@@ -14,17 +11,16 @@ class TestBackwardHookCompatibility:
     @pytest.mark.skip(
         reason="hook_mlp_out has known gradient differences due to architectural bridging (0.875 diff, but forward pass matches perfectly)"
     )
-    def test_backward_hook_gradients_match_hooked_transformer(self):
+    def test_backward_hook_gradients_match_hooked_transformer(
+        self, gpt2_hooked_unprocessed, gpt2_bridge_compat_no_processing
+    ):
         """Test that backward hook gradients match between TransformerBridge and HookedTransformer.
 
         This test ensures that backward hooks see identical gradient values in both
         TransformerBridge and HookedTransformer when using no_processing mode.
         """
-        hooked_model = HookedTransformer.from_pretrained_no_processing("gpt2", device_map="cpu")
-        bridge_model: TransformerBridge = TransformerBridge.boot_transformers(
-            "gpt2", device="cpu"
-        )  # type: ignore
-        bridge_model.enable_compatibility_mode(no_processing=True)
+        hooked_model = gpt2_hooked_unprocessed
+        bridge_model = gpt2_bridge_compat_no_processing
 
         test_input = torch.tensor([[1, 2, 3]])
 
@@ -51,16 +47,7 @@ def sum_bridge_grads(grad, hook=None):
             out = bridge_model(test_input)
             out.sum().backward()
 
-        print(f"HookedTransformer gradient sum: {hooked_grad_sum.item():.6f}")
-        print(f"TransformerBridge gradient sum: {bridge_grad_sum.item():.6f}")
-        print(f"Difference: {abs(hooked_grad_sum - bridge_grad_sum).item():.6f}")
         assert torch.allclose(hooked_grad_sum, bridge_grad_sum, atol=1e-2, rtol=1e-2), (
             f"Gradient sums should be identical but differ by "
             f"{abs(hooked_grad_sum - bridge_grad_sum).item():.6f}"
         )
-
-
-if __name__ == "__main__":
-    test = TestBackwardHookCompatibility()
-    test.test_backward_hook_gradients_match_hooked_transformer()
-    print("✅ Backward hook compatibility test passed!")
diff --git a/tests/acceptance/model_bridge/compatibility/test_bridge_hooks.py b/tests/acceptance/model_bridge/compatibility/test_bridge_hooks.py
deleted file mode 100644
index 7d13c2b62..000000000
--- a/tests/acceptance/model_bridge/compatibility/test_bridge_hooks.py
+++ /dev/null
@@ -1,226 +0,0 @@
-#!/usr/bin/env python3
-"""Test suite for TransformerBridge hook system functionality."""
-
-import pytest
-import torch
-
-from transformer_lens import HookedTransformer
-from transformer_lens.model_bridge import TransformerBridge
-
-
-class TestTransformerBridgeHooks:
-    """Test TransformerBridge hook system functionality."""
-
-    @pytest.fixture
-    def bridge_model(self):
-        """Create TransformerBridge with compatibility mode enabled."""
-        device = "cpu"
-        model_name = "gpt2"
-
-        bridge = TransformerBridge.boot_transformers(model_name, device=device)
-        bridge.enable_compatibility_mode()
-        return bridge
-
-    @pytest.fixture
-    def reference_ht(self):
-        """Create reference HookedTransformer for comparison."""
-        device = "cpu"
-        model_name = "gpt2"
-
-        return HookedTransformer.from_pretrained(
-            model_name,
-            device=device,
-            fold_ln=True,
-            center_writing_weights=True,
-            center_unembed=True,
-            fold_value_biases=True,
-            refactor_factored_attn_matrices=False,
-        )
-
-    def test_hook_registry_completeness(self, bridge_model, reference_ht):
-        """Test that bridge has complete hook registry."""
-        key_hooks = [
-            "hook_embed",
-            "hook_pos_embed",
-            "blocks.0.attn.hook_q",
-            "blocks.0.attn.hook_k",
-            "blocks.0.attn.hook_v",
-            "blocks.0.attn.hook_z",
-        ]
-
-        for hook_name in key_hooks:
-            assert hook_name in reference_ht.hook_dict, f"Reference HT missing {hook_name}"
-            # Aliases are in hook_dict, not _hook_registry
-            assert hook_name in bridge_model.hook_dict, f"Bridge missing {hook_name}"
-
-        # Bridge should have substantial number of hooks (canonical + aliases)
-        assert len(bridge_model.hook_dict) > 200, "Bridge should have substantial hook registry"
-
-    def test_basic_hook_functionality(self, bridge_model):
-        """Test that hooks fire and can modify activations."""
-        test_text = "Natural language processing"
-        hook_fired = False
-
-        def test_hook(activation, hook):
-            nonlocal hook_fired
-            hook_fired = True
-            assert isinstance(activation, torch.Tensor), "Hook should receive tensor"
-            assert activation.shape[-1] > 0, "Activation should have meaningful shape"
-            return activation
-
-        result = bridge_model.run_with_hooks(
-            test_text, return_type="logits", fwd_hooks=[("hook_embed", test_hook)]
-        )
-
-        assert hook_fired, "Hook should have fired"
-        assert isinstance(result, torch.Tensor), "Should return tensor result"
-
-    def test_ablation_hook_effect(self, bridge_model):
-        """Test that ablation hooks actually affect output."""
-        test_text = "Natural language processing"
-
-        baseline_loss = bridge_model(test_text, return_type="loss")
-
-        def ablation_hook(activation, hook):
-            activation[:, :, 0, :] = 0
-            return activation
-
-        ablated_loss = bridge_model.run_with_hooks(
-            test_text, return_type="loss", fwd_hooks=[("blocks.0.attn.hook_v", ablation_hook)]
-        )
-
-        effect = abs(ablated_loss - baseline_loss)
-        assert effect > 1e-6, f"Ablation should have meaningful effect (got {effect:.6f})"
-
-    def test_hook_equivalence_with_reference(self, bridge_model, reference_ht):
-        """Test that hooks produce equivalent effects to reference HookedTransformer."""
-        test_text = "Natural language processing"
-
-        def ablation_hook(activation, hook):
-            activation[:, :, 5, :] = 0
-            return activation
-
-        ht_baseline = reference_ht(test_text, return_type="loss")
-        ht_ablated = reference_ht.run_with_hooks(
-            test_text, return_type="loss", fwd_hooks=[("blocks.0.attn.hook_v", ablation_hook)]
-        )
-
-        bridge_baseline = bridge_model(test_text, return_type="loss")
-        bridge_ablated = bridge_model.run_with_hooks(
-            test_text, return_type="loss", fwd_hooks=[("blocks.0.attn.hook_v", ablation_hook)]
-        )
-
-        # Small numerical differences expected between implementations
-        ht_effect = ht_ablated - ht_baseline
-        bridge_effect = bridge_ablated - bridge_baseline
-
-        effect_diff = abs(ht_effect - bridge_effect)
-        assert (
-            effect_diff < 2e-4
-        ), f"Hook effects should match between models (diff: {effect_diff:.6f})"
-
-    def test_multiple_hooks(self, bridge_model):
-        """Test that multiple hooks can be applied simultaneously."""
-        test_text = "Natural language processing"
-        hooks_fired = set()
-
-        def make_hook(hook_id):
-            def hook_fn(activation, hook):
-                hooks_fired.add(hook_id)
-                return activation
-
-            return hook_fn
-
-        result = bridge_model.run_with_hooks(
-            test_text,
-            return_type="logits",
-            fwd_hooks=[
-                ("hook_embed", make_hook("embed")),
-                ("blocks.0.attn.hook_q", make_hook("q")),
-                ("blocks.0.attn.hook_v", make_hook("v")),
-            ],
-        )
-
-        expected_hooks = {"embed", "q", "v"}
-        assert hooks_fired == expected_hooks, f"Expected {expected_hooks}, got {hooks_fired}"
-
-    def test_hook_activation_shapes(self, bridge_model):
-        """Test that hook activations have expected shapes."""
-        test_text = "The quick brown fox"
-        captured_shapes = {}
-
-        def capture_shape_hook(hook_name):
-            def hook_fn(activation, hook):
-                captured_shapes[hook_name] = activation.shape
-                return activation
-
-            return hook_fn
-
-        bridge_model.run_with_hooks(
-            test_text,
-            return_type="logits",
-            fwd_hooks=[
-                ("hook_embed", capture_shape_hook("embed")),
-                ("blocks.0.attn.hook_v", capture_shape_hook("v")),
-                ("blocks.0.mlp.hook_pre", capture_shape_hook("mlp_pre")),
-            ],
-        )
-
-        assert len(captured_shapes) == 3, "Should have captured 3 activations"
-        embed_shape = captured_shapes["embed"]
-        assert len(embed_shape) == 3, "Embedding should be 3D"
-        assert embed_shape[-1] == 768, "Should have d_model=768 for GPT2"
-
-        v_shape = captured_shapes["v"]
-        assert len(v_shape) == 4, "Attention values should be 4D"
-        assert v_shape[2] == 12, "Should have 12 heads for GPT2"
-
-    def test_hook_context_manager(self, bridge_model):
-        """Test hook context manager functionality."""
-        test_text = "Natural language processing"
-        hook_fired = False
-
-        def test_hook(activation, hook):
-            nonlocal hook_fired
-            hook_fired = True
-            return activation
-
-        with bridge_model.hooks(fwd_hooks=[("hook_embed", test_hook)]):
-            result = bridge_model(test_text, return_type="logits")
-
-        assert hook_fired, "Hook should have fired in context"
-
-        hook_fired = False
-        bridge_model(test_text, return_type="logits")
-        assert not hook_fired, "Hook should be removed after context"
-
-
-def test_standalone_hook_functionality():
-    """Standalone test for basic hook functionality."""
-    device = "cpu"
-    model_name = "gpt2"
-
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-    bridge.enable_compatibility_mode()
-
-    test_text = "The quick brown fox"
-
-    hook_called = False
-
-    def test_hook(activation, hook):
-        nonlocal hook_called
-        hook_called = True
-        print(f"Hook fired: {hook.name}, shape: {activation.shape}")
-        return activation
-
-    result = bridge.run_with_hooks(
-        test_text, return_type="loss", fwd_hooks=[("blocks.0.attn.hook_v", test_hook)]
-    )
-
-    assert hook_called, "Hook should have been called"
-    assert isinstance(result, torch.Tensor), "Should return tensor result"
-    print(f"✅ Hook test passed! Loss: {result:.6f}")
-
-
-if __name__ == "__main__":
-    test_standalone_hook_functionality()
diff --git a/tests/acceptance/model_bridge/compatibility/test_hook_completeness.py b/tests/acceptance/model_bridge/compatibility/test_hook_completeness.py
index 7f3888db6..78896053b 100644
--- a/tests/acceptance/model_bridge/compatibility/test_hook_completeness.py
+++ b/tests/acceptance/model_bridge/compatibility/test_hook_completeness.py
@@ -15,7 +15,7 @@
 from transformer_lens.benchmarks import benchmark_forward_hooks, benchmark_hook_registry
 from transformer_lens.model_bridge import TransformerBridge
 
-pytestmark = pytest.mark.skip(reason="Temporarily skipping hook completeness tests pending fixes")
+pytestmark = pytest.mark.slow
 
 # Diverse architectures for hook completeness testing
 MODELS_TO_TEST = [
@@ -71,7 +71,9 @@ def test_all_hooks_fire(self, model_name):
         test_text = "The quick brown fox"
 
         # Run benchmark - this will fail if hooks don't fire
-        result = benchmark_forward_hooks(bridge, test_text, reference_model=ht, tolerance=1e-3)
+        # tolerance=1e-2: some architectures (e.g., pythia) accumulate small floating-point
+        # differences across layers that exceed 1e-3 but are not meaningful divergences.
+        result = benchmark_forward_hooks(bridge, test_text, reference_model=ht, tolerance=1e-2)
 
         # Must pass - all hooks must fire
         assert result.passed, (
diff --git a/tests/acceptance/model_bridge/compatibility/test_hook_duplication.py b/tests/acceptance/model_bridge/compatibility/test_hook_duplication.py
index 766f2a714..254b5a741 100644
--- a/tests/acceptance/model_bridge/compatibility/test_hook_duplication.py
+++ b/tests/acceptance/model_bridge/compatibility/test_hook_duplication.py
@@ -2,15 +2,13 @@
 
 import torch
 
-from transformer_lens import HookedTransformer
-from transformer_lens.model_bridge import TransformerBridge
 
-
-def test_TransformerBridge_compatibility_mode_calls_hooks_once():
+def test_TransformerBridge_compatibility_mode_calls_hooks_once(
+    gpt2_hooked_unprocessed, gpt2_bridge_compat_no_processing
+):
     """Regression test: hooks fire exactly once even with aliased HookPoint names."""
-    hooked_model = HookedTransformer.from_pretrained_no_processing("gpt2", device_map="cpu")
-    bridge_model: TransformerBridge = TransformerBridge.boot_transformers("gpt2", device="cpu")  # type: ignore
-    bridge_model.enable_compatibility_mode(no_processing=True)
+    hooked_model = gpt2_hooked_unprocessed
+    bridge_model = gpt2_bridge_compat_no_processing
 
     test_input = torch.tensor([[1, 2, 3]])
 
@@ -47,10 +45,9 @@ def count_bridge_calls(acts, hook):
     )
 
 
-def test_hook_mlp_out_aliasing():
+def test_hook_mlp_out_aliasing(gpt2_bridge_compat_no_processing):
     """Test that hook_mlp_out is properly aliased to mlp.hook_out in compatibility mode."""
-    bridge_model: TransformerBridge = TransformerBridge.boot_transformers("gpt2", device="cpu")  # type: ignore
-    bridge_model.enable_compatibility_mode(no_processing=True)
+    bridge_model = gpt2_bridge_compat_no_processing
 
     block0 = bridge_model.blocks[0]
 
@@ -61,10 +58,9 @@ def test_hook_mlp_out_aliasing():
     ), "hook_mlp_out should be aliased to mlp.hook_out (same object)"
 
 
-def test_stateful_hook_pattern():
+def test_stateful_hook_pattern(gpt2_bridge_compat_no_processing):
     """Test stateful closure pattern (circuit-tracer's cache-then-pop) with aliased hooks."""
-    bridge_model: TransformerBridge = TransformerBridge.boot_transformers("gpt2", device="cpu")  # type: ignore
-    bridge_model.enable_compatibility_mode(no_processing=True)
+    bridge_model = gpt2_bridge_compat_no_processing
 
     test_input = torch.tensor([[1, 2, 3]])
     block = bridge_model.blocks[0]
diff --git a/tests/acceptance/model_bridge/compatibility/test_legacy_hooked_transformer_coverage.py b/tests/acceptance/model_bridge/compatibility/test_legacy_hooked_transformer_coverage.py
index 1d4d0dd63..ff5f7bdff 100644
--- a/tests/acceptance/model_bridge/compatibility/test_legacy_hooked_transformer_coverage.py
+++ b/tests/acceptance/model_bridge/compatibility/test_legacy_hooked_transformer_coverage.py
@@ -29,8 +29,10 @@ def model_name(self, request):
         return request.param
 
     @pytest.fixture(scope="class")
-    def bridge_model(self, model_name):
-        """Create a TransformerBridge model for testing."""
+    def bridge_model(self, model_name, gpt2_bridge):
+        """Use session-scoped fixture for gpt2, load fresh for other models."""
+        if model_name == "gpt2":
+            return gpt2_bridge
         try:
             return TransformerBridge.boot_transformers(model_name, device="cpu")
         except Exception as e:
diff --git a/tests/acceptance/model_bridge/compatibility/test_run_with_cache.py b/tests/acceptance/model_bridge/compatibility/test_run_with_cache.py
index 77c81d0e9..7a2e6a169 100644
--- a/tests/acceptance/model_bridge/compatibility/test_run_with_cache.py
+++ b/tests/acceptance/model_bridge/compatibility/test_run_with_cache.py
@@ -3,41 +3,28 @@
 
 import torch
 
-from transformer_lens import HookedTransformer
-from transformer_lens.model_bridge import TransformerBridge
-
 
 class TestRunWithCacheCompatibility:
     """Test run_with_cache compatibility between TransformerBridge and HookedTransformer."""
 
-    def test_run_with_cache_matches_forward_pass(self):
+    def test_run_with_cache_matches_forward_pass(self, gpt2_bridge_compat_no_processing):
         """Test that run_with_cache produces identical results to a regular forward pass."""
-        bridge_model: TransformerBridge = TransformerBridge.boot_transformers(
-            "gpt2", device="cpu"
-        )  # type: ignore
-        bridge_model.enable_compatibility_mode(no_processing=True)
+        bridge_model = gpt2_bridge_compat_no_processing
 
         test_input = torch.tensor([[1, 2, 3]])
         bridge_logits_cache, _ = bridge_model.run_with_cache(test_input)
         bridge_logits_manual = bridge_model(test_input)
 
-        print(f"Cache logits shape: {bridge_logits_cache.shape}")
-        print(f"Manual logits shape: {bridge_logits_manual.shape}")
-        print(
-            f"Max difference: {torch.abs(bridge_logits_cache - bridge_logits_manual).max().item():.6f}"
-        )
-
         assert torch.allclose(
             bridge_logits_cache, bridge_logits_manual, atol=1e-2
         ), "run_with_cache should produce identical results to forward pass"
 
-    def test_run_with_cache_returns_correct_cached_values(self):
+    def test_run_with_cache_returns_correct_cached_values(
+        self, gpt2_hooked_unprocessed, gpt2_bridge_compat_no_processing
+    ):
         """Test that run_with_cache returns correct cached activation values."""
-        hooked_model = HookedTransformer.from_pretrained_no_processing("gpt2", device_map="cpu")
-        bridge_model: TransformerBridge = TransformerBridge.boot_transformers(
-            "gpt2", device="cpu"
-        )  # type: ignore
-        bridge_model.enable_compatibility_mode(no_processing=True)
+        hooked_model = gpt2_hooked_unprocessed
+        bridge_model = gpt2_bridge_compat_no_processing
 
         test_input = torch.tensor([[1, 2, 3]])
 
@@ -62,31 +49,15 @@ def hook_fn(acts, hook):
             bridge_model(test_input)
 
         # Verify cache matches manual hooks
-        print(f"HookedTransformer cache sum: {hooked_cache['blocks.0.hook_mlp_out'].sum():.6f}")
-        print(f"HookedTransformer manual sum: {manual_cache['hooked'].sum():.6f}")
         assert torch.allclose(
             hooked_cache["blocks.0.hook_mlp_out"], manual_cache["hooked"], atol=1e-5
         ), "HookedTransformer run_with_cache should match manual hooks"
 
         # Same check for TransformerBridge
-        print(f"TransformerBridge cache sum: {bridge_cache['blocks.0.hook_mlp_out'].sum():.6f}")
-        print(f"TransformerBridge manual sum: {manual_cache['bridge'].sum():.6f}")
         cache_diff = (bridge_cache["blocks.0.hook_mlp_out"] - manual_cache["bridge"]).abs().max()
-        print(f"Max difference: {cache_diff:.6f}")
-
         assert torch.allclose(
             bridge_cache["blocks.0.hook_mlp_out"], manual_cache["bridge"], atol=1e-2, rtol=1e-2
         ), (
             f"TransformerBridge run_with_cache should match manual hooks. "
-            f"Cache sum: {bridge_cache['blocks.0.hook_mlp_out'].sum():.6f}, "
-            f"Manual hooks sum: {manual_cache['bridge'].sum():.6f}, "
-            f"Difference: {cache_diff:.6f}"
+            f"Max difference: {cache_diff:.6f}"
         )
-
-
-if __name__ == "__main__":
-    test = TestRunWithCacheCompatibility()
-    test.test_run_with_cache_matches_forward_pass()
-    print("✅ run_with_cache forward pass test passed!")
-    test.test_run_with_cache_returns_correct_cached_values()
-    print("✅ run_with_cache cached values test passed!")
diff --git a/tests/acceptance/model_bridge/conftest.py b/tests/acceptance/model_bridge/conftest.py
new file mode 100644
index 000000000..7012b890d
--- /dev/null
+++ b/tests/acceptance/model_bridge/conftest.py
@@ -0,0 +1,44 @@
+"""Shared fixtures for model_bridge acceptance tests.
+
+Session-scoped fixtures avoid redundant model loads across test files.
+All models used here must be in the CI cache (see .github/workflows/checks.yml).
+"""
+
+import pytest
+
+from transformer_lens import HookedTransformer
+from transformer_lens.model_bridge import TransformerBridge
+
+
+@pytest.fixture(scope="session")
+def gpt2_bridge():
+    """TransformerBridge wrapping gpt2 (no compatibility mode)."""
+    return TransformerBridge.boot_transformers("gpt2", device="cpu")
+
+
+@pytest.fixture(scope="session")
+def gpt2_bridge_compat():
+    """TransformerBridge wrapping gpt2 with compatibility mode enabled."""
+    bridge = TransformerBridge.boot_transformers("gpt2", device="cpu")
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+@pytest.fixture(scope="session")
+def gpt2_bridge_compat_no_processing():
+    """TransformerBridge wrapping gpt2 with compatibility mode but no weight processing."""
+    bridge = TransformerBridge.boot_transformers("gpt2", device="cpu")
+    bridge.enable_compatibility_mode(no_processing=True)
+    return bridge
+
+
+@pytest.fixture(scope="session")
+def gpt2_hooked_processed():
+    """HookedTransformer gpt2 with default weight processing."""
+    return HookedTransformer.from_pretrained("gpt2", device="cpu")
+
+
+@pytest.fixture(scope="session")
+def gpt2_hooked_unprocessed():
+    """HookedTransformer gpt2 without weight processing."""
+    return HookedTransformer.from_pretrained_no_processing("gpt2", device="cpu")
diff --git a/tests/acceptance/model_bridge/test_t5_compatibility_mode.py b/tests/acceptance/model_bridge/test_t5_compatibility_mode.py
index 24c13bb4a..1382e57ac 100644
--- a/tests/acceptance/model_bridge/test_t5_compatibility_mode.py
+++ b/tests/acceptance/model_bridge/test_t5_compatibility_mode.py
@@ -225,6 +225,37 @@ def test_t5_block_bridge_hooks(self, bridge_model):
         # Decoder blocks SHOULD have hook_resid_mid2 (3 layers - after cross-attn)
         assert hasattr(decoder_block, "hook_resid_mid2")
 
+    def test_encoder_forward_produces_valid_output(self, bridge_model):
+        """Test that T5 bridge produces valid output on a forward pass."""
+        tokens = bridge_model.to_tokens("The quick brown fox")
+        with torch.no_grad():
+            output = bridge_model(tokens)
+
+        # Output should be a tensor with reasonable shape
+        assert isinstance(output, torch.Tensor), f"Expected tensor, got {type(output)}"
+        assert output.ndim >= 2, f"Expected at least 2D output, got {output.ndim}D"
+        assert not torch.isnan(output).any(), "Output contains NaN"
+        assert not torch.isinf(output).any(), "Output contains Inf"
+
+    def test_run_with_cache_populates_encoder_and_decoder(self, bridge_model):
+        """Test that run_with_cache returns activations from both encoder and decoder."""
+        tokens = bridge_model.to_tokens("Translate this")
+        with torch.no_grad():
+            _, cache = bridge_model.run_with_cache(tokens)
+
+        cache_keys = list(cache.keys())
+        assert len(cache_keys) > 0, "Cache should not be empty"
+
+        encoder_keys = [k for k in cache_keys if "encoder" in k]
+        decoder_keys = [k for k in cache_keys if "decoder" in k]
+
+        assert (
+            len(encoder_keys) > 0
+        ), f"Cache should contain encoder activations. Keys: {cache_keys[:10]}"
+        assert (
+            len(decoder_keys) > 0
+        ), f"Cache should contain decoder activations. Keys: {cache_keys[:10]}"
+
     def test_rms_normalization_used(self, bridge_model):
         """Test that T5 uses RMSNormalizationBridge throughout."""
         from transformer_lens.model_bridge.generalized_components.rms_normalization import (
diff --git a/tests/integration/model_bridge/compatibility/test_bridge_cache_behavior.py b/tests/integration/model_bridge/compatibility/test_bridge_cache_behavior.py
new file mode 100644
index 000000000..67af4413b
--- /dev/null
+++ b/tests/integration/model_bridge/compatibility/test_bridge_cache_behavior.py
@@ -0,0 +1,165 @@
+"""Consolidated tests for TransformerBridge cache behavior.
+
+Tests run_with_cache output, cache contents, names filtering, and
+cache equality with HookedTransformer. Consolidates overlapping tests from:
+- tests/integration/model_bridge/compatibility/test_hooks.py (cache tests)
+- tests/integration/model_bridge/compatibility/test_legacy_hooks.py
+
+Uses distilgpt2 (CI-cached) for speed unless gpt2-specific behavior is tested.
+"""
+
+import pytest
+import torch
+
+from transformer_lens import HookedTransformer
+from transformer_lens.model_bridge import TransformerBridge
+
+
+@pytest.fixture(scope="module")
+def bridge_compat():
+    """TransformerBridge with compatibility mode."""
+    b = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    b.enable_compatibility_mode()
+    return b
+
+
+@pytest.fixture(scope="module")
+def reference_ht():
+    """HookedTransformer for comparison."""
+    return HookedTransformer.from_pretrained("distilgpt2", device="cpu")
+
+
+EXPECTED_HOOKS = [
+    "hook_embed",
+    "hook_pos_embed",
+    "blocks.0.hook_resid_pre",
+    "blocks.0.hook_resid_mid",
+    "blocks.0.hook_resid_post",
+    "blocks.0.ln1.hook_scale",
+    "blocks.0.ln1.hook_normalized",
+    "blocks.0.ln2.hook_scale",
+    "blocks.0.ln2.hook_normalized",
+    "blocks.0.attn.hook_q",
+    "blocks.0.attn.hook_k",
+    "blocks.0.attn.hook_v",
+    "blocks.0.attn.hook_z",
+    "blocks.0.attn.hook_attn_scores",
+    "blocks.0.attn.hook_pattern",
+    "blocks.0.attn.hook_result",
+    "blocks.0.mlp.hook_pre",
+    "blocks.0.mlp.hook_post",
+    "blocks.0.hook_attn_out",
+    "blocks.0.hook_mlp_out",
+    "ln_final.hook_scale",
+    "ln_final.hook_normalized",
+]
+
+
+class TestCacheBasics:
+    """Test basic cache functionality."""
+
+    def test_run_with_cache_returns_nonempty(self, bridge_compat):
+        """run_with_cache returns a non-empty cache."""
+        with torch.no_grad():
+            _, cache = bridge_compat.run_with_cache("Hello world")
+        assert len(cache) > 0
+
+    def test_cache_contains_residual_hooks(self, bridge_compat):
+        """Cache should contain residual stream hooks."""
+        with torch.no_grad():
+            _, cache = bridge_compat.run_with_cache("Hello world")
+        cache_keys = list(cache.keys())
+        assert any("hook_resid" in k for k in cache_keys)
+
+    def test_cache_values_are_tensors(self, bridge_compat):
+        """All cached values should be tensors with correct batch dimension."""
+        with torch.no_grad():
+            _, cache = bridge_compat.run_with_cache("Hello")
+        for key, value in cache.items():
+            assert isinstance(value, torch.Tensor), f"Cache[{key}] is {type(value)}"
+            assert value.shape[0] == 1, f"Cache[{key}] batch dim is {value.shape[0]}"
+
+
+class TestCacheNamesFilter:
+    """Test cache names filtering."""
+
+    def test_names_filter_returns_subset(self, bridge_compat):
+        """names_filter should return only matching keys."""
+        with torch.no_grad():
+            _, full_cache = bridge_compat.run_with_cache("Hello")
+            _, filtered_cache = bridge_compat.run_with_cache(
+                "Hello",
+                names_filter=lambda name: "hook_resid_pre" in name,
+            )
+
+        assert len(filtered_cache) > 0
+        assert len(filtered_cache) < len(full_cache)
+        for key in filtered_cache:
+            assert "hook_resid_pre" in key, f"Unexpected key: {key}"
+
+
+class TestCacheCompleteness:
+    """Test that cache contains all expected hooks."""
+
+    def test_all_expected_hooks_in_cache(self, bridge_compat):
+        """Cache should contain all expected hook names."""
+        _, cache = bridge_compat.run_with_cache("Hello World!")
+        actual_keys = set(cache.keys())
+        missing = set(EXPECTED_HOOKS) - actual_keys
+        assert len(missing) == 0, f"Missing expected hooks: {sorted(missing)}"
+
+    def test_expected_hooks_accessible_on_model(self, bridge_compat):
+        """Expected hooks should be accessible as attributes on the model."""
+        from transformer_lens.hook_points import HookPoint
+
+        missing = []
+        for hook_name in EXPECTED_HOOKS:
+            parts = hook_name.split(".")
+            current = bridge_compat
+            try:
+                for part in parts:
+                    current = getattr(current, part)
+                if not isinstance(current, HookPoint):
+                    missing.append(hook_name)
+            except AttributeError:
+                missing.append(hook_name)
+
+        assert len(missing) == 0, f"Hooks not accessible on model: {sorted(missing)}"
+
+
+class TestCacheEqualityWithHookedTransformer:
+    """Test that cache values match between bridge and HookedTransformer."""
+
+    def test_cache_values_match(self, bridge_compat, reference_ht):
+        """Cache activations should match between bridge and HookedTransformer.
+
+        Note: Raw attention scores use different masking sentinels:
+        HookedTransformer uses -inf, Bridge uses torch.finfo(dtype).min.
+        Unmasked scores and resulting patterns should still match.
+        """
+        prompt = "Hello World!"
+        _, bridge_cache = bridge_compat.run_with_cache(prompt)
+        _, ht_cache = reference_ht.run_with_cache(prompt)
+
+        for hook in EXPECTED_HOOKS:
+            if hook not in bridge_cache or hook not in ht_cache:
+                continue
+
+            ht_act = ht_cache[hook]
+            bridge_act = bridge_cache[hook]
+
+            assert (
+                ht_act.shape == bridge_act.shape
+            ), f"Shape mismatch for {hook}: {ht_act.shape} vs {bridge_act.shape}"
+
+            if hook == "blocks.0.attn.hook_attn_scores":
+                # Different masking sentinels — compare only unmasked positions
+                masked = torch.isinf(ht_act)
+                unmasked = ~masked
+                assert torch.allclose(
+                    ht_act[unmasked], bridge_act[unmasked], atol=1e-4, rtol=1e-4
+                ), "Unmasked attention scores should match"
+                continue
+
+            mean_diff = torch.abs(ht_act - bridge_act).mean()
+            assert mean_diff < 0.5, f"Hook {hook} mismatch: mean abs diff = {mean_diff:.6f}"
diff --git a/tests/integration/model_bridge/compatibility/test_bridge_hook_behavior.py b/tests/integration/model_bridge/compatibility/test_bridge_hook_behavior.py
new file mode 100644
index 000000000..ed7856b0f
--- /dev/null
+++ b/tests/integration/model_bridge/compatibility/test_bridge_hook_behavior.py
@@ -0,0 +1,348 @@
+"""Consolidated tests for TransformerBridge hook behavior.
+
+Tests hook firing, modification, ablation, shapes, context managers, error handling,
+and registry completeness. Consolidates overlapping tests from:
+- tests/acceptance/model_bridge/compatibility/test_bridge_hooks.py
+- tests/integration/model_bridge/compatibility/test_hooks.py
+- tests/integration/model_bridge/test_attention_hook_compatibility.py
+
+Uses distilgpt2 (CI-cached) for speed unless gpt2-specific behavior is being tested.
+"""
+
+import pytest
+import torch
+
+from transformer_lens import HookedTransformer
+from transformer_lens.model_bridge import TransformerBridge
+
+
+@pytest.fixture(scope="module")
+def bridge():
+    """TransformerBridge without compatibility mode."""
+    return TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+
+
+@pytest.fixture(scope="module")
+def bridge_compat():
+    """TransformerBridge with compatibility mode."""
+    b = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    b.enable_compatibility_mode()
+    return b
+
+
+@pytest.fixture(scope="module")
+def reference_ht():
+    """HookedTransformer for comparison."""
+    return HookedTransformer.from_pretrained("distilgpt2", device="cpu")
+
+
+class TestHookFiring:
+    """Test that hooks fire correctly during forward passes."""
+
+    def test_hook_fires_once_per_forward(self, bridge):
+        """A registered forward hook fires exactly once per forward pass."""
+        count = 0
+
+        def hook_fn(tensor, hook):
+            nonlocal count
+            count += 1
+            return tensor
+
+        bridge.run_with_hooks(
+            "Hello world",
+            fwd_hooks=[("blocks.0.hook_resid_pre", hook_fn)],
+        )
+        assert count == 1
+
+    def test_hook_receives_tensor_with_batch_and_seq(self, bridge):
+        """Hook receives a tensor with at least batch and sequence dimensions."""
+        captured = {}
+
+        def hook_fn(tensor, hook):
+            captured["shape"] = tensor.shape
+            return tensor
+
+        bridge.run_with_hooks(
+            "Hello",
+            fwd_hooks=[("blocks.0.hook_resid_pre", hook_fn)],
+        )
+        assert len(captured["shape"]) >= 2
+        assert captured["shape"][0] >= 1  # batch >= 1
+
+    def test_multiple_hooks_fire_independently(self, bridge):
+        """Multiple hooks on different points each fire independently."""
+        fired = set()
+
+        def make_hook(name):
+            def hook_fn(tensor, hook):
+                fired.add(name)
+                return tensor
+
+            return hook_fn
+
+        bridge.run_with_hooks(
+            "Hello",
+            fwd_hooks=[
+                ("blocks.0.hook_resid_pre", make_hook("resid_pre_0")),
+                ("blocks.0.hook_resid_post", make_hook("resid_post_0")),
+            ],
+        )
+        assert fired == {"resid_pre_0", "resid_post_0"}
+
+    @pytest.mark.xfail(reason="add_perma_hook not yet implemented on TransformerBridge")
+    def test_perma_hook_persists_across_calls(self, bridge):
+        """A permanent hook fires on every forward pass until removed."""
+        count = 0
+
+        def hook_fn(tensor, hook):
+            nonlocal count
+            count += 1
+            return tensor
+
+        bridge.add_perma_hook("blocks.0.hook_resid_pre", hook_fn)
+        try:
+            with torch.no_grad():
+                bridge("Hello")
+                assert count == 1
+                bridge("World")
+                assert count == 2
+        finally:
+            bridge.reset_hooks()
+
+
+class TestHookModification:
+    """Test that hooks can modify activations and affect output."""
+
+    def test_zeroing_residual_changes_output(self, bridge):
+        """Zeroing a residual stream hook changes the final output."""
+        with torch.no_grad():
+            normal_output = bridge("Hello world")
+
+            def zero_hook(tensor, hook):
+                return torch.zeros_like(tensor)
+
+            modified_output = bridge.run_with_hooks(
+                "Hello world",
+                fwd_hooks=[("blocks.0.hook_resid_pre", zero_hook)],
+            )
+
+        assert not torch.allclose(normal_output, modified_output)
+
+    def test_ablation_has_nonzero_effect(self, bridge_compat):
+        """Ablating an attention head changes the loss."""
+        test_text = "Natural language processing"
+        baseline_loss = bridge_compat(test_text, return_type="loss")
+
+        def ablation_hook(activation, hook):
+            activation[:, :, 0, :] = 0
+            return activation
+
+        ablated_loss = bridge_compat.run_with_hooks(
+            test_text,
+            return_type="loss",
+            fwd_hooks=[("blocks.0.attn.hook_v", ablation_hook)],
+        )
+
+        effect = abs(ablated_loss - baseline_loss)
+        assert effect > 1e-6, f"Ablation should have meaningful effect (got {effect:.6f})"
+
+
+class TestHookAblationEquivalence:
+    """Test that ablation effects match between bridge and HookedTransformer."""
+
+    def test_ablation_effect_matches_reference(self, bridge_compat, reference_ht):
+        """Ablation effects should match between bridge and HookedTransformer."""
+        test_text = "Natural language processing"
+
+        def ablation_hook(activation, hook):
+            activation[:, :, 5, :] = 0
+            return activation
+
+        ht_baseline = reference_ht(test_text, return_type="loss")
+        ht_ablated = reference_ht.run_with_hooks(
+            test_text,
+            return_type="loss",
+            fwd_hooks=[("blocks.0.attn.hook_v", ablation_hook)],
+        )
+
+        bridge_baseline = bridge_compat(test_text, return_type="loss")
+        bridge_ablated = bridge_compat.run_with_hooks(
+            test_text,
+            return_type="loss",
+            fwd_hooks=[("blocks.0.attn.hook_v", ablation_hook)],
+        )
+
+        ht_effect = ht_ablated - ht_baseline
+        bridge_effect = bridge_ablated - bridge_baseline
+        effect_diff = abs(ht_effect - bridge_effect)
+
+        assert (
+            effect_diff < 2e-4
+        ), f"Hook effects should match between models (diff: {effect_diff:.6f})"
+
+
+class TestHookActivationShapes:
+    """Test that hook activations have expected shapes."""
+
+    def test_embedding_shape_3d(self, bridge_compat):
+        """Embedding hook should produce 3D tensor [batch, seq, d_model]."""
+        shapes = {}
+
+        def capture(name):
+            def hook_fn(activation, hook):
+                shapes[name] = activation.shape
+                return activation
+
+            return hook_fn
+
+        bridge_compat.run_with_hooks(
+            "The quick brown fox",
+            return_type="logits",
+            fwd_hooks=[("hook_embed", capture("embed"))],
+        )
+        assert len(shapes["embed"]) == 3
+        assert shapes["embed"][-1] == bridge_compat.cfg.d_model
+
+    def test_attention_v_shape_4d(self, bridge_compat):
+        """Attention V hook should produce 4D tensor [batch, seq, n_heads, d_head]."""
+        shapes = {}
+
+        def capture(name):
+            def hook_fn(activation, hook):
+                shapes[name] = activation.shape
+                return activation
+
+            return hook_fn
+
+        bridge_compat.run_with_hooks(
+            "The quick brown fox",
+            return_type="logits",
+            fwd_hooks=[("blocks.0.attn.hook_v", capture("v"))],
+        )
+        assert len(shapes["v"]) == 4
+        assert shapes["v"][2] == bridge_compat.cfg.n_heads
+
+    def test_shapes_match_reference(self, bridge_compat, reference_ht):
+        """Activation shapes should match between bridge and HookedTransformer."""
+        hook_name = "blocks.0.attn.hook_v"
+        tokens = reference_ht.to_tokens("The cat sat on")
+
+        ref_act: list[torch.Tensor] = []
+        bridge_act: list[torch.Tensor] = []
+
+        def collect_ref(a: torch.Tensor, hook: object) -> torch.Tensor:
+            ref_act.append(a)
+            return a
+
+        def collect_bridge(a: torch.Tensor, hook: object) -> torch.Tensor:
+            bridge_act.append(a)
+            return a
+
+        reference_ht.add_hook(hook_name, collect_ref)
+        bridge_compat.add_hook(hook_name, collect_bridge)
+
+        with torch.no_grad():
+            reference_ht(tokens)
+            bridge_compat(tokens)
+
+        reference_ht.reset_hooks()
+        bridge_compat.reset_hooks()
+
+        assert ref_act[0].shape == bridge_act[0].shape
+
+
+class TestHookContextManager:
+    """Test hook cleanup and context management."""
+
+    def test_run_with_hooks_cleans_up(self, bridge):
+        """Hooks from run_with_hooks don't persist after the call."""
+        count = 0
+
+        def hook_fn(tensor, hook):
+            nonlocal count
+            count += 1
+            return tensor
+
+        with torch.no_grad():
+            bridge.run_with_hooks(
+                "Hello",
+                fwd_hooks=[("blocks.0.hook_resid_pre", hook_fn)],
+            )
+        assert count == 1
+
+        count = 0
+        with torch.no_grad():
+            bridge("Hello")
+        assert count == 0, "Hook persisted after run_with_hooks returned"
+
+    def test_hooks_context_manager(self, bridge_compat):
+        """hooks() context manager adds and removes hooks correctly."""
+        hook_fired = False
+
+        def test_hook(activation, hook):
+            nonlocal hook_fired
+            hook_fired = True
+            return activation
+
+        with bridge_compat.hooks(fwd_hooks=[("hook_embed", test_hook)]):
+            bridge_compat("Natural language", return_type="logits")
+
+        assert hook_fired, "Hook should have fired in context"
+
+        hook_fired = False
+        bridge_compat("Natural language", return_type="logits")
+        assert not hook_fired, "Hook should be removed after context"
+
+
+class TestHookRegistry:
+    """Test hook registry completeness."""
+
+    def test_key_hooks_present(self, bridge_compat, reference_ht):
+        """Key hooks should be present in both bridge and reference."""
+        key_hooks = [
+            "hook_embed",
+            "hook_pos_embed",
+            "blocks.0.attn.hook_q",
+            "blocks.0.attn.hook_k",
+            "blocks.0.attn.hook_v",
+            "blocks.0.attn.hook_z",
+        ]
+        for hook_name in key_hooks:
+            assert hook_name in reference_ht.hook_dict, f"Reference missing {hook_name}"
+            assert hook_name in bridge_compat.hook_dict, f"Bridge missing {hook_name}"
+
+    def test_bridge_has_substantial_hooks(self, bridge_compat):
+        """Bridge should have a substantial number of hooks.
+
+        distilgpt2 has ~301 hooks, gpt2 has ~589. Threshold of 200 catches
+        regressions where large portions of the hook registry are lost.
+        """
+        assert len(bridge_compat.hook_dict) > 200
+
+    def test_expected_attention_hooks_available(self, bridge_compat):
+        """Expected attention hook names should be available."""
+        expected = [
+            "blocks.0.attn.hook_v",
+            "blocks.0.attn.hook_q",
+            "blocks.0.attn.hook_k",
+        ]
+        hook_names = set(bridge_compat.hook_dict.keys())
+        for hook_name in expected:
+            assert hook_name in hook_names, f"Bridge missing hook: {hook_name}"
+
+
+class TestHookErrorHandling:
+    """Test error handling in hooks."""
+
+    def test_hook_error_propagates(self, bridge_compat):
+        """Errors in hooks should propagate to the caller."""
+        tokens = bridge_compat.to_tokens("test")
+
+        def error_hook(activation, hook):
+            raise ValueError("Test error in hook")
+
+        bridge_compat.add_hook("blocks.0.attn.hook_v", error_hook)
+        with pytest.raises(ValueError, match="Test error in hook"):
+            with torch.no_grad():
+                bridge_compat(tokens)
+        bridge_compat.reset_hooks()
diff --git a/tests/integration/model_bridge/compatibility/test_hooks.py b/tests/integration/model_bridge/compatibility/test_hooks.py
deleted file mode 100644
index 9cca30334..000000000
--- a/tests/integration/model_bridge/compatibility/test_hooks.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""Focused hook tests for TransformerBridge.
-
-Tests that Bridge hooks fire correctly, can modify activations,
-and that run_with_cache returns populated caches.
-"""
-
-import pytest
-import torch
-
-from transformer_lens.model_bridge import TransformerBridge
-
-MODEL = "gpt2"
-
-
-@pytest.fixture(scope="module")
-def bridge():
-    return TransformerBridge.boot_transformers(MODEL, device="cpu")
-
-
-def test_hook_fires_on_forward(bridge):
-    """A registered forward hook must fire exactly once per forward pass."""
-    count = 0
-
-    def hook_fn(tensor, hook):
-        nonlocal count
-        count += 1
-        return tensor
-
-    bridge.run_with_hooks(
-        "Hello world",
-        fwd_hooks=[("blocks.0.hook_resid_pre", hook_fn)],
-    )
-    assert count == 1
-
-
-def test_hook_receives_tensor(bridge):
-    """Hook must receive a tensor with batch and sequence dimensions."""
-    captured = {}
-
-    def hook_fn(tensor, hook):
-        captured["shape"] = tensor.shape
-        captured["dtype"] = tensor.dtype
-        return tensor
-
-    bridge.run_with_hooks(
-        "Hello",
-        fwd_hooks=[("blocks.0.hook_resid_pre", hook_fn)],
-    )
-    assert "shape" in captured
-    assert len(captured["shape"]) >= 2  # at least [batch, seq, ...]
-    assert captured["shape"][0] >= 1  # batch >= 1
-
-
-def test_hook_can_modify_output(bridge):
-    """Zeroing a residual stream hook must change the final output."""
-    with torch.no_grad():
-        normal_output = bridge("Hello world")
-
-        def zero_hook(tensor, hook):
-            return torch.zeros_like(tensor)
-
-        modified_output = bridge.run_with_hooks(
-            "Hello world",
-            fwd_hooks=[("blocks.0.hook_resid_pre", zero_hook)],
-        )
-
-    assert not torch.allclose(normal_output, modified_output)
-
-
-def test_run_with_cache_returns_activations(bridge):
-    """run_with_cache must return a non-empty cache with expected keys."""
-    with torch.no_grad():
-        _, cache = bridge.run_with_cache("Hello world")
-
-    assert len(cache) > 0
-    # Must contain at least residual stream hooks
-    cache_keys = list(cache.keys())
-    assert any(
-        "hook_resid" in k for k in cache_keys
-    ), f"No residual stream hooks in cache. Keys: {cache_keys[:10]}"
-
-
-def test_cache_values_are_tensors_with_correct_batch(bridge):
-    """All cached values must be tensors with batch dim matching input."""
-    with torch.no_grad():
-        _, cache = bridge.run_with_cache("Hello")
-
-    for key, value in cache.items():
-        assert isinstance(value, torch.Tensor), f"Cache[{key}] is {type(value)}, not Tensor"
-        assert value.shape[0] == 1, f"Cache[{key}] batch dim is {value.shape[0]}, expected 1"
-
-
-def test_multiple_hooks_fire_independently(bridge):
-    """Multiple hooks on different points must each fire independently."""
-    fired = set()
-
-    def make_hook(name):
-        def hook_fn(tensor, hook):
-            fired.add(name)
-            return tensor
-
-        return hook_fn
-
-    bridge.run_with_hooks(
-        "Hello",
-        fwd_hooks=[
-            ("blocks.0.hook_resid_pre", make_hook("resid_pre_0")),
-            ("blocks.0.hook_resid_post", make_hook("resid_post_0")),
-        ],
-    )
-    assert "resid_pre_0" in fired
-    assert "resid_post_0" in fired
-
-
-@pytest.mark.xfail(reason="add_perma_hook not yet implemented on TransformerBridge")
-def test_perma_hook_persists_across_calls(bridge):
-    """A permanent hook must fire on every forward pass until explicitly removed."""
-    count = 0
-
-    def hook_fn(tensor, hook):
-        nonlocal count
-        count += 1
-        return tensor
-
-    bridge.add_perma_hook("blocks.0.hook_resid_pre", hook_fn)
-    try:
-        with torch.no_grad():
-            bridge("Hello")
-            assert count == 1
-            bridge("World")
-            assert count == 2  # still fires on second call
-    finally:
-        bridge.reset_hooks()
-
-    # After reset, hook should no longer fire
-    count = 0
-    with torch.no_grad():
-        bridge("Hello again")
-    assert count == 0
-
-
-def test_hook_context_manager_cleans_up(bridge):
-    """Hooks added via run_with_hooks must not persist after the call returns."""
-    count = 0
-
-    def hook_fn(tensor, hook):
-        nonlocal count
-        count += 1
-        return tensor
-
-    # Run with hook
-    with torch.no_grad():
-        bridge.run_with_hooks(
-            "Hello",
-            fwd_hooks=[("blocks.0.hook_resid_pre", hook_fn)],
-        )
-    assert count == 1
-
-    # Run again without hooks — count should NOT increase
-    count = 0
-    with torch.no_grad():
-        bridge("Hello")
-    assert count == 0, "Hook persisted after run_with_hooks returned"
-
-
-def test_cache_with_names_filter(bridge):
-    """run_with_cache with names_filter must return only matching keys."""
-    with torch.no_grad():
-        _, full_cache = bridge.run_with_cache("Hello")
-        _, filtered_cache = bridge.run_with_cache(
-            "Hello",
-            names_filter=lambda name: "hook_resid_pre" in name,
-        )
-
-    # Filtered cache should be a strict subset
-    assert len(filtered_cache) > 0
-    assert len(filtered_cache) < len(full_cache)
-    for key in filtered_cache:
-        assert "hook_resid_pre" in key, f"Unexpected key in filtered cache: {key}"
diff --git a/tests/integration/model_bridge/compatibility/test_legacy_hooks.py b/tests/integration/model_bridge/compatibility/test_legacy_hooks.py
deleted file mode 100644
index 8d2b48ea8..000000000
--- a/tests/integration/model_bridge/compatibility/test_legacy_hooks.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""Legacy hook compatibility tests for TransformerBridge.
-
-This module contains comprehensive tests that verify TransformerBridge provides all the hooks
-that should be available from HookedTransformer for interpretability research, including
-cache compatibility and hook availability tests.
-"""
-
-import pytest
-import torch
-
-from transformer_lens import HookedTransformer
-from transformer_lens.model_bridge import TransformerBridge
-
-
-class TestLegacyHookCompatibility:
-    """Test suite to verify comprehensive hook compatibility for TransformerBridge."""
-
-    @pytest.fixture
-    def model_name(self):
-        """Model name to use for testing."""
-        return "gpt2"
-
-    @pytest.fixture
-    def prompt(self):
-        """Test prompt for cache generation."""
-        return "Hello World!"
-
-    @pytest.fixture
-    def transformer_bridge(self, model_name):
-        """Create a TransformerBridge for testing."""
-        model = TransformerBridge.boot_transformers(model_name, device="cpu")
-        model.enable_compatibility_mode()
-        return model
-
-    @pytest.fixture
-    def hooked_transformer(self, model_name):
-        """Create a HookedTransformer for comparison testing."""
-        return HookedTransformer.from_pretrained(model_name, device="cpu")
-
-    @pytest.fixture
-    def expected_hooks(self):
-        """Get the unified list of hooks that should be available for TransformerBridge testing.
-
-        This includes all hooks that should be present in activation caches and accessible
-        on the model for interpretability research.
-        """
-        return [
-            # Core embedding hooks
-            "hook_embed",
-            "hook_pos_embed",
-            # Block 0 residual stream hooks
-            "blocks.0.hook_resid_pre",
-            "blocks.0.hook_resid_mid",
-            "blocks.0.hook_resid_post",
-            # Layer norm hooks
-            "blocks.0.ln1.hook_scale",
-            "blocks.0.ln1.hook_normalized",
-            "blocks.0.ln2.hook_scale",
-            "blocks.0.ln2.hook_normalized",
-            # Attention hooks
-            "blocks.0.attn.hook_q",
-            "blocks.0.attn.hook_k",
-            "blocks.0.attn.hook_v",
-            "blocks.0.attn.hook_z",
-            "blocks.0.attn.hook_attn_scores",
-            "blocks.0.attn.hook_pattern",
-            "blocks.0.attn.hook_result",
-            # MLP hooks
-            "blocks.0.mlp.hook_pre",
-            "blocks.0.mlp.hook_post",
-            # Output hooks
-            "blocks.0.hook_attn_out",
-            "blocks.0.hook_mlp_out",
-            # Final layer norm hooks
-            "ln_final.hook_scale",
-            "ln_final.hook_normalized",
-            # Hook aliases for commonly used patterns
-            "blocks.0.hook_attn_in",
-            "blocks.0.hook_mlp_in",
-            "blocks.0.hook_q_input",
-            "blocks.0.hook_k_input",
-            "blocks.0.hook_v_input",
-        ]
-
-    def hook_exists_on_model(self, model, hook_path: str) -> bool:
-        """Check if a hook path exists on the model by traversing attributes."""
-        parts = hook_path.split(".")
-        current = model
-
-        try:
-            for part in parts:
-                if "[" in part and "]" in part:
-                    # Handle array indexing like blocks[0]
-                    attr_name = part.split("[")[0]
-                    index = int(part.split("[")[1].split("]")[0])
-                    current = getattr(current, attr_name)[index]
-                else:
-                    current = getattr(current, part)
-
-            # Check if the final object is a HookPoint
-            from transformer_lens.hook_points import HookPoint
-
-            return isinstance(current, HookPoint)
-
-        except (AttributeError, IndexError, TypeError):
-            return False
-
-    def test_cache_hook_names_present(self, transformer_bridge, prompt, expected_hooks):
-        """Test that TransformerBridge cache contains all expected hook names."""
-        _, cache = transformer_bridge.run_with_cache(prompt)
-
-        # Get the actual cache keys
-        actual_keys = list(cache.keys())
-
-        print(f"\nExpected hooks: {len(expected_hooks)}")
-        print(f"Actual hooks: {len(actual_keys)}")
-
-        # Find missing and extra hooks
-        expected_set = set(expected_hooks)
-        actual_set = set(actual_keys)
-
-        missing_hooks = expected_set - actual_set
-        extra_hooks = actual_set - expected_set
-
-        print(f"Missing hooks ({len(missing_hooks)}): {sorted(missing_hooks)}")
-        print(
-            f"Extra hooks ({len(extra_hooks)}): {sorted(list(extra_hooks)[:10])}{'...' if len(extra_hooks) > 10 else ''}"
-        )
-
-        # Check that all expected hooks are present (subset check)
-        # It's okay to have extra hooks - that means more functionality is exposed
-        assert len(missing_hooks) == 0, f"Missing expected hooks: {sorted(missing_hooks)}"
-
-        # Verify we have at least the expected hooks
-        assert all(
-            hook in actual_set for hook in expected_set
-        ), f"Some expected hooks are missing: {missing_hooks}"
-
-    def test_cache_hook_equality_with_hooked_transformer(
-        self, transformer_bridge, hooked_transformer, prompt, expected_hooks
-    ):
-        """Test that TransformerBridge cache values match HookedTransformer cache values.
-
-        Raw attention-score caches intentionally use different masked sentinels:
-        HookedTransformer stores ``-inf`` for masked causal positions, while
-        TransformerBridge preserves HuggingFace's finite additive mask
-        representation using ``torch.finfo(dtype).min``. The unmasked scores and
-        resulting attention pattern should still match within floating-point
-        precision.
-        """
-        _, bridge_cache = transformer_bridge.run_with_cache(prompt)
-        _, hooked_transformer_cache = hooked_transformer.run_with_cache(prompt)
-
-        for hook in expected_hooks:
-            # Skip hooks that might not be present in both models
-            if hook not in bridge_cache or hook not in hooked_transformer_cache:
-                continue
-
-            hooked_transformer_activation = hooked_transformer_cache[hook]
-            bridge_activation = bridge_cache[hook]
-
-            assert hooked_transformer_activation.shape == bridge_activation.shape, (
-                f"Shape mismatch for hook {hook}: "
-                f"HookedTransformer shape {hooked_transformer_activation.shape}, "
-                f"TransformerBridge shape {bridge_activation.shape}"
-            )
-
-            if hook == "blocks.0.attn.hook_attn_scores":
-                masked_positions = torch.isinf(hooked_transformer_activation)
-                unmasked_positions = ~masked_positions
-
-                assert torch.allclose(
-                    hooked_transformer_activation[unmasked_positions],
-                    bridge_activation[unmasked_positions],
-                    atol=1e-4,
-                    rtol=1e-4,
-                ), (
-                    "Unmasked attention scores should match within float32 "
-                    "cross-implementation tolerance"
-                )
-
-                masked_bridge_values = bridge_activation[masked_positions]
-                min_dtype = torch.finfo(bridge_activation.dtype).min
-
-                assert masked_positions.any(), "Expected causal masking in attention scores"
-                assert torch.isfinite(masked_bridge_values).all(), (
-                    "TransformerBridge should keep masked attention scores finite "
-                    "to mirror HuggingFace additive masking semantics"
-                )
-                assert torch.all(masked_bridge_values == min_dtype), (
-                    "Masked TransformerBridge attention scores should use dtype min "
-                    "instead of HookedTransformer's -inf sentinel"
-                )
-                continue
-
-            # Remaining legacy-compatible hooks are finite on this prompt, mean abs diff suffices
-            mean_abs_diff = torch.abs(hooked_transformer_activation - bridge_activation).mean()
-            assert mean_abs_diff < 0.5, (
-                f"Hook {hook} does not match between HookedTransformer and TransformerBridge. "
-                f"Mean absolute difference: {mean_abs_diff}"
-            )
-
-    def test_required_model_hooks_available(self, transformer_bridge, expected_hooks):
-        """Test that TransformerBridge has all required TransformerLens hooks accessible on the model."""
-        # Get expected hooks and assert each one exists
-
-        missing_hooks = []
-        for hook_name in expected_hooks:
-            if not self.hook_exists_on_model(transformer_bridge, hook_name):
-                missing_hooks.append(hook_name)
-
-        assert (
-            len(missing_hooks) == 0
-        ), f"Required hooks are not accessible on TransformerBridge: {sorted(missing_hooks)}"
-
-    def test_cache_completeness_vs_strict_equality(
-        self, transformer_bridge, prompt, expected_hooks
-    ):
-        """Test cache completeness (allowing extra hooks) vs strict equality."""
-        _, cache = transformer_bridge.run_with_cache(prompt)
-        actual_keys = list(cache.keys())
-
-        # Find missing and extra hooks
-        expected_set = set(expected_hooks)
-        actual_set = set(actual_keys)
-
-        missing_hooks = expected_set - actual_set
-        extra_hooks = actual_set - expected_set
-
-        # This test documents the current behavior:
-        # - We require all expected hooks to be present
-        # - We allow extra hooks (they indicate additional functionality)
-        assert len(missing_hooks) == 0, f"Missing expected hooks: {sorted(missing_hooks)}"
-
-        # Log extra hooks for visibility but don't fail
-        if extra_hooks:
-            print(f"Note: Found {len(extra_hooks)} additional hooks beyond expected set")
-            print(
-                f"Additional hooks: {sorted(list(extra_hooks)[:5])}{'...' if len(extra_hooks) > 5 else ''}"
-            )
diff --git a/tests/integration/model_bridge/compatibility/test_weight_processing_compatibility.py b/tests/integration/model_bridge/compatibility/test_weight_processing_compatibility.py
deleted file mode 100644
index c3b754496..000000000
--- a/tests/integration/model_bridge/compatibility/test_weight_processing_compatibility.py
+++ /dev/null
@@ -1,228 +0,0 @@
-#!/usr/bin/env python3
-"""
-Integration Compatibility Test for Weight Processing
-====================================================
-
-This test verifies that:
-1. HookedTransformer with processing matches expected Main Demo values (3.999 → 5.453)
-2. HookedTransformer without processing matches expected unprocessed values (~3.999 → ~4.117)
-3. TransformerBridge with processing matches HookedTransformer with processing
-4. TransformerBridge without processing matches HookedTransformer without processing
-5. Processing maintains mathematical equivalence for baseline computation
-6. Processing changes ablation results as expected (for better interpretability)
-"""
-
-import pytest
-import torch
-from jaxtyping import Float
-
-from transformer_lens import HookedTransformer, utils
-from transformer_lens.model_bridge.bridge import TransformerBridge
-
-
-class TestWeightProcessingCompatibility:
-    """Test class for weight processing compatibility between HookedTransformer and TransformerBridge."""
-
-    @pytest.fixture(scope="class")
-    def model_name(self):
-        return "gpt2"
-
-    @pytest.fixture(scope="class")
-    def device(self):
-        return "cpu"
-
-    @pytest.fixture(scope="class")
-    def test_text(self):
-        return "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
-
-    @pytest.fixture(scope="class")
-    def ablation_params(self):
-        return {"layer_to_ablate": 0, "head_index_to_ablate": 8}
-
-    @pytest.fixture(scope="class")
-    def expected_values(self):
-        return {
-            "processed_orig": 3.999,
-            "processed_ablated": 5.453,
-            "unprocessed_orig": 3.999,
-            "unprocessed_ablated": 4.117,
-        }
-
-    @pytest.fixture(scope="class")
-    def tolerance(self):
-        return 0.01
-
-    @pytest.fixture(scope="class")
-    def hooked_processed(self, model_name, device):
-        """Load HookedTransformer with processing."""
-        print("Loading HookedTransformer with processing...")
-        return HookedTransformer.from_pretrained(
-            model_name,
-            device=device,
-            fold_ln=True,
-            center_writing_weights=True,
-            center_unembed=True,
-            fold_value_biases=True,
-        )
-
-    @pytest.fixture(scope="class")
-    def hooked_unprocessed(self, model_name, device):
-        """Load HookedTransformer without processing."""
-        print("Loading HookedTransformer without processing...")
-        return HookedTransformer.from_pretrained_no_processing(model_name, device=device)
-
-    @pytest.fixture(scope="class")
-    def bridge_processed(self, model_name, device):
-        """Load TransformerBridge with processing."""
-        print("Loading TransformerBridge with processing...")
-        bridge = TransformerBridge.boot_transformers(model_name, device=device)
-        bridge.enable_compatibility_mode()  # Enable compatibility mode for hook aliases
-        return bridge
-
-    @pytest.fixture(scope="class")
-    def bridge_unprocessed(self, model_name, device):
-        """Load TransformerBridge without processing."""
-        print("Loading TransformerBridge without processing...")
-        bridge = TransformerBridge.boot_transformers(model_name, device=device)
-        bridge.enable_compatibility_mode(
-            no_processing=True
-        )  # Enable compatibility mode for hook aliases
-        # No processing applied
-        return bridge
-
-    def create_ablation_hook(self, head_index_to_ablate):
-        """Create the exact ablation hook from Main Demo."""
-
-        def head_ablation_hook(
-            value: Float[torch.Tensor, "batch pos head_index d_head"], hook
-        ) -> Float[torch.Tensor, "batch pos head_index d_head"]:
-            value[:, :, head_index_to_ablate, :] = 0.0
-            return value
-
-        return head_ablation_hook
-
-    def _test_model_ablation(self, model, model_name: str, test_text, ablation_params):
-        """Test a model and return original and ablated losses."""
-        tokens = model.to_tokens(test_text)
-
-        # Original loss
-        original_loss = model(tokens, return_type="loss").item()
-
-        # Ablated loss
-        ablated_loss = model.run_with_hooks(
-            tokens,
-            return_type="loss",
-            fwd_hooks=[
-                (
-                    utils.get_act_name("v", ablation_params["layer_to_ablate"]),
-                    self.create_ablation_hook(ablation_params["head_index_to_ablate"]),
-                )
-            ],
-        ).item()
-
-        print(f"{model_name}: Original={original_loss:.6f}, Ablated={ablated_loss:.6f}")
-        return original_loss, ablated_loss
-
-    def test_hooked_transformer_processed_matches_main_demo(
-        self, hooked_processed, test_text, ablation_params, expected_values, tolerance
-    ):
-        """Test that HookedTransformer with processing matches Main Demo values."""
-        orig, ablated = self._test_model_ablation(
-            hooked_processed, "HookedTransformer (processed)", test_text, ablation_params
-        )
-
-        assert (
-            abs(orig - expected_values["processed_orig"]) < tolerance
-        ), f"HookedTransformer processed original loss {orig:.6f} != expected {expected_values['processed_orig']:.3f}"
-        assert (
-            abs(ablated - expected_values["processed_ablated"]) < tolerance
-        ), f"HookedTransformer processed ablated loss {ablated:.6f} != expected {expected_values['processed_ablated']:.3f}"
-
-    def test_hooked_transformer_unprocessed_matches_expected(
-        self, hooked_unprocessed, test_text, ablation_params, expected_values, tolerance
-    ):
-        """Test that HookedTransformer without processing matches expected values."""
-        orig, ablated = self._test_model_ablation(
-            hooked_unprocessed, "HookedTransformer (unprocessed)", test_text, ablation_params
-        )
-
-        assert (
-            abs(orig - expected_values["unprocessed_orig"]) < tolerance
-        ), f"HookedTransformer unprocessed original loss {orig:.6f} != expected {expected_values['unprocessed_orig']:.3f}"
-        assert (
-            abs(ablated - expected_values["unprocessed_ablated"]) < tolerance
-        ), f"HookedTransformer unprocessed ablated loss {ablated:.6f} != expected {expected_values['unprocessed_ablated']:.3f}"
-
-    def test_baseline_mathematical_equivalence(
-        self, hooked_processed, hooked_unprocessed, test_text, ablation_params
-    ):
-        """Test that processing maintains mathematical equivalence for baseline computation."""
-        hooked_proc_orig, _ = self._test_model_ablation(
-            hooked_processed, "HookedTransformer (processed)", test_text, ablation_params
-        )
-        hooked_unproc_orig, _ = self._test_model_ablation(
-            hooked_unprocessed, "HookedTransformer (unprocessed)", test_text, ablation_params
-        )
-
-        orig_diff = abs(hooked_proc_orig - hooked_unproc_orig)
-        assert (
-            orig_diff < 0.001
-        ), f"Baseline computation not mathematically equivalent: diff={orig_diff:.6f}"
-
-    def test_ablation_interpretability_enhancement(
-        self, hooked_processed, hooked_unprocessed, test_text, ablation_params
-    ):
-        """Test that processing changes ablation results as expected for interpretability."""
-        _, hooked_proc_ablated = self._test_model_ablation(
-            hooked_processed, "HookedTransformer (processed)", test_text, ablation_params
-        )
-        _, hooked_unproc_ablated = self._test_model_ablation(
-            hooked_unprocessed, "HookedTransformer (unprocessed)", test_text, ablation_params
-        )
-
-        ablated_diff = abs(hooked_proc_ablated - hooked_unproc_ablated)
-        assert (
-            ablated_diff > 0.5
-        ), f"Ablation results should be significantly different for interpretability: diff={ablated_diff:.6f}"
-
-    @pytest.mark.skip(
-        reason="TransformerBridge processing compatibility has architectural differences that cause large numerical discrepancies"
-    )
-    def test_bridge_processed_matches_hooked_processed(
-        self, bridge_processed, hooked_processed, test_text, ablation_params, tolerance
-    ):
-        """Test that TransformerBridge with processing matches HookedTransformer with processing."""
-        bridge_orig, bridge_ablated = self._test_model_ablation(
-            bridge_processed, "TransformerBridge (processed)", test_text, ablation_params
-        )
-        hooked_orig, hooked_ablated = self._test_model_ablation(
-            hooked_processed, "HookedTransformer (processed)", test_text, ablation_params
-        )
-
-        assert (
-            abs(bridge_orig - hooked_orig) < tolerance
-        ), f"TransformerBridge processed original {bridge_orig:.6f} != HookedTransformer processed {hooked_orig:.6f}"
-        assert (
-            abs(bridge_ablated - hooked_ablated) < tolerance
-        ), f"TransformerBridge processed ablated {bridge_ablated:.6f} != HookedTransformer processed {hooked_ablated:.6f}"
-
-    @pytest.mark.skip(
-        reason="TransformerBridge processing compatibility has architectural differences that cause large numerical discrepancies"
-    )
-    def test_bridge_unprocessed_matches_hooked_unprocessed(
-        self, bridge_unprocessed, hooked_unprocessed, test_text, ablation_params, tolerance
-    ):
-        """Test that TransformerBridge without processing matches HookedTransformer without processing."""
-        bridge_orig, bridge_ablated = self._test_model_ablation(
-            bridge_unprocessed, "TransformerBridge (unprocessed)", test_text, ablation_params
-        )
-        hooked_orig, hooked_ablated = self._test_model_ablation(
-            hooked_unprocessed, "HookedTransformer (unprocessed)", test_text, ablation_params
-        )
-
-        assert (
-            abs(bridge_orig - hooked_orig) < tolerance
-        ), f"TransformerBridge unprocessed original {bridge_orig:.6f} != HookedTransformer unprocessed {hooked_orig:.6f}"
-        assert (
-            abs(bridge_ablated - hooked_ablated) < tolerance
-        ), f"TransformerBridge unprocessed ablated {bridge_ablated:.6f} != HookedTransformer unprocessed {hooked_ablated:.6f}"
diff --git a/tests/integration/model_bridge/conftest.py b/tests/integration/model_bridge/conftest.py
new file mode 100644
index 000000000..23603031f
--- /dev/null
+++ b/tests/integration/model_bridge/conftest.py
@@ -0,0 +1,56 @@
+"""Shared fixtures for model_bridge integration tests.
+
+Session-scoped fixtures avoid redundant model loads across test files.
+All models used here must be in the CI cache (see .github/workflows/checks.yml).
+"""
+
+import pytest
+
+from transformer_lens import HookedTransformer
+from transformer_lens.model_bridge.bridge import TransformerBridge
+
+
+@pytest.fixture(scope="session")
+def distilgpt2_bridge():
+    """TransformerBridge wrapping distilgpt2 (no compatibility mode)."""
+    return TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+
+
+@pytest.fixture(scope="session")
+def distilgpt2_bridge_compat():
+    """TransformerBridge wrapping distilgpt2 with compatibility mode enabled."""
+    bridge = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+@pytest.fixture(scope="session")
+def gpt2_bridge():
+    """TransformerBridge wrapping gpt2 (no compatibility mode)."""
+    return TransformerBridge.boot_transformers("gpt2", device="cpu")
+
+
+@pytest.fixture(scope="session")
+def gpt2_bridge_compat():
+    """TransformerBridge wrapping gpt2 with compatibility mode enabled."""
+    bridge = TransformerBridge.boot_transformers("gpt2", device="cpu")
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+@pytest.fixture(scope="session")
+def gpt2_hooked_processed():
+    """HookedTransformer gpt2 with default weight processing."""
+    return HookedTransformer.from_pretrained("gpt2", device="cpu")
+
+
+@pytest.fixture(scope="session")
+def gpt2_hooked_unprocessed():
+    """HookedTransformer gpt2 without weight processing."""
+    return HookedTransformer.from_pretrained_no_processing("gpt2", device="cpu")
+
+
+@pytest.fixture(scope="session")
+def distilgpt2_hooked_processed():
+    """HookedTransformer distilgpt2 with default weight processing."""
+    return HookedTransformer.from_pretrained("distilgpt2", device="cpu")
diff --git a/tests/integration/model_bridge/test_attention_hook_compatibility.py b/tests/integration/model_bridge/test_attention_hook_compatibility.py
deleted file mode 100644
index c4a55f7c8..000000000
--- a/tests/integration/model_bridge/test_attention_hook_compatibility.py
+++ /dev/null
@@ -1,138 +0,0 @@
-"""Test attention hook behavior between HookedTransformer and TransformerBridge."""
-
-import pytest
-import torch
-
-from transformer_lens import HookedTransformer
-from transformer_lens.model_bridge.bridge import TransformerBridge
-
-
-class TestAttentionHookCompatibility:
-    """Test attention hook behavior compatibility."""
-
-    @pytest.fixture(scope="class")
-    def models(self):
-        """Create HookedTransformer and TransformerBridge for testing."""
-        # Create reference model (using distilgpt2 for faster tests)
-        reference_model = HookedTransformer.from_pretrained("distilgpt2", device="cpu")
-
-        # Create bridge model
-        bridge = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
-        bridge.enable_compatibility_mode()
-
-        return reference_model, bridge
-
-    @pytest.fixture
-    def test_input(self, models):
-        """Create test input tokens."""
-        reference_model, _ = models
-        test_text = "The cat sat on"
-        return reference_model.to_tokens(test_text)
-
-    def test_hook_shapes_match(self, models, test_input):
-        """Test that attention hooks produce matching activation shapes."""
-        reference_model, bridge = models
-        hook_name = "blocks.0.attn.hook_v"
-
-        # Collect activations from both models
-        ref_activations = []
-        bridge_activations = []
-
-        def collect_ref_hook(activation, hook):
-            ref_activations.append(activation)
-            return activation
-
-        def collect_bridge_hook(activation, hook):
-            bridge_activations.append(activation)
-            return activation
-
-        # Run with hooks
-        reference_model.add_hook(hook_name, collect_ref_hook)
-        bridge.add_hook(hook_name, collect_bridge_hook)
-
-        with torch.no_grad():
-            reference_model(test_input)
-            bridge(test_input)
-
-        # Clean up hooks
-        reference_model.reset_hooks()
-        bridge.reset_hooks()
-
-        # Verify shapes match
-        assert len(ref_activations) == 1, "Reference model should have one activation"
-        assert len(bridge_activations) == 1, "Bridge should have one activation"
-        assert (
-            ref_activations[0].shape == bridge_activations[0].shape
-        ), f"Activation shapes should match: {ref_activations[0].shape} vs {bridge_activations[0].shape}"
-
-    def test_ablation_hook_works(self, models, test_input):
-        """Test that ablation hooks work correctly on both models."""
-        reference_model, bridge = models
-        hook_name = "blocks.0.attn.hook_v"
-
-        def ablation_hook(activation, hook):
-            """Zero out the activation as ablation."""
-            return torch.zeros_like(activation)
-
-        # Test reference model ablation
-        reference_model.add_hook(hook_name, ablation_hook)
-        with torch.no_grad():
-            ref_ablated_loss = reference_model(test_input, return_type="loss")
-        reference_model.reset_hooks()
-
-        # Test bridge ablation
-        bridge.add_hook(hook_name, ablation_hook)
-        with torch.no_grad():
-            bridge_ablated_loss = bridge(test_input, return_type="loss")
-        bridge.reset_hooks()
-
-        # Both ablations should produce reasonable (higher) losses
-        assert (
-            ref_ablated_loss > 3.0
-        ), f"Reference ablated loss should be reasonable: {ref_ablated_loss}"
-        assert (
-            bridge_ablated_loss > 3.0
-        ), f"Bridge ablated loss should be reasonable: {bridge_ablated_loss}"
-
-        # Ablated losses should be close to each other
-        diff = abs(ref_ablated_loss - bridge_ablated_loss)
-        assert diff < 1.0, f"Ablated losses should match closely: {diff}"
-
-    def test_hook_names_available(self, models):
-        """Test that expected hook names are available in both models."""
-        reference_model, bridge = models
-
-        expected_hooks = ["blocks.0.attn.hook_v", "blocks.0.attn.hook_q", "blocks.0.attn.hook_k"]
-
-        # Check reference model hooks
-        ref_hook_names = set(reference_model.hook_dict.keys())
-        for hook_name in expected_hooks:
-            assert hook_name in ref_hook_names, f"Reference model missing hook: {hook_name}"
-
-        # Check bridge hooks
-        bridge_hook_names = set(bridge.hook_dict.keys())
-        for hook_name in expected_hooks:
-            assert hook_name in bridge_hook_names, f"Bridge missing hook: {hook_name}"
-
-    def test_hook_error_handling(self, models, test_input):
-        """Test that hook errors are handled gracefully."""
-        reference_model, bridge = models
-        hook_name = "blocks.0.attn.hook_v"
-
-        def error_hook(activation, hook):
-            """Hook that raises an error."""
-            raise ValueError("Test error in hook")
-
-        # Test error handling in reference model
-        reference_model.add_hook(hook_name, error_hook)
-        with pytest.raises(ValueError, match="Test error in hook"):
-            with torch.no_grad():
-                reference_model(test_input)
-        reference_model.reset_hooks()
-
-        # Test error handling in bridge
-        bridge.add_hook(hook_name, error_hook)
-        with pytest.raises(ValueError, match="Test error in hook"):
-            with torch.no_grad():
-                bridge(test_input)
-        bridge.reset_hooks()
diff --git a/tests/integration/model_bridge/test_bridge_generation.py b/tests/integration/model_bridge/test_bridge_generation.py
new file mode 100644
index 000000000..22d2fbac2
--- /dev/null
+++ b/tests/integration/model_bridge/test_bridge_generation.py
@@ -0,0 +1,121 @@
+"""Test TransformerBridge text generation capabilities.
+
+Covers greedy generation, temperature sampling, and HuggingFace parity.
+Uses distilgpt2 (CI-cached).
+"""
+
+import pytest
+import torch
+
+from transformer_lens.model_bridge.bridge import TransformerBridge
+
+
+@pytest.fixture(scope="module")
+def bridge():
+    """TransformerBridge wrapping distilgpt2."""
+    return TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+
+
+@pytest.fixture(scope="module")
+def bridge_compat():
+    """TransformerBridge wrapping distilgpt2 with compatibility mode."""
+    b = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    b.enable_compatibility_mode()
+    return b
+
+
+class TestGreedyGeneration:
+    """Test deterministic greedy generation."""
+
+    def test_greedy_produces_tokens(self, bridge):
+        """Greedy generation should produce additional tokens."""
+        tokens = bridge.to_tokens("The quick brown")
+        with torch.no_grad():
+            output = bridge.generate(tokens, max_new_tokens=5, temperature=0.0, do_sample=False)
+        assert output.shape[1] > tokens.shape[1], "Should generate additional tokens"
+
+    def test_greedy_is_deterministic(self, bridge):
+        """Two greedy runs should produce identical output."""
+        tokens = bridge.to_tokens("Hello world")
+        with torch.no_grad():
+            out1 = bridge.generate(tokens, max_new_tokens=5, temperature=0.0, do_sample=False)
+            out2 = bridge.generate(tokens, max_new_tokens=5, temperature=0.0, do_sample=False)
+        assert torch.equal(out1, out2), "Greedy generation should be deterministic"
+
+    def test_greedy_output_decodable(self, bridge):
+        """Generated tokens should decode to valid text."""
+        tokens = bridge.to_tokens("The meaning of life")
+        with torch.no_grad():
+            output = bridge.generate(tokens, max_new_tokens=10, temperature=0.0, do_sample=False)
+        text = bridge.to_string(output[0])
+        assert isinstance(text, str)
+        assert len(text) > len("The meaning of life")
+
+
+class TestSamplingGeneration:
+    """Test generation with sampling."""
+
+    def test_temperature_affects_output(self, bridge):
+        """Different temperatures should (usually) produce different outputs."""
+        tokens = bridge.to_tokens("Once upon a time")
+        torch.manual_seed(42)
+        with torch.no_grad():
+            out_low = bridge.generate(tokens, max_new_tokens=10, temperature=0.1, do_sample=True)
+        torch.manual_seed(42)
+        with torch.no_grad():
+            out_high = bridge.generate(tokens, max_new_tokens=10, temperature=2.0, do_sample=True)
+        # With very different temperatures, outputs should differ
+        # (not guaranteed but extremely likely with 10 tokens)
+        # Just verify both produce valid output
+        assert out_low.shape[1] > tokens.shape[1]
+        assert out_high.shape[1] > tokens.shape[1]
+
+    def test_top_k_limits_vocabulary(self, bridge):
+        """top_k generation should produce valid tokens."""
+        tokens = bridge.to_tokens("The cat")
+        torch.manual_seed(123)
+        with torch.no_grad():
+            output = bridge.generate(
+                tokens, max_new_tokens=5, temperature=1.0, do_sample=True, top_k=10
+            )
+        assert output.shape[1] > tokens.shape[1]
+        # All token IDs should be valid
+        assert (output >= 0).all()
+        assert (output < bridge.cfg.d_vocab).all()
+
+
+class TestGenerationWithCompatMode:
+    """Test generation works with compatibility mode enabled."""
+
+    def test_compat_greedy_matches_non_compat(self, bridge, bridge_compat):
+        """Greedy generation should match between compat and non-compat modes."""
+        tokens = bridge.to_tokens("Natural language")
+        with torch.no_grad():
+            out_plain = bridge.generate(tokens, max_new_tokens=5, temperature=0.0, do_sample=False)
+            out_compat = bridge_compat.generate(
+                tokens, max_new_tokens=5, temperature=0.0, do_sample=False
+            )
+        # With weight processing, outputs may differ slightly but both should be valid
+        assert out_plain.shape[1] > tokens.shape[1]
+        assert out_compat.shape[1] > tokens.shape[1]
+
+
+class TestGenerationEdgeCases:
+    """Test generation edge cases."""
+
+    def test_single_token_input(self, bridge):
+        """Generation from a single token should work."""
+        tokens = bridge.to_tokens("Hello")
+        with torch.no_grad():
+            output = bridge.generate(tokens, max_new_tokens=3, temperature=0.0, do_sample=False)
+        assert output.shape[1] > tokens.shape[1]
+
+    def test_max_new_tokens_respected(self, bridge):
+        """Output should not exceed input + max_new_tokens."""
+        tokens = bridge.to_tokens("Test")
+        max_new = 5
+        with torch.no_grad():
+            output = bridge.generate(
+                tokens, max_new_tokens=max_new, temperature=0.0, do_sample=False
+            )
+        assert output.shape[1] <= tokens.shape[1] + max_new
diff --git a/tests/integration/model_bridge/test_bridge_vs_hooked_comparison.py b/tests/integration/model_bridge/test_bridge_vs_hooked_comparison.py
index 195c6cb66..d59e85db2 100644
--- a/tests/integration/model_bridge/test_bridge_vs_hooked_comparison.py
+++ b/tests/integration/model_bridge/test_bridge_vs_hooked_comparison.py
@@ -41,7 +41,7 @@ def test_texts(self):
             "This is a longer sentence with more tokens to test the models thoroughly.",
         ]
 
-    @pytest.mark.skip(reason="Bridge vs Hooked comparison failing due to architectural differences")
+    @pytest.mark.slow
     def test_loss_comparison_multiple_texts(self, models_with_processing, test_texts):
         """Test loss comparison across multiple text samples."""
         hooked, bridge = models_with_processing
@@ -56,13 +56,13 @@ def test_loss_comparison_multiple_texts(self, models_with_processing, test_texts
                 diff < 0.01
             ), f"Loss difference too large for '{text}': {diff} (hooked: {hooked_loss}, bridge: {bridge_loss})"
 
-            # Both should have reasonable losses
+            # Both should have reasonable losses (single-token inputs can have high loss)
             assert (
-                2.0 < hooked_loss < 8.0
+                0.0 < hooked_loss < 15.0
             ), f"HookedTransformer loss unreasonable for '{text}': {hooked_loss}"
-            assert 2.0 < bridge_loss < 8.0, f"Bridge loss unreasonable for '{text}': {bridge_loss}"
+            assert 0.0 < bridge_loss < 15.0, f"Bridge loss unreasonable for '{text}': {bridge_loss}"
 
-    @pytest.mark.skip(reason="Bridge vs Hooked comparison failing due to architectural differences")
+    @pytest.mark.slow
     def test_logits_comparison(self, models_with_processing):
         """Test that logits match between models."""
         hooked, bridge = models_with_processing
@@ -89,45 +89,29 @@ def test_logits_comparison(self, models_with_processing):
         ), f"HookedTransformer logits std should be reasonable: {hooked_std}"
         assert 1.0 < bridge_std < 10.0, f"Bridge logits std should be reasonable: {bridge_std}"
 
-    @pytest.mark.skip(reason="Bridge vs Hooked comparison failing due to architectural differences")
+    @pytest.mark.slow
     def test_attention_output_comparison(self, models_with_processing):
-        """Test attention layer outputs match."""
+        """Test attention outputs match via cached activations."""
         hooked, bridge = models_with_processing
         test_text = "Attention test"
 
-        # Get embeddings and inputs
-        tokens = hooked.to_tokens(test_text)
-
-        # HookedTransformer attention
-        hooked_embed = hooked.embed(tokens)
-        hooked_pos_embed = hooked.pos_embed(tokens)
-        hooked_input = hooked_embed + hooked_pos_embed
-
-        # Bridge attention (needs position indices)
-        bridge_embed = bridge.embed(tokens)
-        batch_size, seq_len = tokens.shape[:2]
-        position_indices = torch.arange(seq_len, device=tokens.device, dtype=torch.long)
-        position_indices = position_indices.unsqueeze(0).expand(batch_size, -1)
-        bridge_pos_embed = bridge.pos_embed(position_indices)
-        bridge_input = bridge_embed + bridge_pos_embed
-
-        # Inputs should be very close
-        input_diff = (hooked_input - bridge_input).abs().max()
-        assert input_diff < 0.01, f"Embedding inputs should match: {input_diff}"
-
-        # Test first layer attention directly
+        # Use run_with_cache to capture attention outputs cleanly
         with torch.no_grad():
-            hooked_attn_out = hooked.blocks[0].attn(hooked_input)
-            bridge_attn_out = bridge.blocks[0].attn(bridge_input)
+            _, hooked_cache = hooked.run_with_cache(test_text)
+            _, bridge_cache = bridge.run_with_cache(test_text)
 
-            # Handle potential tuple output from bridge
-            if isinstance(bridge_attn_out, tuple):
-                bridge_attn_out = bridge_attn_out[0]
+        # Compare first layer attention output (hook_z = pre-output-projection attention)
+        hooked_attn_out = hooked_cache["blocks.0.attn.hook_z"]
+        bridge_attn_out = bridge_cache["blocks.0.attn.hook_z"]
+
+        assert (
+            hooked_attn_out.shape == bridge_attn_out.shape
+        ), f"Attention output shapes should match: {hooked_attn_out.shape} vs {bridge_attn_out.shape}"
 
         attn_diff = (hooked_attn_out - bridge_attn_out).abs().max()
-        assert attn_diff < 0.1, f"Attention outputs should be reasonably close: {attn_diff}"
+        assert attn_diff < 0.01, f"Attention outputs should match closely: {attn_diff}"
 
-    @pytest.mark.skip(reason="Bridge vs Hooked comparison failing due to architectural differences")
+    @pytest.mark.slow
     def test_hook_v_values_match(self, models_with_processing):
         """Test that hook_v values match between models."""
         hooked, bridge = models_with_processing
@@ -168,32 +152,33 @@ def collect_bridge_v(activation, hook):
         ), f"V shapes should match: {hooked_v.shape} vs {bridge_v.shape}"
 
         v_diff = (hooked_v - bridge_v).abs().max()
-        # V values might not match exactly due to different computation paths
-        assert v_diff < 1.0, f"V values should be reasonably close: {v_diff}"
+        # Observed: 0.000000 for distilgpt2 with matching weight processing
+        assert v_diff < 0.01, f"V values should match closely: {v_diff}"
 
-    @pytest.mark.skip(reason="Bridge vs Hooked comparison failing due to architectural differences")
+    @pytest.mark.slow
     def test_generation_consistency(self, models_with_processing):
         """Test that text generation is consistent."""
         hooked, bridge = models_with_processing
         prompt = "The future of AI"
+        tokens = hooked.to_tokens(prompt)
 
-        # Generate from both models
+        # Generate from both models using token input
         with torch.no_grad():
             hooked_tokens = hooked.generate(
-                prompt, max_new_tokens=5, temperature=0.0, do_sample=False
+                tokens, max_new_tokens=5, temperature=0.0, do_sample=False
             )
             bridge_tokens = bridge.generate(
-                prompt, max_new_tokens=5, temperature=0.0, do_sample=False
+                tokens, max_new_tokens=5, temperature=0.0, do_sample=False
             )
 
         # Convert to text for comparison
         hooked_text = hooked.to_string(hooked_tokens[0])
         bridge_text = bridge.to_string(bridge_tokens[0])
 
-        # Should generate very similar or identical text (deterministic generation)
-        # Allow some flexibility as generation might have slight numerical differences
-        assert len(hooked_text) > len(prompt), "HookedTransformer should generate additional tokens"
-        assert len(bridge_text) > len(prompt), "Bridge should generate additional tokens"
+        # Deterministic generation should produce identical output
+        assert (
+            hooked_text == bridge_text
+        ), f"Generation should match:\n  hooked: {repr(hooked_text)}\n  bridge: {repr(bridge_text)}"
 
     def test_batch_processing(self, models_with_processing):
         """Test batch processing works correctly for both models."""
diff --git a/tests/integration/model_bridge/test_bridge_wiring_integration.py b/tests/integration/model_bridge/test_bridge_wiring_integration.py
new file mode 100644
index 000000000..9828bfd2d
--- /dev/null
+++ b/tests/integration/model_bridge/test_bridge_wiring_integration.py
@@ -0,0 +1,141 @@
+"""Integration tests for bridge internal wiring with real models.
+
+Verifies that:
+- Reshaped attention biases produce correct computation (not just correct shapes)
+- Adapter path translations resolve to actual weight tensors with expected properties
+
+Uses distilgpt2 (CI-cached).
+"""
+
+import pytest
+import torch
+
+from transformer_lens import HookedTransformer
+from transformer_lens.model_bridge.bridge import TransformerBridge
+
+
+@pytest.fixture(scope="module")
+def bridge_compat():
+    b = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    b.enable_compatibility_mode()
+    return b
+
+
+@pytest.fixture(scope="module")
+def reference_ht():
+    return HookedTransformer.from_pretrained("distilgpt2", device="cpu")
+
+
+class TestReshapeBiasIntegration:
+    """Verify reshaped biases produce correct attention computation on a real model."""
+
+    def test_reshaped_b_Q_produces_matching_hook_q(self, bridge_compat, reference_ht):
+        """b_Q reshaped via _reshape_bias should produce hook_q values matching HookedTransformer."""
+        text = "The quick brown fox"
+
+        with torch.no_grad():
+            _, ht_cache = reference_ht.run_with_cache(text)
+            _, br_cache = bridge_compat.run_with_cache(text)
+
+        ht_q = ht_cache["blocks.0.attn.hook_q"]
+        br_q = br_cache["blocks.0.attn.hook_q"]
+
+        assert ht_q.shape == br_q.shape, f"hook_q shapes differ: {ht_q.shape} vs {br_q.shape}"
+        max_diff = (ht_q - br_q).abs().max().item()
+        assert max_diff < 1e-4, (
+            f"hook_q values differ by {max_diff:.6f} — " f"bias reshaping may be incorrect"
+        )
+
+    def test_reshaped_b_V_produces_matching_hook_v(self, bridge_compat, reference_ht):
+        """b_V reshaped via _reshape_bias should produce hook_v values matching HookedTransformer."""
+        text = "The quick brown fox"
+
+        with torch.no_grad():
+            _, ht_cache = reference_ht.run_with_cache(text)
+            _, br_cache = bridge_compat.run_with_cache(text)
+
+        ht_v = ht_cache["blocks.0.attn.hook_v"]
+        br_v = br_cache["blocks.0.attn.hook_v"]
+
+        assert ht_v.shape == br_v.shape, f"hook_v shapes differ: {ht_v.shape} vs {br_v.shape}"
+        max_diff = (ht_v - br_v).abs().max().item()
+        assert max_diff < 1e-4, (
+            f"hook_v values differ by {max_diff:.6f} — " f"bias reshaping may be incorrect"
+        )
+
+
+class TestAdapterPathResolution:
+    """Verify adapter path translations resolve to real weight tensors."""
+
+    def test_embed_path_resolves_to_weight(self, bridge_compat):
+        """embed.W_E should resolve to a real embedding weight tensor."""
+        W_E = bridge_compat.embed.W_E
+        assert W_E is not None
+        assert W_E.ndim == 2
+        assert W_E.shape == (bridge_compat.cfg.d_vocab, bridge_compat.cfg.d_model)
+        assert not torch.isnan(W_E).any()
+        assert W_E.std() > 0, "Embedding weights should not be all zeros"
+
+    def test_unembed_path_resolves_to_weight(self, bridge_compat):
+        """unembed.W_U should resolve to a real unembedding weight tensor."""
+        W_U = bridge_compat.unembed.W_U
+        assert W_U is not None
+        assert W_U.ndim == 2
+        assert W_U.shape == (bridge_compat.cfg.d_model, bridge_compat.cfg.d_vocab)
+        assert not torch.isnan(W_U).any()
+        assert W_U.std() > 0
+
+    def test_attention_weight_paths_resolve(self, bridge_compat):
+        """W_Q, W_K, W_V, W_O per-block should resolve to real weight tensors."""
+        cfg = bridge_compat.cfg
+        block = bridge_compat.blocks[0]
+
+        W_Q = block.attn.W_Q
+        assert W_Q is not None
+        assert W_Q.shape == (cfg.n_heads, cfg.d_model, cfg.d_head)
+        assert not torch.isnan(W_Q).any()
+
+        W_K = block.attn.W_K
+        assert W_K.shape == (cfg.n_heads, cfg.d_model, cfg.d_head)
+
+        W_V = block.attn.W_V
+        assert W_V.shape == (cfg.n_heads, cfg.d_model, cfg.d_head)
+
+        W_O = block.attn.W_O
+        assert W_O.shape == (cfg.n_heads, cfg.d_head, cfg.d_model)
+
+    def test_mlp_weight_paths_resolve(self, bridge_compat):
+        """MLP weight paths should resolve to real weight tensors."""
+        block = bridge_compat.blocks[0]
+
+        W_in = block.mlp.W_in
+        assert W_in is not None
+        assert W_in.ndim == 2
+        assert not torch.isnan(W_in).any()
+        assert W_in.std() > 0
+
+        W_out = block.mlp.W_out
+        assert W_out is not None
+        assert W_out.ndim == 2
+        assert not torch.isnan(W_out).any()
+
+    def test_stacked_weight_properties_match_per_block(self, bridge_compat):
+        """Stacked W_Q property should match per-block W_Q values."""
+        stacked_W_Q = bridge_compat.W_Q  # [n_layers, n_heads, d_model, d_head]
+        block0_W_Q = bridge_compat.blocks[0].attn.W_Q  # [n_heads, d_model, d_head]
+
+        assert torch.allclose(
+            stacked_W_Q[0], block0_W_Q, atol=1e-6
+        ), "Stacked W_Q[0] should match blocks[0].attn.W_Q"
+
+    def test_translated_paths_match_hf_weights(self, bridge_compat):
+        """Bridge weight properties should contain the same data as the underlying HF model."""
+        hf_model = bridge_compat.original_model
+
+        # distilgpt2 embedding: transformer.wte.weight
+        hf_embed = hf_model.transformer.wte.weight
+        bridge_embed = bridge_compat.embed.W_E
+
+        assert torch.equal(
+            hf_embed, bridge_embed
+        ), "Bridge embed.W_E should be the same tensor as HF transformer.wte.weight"
diff --git a/tests/integration/model_bridge/test_weight_processing.py b/tests/integration/model_bridge/test_weight_processing.py
new file mode 100644
index 000000000..3eaa91e64
--- /dev/null
+++ b/tests/integration/model_bridge/test_weight_processing.py
@@ -0,0 +1,320 @@
+#!/usr/bin/env python3
+"""Consolidated weight processing tests for TransformerBridge.
+
+Tests flag combinations, regression anchors, and bridge-vs-HT parity.
+Consolidates:
+- test_weight_processing_combinations.py (flag matrix + ablation effects)
+- compatibility/test_weight_processing_compatibility.py (Main Demo regression anchors)
+
+Uses distilgpt2 for fast flag matrix tests and gpt2 for Main Demo regression anchors.
+"""
+
+import pytest
+import torch
+from jaxtyping import Float
+
+from transformer_lens import HookedTransformer, utils
+from transformer_lens.model_bridge import TransformerBridge
+
+# ---------------------------------------------------------------------------
+# Flag combination matrix (distilgpt2 for speed)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "fold_ln,center_writing_weights,center_unembed,fold_value_biases,expected_close_match",
+    [
+        # Test critical combinations only to speed up CI
+        (False, False, False, False, True),  # No processing
+        (True, False, False, False, True),  # Only fold_ln (most important)
+        (True, True, False, False, True),  # fold_ln + center_writing (common combo)
+        (True, True, True, True, True),  # All processing (default)
+        # Extended flag combinations (mark test with @pytest.mark.slow to skip in fast CI runs)
+        pytest.param(
+            False, True, False, False, True, marks=pytest.mark.slow
+        ),  # Only center_writing
+        pytest.param(
+            False, False, True, False, True, marks=pytest.mark.slow
+        ),  # Only center_unembed
+        pytest.param(
+            False, False, False, True, True, marks=pytest.mark.slow
+        ),  # Only fold_value_biases
+        pytest.param(
+            True, False, True, False, True, marks=pytest.mark.slow
+        ),  # fold_ln + center_unembed
+        pytest.param(
+            True, False, False, True, True, marks=pytest.mark.slow
+        ),  # fold_ln + fold_value_biases
+        pytest.param(
+            False, True, True, False, True, marks=pytest.mark.slow
+        ),  # center_writing + center_unembed
+        pytest.param(
+            True, True, True, False, True, marks=pytest.mark.slow
+        ),  # All except fold_value_biases
+        pytest.param(
+            True, True, False, True, True, marks=pytest.mark.slow
+        ),  # All except center_unembed
+        pytest.param(
+            True, False, True, True, True, marks=pytest.mark.slow
+        ),  # All except center_writing
+        pytest.param(False, True, True, True, True, marks=pytest.mark.slow),  # All except fold_ln
+    ],
+)
+def test_weight_processing_flag_combinations(
+    fold_ln, center_writing_weights, center_unembed, fold_value_biases, expected_close_match
+):
+    """Test that different combinations of weight processing flags work correctly."""
+    device = "cpu"
+    model_name = "distilgpt2"
+    test_text = "Natural language processing"
+
+    # Get reference values from HookedTransformer with same settings
+    reference_ht = HookedTransformer.from_pretrained(
+        model_name,
+        device=device,
+        fold_ln=fold_ln,
+        center_writing_weights=center_writing_weights,
+        center_unembed=center_unembed,
+        fold_value_biases=fold_value_biases,
+        refactor_factored_attn_matrices=False,
+    )
+
+    ref_loss = reference_ht(test_text, return_type="loss")
+
+    # Test ablation effect
+    hook_name = utils.get_act_name("v", 0)
+
+    def ablation_hook(activation, hook):
+        activation[:, :, 8, :] = 0  # Ablate head 8 in layer 0
+        return activation
+
+    ref_ablated_loss = reference_ht.run_with_hooks(
+        test_text, return_type="loss", fwd_hooks=[(hook_name, ablation_hook)]
+    )
+    ref_ablation_effect = ref_ablated_loss - ref_loss
+
+    # Create TransformerBridge and apply weight processing
+    bridge = TransformerBridge.boot_transformers(model_name, device=device)
+    bridge.process_weights(
+        fold_ln=fold_ln,
+        center_writing_weights=center_writing_weights,
+        center_unembed=center_unembed,
+        fold_value_biases=fold_value_biases,
+        refactor_factored_attn_matrices=False,
+    )
+    bridge.enable_compatibility_mode()
+
+    # Test baseline inference
+    bridge_loss = bridge(test_text, return_type="loss")
+
+    # Test ablation with bridge
+    bridge_ablated_loss = bridge.run_with_hooks(
+        test_text, return_type="loss", fwd_hooks=[(hook_name, ablation_hook)]
+    )
+    bridge_ablation_effect = bridge_ablated_loss - bridge_loss
+
+    # Compare results
+    loss_diff = abs(bridge_loss - ref_loss)
+    effect_diff = abs(bridge_ablation_effect - ref_ablation_effect)
+
+    # Assertions
+    # Observed values (distilgpt2, 2026-04-07):
+    #   Loss diffs: all < 0.00002 across all flag combos
+    #   Effect diffs: ~0.133 for partial processing, ~0.000001 for full processing
+    #   The partial-processing effect mismatch is due to different V hook capture
+    #   points between bridge and HookedTransformer in non-fully-processed mode.
+    if expected_close_match:
+        assert loss_diff < 0.01, f"Baseline loss difference too large: {loss_diff:.6f}"
+        assert effect_diff < 0.5, f"Ablation effect difference too large: {effect_diff:.6f}"
+
+    # Ensure model produces reasonable outputs
+    assert not torch.isnan(bridge_loss), "Bridge produced NaN loss"
+    assert not torch.isinf(bridge_loss), "Bridge produced infinite loss"
+
+
+def test_no_processing_matches_unprocessed_hooked_transformer():
+    """Test that no processing flag matches HookedTransformer loaded without processing."""
+    device = "cpu"
+    model_name = "distilgpt2"
+    test_text = "Natural language processing"
+
+    unprocessed_ht = HookedTransformer.from_pretrained_no_processing(model_name, device=device)
+    unprocessed_loss = unprocessed_ht(test_text, return_type="loss")
+
+    bridge = TransformerBridge.boot_transformers(model_name, device=device)
+    bridge.process_weights(
+        fold_ln=False,
+        center_writing_weights=False,
+        center_unembed=False,
+        fold_value_biases=False,
+        refactor_factored_attn_matrices=False,
+    )
+    bridge.enable_compatibility_mode()
+    bridge_loss = bridge(test_text, return_type="loss")
+
+    # Observed: < 0.00002 for distilgpt2
+    loss_diff = abs(bridge_loss - unprocessed_loss)
+    assert loss_diff < 0.01, f"Unprocessed models should match closely: {loss_diff:.6f}"
+
+
+def test_all_processing_matches_default_hooked_transformer():
+    """Test that all processing flags match default HookedTransformer behavior."""
+    device = "cpu"
+    model_name = "distilgpt2"
+    test_text = "Natural language processing"
+
+    default_ht = HookedTransformer.from_pretrained(model_name, device=device)
+    default_loss = default_ht(test_text, return_type="loss")
+
+    bridge = TransformerBridge.boot_transformers(model_name, device=device)
+    bridge.enable_compatibility_mode()
+    bridge_loss = bridge(test_text, return_type="loss")
+
+    loss_diff = abs(bridge_loss - default_loss)
+    assert loss_diff < 0.01, f"Fully processed models should match closely: {loss_diff:.6f}"
+
+
+# ---------------------------------------------------------------------------
+# Main Demo regression anchors (gpt2 — matches published demo values)
+# ---------------------------------------------------------------------------
+
+# Expected values from the TransformerLens Main Demo notebook
+MAIN_DEMO_TEXT = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
+MAIN_DEMO_LAYER = 0
+MAIN_DEMO_HEAD = 8
+EXPECTED_PROCESSED_ORIG = 3.999
+EXPECTED_PROCESSED_ABLATED = 5.453
+EXPECTED_UNPROCESSED_ORIG = 3.999
+EXPECTED_UNPROCESSED_ABLATED = 4.117
+REGRESSION_TOLERANCE = 0.01
+
+
+def _run_ablation(model, text, layer, head):
+    """Run baseline + ablation and return (orig_loss, ablated_loss)."""
+    tokens = model.to_tokens(text)
+
+    def ablation_hook(
+        value: Float[torch.Tensor, "batch pos head_index d_head"], hook
+    ) -> Float[torch.Tensor, "batch pos head_index d_head"]:
+        value[:, :, head, :] = 0.0
+        return value
+
+    hook_name = utils.get_act_name("v", layer)
+    orig = model(tokens, return_type="loss").item()
+    ablated = model.run_with_hooks(
+        tokens, return_type="loss", fwd_hooks=[(hook_name, ablation_hook)]
+    ).item()
+    return orig, ablated
+
+
+class TestMainDemoRegression:
+    """Regression anchors from the TransformerLens Main Demo.
+
+    These tests pin the exact loss values produced by gpt2 with and without
+    weight processing, ensuring that changes to weight processing code don't
+    silently shift the numbers that published notebooks depend on.
+    """
+
+    @pytest.fixture(scope="class")
+    def hooked_processed(self):
+        return HookedTransformer.from_pretrained("gpt2", device="cpu")
+
+    @pytest.fixture(scope="class")
+    def hooked_unprocessed(self):
+        return HookedTransformer.from_pretrained_no_processing("gpt2", device="cpu")
+
+    @pytest.fixture(scope="class")
+    def bridge_processed(self):
+        bridge = TransformerBridge.boot_transformers("gpt2", device="cpu")
+        bridge.enable_compatibility_mode()
+        return bridge
+
+    @pytest.fixture(scope="class")
+    def bridge_unprocessed(self):
+        bridge = TransformerBridge.boot_transformers("gpt2", device="cpu")
+        bridge.enable_compatibility_mode(no_processing=True)
+        return bridge
+
+    def test_hooked_processed_matches_main_demo(self, hooked_processed):
+        """HookedTransformer with processing should match Main Demo values."""
+        orig, ablated = _run_ablation(
+            hooked_processed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        assert (
+            abs(orig - EXPECTED_PROCESSED_ORIG) < REGRESSION_TOLERANCE
+        ), f"Processed orig {orig:.6f} != expected {EXPECTED_PROCESSED_ORIG}"
+        assert (
+            abs(ablated - EXPECTED_PROCESSED_ABLATED) < REGRESSION_TOLERANCE
+        ), f"Processed ablated {ablated:.6f} != expected {EXPECTED_PROCESSED_ABLATED}"
+
+    def test_hooked_unprocessed_matches_expected(self, hooked_unprocessed):
+        """HookedTransformer without processing should match expected values."""
+        orig, ablated = _run_ablation(
+            hooked_unprocessed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        assert (
+            abs(orig - EXPECTED_UNPROCESSED_ORIG) < REGRESSION_TOLERANCE
+        ), f"Unprocessed orig {orig:.6f} != expected {EXPECTED_UNPROCESSED_ORIG}"
+        assert (
+            abs(ablated - EXPECTED_UNPROCESSED_ABLATED) < REGRESSION_TOLERANCE
+        ), f"Unprocessed ablated {ablated:.6f} != expected {EXPECTED_UNPROCESSED_ABLATED}"
+
+    def test_processing_preserves_baseline(self, hooked_processed, hooked_unprocessed):
+        """Processing should not change baseline loss (mathematical equivalence)."""
+        proc_orig, _ = _run_ablation(
+            hooked_processed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        unproc_orig, _ = _run_ablation(
+            hooked_unprocessed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        assert (
+            abs(proc_orig - unproc_orig) < 0.001
+        ), f"Baseline not mathematically equivalent: {proc_orig:.6f} vs {unproc_orig:.6f}"
+
+    def test_processing_enhances_ablation_signal(self, hooked_processed, hooked_unprocessed):
+        """Processing should increase the ablation effect (better interpretability)."""
+        _, proc_ablated = _run_ablation(
+            hooked_processed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        _, unproc_ablated = _run_ablation(
+            hooked_unprocessed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        diff = abs(proc_ablated - unproc_ablated)
+        assert diff > 0.5, (
+            f"Processing should significantly change ablation: "
+            f"processed={proc_ablated:.6f}, unprocessed={unproc_ablated:.6f}, diff={diff:.6f}"
+        )
+
+    def test_bridge_processed_matches_hooked_processed(self, bridge_processed, hooked_processed):
+        """TransformerBridge with processing should match HookedTransformer."""
+        br_orig, br_ablated = _run_ablation(
+            bridge_processed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        ht_orig, ht_ablated = _run_ablation(
+            hooked_processed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        # Observed: 0.000000 diff for gpt2 (2026-04-07)
+        assert (
+            abs(br_orig - ht_orig) < REGRESSION_TOLERANCE
+        ), f"Bridge processed orig {br_orig:.6f} != HT {ht_orig:.6f}"
+        assert (
+            abs(br_ablated - ht_ablated) < REGRESSION_TOLERANCE
+        ), f"Bridge processed ablated {br_ablated:.6f} != HT {ht_ablated:.6f}"
+
+    def test_bridge_unprocessed_matches_hooked_unprocessed(
+        self, bridge_unprocessed, hooked_unprocessed
+    ):
+        """TransformerBridge without processing should match HookedTransformer."""
+        br_orig, br_ablated = _run_ablation(
+            bridge_unprocessed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        ht_orig, ht_ablated = _run_ablation(
+            hooked_unprocessed, MAIN_DEMO_TEXT, MAIN_DEMO_LAYER, MAIN_DEMO_HEAD
+        )
+        # Observed: 0.000000 diff for gpt2 (2026-04-07)
+        assert (
+            abs(br_orig - ht_orig) < REGRESSION_TOLERANCE
+        ), f"Bridge unprocessed orig {br_orig:.6f} != HT {ht_orig:.6f}"
+        assert (
+            abs(br_ablated - ht_ablated) < REGRESSION_TOLERANCE
+        ), f"Bridge unprocessed ablated {br_ablated:.6f} != HT {ht_ablated:.6f}"
diff --git a/tests/integration/model_bridge/test_weight_processing_combinations.py b/tests/integration/model_bridge/test_weight_processing_combinations.py
deleted file mode 100644
index 27e9843d7..000000000
--- a/tests/integration/model_bridge/test_weight_processing_combinations.py
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/usr/bin/env python3
-"""Test different combinations of weight processing flags to ensure each works correctly."""
-
-import pytest
-import torch
-
-from transformer_lens import HookedTransformer, utils
-from transformer_lens.model_bridge import TransformerBridge
-
-
-@pytest.mark.parametrize(
-    "fold_ln,center_writing_weights,center_unembed,fold_value_biases,expected_close_match",
-    [
-        # Test critical combinations only to speed up CI
-        (False, False, False, False, True),  # No processing
-        (True, False, False, False, True),  # Only fold_ln (most important)
-        (True, True, False, False, True),  # fold_ln + center_writing (common combo)
-        (True, True, True, True, True),  # All processing (default)
-        # NOTE: Full test matrix commented out for CI speed. Uncomment for thorough testing:
-        # (False, True, False, False, True),  # Only center_writing
-        # (False, False, True, False, True),  # Only center_unembed
-        # (False, False, False, True, True),  # Only fold_value_biases
-        # (True, False, True, False, True),  # fold_ln + center_unembed
-        # (True, False, False, True, True),  # fold_ln + fold_value_biases
-        # (False, True, True, False, True),  # center_writing + center_unembed
-        # (True, True, True, False, True),  # All except fold_value_biases
-        # (True, True, False, True, True),  # All except center_unembed
-        # (True, False, True, True, True),  # All except center_writing
-        # (False, True, True, True, True),  # All except fold_ln
-    ],
-)
-def test_weight_processing_flag_combinations(
-    fold_ln, center_writing_weights, center_unembed, fold_value_biases, expected_close_match
-):
-    """Test that different combinations of weight processing flags work correctly."""
-    device = "cpu"
-    model_name = "distilgpt2"  # Use distilgpt2 for faster tests
-    test_text = "Natural language processing"
-
-    # Get reference values from HookedTransformer with same settings
-    reference_ht = HookedTransformer.from_pretrained(
-        model_name,
-        device=device,
-        fold_ln=fold_ln,
-        center_writing_weights=center_writing_weights,
-        center_unembed=center_unembed,
-        fold_value_biases=fold_value_biases,
-        refactor_factored_attn_matrices=False,
-    )
-
-    ref_loss = reference_ht(test_text, return_type="loss")
-
-    # Test ablation effect
-    hook_name = utils.get_act_name("v", 0)
-
-    def ablation_hook(activation, hook):
-        activation[:, :, 8, :] = 0  # Ablate head 8 in layer 0
-        return activation
-
-    ref_ablated_loss = reference_ht.run_with_hooks(
-        test_text, return_type="loss", fwd_hooks=[(hook_name, ablation_hook)]
-    )
-    ref_ablation_effect = ref_ablated_loss - ref_loss
-
-    # Create TransformerBridge and apply weight processing
-    bridge = TransformerBridge.boot_transformers(
-        model_name,
-        device=device,
-    )
-
-    # Apply weight processing with specified settings
-    bridge.process_weights(
-        fold_ln=fold_ln,
-        center_writing_weights=center_writing_weights,
-        center_unembed=center_unembed,
-        fold_value_biases=fold_value_biases,
-        refactor_factored_attn_matrices=False,
-    )
-
-    bridge.enable_compatibility_mode()
-
-    # Test baseline inference
-    bridge_loss = bridge(test_text, return_type="loss")
-
-    # Test ablation with bridge
-    bridge_ablated_loss = bridge.run_with_hooks(
-        test_text, return_type="loss", fwd_hooks=[(hook_name, ablation_hook)]
-    )
-    bridge_ablation_effect = bridge_ablated_loss - bridge_loss
-
-    # Compare results
-    loss_diff = abs(bridge_loss - ref_loss)
-    effect_diff = abs(bridge_ablation_effect - ref_ablation_effect)
-
-    # Assertions
-    if expected_close_match:
-        assert loss_diff < 30.0, f"Baseline loss difference too large: {loss_diff:.6f}"
-        assert effect_diff < 20.0, f"Ablation effect difference too large: {effect_diff:.6f}"
-
-    # Ensure model produces reasonable outputs
-    assert not torch.isnan(bridge_loss), "Bridge produced NaN loss"
-    assert not torch.isinf(bridge_loss), "Bridge produced infinite loss"
-
-
-def test_no_processing_matches_unprocessed_hooked_transformer():
-    """Test that no processing flag matches HookedTransformer loaded without processing."""
-    device = "cpu"
-    model_name = "distilgpt2"  # Use distilgpt2 for faster tests
-    test_text = "Natural language processing"
-
-    # Load HookedTransformer without processing
-    unprocessed_ht = HookedTransformer.from_pretrained_no_processing(model_name, device=device)
-    unprocessed_loss = unprocessed_ht(test_text, return_type="loss")
-
-    # Load TransformerBridge without processing
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-
-    # Apply no weight processing
-    bridge.process_weights(
-        fold_ln=False,
-        center_writing_weights=False,
-        center_unembed=False,
-        fold_value_biases=False,
-        refactor_factored_attn_matrices=False,
-    )
-    bridge.enable_compatibility_mode()
-    bridge_loss = bridge(test_text, return_type="loss")
-
-    # Should match closely
-    loss_diff = abs(bridge_loss - unprocessed_loss)
-    assert loss_diff < 30.0, f"Unprocessed models should match closely: {loss_diff:.6f}"
-
-
-def test_all_processing_matches_default_hooked_transformer():
-    """Test that all processing flags match default HookedTransformer behavior."""
-    device = "cpu"
-    model_name = "distilgpt2"  # Use distilgpt2 for faster tests
-    test_text = "Natural language processing"
-
-    # Load default HookedTransformer (with all processing)
-    default_ht = HookedTransformer.from_pretrained(model_name, device=device)
-    default_loss = default_ht(test_text, return_type="loss")
-
-    # Load TransformerBridge with all processing (default behavior)
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-    bridge.enable_compatibility_mode()
-    bridge_loss = bridge(test_text, return_type="loss")
-
-    # Should match closely
-    loss_diff = abs(bridge_loss - default_loss)
-    assert loss_diff < 0.01, f"Fully processed models should match closely: {loss_diff:.6f}"
diff --git a/tests/integration/model_bridge/test_weight_processing_integration.py b/tests/integration/model_bridge/test_weight_processing_integration.py
deleted file mode 100644
index 5379a583a..000000000
--- a/tests/integration/model_bridge/test_weight_processing_integration.py
+++ /dev/null
@@ -1,916 +0,0 @@
-#!/usr/bin/env python3
-"""
-Integration Compatibility Test for Weight Processing
-====================================================
-
-This test verifies that:
-1. HookedTransformer with processing matches expected Main Demo values (3.999 → 5.453)
-2. HookedTransformer without processing matches expected unprocessed values (~3.999 → ~4.117)
-3. TransformerBridge with processing matches HookedTransformer with processing
-4. TransformerBridge without processing matches HookedTransformer without processing
-5. Processing maintains mathematical equivalence for baseline computation
-6. Processing changes ablation results as expected (for better interpretability)
-"""
-
-import pytest
-import torch
-from jaxtyping import Float
-
-from transformer_lens import HookedTransformer, utils
-from transformer_lens.model_bridge.bridge import TransformerBridge
-
-
-def test_integration_compatibility():
-    """Test integration compatibility between HookedTransformer and TransformerBridge."""
-    model_name = "gpt2"
-    device = "cpu"
-
-    # Test text from Main Demo
-    test_text = "Natural language processing tasks, such as question answering, machine translation, reading comprehension, and summarization, are typically approached with supervised learning on taskspecific datasets."
-
-    # Ablation parameters from Main Demo
-    layer_to_ablate = 0
-    head_index_to_ablate = 8
-
-    print("=== INTEGRATION COMPATIBILITY TEST ===")
-    print(f"Model: {model_name}")
-    print(f"Device: {device}")
-    print(f"Test text: {test_text[:50]}...")
-    print(f"Ablating layer {layer_to_ablate}, head {head_index_to_ablate}")
-
-    # ===========================================
-    # STEP 1: HookedTransformer with processing
-    # ===========================================
-    print("\n1. Loading HookedTransformer with processing...")
-    hooked_processed = HookedTransformer.from_pretrained(model_name, device=device)
-    tokens = hooked_processed.to_tokens(test_text)
-
-    print("\n   Testing baseline performance...")
-    hooked_processed_baseline = hooked_processed(tokens, return_type="loss")
-    print(f"   HookedTransformer (processed) baseline: {hooked_processed_baseline.item():.6f}")
-
-    print("\n   Testing ablation performance...")
-
-    def head_ablation_hook(value: Float[torch.Tensor, "batch pos head_index d_head"], hook):
-        value[:, :, head_index_to_ablate, :] = 0.0
-        return value
-
-    hook_name = utils.get_act_name("v", layer_to_ablate)
-    hooked_processed_ablated = hooked_processed.run_with_hooks(
-        tokens, return_type="loss", fwd_hooks=[(hook_name, head_ablation_hook)]
-    )
-    print(f"   HookedTransformer (processed) ablated: {hooked_processed_ablated.item():.6f}")
-
-    hooked_processed_gain = hooked_processed_ablated.item() - hooked_processed_baseline.item()
-    print(f"   HookedTransformer (processed) gain: {hooked_processed_gain:.6f}")
-
-    # ===========================================
-    # STEP 2: HookedTransformer without processing
-    # ===========================================
-    print("\n2. Loading HookedTransformer without processing...")
-    hooked_unprocessed = HookedTransformer.from_pretrained_no_processing(model_name, device=device)
-
-    print("\n   Testing baseline performance...")
-    hooked_unprocessed_baseline = hooked_unprocessed(tokens, return_type="loss")
-    print(f"   HookedTransformer (unprocessed) baseline: {hooked_unprocessed_baseline.item():.6f}")
-
-    print("\n   Testing ablation performance...")
-    hooked_unprocessed_ablated = hooked_unprocessed.run_with_hooks(
-        tokens, return_type="loss", fwd_hooks=[(hook_name, head_ablation_hook)]
-    )
-    print(f"   HookedTransformer (unprocessed) ablated: {hooked_unprocessed_ablated.item():.6f}")
-
-    hooked_unprocessed_gain = hooked_unprocessed_ablated.item() - hooked_unprocessed_baseline.item()
-    print(f"   HookedTransformer (unprocessed) gain: {hooked_unprocessed_gain:.6f}")
-
-    # ===========================================
-    # STEP 3: TransformerBridge without processing
-    # ===========================================
-    print("\n3. Loading TransformerBridge without processing...")
-    try:
-        bridge_unprocessed = TransformerBridge.boot_transformers(model_name, device=device)
-
-        print("\n   Testing baseline performance...")
-        bridge_unprocessed_baseline = bridge_unprocessed(tokens, return_type="loss")
-        print(
-            f"   TransformerBridge (unprocessed) baseline: {bridge_unprocessed_baseline.item():.6f}"
-        )
-
-        print("\n   Testing ablation performance...")
-        bridge_unprocessed_ablated = bridge_unprocessed.run_with_hooks(
-            tokens, return_type="loss", fwd_hooks=[(hook_name, head_ablation_hook)]
-        )
-        print(
-            f"   TransformerBridge (unprocessed) ablated: {bridge_unprocessed_ablated.item():.6f}"
-        )
-
-        bridge_unprocessed_gain = (
-            bridge_unprocessed_ablated.item() - bridge_unprocessed_baseline.item()
-        )
-        print(f"   TransformerBridge (unprocessed) gain: {bridge_unprocessed_gain:.6f}")
-
-        bridge_unprocessed_success = True
-
-    except Exception as e:
-        print(f"   ❌ TransformerBridge (unprocessed) failed: {e}")
-        bridge_unprocessed_success = False
-
-    # ===========================================
-    # STEP 4: TransformerBridge with processing
-    # ===========================================
-    print("\n4. Loading TransformerBridge with processing...")
-    try:
-        bridge_processed = TransformerBridge.boot_transformers(model_name, device=device)
-
-        bridge_processed.process_weights()
-
-        print("\n   Testing baseline performance...")
-        bridge_processed_baseline = bridge_processed(tokens, return_type="loss")
-        print(f"   TransformerBridge (processed) baseline: {bridge_processed_baseline.item():.6f}")
-
-        print("\n   Testing ablation performance...")
-        bridge_processed_ablated = bridge_processed.run_with_hooks(
-            tokens, return_type="loss", fwd_hooks=[(hook_name, head_ablation_hook)]
-        )
-        print(f"   TransformerBridge (processed) ablated: {bridge_processed_ablated.item():.6f}")
-
-        bridge_processed_gain = bridge_processed_ablated.item() - bridge_processed_baseline.item()
-        print(f"   TransformerBridge (processed) gain: {bridge_processed_gain:.6f}")
-
-        bridge_processed_success = True
-
-    except Exception as e:
-        print(f"   ❌ TransformerBridge (processed) failed: {e}")
-        bridge_processed_success = False
-
-    # ===========================================
-    # ANALYSIS
-    # ===========================================
-    print("\n" + "=" * 60)
-    print("COMPATIBILITY ANALYSIS")
-    print("=" * 60)
-
-    # Expected values from Main Demo
-    expected_processed_baseline = 3.999
-    expected_processed_ablated = 5.453
-    expected_unprocessed_baseline = 3.999
-    expected_unprocessed_ablated = 4.117
-
-    tolerance_strict = 0.01
-    tolerance_loose = 0.1
-
-    print("\n1. HookedTransformer Validation:")
-    processed_baseline_match = (
-        abs(hooked_processed_baseline.item() - expected_processed_baseline) < tolerance_strict
-    )
-    processed_ablated_match = (
-        abs(hooked_processed_ablated.item() - expected_processed_ablated) < tolerance_strict
-    )
-    unprocessed_baseline_match = (
-        abs(hooked_unprocessed_baseline.item() - expected_unprocessed_baseline) < tolerance_strict
-    )
-    unprocessed_ablated_match = (
-        abs(hooked_unprocessed_ablated.item() - expected_unprocessed_ablated) < tolerance_loose
-    )
-
-    print(
-        f"   Processed baseline: {'✅' if processed_baseline_match else '❌'} {hooked_processed_baseline.item():.6f} (expected ~{expected_processed_baseline})"
-    )
-    print(
-        f"   Processed ablated:  {'✅' if processed_ablated_match else '❌'} {hooked_processed_ablated.item():.6f} (expected ~{expected_processed_ablated})"
-    )
-    print(
-        f"   Unprocessed baseline: {'✅' if unprocessed_baseline_match else '❌'} {hooked_unprocessed_baseline.item():.6f} (expected ~{expected_unprocessed_baseline})"
-    )
-    print(
-        f"   Unprocessed ablated:  {'✅' if unprocessed_ablated_match else '❌'} {hooked_unprocessed_ablated.item():.6f} (expected ~{expected_unprocessed_ablated})"
-    )
-
-    if bridge_unprocessed_success:
-        print("\n2. Bridge vs HookedTransformer (Unprocessed) Compatibility:")
-        bridge_hooked_baseline_diff = abs(
-            bridge_unprocessed_baseline.item() - hooked_unprocessed_baseline.item()
-        )
-        bridge_hooked_ablated_diff = abs(
-            bridge_unprocessed_ablated.item() - hooked_unprocessed_ablated.item()
-        )
-        bridge_hooked_gain_diff = abs(bridge_unprocessed_gain - hooked_unprocessed_gain)
-
-        baseline_compatible = bridge_hooked_baseline_diff < tolerance_strict
-        ablated_compatible = bridge_hooked_ablated_diff < tolerance_strict
-        gain_compatible = bridge_hooked_gain_diff < tolerance_strict
-
-        print(
-            f"   Baseline diff: {'✅' if baseline_compatible else '❌'} {bridge_hooked_baseline_diff:.6f}"
-        )
-        print(
-            f"   Ablated diff:  {'✅' if ablated_compatible else '❌'} {bridge_hooked_ablated_diff:.6f}"
-        )
-        print(f"   Gain diff:     {'✅' if gain_compatible else '❌'} {bridge_hooked_gain_diff:.6f}")
-
-    if bridge_processed_success:
-        print("\n3. Bridge vs HookedTransformer (Processed) Compatibility:")
-        bridge_hooked_processed_baseline_diff = abs(
-            bridge_processed_baseline.item() - hooked_processed_baseline.item()
-        )
-        bridge_hooked_processed_ablated_diff = abs(
-            bridge_processed_ablated.item() - hooked_processed_ablated.item()
-        )
-        bridge_hooked_processed_gain_diff = abs(bridge_processed_gain - hooked_processed_gain)
-
-        processed_baseline_compatible = bridge_hooked_processed_baseline_diff < tolerance_strict
-        processed_ablated_compatible = bridge_hooked_processed_ablated_diff < tolerance_strict
-        processed_gain_compatible = bridge_hooked_processed_gain_diff < tolerance_strict
-
-        print(
-            f"   Baseline diff: {'✅' if processed_baseline_compatible else '❌'} {bridge_hooked_processed_baseline_diff:.6f}"
-        )
-        print(
-            f"   Ablated diff:  {'✅' if processed_ablated_compatible else '❌'} {bridge_hooked_processed_ablated_diff:.6f}"
-        )
-        print(
-            f"   Gain diff:     {'✅' if processed_gain_compatible else '❌'} {bridge_hooked_processed_gain_diff:.6f}"
-        )
-
-    print("\n4. Processing Effect Analysis:")
-    processing_improves_interpretability = hooked_processed_gain > hooked_unprocessed_gain
-    print(
-        f"   Processing improves interpretability: {'✅' if processing_improves_interpretability else '❌'}"
-    )
-    print(f"   Processed gain: {hooked_processed_gain:.6f}")
-    print(f"   Unprocessed gain: {hooked_unprocessed_gain:.6f}")
-    print(f"   Improvement: {hooked_processed_gain - hooked_unprocessed_gain:.6f}")
-
-    # ===========================================
-    # FINAL VERDICT
-    # ===========================================
-    print("\n" + "=" * 60)
-    print("FINAL VERDICT")
-    print("=" * 60)
-
-    hooked_valid = (
-        processed_baseline_match
-        and processed_ablated_match
-        and unprocessed_baseline_match
-        and unprocessed_ablated_match
-    )
-    bridge_unprocessed_compatible = (
-        bridge_unprocessed_success
-        and baseline_compatible
-        and ablated_compatible
-        and gain_compatible
-        if bridge_unprocessed_success
-        else False
-    )
-    bridge_processed_compatible = (
-        bridge_processed_success
-        and processed_baseline_compatible
-        and processed_ablated_compatible
-        and processed_gain_compatible
-        if bridge_processed_success
-        else False
-    )
-
-    print(f"HookedTransformer validation: {'✅' if hooked_valid else '❌'}")
-    print(f"Bridge (unprocessed) compatibility: {'✅' if bridge_unprocessed_compatible else '❌'}")
-    print(f"Bridge (processed) compatibility: {'✅' if bridge_processed_compatible else '❌'}")
-    print(f"Processing effectiveness: {'✅' if processing_improves_interpretability else '❌'}")
-
-    overall_success = (
-        hooked_valid
-        and bridge_unprocessed_compatible
-        and bridge_processed_compatible
-        and processing_improves_interpretability
-    )
-
-    if overall_success:
-        print("\n🎉🎉🎉 FULL INTEGRATION COMPATIBILITY ACHIEVED! 🎉🎉🎉")
-        print("TransformerBridge is fully compatible with HookedTransformer!")
-    else:
-        print("\n⚠️ Integration compatibility issues detected")
-        pytest.fail("Integration compatibility issues detected")
-
-
-@pytest.mark.skip(
-    reason="Test is outdated - TransformerBridge uses _original_component structure, incompatible with direct state_dict loading from ProcessWeights"
-)
-def test_weight_processing_results_loaded_into_model():
-    """Test that weight processing results affect model output when loaded via state dict."""
-    model_name = "gpt2"
-    device = "cpu"
-
-    # Load TransformerBridge
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-
-    # Get original weights before processing
-    original_state_dict = bridge._extract_hf_weights()
-
-    # Process weights with all available processing options
-    from transformer_lens.weight_processing import ProcessWeights
-
-    processed_state_dict = ProcessWeights.process_weights(
-        original_state_dict,
-        bridge.cfg,
-        fold_ln=True,  # Enable layer norm folding
-        center_writing_weights=True,  # Center attention weights
-        center_unembed=True,  # Center unembedding weights
-        fold_value_biases=True,  # Fold value biases
-        refactor_factored_attn_matrices=False,  # Keep attention matrices as-is
-        adapter=bridge.adapter,
-    )
-
-    # Verify that processing changed the weights
-    processed_keys = set(processed_state_dict.keys())
-    original_keys = set(original_state_dict.keys())
-
-    # Some keys should be removed (e.g., layer norm weights)
-    removed_keys = original_keys - processed_keys
-    print(f"Keys removed during processing: {len(removed_keys)}")
-    print(f"Sample removed keys: {sorted(list(removed_keys))[:5]}...")
-
-    # Some keys might be added (e.g., combined QKV weights)
-    added_keys = processed_keys - original_keys
-    print(f"Keys added during processing: {len(added_keys)}")
-
-    # Load processed weights into the bridge model
-    result = bridge.load_state_dict(processed_state_dict, strict=False, assign=True)
-
-    # Verify loading was successful
-    assert len(result.unexpected_keys) == 0, f"Unexpected keys found: {result.unexpected_keys}"
-    print(f"Missing keys (expected for processed weights): {len(result.missing_keys)}")
-
-    # Test that layer norm weights were properly removed
-    ln_keys_in_processed = [
-        k for k in processed_state_dict.keys() if "ln" in k and ("weight" in k or "bias" in k)
-    ]
-    print(f"Layer norm keys in processed state dict: {len(ln_keys_in_processed)}")
-
-    # Most layer norm keys should be removed during processing
-    assert len(ln_keys_in_processed) < len(
-        [k for k in original_keys if "ln" in k and ("weight" in k or "bias" in k)]
-    ), "Layer norm keys should be removed during processing"
-
-    # Test model output to ensure it's using the processed weights
-    test_input = torch.tensor([[1, 2, 3]], device=device)  # Simple test input
-
-    # Verify the model can run with processed weights
-    with torch.no_grad():
-        output = bridge(test_input)
-        assert output is not None, "Model should produce output with processed weights"
-        assert output.shape[0] == test_input.shape[0], "Output batch size should match input"
-        print(f"✅ Model produces valid output with processed weights: {output.shape}")
-
-    # Verify that the model's forward pass uses the loaded weights
-    # by checking that the output is different from a fresh model
-    fresh_bridge = TransformerBridge.boot_transformers(model_name, device=device)
-    with torch.no_grad():
-        fresh_output = fresh_bridge(test_input)
-        processed_output = bridge(test_input)
-
-        # The outputs should be different since we loaded processed weights
-        outputs_different = not torch.allclose(fresh_output, processed_output, atol=1e-6)
-        if outputs_different:
-            print("✅ Model output changed after loading processed weights")
-
-            # Calculate the difference magnitude
-            max_diff = torch.max(torch.abs(fresh_output - processed_output)).item()
-            print(f"Maximum output difference: {max_diff:.6f}")
-
-            # Verify the difference is significant (not just numerical noise)
-            assert max_diff > 1e-5, f"Output difference too small: {max_diff:.6f}"
-        else:
-            print("ℹ️ Model output unchanged (may indicate processing had no effect)")
-
-    # Test key conversion functionality
-    test_key = "transformer.h.0.attn.c_attn.weight"
-    if test_key in processed_state_dict:
-        bridge_key = bridge.adapter.convert_hf_key_to_tl_key(test_key)
-        assert (
-            bridge_key in bridge.original_model.state_dict()
-        ), f"Bridge key {bridge_key} should exist in model"
-        print(f"✅ Key conversion works: {test_key} -> {bridge_key}")
-
-    # Comprehensive test: verify all processed tensors are properly loaded into original components
-    print("\n=== COMPREHENSIVE TENSOR LOADING VERIFICATION ===")
-
-    # Get final state dict after loading
-    final_state_dict = bridge.original_model.state_dict()
-
-    # Test all processed keys
-    total_processed = len(processed_state_dict)
-    loaded_correctly = 0
-    not_found_in_bridge = 0
-    not_loaded_correctly = 0
-    expected_not_found = 0
-
-    print(f"Testing {total_processed} processed keys...")
-
-    for processed_key, processed_value in processed_state_dict.items():
-        # Convert to bridge key
-        bridge_key = bridge.adapter.convert_hf_key_to_tl_key(processed_key)
-
-        # Check if bridge key exists in the final state dict
-        if bridge_key in final_state_dict:
-            final_value = final_state_dict[bridge_key]
-
-            # Check if values match (allowing for small numerical differences)
-            if torch.allclose(processed_value, final_value, atol=1e-6):
-                loaded_correctly += 1
-            else:
-                not_loaded_correctly += 1
-                max_diff = torch.max(torch.abs(processed_value - final_value)).item()
-                # Only show first few failures to avoid spam
-                if not_loaded_correctly <= 3:
-                    print(
-                        f"❌ {processed_key} -> {bridge_key} NOT loaded correctly (max diff: {max_diff:.6f})"
-                    )
-        else:
-            not_found_in_bridge += 1
-
-            # Check if this key was expected to be removed during processing
-            if "ln" in processed_key and ("weight" in processed_key or "bias" in processed_key):
-                expected_not_found += 1
-                # Layer norm keys are expected to be removed, so this is OK
-                if expected_not_found <= 3:
-                    print(
-                        f"ℹ️ {processed_key} -> {bridge_key} not found (expected - layer norm removed)"
-                    )
-            else:
-                # This is unexpected
-                if not_found_in_bridge - expected_not_found <= 3:
-                    print(f"❌ {processed_key} -> {bridge_key} not found in bridge (unexpected)")
-
-    print(f"\n=== LOADING VERIFICATION SUMMARY ===")
-    print(f"Total processed keys: {total_processed}")
-    print(f"Loaded correctly: {loaded_correctly} ({loaded_correctly/total_processed*100:.1f}%)")
-    print(
-        f"Not loaded correctly: {not_loaded_correctly} ({not_loaded_correctly/total_processed*100:.1f}%)"
-    )
-    print(
-        f"Not found in bridge: {not_found_in_bridge} ({not_found_in_bridge/total_processed*100:.1f}%)"
-    )
-    print(
-        f"Expected not found (layer norms): {expected_not_found} ({expected_not_found/total_processed*100:.1f}%)"
-    )
-    print(
-        f"Unexpected not found: {not_found_in_bridge - expected_not_found} ({(not_found_in_bridge - expected_not_found)/total_processed*100:.1f}%)"
-    )
-
-    # Assertions - adjusted for realistic expectations
-    # 1. Some keys should load correctly (partial state dict loading is expected to be incomplete)
-    success_rate = loaded_correctly / total_processed
-    print(f"Success rate: {success_rate*100:.1f}%")
-
-    # The key insight is that when loading a partial state dict, PyTorch only updates the keys present
-    # So we should focus on ensuring the keys that ARE loaded are loaded correctly
-    if loaded_correctly + not_loaded_correctly > 0:
-        actual_loading_success_rate = loaded_correctly / (loaded_correctly + not_loaded_correctly)
-        print(
-            f"Actual loading success rate (excluding not found): {actual_loading_success_rate*100:.1f}%"
-        )
-        assert (
-            actual_loading_success_rate >= 0.5
-        ), f"Only {actual_loading_success_rate*100:.1f}% of found keys loaded correctly"
-
-    # 2. Unexpected not found keys should be minimal (only layer norms should be missing)
-    unexpected_not_found_rate = (not_found_in_bridge - expected_not_found) / total_processed
-    assert (
-        unexpected_not_found_rate <= 0.05
-    ), f"Too many unexpected not found keys: {unexpected_not_found_rate*100:.1f}% (expected <= 5%)"
-
-    # 3. Layer norm keys should be properly removed
-    ln_keys_processed = [
-        k for k in processed_state_dict.keys() if "ln" in k and ("weight" in k or "bias" in k)
-    ]
-    print(f"Layer norm keys in processed dict: {len(ln_keys_processed)}")
-
-    # 4. Test that key conversion works for all processed keys
-    conversion_success = 0
-    for processed_key in processed_state_dict.keys():
-        bridge_key = bridge.adapter.convert_hf_key_to_tl_key(processed_key)
-        if bridge_key != processed_key:  # Key was converted
-            conversion_success += 1
-
-    conversion_rate = conversion_success / total_processed
-    print(
-        f"Key conversion rate: {conversion_rate*100:.1f}% ({conversion_success}/{total_processed})"
-    )
-    assert (
-        conversion_rate >= 0.9
-    ), f"Key conversion rate too low: {conversion_rate*100:.1f}% (expected >= 90%)"
-
-    # 5. Most importantly: verify that critical keys (embeddings, global weights) load correctly
-    critical_keys = ["transformer.wte.weight", "transformer.wpe.weight", "lm_head.weight"]
-    critical_loaded = 0
-    for critical_key in critical_keys:
-        if critical_key in processed_state_dict:
-            bridge_key = bridge.adapter.convert_hf_key_to_tl_key(critical_key)
-            if bridge_key in final_state_dict:
-                processed_value = processed_state_dict[critical_key]
-                final_value = final_state_dict[bridge_key]
-                if torch.allclose(processed_value, final_value, atol=1e-6):
-                    critical_loaded += 1
-                    print(f"✅ Critical key {critical_key} loaded correctly")
-                else:
-                    print(f"❌ Critical key {critical_key} NOT loaded correctly")
-            else:
-                print(f"❌ Critical key {critical_key} bridge key not found")
-
-    critical_success_rate = critical_loaded / len(critical_keys)
-    print(
-        f"Critical keys loaded: {critical_loaded}/{len(critical_keys)} ({critical_success_rate*100:.1f}%)"
-    )
-    assert (
-        critical_success_rate >= 0.8
-    ), f"Only {critical_success_rate*100:.1f}% of critical keys loaded correctly"
-
-    print("✅ All processed tensors properly loaded into original components!")
-    print("✅ Weight processing results successfully affect model behavior!")
-
-
-@pytest.mark.skip(
-    reason="Test is outdated - relies on old HF state_dict key format (transformer.h.0.attn.c_attn.weight)"
-)
-def test_attention_weight_loading():
-    """Test that attention weights are properly loaded after processing."""
-    model_name = "gpt2"
-    device = "cpu"
-
-    # Load TransformerBridge
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-
-    # Get original weights
-    original_state_dict = bridge._extract_hf_weights()
-    original_q_weight = bridge.transformer.h[0].attn.c_attn.weight.clone()
-
-    # Process weights (this should fold layer norms into attention weights)
-    from transformer_lens.weight_processing import ProcessWeights
-
-    processed_state_dict = ProcessWeights.process_weights(
-        original_state_dict,
-        bridge.cfg,
-        fold_ln=True,
-        center_writing_weights=True,
-        center_unembed=True,
-        fold_value_biases=True,
-        refactor_factored_attn_matrices=False,
-        adapter=bridge.adapter,
-    )
-
-    # Get processed weights
-    processed_q_weight = processed_state_dict["transformer.h.0.attn.c_attn.weight"]
-
-    # Assert that processing changed the weights (layer norm folding occurred)
-    assert not torch.allclose(
-        original_q_weight, processed_q_weight, atol=1e-6
-    ), "Layer norm folding should change attention weights"
-
-    # Map processed weights to bridge format and load them
-    bridge_key = "transformer.h.0._original_component.attn._original_component.c_attn._original_component.weight"
-    mapped_state_dict = {bridge_key: processed_q_weight}
-
-    # Load the processed weights
-    result = bridge.load_state_dict(mapped_state_dict, strict=False, assign=False)
-
-    # Assert no unexpected keys
-    assert len(result.unexpected_keys) == 0, f"Unexpected keys: {result.unexpected_keys}"
-
-    # Get the loaded weight from the bridge
-    loaded_q_weight = bridge.transformer.h[0].attn.c_attn.weight
-
-    # Assert that the loaded weight matches the processed weight
-    assert torch.allclose(loaded_q_weight, processed_q_weight, atol=1e-6), (
-        f"Loaded weight should match processed weight. "
-        f"Expected: {processed_q_weight[0, :5]}, "
-        f"Got: {loaded_q_weight[0, :5]}"
-    )
-
-
-def test_processing_verification():
-    """Verify that weight processing is actually happening."""
-    device = "cpu"
-    model_name = "gpt2"
-
-    # Load unprocessed HookedTransformer
-    hooked_unprocessed = HookedTransformer.from_pretrained(
-        model_name,
-        device=device,
-        fold_ln=False,
-        center_writing_weights=False,
-        center_unembed=False,
-        fold_value_biases=False,
-    )
-
-    # Load processed HookedTransformer
-    hooked_processed = HookedTransformer.from_pretrained(
-        model_name,
-        device=device,
-        fold_ln=True,
-        center_writing_weights=True,
-        center_unembed=True,
-        fold_value_biases=True,
-    )
-
-    # Load unprocessed TransformerBridge
-    bridge_unprocessed = TransformerBridge.boot_transformers(model_name, device=device)
-    bridge_unprocessed.enable_compatibility_mode()  # Prevent processing
-
-    # Load processed TransformerBridge
-    bridge_processed = TransformerBridge.boot_transformers(model_name, device=device)
-    # Processing is enabled by default
-
-    test_text = "Hello world"
-
-    # Test losses
-    hooked_unprocessed_loss = hooked_unprocessed(test_text, return_type="loss").item()
-    hooked_processed_loss = hooked_processed(test_text, return_type="loss").item()
-    bridge_unprocessed_loss = bridge_unprocessed(test_text, return_type="loss").item()
-    bridge_processed_loss = bridge_processed(test_text, return_type="loss").item()
-
-    # Check if processing actually changed the models (use smaller threshold for bridge)
-    hooked_processing_worked = abs(hooked_processed_loss - hooked_unprocessed_loss) > 0.01
-    bridge_processing_worked = abs(bridge_processed_loss - bridge_unprocessed_loss) > 0.001
-
-    # Check if processed models match (relax tolerance for architectural differences)
-    models_match = abs(hooked_processed_loss - bridge_processed_loss) < 1.0
-
-    # Check if LayerNorm parameters were removed (indicating folding happened)
-    hooked_state = hooked_processed.state_dict()
-    bridge_state = bridge_processed.original_model.state_dict()
-
-    # Look for LayerNorm bias parameters that should be removed after folding
-    hooked_ln_keys = [k for k in hooked_state.keys() if "ln1.b" in k or "ln2.b" in k]
-    bridge_ln_keys = [k for k in bridge_state.keys() if "ln_1.bias" in k or "ln_2.bias" in k]
-
-    # Note: Processing differences may be small for short texts - just check models work
-    print(
-        f"HookedTransformer difference: {abs(hooked_processed_loss - hooked_unprocessed_loss):.6f}"
-    )
-    print(f"Bridge difference: {abs(bridge_processed_loss - bridge_unprocessed_loss):.6f}")
-
-    # Just verify models produce reasonable losses (main test is that they don't crash)
-    assert (
-        2.0 < hooked_processed_loss < 10.0
-    ), f"HookedTransformer loss unreasonable: {hooked_processed_loss}"
-    assert 2.0 < bridge_processed_loss < 10.0, f"Bridge loss unreasonable: {bridge_processed_loss}"
-    assert (
-        models_match
-    ), f"Processed models do not match (diff: {abs(hooked_processed_loss - bridge_processed_loss):.6f})"
-    # Note: LayerNorm parameters may still be present even when folded (implementation detail)
-    # Just check that processing happened by verifying loss differences
-    # Note: Bridge LayerNorm parameters may also still be present (implementation detail)
-
-
-@pytest.mark.skip(reason="Weight processing comparison failing due to architectural differences")
-def test_gpt2_weight_processing_comparison():
-    """Test GPT-2 weight processing comparison between different paths."""
-    model_name = "gpt2"
-    device = "cpu"
-
-    # Load HuggingFace GPT-2
-    from transformers import GPT2LMHeadModel, GPT2Tokenizer
-
-    hf_model = GPT2LMHeadModel.from_pretrained(model_name)
-    hf_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
-
-    # Load HookedTransformer
-    tl_model = HookedTransformer.from_pretrained(model_name, device=device)
-
-    # Create TransformerBridge
-    from transformer_lens.config import TransformerBridgeConfig
-    from transformer_lens.model_bridge.supported_architectures.gpt2 import (
-        GPT2ArchitectureAdapter,
-    )
-
-    bridge_config = TransformerBridgeConfig.from_dict(tl_model.cfg.__dict__)
-    bridge_config.architecture = "GPT2LMHeadModel"
-    adapter = GPT2ArchitectureAdapter(bridge_config)
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-
-    # Get original state dicts
-    hf_state_dict = hf_model.state_dict()
-    tl_state_dict = tl_model.state_dict()
-    bridge_state_dict = bridge.state_dict()
-
-    # Test 1: Direct GPT-2 processing through LayerNorm folding
-    hf_processed = hf_state_dict.copy()
-
-    # Apply LayerNorm folding to HuggingFace model
-    from transformer_lens.weight_processing import ProcessWeights
-
-    hf_processed = ProcessWeights.fold_layer_norm(
-        hf_processed, tl_model.cfg, fold_biases=True, center_weights=True, adapter=adapter
-    )
-
-    # Test 2: TransformerBridge processing
-    bridge.process_weights(
-        fold_ln=True, fold_value_biases=True, center_writing_weights=True, center_unembed=True
-    )
-
-    # Get processed state dicts
-    bridge_processed_state_dict = bridge.state_dict()
-
-    # Test 3: Compare key weights
-    comparison_keys = [
-        "transformer.h.0.attn.c_attn.weight",
-        "transformer.h.0.attn.c_proj.weight",
-        "transformer.h.0.mlp.c_fc.weight",
-        "transformer.h.0.mlp.c_proj.weight",
-        "transformer.wte.weight",
-        "transformer.wpe.weight",
-    ]
-
-    max_diff = 0.0
-    total_comparisons = 0
-    successful_comparisons = 0
-
-    for key in comparison_keys:
-        if key in hf_processed and key in bridge_processed_state_dict:
-            hf_weight = hf_processed[key]
-            bridge_weight = bridge_processed_state_dict[key]
-
-            # Check shapes match
-            assert (
-                hf_weight.shape == bridge_weight.shape
-            ), f"Shape mismatch for {key}: HF {hf_weight.shape} vs Bridge {bridge_weight.shape}"
-
-            # Calculate difference
-            diff = torch.abs(hf_weight - bridge_weight).max().item()
-            max_diff = max(max_diff, diff)
-            total_comparisons += 1
-
-            assert diff < 1e-3, f"{key}: max diff = {diff:.2e} (too large)"
-            successful_comparisons += 1
-
-    # Test 4: Check if LayerNorm parameters were properly folded
-    # Check if LayerNorm parameters are gone from processed state dicts
-    ln_keys_hf = [k for k in hf_processed.keys() if "ln" in k.lower()]
-    ln_keys_bridge = [k for k in bridge_processed_state_dict.keys() if "ln" in k.lower()]
-
-    # LayerNorm parameters may still be present (folded but not removed - implementation detail)
-    # Just check that processing succeeded by verifying weights were modified
-
-    # Test 5: Check attention weight structure
-    # Check if attention weights were split properly
-    attn_keys_hf = [k for k in hf_processed.keys() if "attn" in k and "weight" in k]
-    attn_keys_bridge = [
-        k for k in bridge_processed_state_dict.keys() if "attn" in k and "weight" in k
-    ]
-
-    # Look for split attention weights (q, k, v separate)
-    split_attn_hf = [k for k in attn_keys_hf if any(x in k for x in [".q.", ".k.", ".v."])]
-    split_attn_bridge = [k for k in attn_keys_bridge if any(x in k for x in [".q.", ".k.", ".v."])]
-
-    # Attention weights should be split properly
-    assert len(split_attn_hf) > 0, "Attention weights should be split in HF processed"
-    assert len(split_attn_bridge) > 0, "Attention weights should be split in Bridge processed"
-
-
-@pytest.mark.skip(reason="Tensor conversion compatibility failing due to architectural differences")
-def test_tensor_conversion_compatibility():
-    """Test that conversion functions match HookedTransformer exactly."""
-    model_name = "gpt2"
-    device = "cpu"
-
-    # Load HookedTransformer WITHOUT processing to get unprocessed weights
-    tl_model = HookedTransformer.from_pretrained_no_processing(model_name, device=device)
-    bridge = TransformerBridge.boot_transformers(model_name, device=device)
-
-    # Test layer 0 (first layer)
-    layer_idx = 0
-
-    # Get HookedTransformer state dict
-    tl_state_dict = tl_model.state_dict()
-
-    # Test attention weights
-    attention_params = ["W_Q", "W_K", "W_V", "W_O"]
-    for param in attention_params:
-        tl_key = f"blocks.{layer_idx}.attn.{param}"
-        hf_key = bridge.adapter.translate_transformer_lens_path(tl_key)
-
-        # Get HookedTransformer value
-        tl_value = tl_state_dict[tl_key]
-
-        # Convert using the component directly (it will get the tensor from state dict)
-        from transformer_lens.weight_processing import ProcessWeights
-
-        # Check if key exists before conversion
-        state_dict = bridge.original_model.state_dict()
-        if hf_key not in state_dict:
-            print(
-                f"Key {hf_key} not found in state dict. Available keys: {list(state_dict.keys())[:5]}..."
-            )
-            continue  # Skip this parameter
-
-        converted_value = ProcessWeights.convert_tensor_to_tl_format(
-            state_dict[hf_key], hf_key, bridge.adapter, bridge.cfg
-        )
-
-        # Compare shapes
-        assert (
-            tl_value.shape == converted_value.shape
-        ), f"Shape mismatch for {param}: TL {tl_value.shape} vs Converted {converted_value.shape}"
-
-        # Compare values
-        max_diff = torch.max(torch.abs(tl_value - converted_value)).item()
-        assert max_diff < 1e-6, f"Value mismatch for {param}: max_diff={max_diff:.2e}"
-
-    # Test MLP weights
-    mlp_params = ["W_in", "W_out"]
-    for param in mlp_params:
-        tl_key = f"blocks.{layer_idx}.mlp.{param}"
-        hf_key = bridge.adapter.translate_transformer_lens_path(tl_key)
-
-        # Get HookedTransformer value
-        tl_value = tl_state_dict[tl_key]
-
-        # Convert using the component directly
-        converted_value = ProcessWeights.convert_tensor_to_tl_format(
-            bridge.original_model.state_dict()[hf_key], hf_key, bridge.adapter, bridge.cfg
-        )
-
-        # Compare shapes
-        assert (
-            tl_value.shape == converted_value.shape
-        ), f"Shape mismatch for MLP {param}: TL {tl_value.shape} vs Converted {converted_value.shape}"
-
-        # Compare values
-        max_diff = torch.max(torch.abs(tl_value - converted_value)).item()
-        assert max_diff < 1e-6, f"Value mismatch for MLP {param}: max_diff={max_diff:.2e}"
-
-    # Test embeddings
-    embedding_params = ["W_E", "W_pos"]
-    for param in embedding_params:
-        tl_key = f"embed.{param}"
-        hf_key = bridge.adapter.translate_transformer_lens_path(tl_key)
-
-        # Get HookedTransformer value
-        tl_value = tl_state_dict[tl_key]
-
-        # Convert using the component directly
-        converted_value = ProcessWeights.convert_tensor_to_tl_format(
-            bridge.original_model.state_dict()[hf_key], hf_key, bridge.adapter, bridge.cfg
-        )
-
-        # Compare shapes
-        assert (
-            tl_value.shape == converted_value.shape
-        ), f"Shape mismatch for {param}: TL {tl_value.shape} vs Converted {converted_value.shape}"
-
-        # Compare values
-        max_diff = torch.max(torch.abs(tl_value - converted_value)).item()
-        assert max_diff < 1e-6, f"Value mismatch for {param}: max_diff={max_diff:.2e}"
-
-
-def test_layer_norm_weights_removed():
-    """Test that layer norm weights are properly handled after processing."""
-    model_name = "gpt2"
-    device = "cpu"
-
-    # Load TransformerBridge without processing
-    bridge_unprocessed = TransformerBridge.boot_transformers(model_name, device=device)
-
-    # Get layer norm keys before processing
-    unprocessed_state = bridge_unprocessed.original_model.state_dict()
-    ln_keys_before = [k for k in unprocessed_state.keys() if ("ln_1" in k or "ln_f" in k)]
-    assert len(ln_keys_before) > 0, "Layer norm weights should exist in original state dict"
-
-    # Load TransformerBridge with processing
-    bridge_processed = TransformerBridge.boot_transformers(model_name, device=device)
-    bridge_processed.enable_compatibility_mode()  # This processes weights with fold_ln=True
-
-    # Get layer norm keys after processing
-    # The layer norm weights should still be present in the HF model's state dict
-    # (folding modifies other weights but keeps LN weights in place)
-    processed_state = bridge_processed.original_model.state_dict()
-    ln_keys_after = [k for k in processed_state.keys() if ("ln_1" in k or "ln_f" in k)]
-
-    # Layer norm weights should still exist (they are folded into other weights, not removed from state dict)
-    assert (
-        len(ln_keys_after) > 0
-    ), f"Layer norm weights should still exist after folding. Found: {len(ln_keys_after)} keys"
-
-    # Verify that the LN weights have been set to identity (weight=1, bias=0)
-    # This is the expected result of folding
-    for key in ln_keys_after:
-        if "weight" in key:
-            # After folding, LN weights should be all 1s
-            weight = processed_state[key]
-            assert torch.allclose(
-                weight, torch.ones_like(weight)
-            ), f"{key} should be all 1s after folding"
-        elif "bias" in key:
-            # After folding, LN biases should be all 0s
-            bias = processed_state[key]
-            assert torch.allclose(
-                bias, torch.zeros_like(bias)
-            ), f"{key} should be all 0s after folding"
-
-
-if __name__ == "__main__":
-    success = test_integration_compatibility()
-    if success:
-        print("\n🚀 INTEGRATION READY FOR PRODUCTION! 🚀")
-
-    # Run the comprehensive weight processing test
-    test_weight_processing_results_loaded_into_model()
diff --git a/tests/integration/model_bridge/test_weight_processing_math.py b/tests/integration/model_bridge/test_weight_processing_math.py
new file mode 100644
index 000000000..ea579e5ae
--- /dev/null
+++ b/tests/integration/model_bridge/test_weight_processing_math.py
@@ -0,0 +1,164 @@
+"""Test mathematical correctness of weight processing operations.
+
+Verifies that weight processing transformations produce the expected
+mathematical properties, not just that they run without error.
+Uses distilgpt2 (CI-cached).
+"""
+
+import pytest
+import torch
+
+from transformer_lens.model_bridge.bridge import TransformerBridge
+
+
+@pytest.fixture(scope="module")
+def bridge_fold_ln():
+    """Bridge with only fold_ln applied."""
+    bridge = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    bridge.process_weights(
+        fold_ln=True,
+        center_writing_weights=False,
+        center_unembed=False,
+        fold_value_biases=False,
+        refactor_factored_attn_matrices=False,
+    )
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+@pytest.fixture(scope="module")
+def bridge_center_writing():
+    """Bridge with fold_ln + center_writing_weights applied."""
+    bridge = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    bridge.process_weights(
+        fold_ln=True,
+        center_writing_weights=True,
+        center_unembed=False,
+        fold_value_biases=False,
+        refactor_factored_attn_matrices=False,
+    )
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+@pytest.fixture(scope="module")
+def bridge_center_unembed():
+    """Bridge with fold_ln + center_writing + center_unembed applied."""
+    bridge = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    bridge.process_weights(
+        fold_ln=True,
+        center_writing_weights=True,
+        center_unembed=True,
+        fold_value_biases=False,
+        refactor_factored_attn_matrices=False,
+    )
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+@pytest.fixture(scope="module")
+def bridge_fold_value_biases():
+    """Bridge with all processing applied."""
+    bridge = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    bridge.enable_compatibility_mode()
+    return bridge
+
+
+class TestFoldLayerNorm:
+    """After fold_ln, LayerNorm weights should be identity (w=1, b=0)."""
+
+    def test_ln1_weights_are_ones(self, bridge_fold_ln):
+        """After folding, ln1.w should be all ones."""
+        checked = 0
+        for i in range(bridge_fold_ln.cfg.n_layers):
+            block = bridge_fold_ln.blocks[i]
+            ln = block.ln1.original_component
+            assert torch.allclose(
+                ln.weight, torch.ones_like(ln.weight), atol=1e-6
+            ), f"Layer {i} ln1.weight should be ones after fold_ln"
+            checked += 1
+        assert checked > 0, "No ln1 weights were checked — test is vacuous"
+
+    def test_ln1_biases_are_zeros(self, bridge_fold_ln):
+        """After folding, ln1.b should be all zeros."""
+        checked = 0
+        for i in range(bridge_fold_ln.cfg.n_layers):
+            block = bridge_fold_ln.blocks[i]
+            ln = block.ln1.original_component
+            if ln.bias is not None:
+                assert torch.allclose(
+                    ln.bias, torch.zeros_like(ln.bias), atol=1e-6
+                ), f"Layer {i} ln1.bias should be zeros after fold_ln"
+                checked += 1
+        assert checked > 0, "No ln1 biases were checked — test is vacuous"
+
+    def test_ln2_weights_are_ones(self, bridge_fold_ln):
+        """After folding, ln2.w should be all ones."""
+        checked = 0
+        for i in range(bridge_fold_ln.cfg.n_layers):
+            block = bridge_fold_ln.blocks[i]
+            ln = block.ln2.original_component
+            assert torch.allclose(
+                ln.weight, torch.ones_like(ln.weight), atol=1e-6
+            ), f"Layer {i} ln2.weight should be ones after fold_ln"
+            checked += 1
+        assert checked > 0, "No ln2 weights were checked — test is vacuous"
+
+    def test_ln_final_weights_are_ones(self, bridge_fold_ln):
+        """After folding, ln_final.w should be all ones."""
+        ln = bridge_fold_ln.ln_final.original_component
+        assert torch.allclose(
+            ln.weight, torch.ones_like(ln.weight), atol=1e-6
+        ), "ln_final.weight should be ones after fold_ln"
+
+    def test_fold_preserves_output(self, bridge_fold_ln):
+        """Folding should not change model output (mathematically equivalent)."""
+        # Compare against an unprocessed bridge
+        bridge_unproc = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+        bridge_unproc.process_weights(
+            fold_ln=False,
+            center_writing_weights=False,
+            center_unembed=False,
+            fold_value_biases=False,
+            refactor_factored_attn_matrices=False,
+        )
+        bridge_unproc.enable_compatibility_mode()
+
+        text = "The quick brown fox"
+        with torch.no_grad():
+            folded_loss = bridge_fold_ln(text, return_type="loss").item()
+            unfolded_loss = bridge_unproc(text, return_type="loss").item()
+
+        # Folding is mathematically equivalent — output should be very close
+        assert abs(folded_loss - unfolded_loss) < 0.01, (
+            f"fold_ln should not change output: folded={folded_loss:.6f}, "
+            f"unfolded={unfolded_loss:.6f}"
+        )
+
+
+class TestCenterWritingWeights:
+    """After center_writing_weights, writing weights should have zero column mean."""
+
+    def test_W_O_centered(self, bridge_center_writing):
+        """W_O columns should have zero mean after centering."""
+        W_O = bridge_center_writing.W_O  # [n_layers, n_heads, d_head, d_model]
+        # Mean along the output dimension (d_model) should be ~0
+        # W_O writes to the residual stream along d_model
+        col_mean = W_O.mean(dim=-1)  # [n_layers, n_heads, d_head]
+        assert torch.allclose(
+            col_mean, torch.zeros_like(col_mean), atol=1e-5
+        ), f"W_O column mean should be ~0 after centering, max: {col_mean.abs().max():.6f}"
+
+
+class TestCenterUnembed:
+    """After center_unembed, unembed weights should have zero row mean."""
+
+    def test_unembed_rows_centered(self, bridge_center_unembed):
+        """W_U rows should have zero mean after centering."""
+        # W_U shape: [d_model, d_vocab] — rows are indexed by d_model
+        # center_unembed subtracts the mean along d_vocab (columns)
+        W_U = bridge_center_unembed.unembed.W_U  # [d_model, d_vocab]
+        col_mean = W_U.mean(dim=-1)  # [d_model]
+        assert torch.allclose(
+            col_mean, torch.zeros_like(col_mean), atol=1e-5
+        ), f"W_U column mean should be ~0 after centering, max: {col_mean.abs().max():.6f}"
diff --git a/tests/unit/model_bridge/compatibility/test_svd_interpreter.py b/tests/unit/model_bridge/compatibility/test_svd_interpreter.py
index 22e171b86..520004db2 100644
--- a/tests/unit/model_bridge/compatibility/test_svd_interpreter.py
+++ b/tests/unit/model_bridge/compatibility/test_svd_interpreter.py
@@ -132,8 +132,10 @@ def test_svd_interpreter_returns_different_answers_for_different_models(model, s
 
 
 def test_svd_interpreter_fails_on_invalid_vector_type(model):
+    from typeguard import TypeCheckError
+
     svd_interpreter = SVDInterpreter(model)
-    with pytest.raises(BeartypeCallHintParamViolation):
+    with pytest.raises((BeartypeCallHintParamViolation, TypeCheckError)):
         svd_interpreter.get_singular_vectors("test", layer_index=0, num_vectors=4, head_index=0)
 
 
diff --git a/tests/unit/model_bridge/test_component_inspection.py b/tests/unit/model_bridge/test_component_inspection.py
index 0fdd74b93..d0f76ac30 100644
--- a/tests/unit/model_bridge/test_component_inspection.py
+++ b/tests/unit/model_bridge/test_component_inspection.py
@@ -1,4 +1,8 @@
-"""Unit tests for bridge component inspection functionality."""
+"""Unit tests for bridge component access and properties.
+
+Tests that TransformerBridge exposes components correctly through its own API,
+not just through the underlying HuggingFace model. Uses distilgpt2 (CI-cached).
+"""
 
 import pytest
 import torch
@@ -6,189 +10,139 @@
 from transformer_lens.model_bridge.bridge import TransformerBridge
 
 
-class TestBridgeComponentInspection:
-    """Test inspection of bridge components and their properties."""
-
-    @pytest.fixture
-    def bridge(self):
-        """Create a TransformerBridge for testing."""
-        return TransformerBridge.boot_transformers("gpt2", device="cpu")
-
-    def test_bridge_has_required_components(self, bridge):
-        """Test that bridge has all required transformer components."""
-        # Check main transformer structure
-        assert hasattr(bridge.original_model, "transformer"), "Should have transformer module"
-        transformer = bridge.original_model.transformer
-
-        # Check core components
-        assert hasattr(transformer, "wte"), "Should have token embedding (wte)"
-        assert hasattr(transformer, "wpe"), "Should have position embedding (wpe)"
-        assert hasattr(transformer, "h"), "Should have transformer layers (h)"
-        assert hasattr(transformer, "ln_f"), "Should have final layer norm (ln_f)"
-        assert hasattr(bridge.original_model, "lm_head"), "Should have language model head"
-
-    def test_transformer_layers_structure(self, bridge):
-        """Test the structure of transformer layers."""
-        layers = bridge.original_model.transformer.h
-        assert len(layers) > 0, "Should have at least one transformer layer"
-
-        # Check first layer structure
-        layer_0 = layers[0]
-        assert hasattr(layer_0, "ln_1"), "Layer should have first layer norm"
-        assert hasattr(layer_0, "attn"), "Layer should have attention"
-        assert hasattr(layer_0, "ln_2"), "Layer should have second layer norm"
-        assert hasattr(layer_0, "mlp"), "Layer should have MLP"
-
-        # Check that all layers have consistent structure
-        for i, layer in enumerate(layers):
-            assert hasattr(layer, "ln_1"), f"Layer {i} should have ln_1"
-            assert hasattr(layer, "attn"), f"Layer {i} should have attn"
-            assert hasattr(layer, "ln_2"), f"Layer {i} should have ln_2"
-            assert hasattr(layer, "mlp"), f"Layer {i} should have mlp"
-
-    def test_attention_component_structure(self, bridge):
-        """Test the structure of attention components."""
-        attn = bridge.original_model.transformer.h[0].attn
-
-        # GPT-2 style attention should have these components
-        expected_attrs = ["c_attn", "c_proj"]  # GPT-2 specific naming
-        for attr in expected_attrs:
-            assert hasattr(attn, attr), f"Attention should have {attr}"
-
-        # Check weight shapes are reasonable
-        c_attn = attn.c_attn
-        if hasattr(c_attn, "weight"):
-            weight_shape = c_attn.weight.shape
-            assert len(weight_shape) == 2, f"c_attn weight should be 2D: {weight_shape}"
-            assert (
-                weight_shape[0] > 0 and weight_shape[1] > 0
-            ), f"Weight should have positive dimensions: {weight_shape}"
-
-    def test_mlp_component_structure(self, bridge):
-        """Test the structure of MLP components."""
-        mlp = bridge.original_model.transformer.h[0].mlp
-
-        # GPT-2 style MLP should have these components
-        expected_attrs = ["c_fc", "c_proj"]  # GPT-2 specific naming
-        for attr in expected_attrs:
-            assert hasattr(mlp, attr), f"MLP should have {attr}"
-
-        # Check weight shapes
-        c_fc = mlp.c_fc
-        if hasattr(c_fc, "weight"):
-            weight_shape = c_fc.weight.shape
-            assert len(weight_shape) == 2, f"c_fc weight should be 2D: {weight_shape}"
-
-    def test_embedding_components(self, bridge):
-        """Test embedding component properties."""
-        transformer = bridge.original_model.transformer
-
-        # Token embedding
-        wte = transformer.wte
-        assert hasattr(wte, "weight"), "Token embedding should have weight"
-        wte_shape = wte.weight.shape
-        assert len(wte_shape) == 2, f"Token embedding should be 2D: {wte_shape}"
-        assert (
-            wte_shape[0] > 0 and wte_shape[1] > 0
-        ), "Token embedding should have positive dimensions"
-
-        # Position embedding
-        wpe = transformer.wpe
-        assert hasattr(wpe, "weight"), "Position embedding should have weight"
-        wpe_shape = wpe.weight.shape
-        assert len(wpe_shape) == 2, f"Position embedding should be 2D: {wpe_shape}"
-        assert (
-            wpe_shape[1] == wte_shape[1]
-        ), "Position and token embeddings should have same hidden dimension"
-
-    def test_lm_head_structure(self, bridge):
-        """Test language model head structure."""
-        lm_head = bridge.original_model.lm_head
-        assert hasattr(lm_head, "weight"), "LM head should have weight"
-
-        lm_head_shape = lm_head.weight.shape
-        assert len(lm_head_shape) == 2, f"LM head should be 2D: {lm_head_shape}"
-
-        # LM head vocab size should match token embedding
-        wte_shape = bridge.original_model.transformer.wte.weight.shape
-        assert (
-            lm_head_shape[0] == wte_shape[0]
-        ), "LM head and token embedding should have same vocab size"
-
-    def test_component_types(self, bridge):
-        """Test that components are of expected PyTorch types."""
-        transformer = bridge.original_model.transformer
-
-        # All components should be nn.Module subclasses
-        assert isinstance(transformer.wte, torch.nn.Module), "Token embedding should be nn.Module"
-        assert isinstance(
-            transformer.wpe, torch.nn.Module
-        ), "Position embedding should be nn.Module"
-        assert isinstance(transformer.ln_f, torch.nn.Module), "Final layer norm should be nn.Module"
-
-        # Layer components
-        layer_0 = transformer.h[0]
-        assert isinstance(layer_0.ln_1, torch.nn.Module), "Layer norm 1 should be nn.Module"
-        assert isinstance(layer_0.attn, torch.nn.Module), "Attention should be nn.Module"
-        assert isinstance(layer_0.ln_2, torch.nn.Module), "Layer norm 2 should be nn.Module"
-        assert isinstance(layer_0.mlp, torch.nn.Module), "MLP should be nn.Module"
-
-    def test_parameter_devices(self, bridge):
-        """Test that all parameters are on the expected device."""
-        expected_device = torch.device("cpu")
-
-        # Check embedding parameters
-        transformer = bridge.original_model.transformer
-        assert transformer.wte.weight.device == expected_device, "Token embedding should be on CPU"
-        assert (
-            transformer.wpe.weight.device == expected_device
-        ), "Position embedding should be on CPU"
-
-        # Check layer parameters
-        layer_0 = transformer.h[0]
-        for name, param in layer_0.named_parameters():
-            assert (
-                param.device == expected_device
-            ), f"Parameter {name} should be on CPU, got {param.device}"
-
-        # Check LM head
-        assert (
-            bridge.original_model.lm_head.weight.device == expected_device
-        ), "LM head should be on CPU"
-
-    def test_parameter_dtypes(self, bridge):
-        """Test that parameters have expected data types."""
-        # Most parameters should be float32 or float16
-        valid_dtypes = {torch.float32, torch.float16, torch.bfloat16}
-
-        transformer = bridge.original_model.transformer
-
-        # Check key parameters
-        assert (
-            transformer.wte.weight.dtype in valid_dtypes
-        ), f"Token embedding dtype: {transformer.wte.weight.dtype}"
-        assert (
-            transformer.wpe.weight.dtype in valid_dtypes
-        ), f"Position embedding dtype: {transformer.wpe.weight.dtype}"
-
-        # Check layer 0 parameters
-        for name, param in transformer.h[0].named_parameters():
-            assert (
-                param.dtype in valid_dtypes
-            ), f"Parameter {name} has unexpected dtype: {param.dtype}"
-
-    def test_model_configuration_accessible(self, bridge):
-        """Test that model configuration is accessible."""
-        # Should have access to the original model's config
-        assert hasattr(bridge.original_model, "config"), "Model should have configuration"
-
-        config = bridge.original_model.config
-        assert hasattr(config, "n_layer"), "Config should specify number of layers"
-        assert hasattr(config, "n_head"), "Config should specify number of heads"
-        assert hasattr(config, "n_embd"), "Config should specify embedding dimension"
-
-        # Verify config matches actual model structure
-        actual_layers = len(bridge.original_model.transformer.h)
-        assert (
-            config.n_layer == actual_layers
-        ), f"Config layers ({config.n_layer}) should match actual ({actual_layers})"
+@pytest.fixture(scope="module")
+def bridge():
+    """Create a TransformerBridge for testing."""
+    return TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+
+
+@pytest.fixture(scope="module")
+def bridge_compat():
+    """Create a TransformerBridge with compatibility mode for weight property tests."""
+    b = TransformerBridge.boot_transformers("distilgpt2", device="cpu")
+    b.enable_compatibility_mode()
+    return b
+
+
+class TestBridgeComponentAccess:
+    """Test that bridge exposes components through its own API."""
+
+    def test_blocks_accessible_and_indexed(self, bridge):
+        """Bridge blocks should be accessible by index."""
+        assert hasattr(bridge, "blocks"), "Bridge should have blocks attribute"
+        assert len(bridge.blocks) == bridge.cfg.n_layers
+        block_0 = bridge.blocks[0]
+        assert block_0 is not None
+
+    def test_block_has_attn_and_mlp(self, bridge):
+        """Each block should have attention and MLP subcomponents."""
+        block = bridge.blocks[0]
+        assert hasattr(block, "attn"), "Block should have attn"
+        assert hasattr(block, "mlp"), "Block should have mlp"
+        assert hasattr(block, "ln1"), "Block should have ln1"
+        assert hasattr(block, "ln2"), "Block should have ln2"
+
+    def test_embed_accessible(self, bridge):
+        """Token embedding should be accessible."""
+        assert hasattr(bridge, "embed"), "Bridge should have embed"
+
+    def test_unembed_accessible(self, bridge):
+        """Unembedding should be accessible."""
+        assert hasattr(bridge, "unembed"), "Bridge should have unembed"
+
+    def test_ln_final_accessible(self, bridge):
+        """Final layer norm should be accessible."""
+        assert hasattr(bridge, "ln_final"), "Bridge should have ln_final"
+
+    def test_cfg_accessible(self, bridge):
+        """Config should be accessible with expected fields."""
+        cfg = bridge.cfg
+        assert cfg.n_layers > 0
+        assert cfg.n_heads > 0
+        assert cfg.d_model > 0
+        assert cfg.d_vocab > 0
+
+    def test_tokenizer_accessible(self, bridge):
+        """Tokenizer should be accessible."""
+        assert bridge.tokenizer is not None
+        tokens = bridge.to_tokens("Hello world")
+        assert tokens.shape[0] == 1  # batch dim
+        assert tokens.shape[1] > 0  # seq dim
+
+
+class TestBridgeForwardPass:
+    """Test that bridge produces valid outputs."""
+
+    def test_forward_returns_logits(self, bridge):
+        """Forward pass should return logits tensor."""
+        with torch.no_grad():
+            logits = bridge("Hello world", return_type="logits")
+        assert logits.shape == (1, 3, bridge.cfg.d_vocab)  # "Hello world" = 3 tokens with BOS
+        assert not torch.isnan(logits).any()
+        assert not torch.isinf(logits).any()
+
+    def test_forward_returns_loss(self, bridge):
+        """Forward pass should return reasonable loss."""
+        with torch.no_grad():
+            loss = bridge("The cat sat on the mat", return_type="loss")
+        assert loss.ndim == 0  # scalar
+        assert 0 < loss.item() < 15
+
+    def test_run_with_cache_returns_activations(self, bridge):
+        """run_with_cache should return non-empty cache."""
+        with torch.no_grad():
+            _, cache = bridge.run_with_cache("Hello")
+        assert len(cache) > 0
+        # Should have block-level hooks
+        block_keys = [k for k in cache.keys() if "blocks.0" in k]
+        assert len(block_keys) > 0
+
+
+class TestBridgeWeightProperties:
+    """Test weight property accessors on bridge with compatibility mode."""
+
+    def test_W_Q_shape(self, bridge_compat):
+        """W_Q should have shape [n_layers, n_heads, d_model, d_head]."""
+        W_Q = bridge_compat.W_Q
+        cfg = bridge_compat.cfg
+        assert W_Q.shape == (cfg.n_layers, cfg.n_heads, cfg.d_model, cfg.d_head)
+
+    def test_W_K_shape(self, bridge_compat):
+        """W_K should have shape [n_layers, n_heads, d_model, d_head]."""
+        W_K = bridge_compat.W_K
+        cfg = bridge_compat.cfg
+        assert W_K.shape == (cfg.n_layers, cfg.n_heads, cfg.d_model, cfg.d_head)
+
+    def test_W_V_shape(self, bridge_compat):
+        """W_V should have shape [n_layers, n_heads, d_model, d_head]."""
+        W_V = bridge_compat.W_V
+        cfg = bridge_compat.cfg
+        assert W_V.shape == (cfg.n_layers, cfg.n_heads, cfg.d_model, cfg.d_head)
+
+    def test_W_O_shape(self, bridge_compat):
+        """W_O should have shape [n_layers, n_heads, d_head, d_model]."""
+        W_O = bridge_compat.W_O
+        cfg = bridge_compat.cfg
+        assert W_O.shape == (cfg.n_layers, cfg.n_heads, cfg.d_head, cfg.d_model)
+
+    def test_QK_factored_matrix(self, bridge_compat):
+        """QK property should return a functional FactoredMatrix."""
+        QK = bridge_compat.QK
+        assert QK is not None
+        # FactoredMatrix should have A and B with correct shapes
+        cfg = bridge_compat.cfg
+        assert QK.A.shape == (cfg.n_layers, cfg.n_heads, cfg.d_model, cfg.d_head)
+        assert QK.B.shape == (cfg.n_layers, cfg.n_heads, cfg.d_head, cfg.d_model)
+        # Should be computable (not contain NaN)
+        assert not torch.isnan(QK.A).any()
+        assert not torch.isnan(QK.B).any()
+
+    def test_OV_factored_matrix(self, bridge_compat):
+        """OV property should return a functional FactoredMatrix."""
+        OV = bridge_compat.OV
+        assert OV is not None
+        cfg = bridge_compat.cfg
+        assert OV.A.shape == (cfg.n_layers, cfg.n_heads, cfg.d_model, cfg.d_head)
+        assert OV.B.shape == (cfg.n_layers, cfg.n_heads, cfg.d_head, cfg.d_model)
+        assert not torch.isnan(OV.A).any()
+        assert not torch.isnan(OV.B).any()
diff --git a/tests/unit/model_bridge/test_processweights_with_adapter.py b/tests/unit/model_bridge/test_weight_processing_adapter_paths.py
similarity index 100%
rename from tests/unit/model_bridge/test_processweights_with_adapter.py
rename to tests/unit/model_bridge/test_weight_processing_adapter_paths.py
diff --git a/transformer_lens/components/abstract_attention.py b/transformer_lens/components/abstract_attention.py
index 2b60671f0..5b744ca0f 100644
--- a/transformer_lens/components/abstract_attention.py
+++ b/transformer_lens/components/abstract_attention.py
@@ -57,8 +57,12 @@ def __init__(
 
         if self.cfg.load_in_4bit:
             nq = int((self.cfg.d_model * self.cfg.d_head * self.cfg.n_heads) / 2)
-            self.W_Q = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
-            self.W_O = Params4bit(torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False)
+            self.W_Q: Union[nn.Parameter, "Params4bit"] = Params4bit(
+                torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False
+            )
+            self.W_O: Union[nn.Parameter, "Params4bit"] = Params4bit(
+                torch.empty(nq, 1, dtype=torch.uint8), requires_grad=False
+            )
         else:
             self.W_Q = nn.Parameter(
                 torch.empty(
@@ -333,13 +337,13 @@ def forward(
         if not self.cfg.use_attn_result:
             if self.cfg.load_in_4bit:
                 # call bitsandbytes method to dequantize and multiply
+                W_O_4bit = cast(Params4bit, self.W_O)
                 out = (
                     bnb.matmul_4bit(
                         z.reshape(z.shape[0], z.shape[1], self.cfg.d_head * self.cfg.n_heads),
-                        self.W_O.t(),
-                        # bias=self.W_O.t(),
+                        W_O_4bit.t(),
                         bias=None,
-                        quant_state=self.W_O.quant_state,
+                        quant_state=W_O_4bit.quant_state,
                     )
                     + self.b_O
                 )
@@ -372,12 +376,13 @@ def forward(
             # Explicitly calculate the attention result so it can be accessed by a hook
             # This is off by default because it can easily eat through your GPU memory.
             if self.cfg.load_in_4bit:
+                W_O_4bit = cast(Params4bit, self.W_O)
                 result = self.hook_result(
                     bnb.matmul_4bit(
                         z.reshape(z.shape[0], z.shape[1], self.cfg.d_head * self.cfg.n_heads),
-                        self.W_O.t(),
+                        W_O_4bit.t(),
                         bias=None,
-                        quant_state=self.W_O.quant_state,
+                        quant_state=W_O_4bit.quant_state,
                     )
                 )
             else:
@@ -447,13 +452,14 @@ def calculate_qkv_matrices(
             else simple_attn_linear
         )
         if self.cfg.load_in_4bit:
+            W_Q_4bit = cast(Params4bit, self.W_Q)
             q = self.hook_q(
                 # call bitsandbytes method to dequantize and multiply
                 bnb.matmul_4bit(
                     query_input,
-                    self.W_Q.t(),
+                    W_Q_4bit.t(),
                     bias=None,
-                    quant_state=self.W_Q.quant_state,
+                    quant_state=W_Q_4bit.quant_state,
                 ).reshape(
                     query_input.shape[0],
                     query_input.shape[1],
diff --git a/transformer_lens/model_bridge/generalized_components/attention.py b/transformer_lens/model_bridge/generalized_components/attention.py
index b8f929244..e3bcc7ed2 100644
--- a/transformer_lens/model_bridge/generalized_components/attention.py
+++ b/transformer_lens/model_bridge/generalized_components/attention.py
@@ -357,7 +357,7 @@ def _apply_attn_dropout(self, attn_weights: torch.Tensor) -> torch.Tensor:
             dropout_fn = getattr(self.original_component, "attn_dropout", None)
             if dropout_fn is None:
                 dropout_fn = getattr(self.original_component, "attention_dropout", None)
-            if dropout_fn is not None:
+            if dropout_fn is not None and callable(dropout_fn):
                 attn_weights = dropout_fn(attn_weights)
         return attn_weights