[autorevert] combine test outcomes across shards, prioritizing failures (#7428)

izaitsevfb · web-flow · commit 0021389ebfa1 · 2025-11-05T10:29:33.000-08:00
This PR changes the way how test outcome is aggregated across shards. Ideally this aggregation should not happen (if we had a perfect shard stripping logic), but this change should act as a "safety net" when: * same test runs in multiple shards for some reason * autorevert sharding stripping is imprecise For the latter, we need a follow up PR to improve the sharding stripping logic (but that is more involved). More context: - The extractor merges shards/partitions by a “base” job name that strips the parentheses entirely (so “dynamo_wrapped, default, crossref” all collapse to the same base): - It builds one per-test outcome per (commit, workflow, base, wf_run_id, attempt, test_id), but stores only the last seen job’s outcome (overwriting any earlier job’s outcome for the same test group): tests_by_group_attempt[key] = outcome uses no job_id in the key) - In this run, the same test both failed (dynamo_wrapped shard) and succeeded (default/crossref shards). Because the base collapses all three into one, and the last-seen row may be from a successful partition, the failure gets overwritten by success. That’s exactly what the 2025‑10‑31 16:13:17 state shows: - E.g. for pull:nn/test_pooling.py::test_max_pool_nan_inf_cpu_float32 at commit 7d39401fa07e…, the state has status “success” with job_id 54151038879 or 54151220194 under run 18961825600. - Base-name normalization merges different test partitions (dynamo_wrapped/default/crossref) into one base, and only the last per-test outcome seen is kept for a given run/attempt. This can (non‑deterministically) hide failures when other partitions record a success for the same test. ### Testing ``` python -m pytorch_auto_revert --dry-run autorevert-checker Lint trunk pull inductor --hours 18 --hud-html ``` Before: [2025-10-31_16-13-17.html](https://github.com/user-attachments/files/23267036/2025-10-31_16-13-17.html) After: [2025-10-31T16-47-04.160494-00-00.html](https://github.com/user-attachments/files/23267038/2025-10-31T16-47-04.160494-00-00.html)
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction.py
@@ -321,7 +321,17 @@ def _build_test_signals(
                 started_at=job.started_at,
                 job_id=int(tr.job_id),
             )
+            # Combine outcomes across shards/partitions for the same group key.
+            # Outcome with failures takes precedence.
+            # This is to support a rare case where a test appears in multiple shards
+            # usually it indicates that our base_name normalization is not perfect.
+            existing = tests_by_group_attempt.get(key)
+            if existing is not None and existing.failure_runs > 0:
+                outcome = existing
+
             tests_by_group_attempt[key] = outcome
+
+            # Track keys that have at least one failure across any shard
             if outcome.failure_runs > 0:
                 failing_tests_by_job_base_name.add(
                     (job.workflow_name, job_base_name, tr.test_id)
diff --git a/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_extraction.py b/aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_extraction.py
@@ -525,6 +525,155 @@ def test_inject_pending_workflow_event_when_missing_in_signal(self):
         self.assertEqual(c_new.events[0].status, SignalStatus.PENDING)
         self.assertEqual(c_new.events[0].wf_run_id, 200)
 
+    def test_test_track_combines_shards_failure_wins(self):
+        # Same commit/run/attempt/test_id present on two shards of the same base:
+        # - shard A reports success
+        # - shard B reports failure
+        # When combining, the attempt should reflect a FAILURE event (and we still
+        # retain SUCCESS if it was also observed).
+        base_name = "linux-test (dynamo_wrapped, 1, 3)"  # numeric tokens may be ignored/merged by base logic
+
+        jobs = [
+            # Both jobs share the same commit, workflow run, attempt and base name
+            J(
+                sha="C",
+                run=5000,
+                job=1001,
+                attempt=1,
+                name=base_name,
+                started_at=ts(self.t0, 10),
+                conclusion="failure",
+                rule="pytest failure",  # marks base as test-track candidate
+            ),
+            J(
+                sha="C",
+                run=5000,
+                job=1002,
+                attempt=1,
+                name=base_name,
+                started_at=ts(self.t0, 12),
+                conclusion="success",
+                rule="",
+            ),
+        ]
+
+        tests = [
+            # Same test id across two shards: one success and one failure
+            T(
+                job=1001,
+                run=5000,
+                attempt=1,
+                file="h.py",
+                name="test_merge",
+                failure_runs=1,
+                success_runs=0,
+            ),
+            T(
+                job=1002,
+                run=5000,
+                attempt=1,
+                file="h.py",
+                name="test_merge",
+                failure_runs=0,
+                success_runs=1,
+            ),
+        ]
+
+        signals = self._extract(jobs, tests)
+        test_sig = self._find_test_signal(signals, "trunk", "h.py::test_merge")
+        self.assertIsNotNone(test_sig)
+        # Single commit present
+        self.assertEqual([c.head_sha for c in test_sig.commits], ["C"])
+        events = test_sig.commits[0].events
+        # Expect exactly one FAILURE event for the attempt (failure dominates)
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].status, SignalStatus.FAILURE)
+
+    def test_test_track_combines_shards_success_single_event(self):
+        # Two shards for the same test attempt both succeed; ensure we emit
+        # a single SUCCESS event for that attempt (no duplication).
+        base_name = "linux-test (dynamo_wrapped, 1, 3)"
+
+        jobs = [
+            # Mark base as test-track by using a test failure classification on one job
+            # (selection signal only). Actual test rows are successes.
+            J(
+                sha="C2",
+                run=6000,
+                job=1101,
+                attempt=1,
+                name=base_name,
+                started_at=ts(self.t0, 10),
+                conclusion="failure",
+                rule="pytest failure",
+            ),
+            J(
+                sha="C2",
+                run=6000,
+                job=1102,
+                attempt=1,
+                name=base_name,
+                started_at=ts(self.t0, 12),
+                conclusion="success",
+                rule="",
+            ),
+            # Older commit with the same test failing at least once so the test id is included
+            J(
+                sha="C1",
+                run=5990,
+                job=1103,
+                attempt=1,
+                name=base_name,
+                started_at=ts(self.t0, 1),
+                conclusion="failure",
+                rule="pytest failure",
+            ),
+        ]
+
+        tests = [
+            # Both shards report success for the same test id
+            T(
+                job=1101,
+                run=6000,
+                attempt=1,
+                file="h2.py",
+                name="test_merge_success",
+                failure_runs=0,
+                success_runs=1,
+            ),
+            T(
+                job=1102,
+                run=6000,
+                attempt=1,
+                file="h2.py",
+                name="test_merge_success",
+                failure_runs=0,
+                success_runs=1,
+            ),
+            # Older commit has a failure for the same test id to ensure inclusion
+            T(
+                job=1103,
+                run=5990,
+                attempt=1,
+                file="h2.py",
+                name="test_merge_success",
+                failure_runs=1,
+                success_runs=0,
+            ),
+        ]
+
+        signals = self._extract(jobs, tests)
+        test_sig = self._find_test_signal(signals, "trunk", "h2.py::test_merge_success")
+        self.assertIsNotNone(test_sig)
+        # Should contain both commits, newest first
+        self.assertEqual([c.head_sha for c in test_sig.commits], ["C2", "C1"])
+        # Find C2 and verify only a single SUCCESS event is emitted for the attempt
+        c2 = test_sig.commits[0]
+        events = c2.events
+        # Expect exactly one SUCCESS event for the attempt
+        self.assertEqual(len(events), 1)
+        self.assertEqual(events[0].status, SignalStatus.SUCCESS)
+
 
 if __name__ == "__main__":
     unittest.main()