Skip to content

Commit 0021389

Browse files
authored
[autorevert] combine test outcomes across shards, prioritizing failures (#7428)
This PR changes the way how test outcome is aggregated across shards. Ideally this aggregation should not happen (if we had a perfect shard stripping logic), but this change should act as a "safety net" when: * same test runs in multiple shards for some reason * autorevert sharding stripping is imprecise For the latter, we need a follow up PR to improve the sharding stripping logic (but that is more involved). More context: - The extractor merges shards/partitions by a “base” job name that strips the parentheses entirely (so “dynamo_wrapped, default, crossref” all collapse to the same base): - It builds one per-test outcome per (commit, workflow, base, wf_run_id, attempt, test_id), but stores only the last seen job’s outcome (overwriting any earlier job’s outcome for the same test group): tests_by_group_attempt[key] = outcome uses no job_id in the key) - In this run, the same test both failed (dynamo_wrapped shard) and succeeded (default/crossref shards). Because the base collapses all three into one, and the last-seen row may be from a successful partition, the failure gets overwritten by success. That’s exactly what the 2025‑10‑31 16:13:17 state shows: - E.g. for pull:nn/test_pooling.py::test_max_pool_nan_inf_cpu_float32 at commit 7d39401fa07e…, the state has status “success” with job_id 54151038879 or 54151220194 under run 18961825600. - Base-name normalization merges different test partitions (dynamo_wrapped/default/crossref) into one base, and only the last per-test outcome seen is kept for a given run/attempt. This can (non‑deterministically) hide failures when other partitions record a success for the same test. ### Testing ``` python -m pytorch_auto_revert --dry-run autorevert-checker Lint trunk pull inductor --hours 18 --hud-html ``` Before: [2025-10-31_16-13-17.html](https://github.com/user-attachments/files/23267036/2025-10-31_16-13-17.html) After: [2025-10-31T16-47-04.160494-00-00.html](https://github.com/user-attachments/files/23267038/2025-10-31T16-47-04.160494-00-00.html)
1 parent fde861b commit 0021389

File tree

2 files changed

+159
-0
lines changed

2 files changed

+159
-0
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal_extraction.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,17 @@ def _build_test_signals(
321321
started_at=job.started_at,
322322
job_id=int(tr.job_id),
323323
)
324+
# Combine outcomes across shards/partitions for the same group key.
325+
# Outcome with failures takes precedence.
326+
# This is to support a rare case where a test appears in multiple shards
327+
# usually it indicates that our base_name normalization is not perfect.
328+
existing = tests_by_group_attempt.get(key)
329+
if existing is not None and existing.failure_runs > 0:
330+
outcome = existing
331+
324332
tests_by_group_attempt[key] = outcome
333+
334+
# Track keys that have at least one failure across any shard
325335
if outcome.failure_runs > 0:
326336
failing_tests_by_job_base_name.add(
327337
(job.workflow_name, job_base_name, tr.test_id)

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal_extraction.py

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,155 @@ def test_inject_pending_workflow_event_when_missing_in_signal(self):
525525
self.assertEqual(c_new.events[0].status, SignalStatus.PENDING)
526526
self.assertEqual(c_new.events[0].wf_run_id, 200)
527527

528+
def test_test_track_combines_shards_failure_wins(self):
529+
# Same commit/run/attempt/test_id present on two shards of the same base:
530+
# - shard A reports success
531+
# - shard B reports failure
532+
# When combining, the attempt should reflect a FAILURE event (and we still
533+
# retain SUCCESS if it was also observed).
534+
base_name = "linux-test (dynamo_wrapped, 1, 3)" # numeric tokens may be ignored/merged by base logic
535+
536+
jobs = [
537+
# Both jobs share the same commit, workflow run, attempt and base name
538+
J(
539+
sha="C",
540+
run=5000,
541+
job=1001,
542+
attempt=1,
543+
name=base_name,
544+
started_at=ts(self.t0, 10),
545+
conclusion="failure",
546+
rule="pytest failure", # marks base as test-track candidate
547+
),
548+
J(
549+
sha="C",
550+
run=5000,
551+
job=1002,
552+
attempt=1,
553+
name=base_name,
554+
started_at=ts(self.t0, 12),
555+
conclusion="success",
556+
rule="",
557+
),
558+
]
559+
560+
tests = [
561+
# Same test id across two shards: one success and one failure
562+
T(
563+
job=1001,
564+
run=5000,
565+
attempt=1,
566+
file="h.py",
567+
name="test_merge",
568+
failure_runs=1,
569+
success_runs=0,
570+
),
571+
T(
572+
job=1002,
573+
run=5000,
574+
attempt=1,
575+
file="h.py",
576+
name="test_merge",
577+
failure_runs=0,
578+
success_runs=1,
579+
),
580+
]
581+
582+
signals = self._extract(jobs, tests)
583+
test_sig = self._find_test_signal(signals, "trunk", "h.py::test_merge")
584+
self.assertIsNotNone(test_sig)
585+
# Single commit present
586+
self.assertEqual([c.head_sha for c in test_sig.commits], ["C"])
587+
events = test_sig.commits[0].events
588+
# Expect exactly one FAILURE event for the attempt (failure dominates)
589+
self.assertEqual(len(events), 1)
590+
self.assertEqual(events[0].status, SignalStatus.FAILURE)
591+
592+
def test_test_track_combines_shards_success_single_event(self):
593+
# Two shards for the same test attempt both succeed; ensure we emit
594+
# a single SUCCESS event for that attempt (no duplication).
595+
base_name = "linux-test (dynamo_wrapped, 1, 3)"
596+
597+
jobs = [
598+
# Mark base as test-track by using a test failure classification on one job
599+
# (selection signal only). Actual test rows are successes.
600+
J(
601+
sha="C2",
602+
run=6000,
603+
job=1101,
604+
attempt=1,
605+
name=base_name,
606+
started_at=ts(self.t0, 10),
607+
conclusion="failure",
608+
rule="pytest failure",
609+
),
610+
J(
611+
sha="C2",
612+
run=6000,
613+
job=1102,
614+
attempt=1,
615+
name=base_name,
616+
started_at=ts(self.t0, 12),
617+
conclusion="success",
618+
rule="",
619+
),
620+
# Older commit with the same test failing at least once so the test id is included
621+
J(
622+
sha="C1",
623+
run=5990,
624+
job=1103,
625+
attempt=1,
626+
name=base_name,
627+
started_at=ts(self.t0, 1),
628+
conclusion="failure",
629+
rule="pytest failure",
630+
),
631+
]
632+
633+
tests = [
634+
# Both shards report success for the same test id
635+
T(
636+
job=1101,
637+
run=6000,
638+
attempt=1,
639+
file="h2.py",
640+
name="test_merge_success",
641+
failure_runs=0,
642+
success_runs=1,
643+
),
644+
T(
645+
job=1102,
646+
run=6000,
647+
attempt=1,
648+
file="h2.py",
649+
name="test_merge_success",
650+
failure_runs=0,
651+
success_runs=1,
652+
),
653+
# Older commit has a failure for the same test id to ensure inclusion
654+
T(
655+
job=1103,
656+
run=5990,
657+
attempt=1,
658+
file="h2.py",
659+
name="test_merge_success",
660+
failure_runs=1,
661+
success_runs=0,
662+
),
663+
]
664+
665+
signals = self._extract(jobs, tests)
666+
test_sig = self._find_test_signal(signals, "trunk", "h2.py::test_merge_success")
667+
self.assertIsNotNone(test_sig)
668+
# Should contain both commits, newest first
669+
self.assertEqual([c.head_sha for c in test_sig.commits], ["C2", "C1"])
670+
# Find C2 and verify only a single SUCCESS event is emitted for the attempt
671+
c2 = test_sig.commits[0]
672+
events = c2.events
673+
# Expect exactly one SUCCESS event for the attempt
674+
self.assertEqual(len(events), 1)
675+
self.assertEqual(events[0].status, SignalStatus.SUCCESS)
676+
528677

529678
if __name__ == "__main__":
530679
unittest.main()

0 commit comments

Comments
 (0)