gpu-mode · teelaitila · Mar 2, 2026 · Mar 12, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -11,9 +11,10 @@ try to exploit benchmarking flaws to receive higher scores.
 To benchmark a kernel, two ingredients are needed:
 1. The qualified name of the kernel function. It is important that the testing script itself does not import the kernel function, as this implies executing untrusted code.
 2. A function that generates test/benchmark inputs. This function takes keyword arguments of configuration parameters,
-   as well as the reserved argument `seed` to randomize the problem. It returns two tuples:
-   The first contains the inputs for the kernel and will
-   be used to call the kernel function, and the second contains the expected output and the required absolute and relative tolerance.
+   as well as the reserved argument `seed` to randomize the problem. It returns the kernel arguments.
+   Any writable / checked output must be wrapped in `pygpubench.out(...)` together
+   with its expected result and optional tolerances. For in/out args whose initial contents matter,
+   pass `uses_current_value=True`.
 
 ```python
 import torch
@@ -29,7 +30,10 @@ def generate_test_case(*, seed, **kwargs):
     x, y = generate_input(**kwargs, seed=seed)
     expected = torch.empty_like(y)
     reference_kernel((expected, x))
-    return (y, x), (expected, 1e-6, 1e-6)
+    return (
+        pygpubench.out(y, expected=(expected, 1e-6, 1e-6)),
+        x,
+    )
 
 
 res = pygpubench.do_bench_isolated("submission.kernel", generate_test_case,  {"size": 1024}, 100, 5, discard=True)

diff --git a/csrc/binding.cpp b/csrc/binding.cpp
@@ -18,8 +18,8 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
     signature.allocate(32, rng);
     auto config = read_benchmark_parameters(input_fd, signature.data());
     BenchmarkManager mgr(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal);
-    auto [args, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs,  config.Repeats);
-    mgr.do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
+    auto [args, output_positions, input_output_positions, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
+    mgr.do_bench_py(kernel_qualname, args, output_positions, input_output_positions, expected, reinterpret_cast<cudaStream_t>(stream));
 }
 
 

diff --git a/csrc/manager.cpp b/csrc/manager.cpp
@@ -146,14 +146,18 @@ BenchmarkManager::~BenchmarkManager() {
     cudaFree(mDeviceErrorBase);
     for (auto& event : mStartEvents) cudaEventDestroy(event);
     for (auto& event : mEndEvents) cudaEventDestroy(event);
-    for (auto& exp: mExpectedOutputs) cudaFree(exp.Value);
+    for (auto& expected_per_test : mExpectedOutputs) {
+        for (auto& exp : expected_per_test) cudaFree(exp.Value);
+    }
 }
 
-std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
+std::tuple<std::vector<nb::tuple>, std::vector<std::vector<std::size_t>>, std::vector<std::vector<std::size_t>>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
     std::mt19937_64 rng(mSeed);
     std::uniform_int_distribution<std::uint64_t> dist(0, std::numeric_limits<std::uint64_t>::max());
     // generate one more input to handle warmup
-    std::vector<nb::tuple> kernel_args(repeats + 1);
+    std::vector<nb::tuple> call_args(repeats + 1);
+    std::vector<std::vector<std::size_t>> output_positions(repeats + 1);
+    std::vector<std::vector<std::size_t>> input_output_positions(repeats + 1);
     std::vector<nb::tuple> expected(repeats + 1);
     for (int i = 0; i < repeats + 1; i++) {
         // create new copy of the kwargs dict
@@ -168,23 +172,74 @@ std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setu
         call_kwargs["seed"] = dist(rng);
 
         auto gen = nb::cast<nb::tuple>(generate_test_case(**call_kwargs));
-        kernel_args[i] = nb::cast<nb::tuple>(gen[0]);
-        expected[i] = nb::cast<nb::tuple>(gen[1]);
+        if (gen.size() != 4) {
+            throw std::runtime_error("generate_test_case must return a 4-tuple: (args, output_positions, input_output_positions, expected)");
+        }
+
+        call_args[i] = nb::cast<nb::tuple>(gen[0]);
+        nb::tuple output_positions_tuple = nb::cast<nb::tuple>(gen[1]);
+        nb::tuple input_output_positions_tuple = nb::cast<nb::tuple>(gen[2]);
+        expected[i] = nb::cast<nb::tuple>(gen[3]);
+
+        if (output_positions_tuple.size() == 0) {
+            throw std::runtime_error("output_positions tuple must not be empty");
+        }
+        if (expected[i].size() != output_positions_tuple.size()) {
+            throw std::runtime_error("expected tuple size must match output_positions tuple size");
+        }
+        std::vector<bool> seen_output(call_args[i].size(), false);
+        output_positions[i].reserve(output_positions_tuple.size());
+        for (int j = 0; j < output_positions_tuple.size(); j++) {
+            std::size_t pos = nb::cast<std::size_t>(output_positions_tuple[j]);
+            if (pos >= static_cast<std::size_t>(call_args[i].size())) {
+                throw std::runtime_error("output_positions contains an index outside the args tuple");
+            }
+            if (seen_output[pos]) {
+                throw std::runtime_error("output_positions contains duplicate indices");
+            }
+            seen_output[pos] = true;
+            output_positions[i].push_back(pos);
+        }
+        std::vector<bool> seen_input_output(call_args[i].size(), false);
+        input_output_positions[i].reserve(input_output_positions_tuple.size());
+        for (int j = 0; j < input_output_positions_tuple.size(); j++) {
+            std::size_t pos = nb::cast<std::size_t>(input_output_positions_tuple[j]);
+            if (pos >= static_cast<std::size_t>(call_args[i].size())) {
+                throw std::runtime_error("input_output_positions contains an index outside the args tuple");
+            }
+            if (!seen_output[pos]) {
+                throw std::runtime_error("input_output_positions must be a subset of output_positions");
+            }
+            if (seen_input_output[pos]) {
+                throw std::runtime_error("input_output_positions contains duplicate indices");
+            }
+            seen_input_output[pos] = true;
+            input_output_positions[i].push_back(pos);
+        }
     }
-    return std::make_pair(std::move(kernel_args), std::move(expected));
+    return std::make_tuple(std::move(call_args), std::move(output_positions), std::move(input_output_positions), std::move(expected));
 }
 
 bool can_convert_to_tensor(nb::handle obj) {
     return nb::isinstance<nb_cuda_array>(obj);
 }
 
-auto BenchmarkManager::make_shadow_args(const nb::tuple& args, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
+auto BenchmarkManager::make_shadow_args(const nb::tuple& args, const std::vector<std::size_t>& output_positions, const std::vector<std::size_t>& input_output_positions, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
     std::vector<std::optional<ShadowArgument>> shadow_args(args.size());
-    int nargs = args.size();
+    std::vector<bool> is_output(args.size(), false);
+    for (auto pos : output_positions) {
+        is_output.at(pos) = true;
+    }
+    for (auto pos : input_output_positions) {
+        is_output.at(pos) = false;
+    }
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_int_distribution<unsigned> canary_seed_dist(0,  0xffffffff);
-    for (int i = 1; i < nargs; i++) {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(args.size()); i++) {
+        if (is_output[i]) {
+            continue;
+        }
         if (can_convert_to_tensor(args[i])) {
             nb_cuda_array arr = nb::cast<nb_cuda_array>(args[i]);
             void* shadow;
@@ -225,6 +280,39 @@ void BenchmarkManager::validate_result(Expected& expected, const nb_cuda_array&
     }
 }
 
+BenchmarkManager::Expected BenchmarkManager::parse_expected_spec(const nb::handle& obj) {
+    nb_cuda_array expected_array;
+    auto mode = BenchmarkManager::Expected::ExactMatch;
+    float rtol = 0.f;
+    float atol = 0.f;
+
+    if (nb::isinstance<nb_cuda_array>(obj)) {
+        expected_array = nb::cast<nb_cuda_array>(obj);
+    } else {
+        nb::tuple expected_tuple = nb::cast<nb::tuple>(obj);
+        if (expected_tuple.size() == 0) {
+            throw std::runtime_error("Expected spec tuple must not be empty");
+        }
+        if (expected_tuple.size() != 1 && expected_tuple.size() != 3) {
+            throw std::runtime_error("Expected spec tuple must have size 1 or 3");
+        }
+        expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);
+        if (expected_tuple.size() == 3) {
+            rtol = nb::cast<float>(expected_tuple[1]);
+            atol = nb::cast<float>(expected_tuple[2]);
+            mode = BenchmarkManager::Expected::ApproxMatch;
+        }
+    }
+
+    // copy expected values into memory not owned by torch, then wipe original
+    void* copy_mem;
+    CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
+    CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
+    CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));
+
+    return {mode, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
+}
+
 void BenchmarkManager::clear_cache(cudaStream_t stream) {
     ::clear_cache(mDeviceDummyMemory, 2 * mL2CacheSize, mDiscardCache, stream);
 }
@@ -250,35 +338,37 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh
 }
 
 void BenchmarkManager::do_bench_py(
-        const std::string& kernel_qualname,
-        const std::vector<nb::tuple>& args,
-        const std::vector<nb::tuple>& expected,
-        cudaStream_t stream)
-{
+    const std::string& kernel_qualname,
+    const std::vector<nb::tuple>& args,
+    const std::vector<std::vector<std::size_t>>& output_positions,
+    const std::vector<std::vector<std::size_t>>& input_output_positions,
+    const std::vector<nb::tuple>& expected,
+    cudaStream_t stream
+) {
     if (args.size() < 5) {
         throw std::runtime_error("Not enough test cases to run benchmark");
     }
-    if (expected.size() != args.size()) {
-        throw std::runtime_error("Expected results and test case list do not have the same length");
+    if (output_positions.size() != args.size() || input_output_positions.size() != args.size() || expected.size() != args.size()) {
+        throw std::runtime_error("Expected results, output metadata, and test case lists do not have the same length");
     }
     int calls = args.size() - 1;
 
-    // extract relevant infos from args and expected
-    // by convention, the first arg is the output tensor.
-    // TODO handle multiple outputs
-    std::vector<nb_cuda_array> outputs(args.size());
+    // extract relevant infos from outputs and expected
+    std::vector<std::vector<nb_cuda_array>> outputs(args.size());
     for (int i = 0; i < args.size(); i++) {
-        outputs.at(i) = nb::cast<nb_cuda_array>(args.at(i)[0]);
+        outputs.at(i).reserve(output_positions.at(i).size());
+        for (auto pos : output_positions.at(i)) {
+            outputs.at(i).push_back(nb::cast<nb_cuda_array>(args.at(i)[pos]));
+        }
     }
 
     // Generate "shadow" copies of input arguments
     std::vector<ShadowArgumentList> shadow_arguments;
-    for (const auto & arg : args) {
-        shadow_arguments.emplace_back(make_shadow_args(arg, stream));
+    for (int i = 0; i < args.size(); i++) {
+        shadow_arguments.emplace_back(make_shadow_args(args.at(i), output_positions.at(i), input_output_positions.at(i), stream));
     }
 
-    // prepare expected outputs
-    setup_expected_outputs(args, expected);
+    setup_expected_outputs(output_positions, expected);
 
     // clean up as much python state as we can
     trigger_gc();
@@ -300,9 +390,28 @@ void BenchmarkManager::do_bench_py(
     // after this, we cannot trust python anymore
     nb::callable kernel = kernel_from_qualname(kernel_qualname);
 
+    auto prepare_args = [&](const ShadowArgumentList& shadow_args) {
+        for (auto& shadow_arg : shadow_args) {
+            if (shadow_arg) {
+                CUDA_CHECK(cudaMemcpyAsync(shadow_arg->Original.data(), shadow_arg->Shadow, shadow_arg->Original.nbytes(), cudaMemcpyDeviceToDevice, stream));
+            }
+        }
+
+        clear_cache(stream);
+
+        // ok, now we revert the canaries. This _does_ bring in the corresponding cache lines,
+        // but they are very sparse (1/256), so that seems like an acceptable trade-off
+        for (auto& shadow_arg : shadow_args) {
+            if (shadow_arg) {
+                canaries(shadow_arg->Original.data(), shadow_arg->Original.nbytes(), shadow_arg->Seed, stream);
+            }
+        }
+    };
+
     // ok, first run for compilations etc
     nvtx_push("warmup");
     CUDA_CHECK(cudaDeviceSynchronize());
+    prepare_args(shadow_arguments.at(0));
     kernel(*args.at(0));
     CUDA_CHECK(cudaDeviceSynchronize());
     nvtx_pop();
@@ -316,7 +425,7 @@ void BenchmarkManager::do_bench_py(
         // note: we are assuming here that calling the kernel multiple times for the same input is a safe operation
         // this is only potentially problematic for in-place kernels;
         CUDA_CHECK(cudaDeviceSynchronize());
-        clear_cache(stream);
+        prepare_args(shadow_arguments.at(0));
         kernel(*args.at(0));
         CUDA_CHECK(cudaDeviceSynchronize());
         std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
@@ -379,32 +488,20 @@ void BenchmarkManager::do_bench_py(
         // unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
         // *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before
         // the user code call.
-        for (auto& shadow_arg : shadow_arguments.at(test_id)) {
-            if (shadow_arg) {
-                CUDA_CHECK(cudaMemcpyAsync(shadow_arg->Original.data(), shadow_arg->Shadow, shadow_arg->Original.nbytes(), cudaMemcpyDeviceToDevice, stream));
-            }
-        }
-
         nvtx_push("cc");
-        clear_cache(stream);
+        prepare_args(shadow_arguments.at(test_id));
         nvtx_pop();
 
-        // ok, now we revert the canaries. This _does_ bring in the corresponding cache lines,
-        // but they are very sparse (1/256), so that seems like an acceptable trade-off
-        for (auto& shadow_arg : shadow_arguments.at(test_id)) {
-            if (shadow_arg) {
-                canaries(shadow_arg->Original.data(), shadow_arg->Original.nbytes(), shadow_arg->Seed, stream);
-            }
-        }
-
         CUDA_CHECK(cudaEventRecord(mStartEvents.at(i), stream));
         nvtx_push("kernel");
         (void)kernel(*args.at(test_id));
         nvtx_pop();
         CUDA_CHECK(cudaEventRecord(mEndEvents.at(i), stream));
         // immediately after the kernel, launch the checking code; if there is some unsynced work done on another stream,
         // this increases the chance of detection.
-        validate_result(mExpectedOutputs.at(test_id), outputs.at(test_id), check_seed_generator(rng), stream);
+        for (std::size_t j = 0; j < outputs.at(test_id).size(); j++) {
+            validate_result(mExpectedOutputs.at(test_id).at(j), outputs.at(test_id).at(j), check_seed_generator(rng), stream);
+        }
     }
     nvtx_pop();
 
@@ -456,25 +553,20 @@ float BenchmarkManager::measure_event_overhead(int repeats, cudaStream_t stream)
     return median;
 }
 
-void BenchmarkManager::setup_expected_outputs(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected) {
-    mExpectedOutputs.resize(args.size());
-    for (int i = 0; i < args.size(); i++) {
+void BenchmarkManager::setup_expected_outputs(const std::vector<std::vector<std::size_t>>& output_positions, const std::vector<nb::tuple>& expected) {
+    for (auto& expected_per_test : mExpectedOutputs) {
+        for (auto& exp : expected_per_test) cudaFree(exp.Value);
+    }
+    mExpectedOutputs.clear();
+    mExpectedOutputs.resize(output_positions.size());
+    for (int i = 0; i < output_positions.size(); i++) {
         const nb::tuple& expected_tuple = expected.at(i);
-        nb_cuda_array expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);
-
-        // make a copy of the expected result and put it in memory not owned by torch; overwrite the original
-        // so it cannot be read by cheating solutions.
-        void* copy_mem;
-        CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
-        CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
-        CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));
-
-        if (expected.at(i).size() == 1) {
-            mExpectedOutputs.at(i) = {Expected::ExactMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), 0.f, 0.f};
-        } else {
-            float rtol = nb::cast<float>(expected_tuple[1]);
-            float atol = nb::cast<float>(expected_tuple[2]);
-            mExpectedOutputs.at(i) = {Expected::ApproxMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
+        if (expected_tuple.size() != output_positions.at(i).size()) {
+            throw std::runtime_error("Expected tuple size must match output_positions tuple size");
+        }
+        mExpectedOutputs.at(i).reserve(expected_tuple.size());
+        for (int j = 0; j < expected_tuple.size(); j++) {
+            mExpectedOutputs.at(i).push_back(parse_expected_spec(expected_tuple[j]));
         }
     }
-}
+}