Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ try to exploit benchmarking flaws to receive higher scores.
To benchmark a kernel, two ingredients are needed:
1. The qualified name of the kernel function. It is important that the testing script itself does not import the kernel function, as this implies executing untrusted code.
2. A function that generates test/benchmark inputs. This function takes keyword arguments of configuration parameters,
as well as the reserved argument `seed` to randomize the problem. It returns two tuples:
The first contains the inputs for the kernel and will
be used to call the kernel function, and the second contains the expected output and the required absolute and relative tolerance.
as well as the reserved argument `seed` to randomize the problem. It returns the kernel arguments.
Any writable / checked output must be wrapped in `pygpubench.out(...)` together
with its expected result and optional tolerances. For in/out args whose initial contents matter,
pass `uses_current_value=True`.

```python
import torch
Expand All @@ -29,7 +30,10 @@ def generate_test_case(*, seed, **kwargs):
x, y = generate_input(**kwargs, seed=seed)
expected = torch.empty_like(y)
reference_kernel((expected, x))
return (y, x), (expected, 1e-6, 1e-6)
return (
pygpubench.out(y, expected=(expected, 1e-6, 1e-6)),
x,
)


res = pygpubench.do_bench_isolated("submission.kernel", generate_test_case, {"size": 1024}, 100, 5, discard=True)
Expand Down
4 changes: 2 additions & 2 deletions csrc/binding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ void do_bench(int result_fd, int input_fd, const std::string& kernel_qualname, c
signature.allocate(32, rng);
auto config = read_benchmark_parameters(input_fd, signature.data());
BenchmarkManager mgr(result_fd, std::move(signature), config.Seed, discard, nvtx, landlock, mseal);
auto [args, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr.do_bench_py(kernel_qualname, args, expected, reinterpret_cast<cudaStream_t>(stream));
auto [args, output_positions, input_output_positions, expected] = mgr.setup_benchmark(nb::cast<nb::callable>(test_generator), test_kwargs, config.Repeats);
mgr.do_bench_py(kernel_qualname, args, output_positions, input_output_positions, expected, reinterpret_cast<cudaStream_t>(stream));
}


Expand Down
214 changes: 153 additions & 61 deletions csrc/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,18 @@ BenchmarkManager::~BenchmarkManager() {
cudaFree(mDeviceErrorBase);
for (auto& event : mStartEvents) cudaEventDestroy(event);
for (auto& event : mEndEvents) cudaEventDestroy(event);
for (auto& exp: mExpectedOutputs) cudaFree(exp.Value);
for (auto& expected_per_test : mExpectedOutputs) {
for (auto& exp : expected_per_test) cudaFree(exp.Value);
}
}

std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
std::tuple<std::vector<nb::tuple>, std::vector<std::vector<std::size_t>>, std::vector<std::vector<std::size_t>>, std::vector<nb::tuple>> BenchmarkManager::setup_benchmark(const nb::callable& generate_test_case, const nb::dict& kwargs, int repeats) {
std::mt19937_64 rng(mSeed);
std::uniform_int_distribution<std::uint64_t> dist(0, std::numeric_limits<std::uint64_t>::max());
// generate one more input to handle warmup
std::vector<nb::tuple> kernel_args(repeats + 1);
std::vector<nb::tuple> call_args(repeats + 1);
std::vector<std::vector<std::size_t>> output_positions(repeats + 1);
std::vector<std::vector<std::size_t>> input_output_positions(repeats + 1);
std::vector<nb::tuple> expected(repeats + 1);
for (int i = 0; i < repeats + 1; i++) {
// create new copy of the kwargs dict
Expand All @@ -168,23 +172,74 @@ std::pair<std::vector<nb::tuple>, std::vector<nb::tuple>> BenchmarkManager::setu
call_kwargs["seed"] = dist(rng);

auto gen = nb::cast<nb::tuple>(generate_test_case(**call_kwargs));
kernel_args[i] = nb::cast<nb::tuple>(gen[0]);
expected[i] = nb::cast<nb::tuple>(gen[1]);
if (gen.size() != 4) {
throw std::runtime_error("generate_test_case must return a 4-tuple: (args, output_positions, input_output_positions, expected)");
}

call_args[i] = nb::cast<nb::tuple>(gen[0]);
nb::tuple output_positions_tuple = nb::cast<nb::tuple>(gen[1]);
nb::tuple input_output_positions_tuple = nb::cast<nb::tuple>(gen[2]);
expected[i] = nb::cast<nb::tuple>(gen[3]);

if (output_positions_tuple.size() == 0) {
throw std::runtime_error("output_positions tuple must not be empty");
}
if (expected[i].size() != output_positions_tuple.size()) {
throw std::runtime_error("expected tuple size must match output_positions tuple size");
}
std::vector<bool> seen_output(call_args[i].size(), false);
output_positions[i].reserve(output_positions_tuple.size());
for (int j = 0; j < output_positions_tuple.size(); j++) {
std::size_t pos = nb::cast<std::size_t>(output_positions_tuple[j]);
if (pos >= static_cast<std::size_t>(call_args[i].size())) {
throw std::runtime_error("output_positions contains an index outside the args tuple");
}
if (seen_output[pos]) {
throw std::runtime_error("output_positions contains duplicate indices");
}
seen_output[pos] = true;
output_positions[i].push_back(pos);
}
std::vector<bool> seen_input_output(call_args[i].size(), false);
input_output_positions[i].reserve(input_output_positions_tuple.size());
for (int j = 0; j < input_output_positions_tuple.size(); j++) {
std::size_t pos = nb::cast<std::size_t>(input_output_positions_tuple[j]);
if (pos >= static_cast<std::size_t>(call_args[i].size())) {
throw std::runtime_error("input_output_positions contains an index outside the args tuple");
}
if (!seen_output[pos]) {
throw std::runtime_error("input_output_positions must be a subset of output_positions");
}
if (seen_input_output[pos]) {
throw std::runtime_error("input_output_positions contains duplicate indices");
}
seen_input_output[pos] = true;
input_output_positions[i].push_back(pos);
}
}
return std::make_pair(std::move(kernel_args), std::move(expected));
return std::make_tuple(std::move(call_args), std::move(output_positions), std::move(input_output_positions), std::move(expected));
}

bool can_convert_to_tensor(nb::handle obj) {
return nb::isinstance<nb_cuda_array>(obj);
}

auto BenchmarkManager::make_shadow_args(const nb::tuple& args, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
auto BenchmarkManager::make_shadow_args(const nb::tuple& args, const std::vector<std::size_t>& output_positions, const std::vector<std::size_t>& input_output_positions, cudaStream_t stream) -> std::vector<std::optional<ShadowArgument>> {
std::vector<std::optional<ShadowArgument>> shadow_args(args.size());
int nargs = args.size();
std::vector<bool> is_output(args.size(), false);
for (auto pos : output_positions) {
is_output.at(pos) = true;
}
for (auto pos : input_output_positions) {
is_output.at(pos) = false;
}
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<unsigned> canary_seed_dist(0, 0xffffffff);
for (int i = 1; i < nargs; i++) {
for (std::size_t i = 0; i < static_cast<std::size_t>(args.size()); i++) {
if (is_output[i]) {
continue;
}
if (can_convert_to_tensor(args[i])) {
nb_cuda_array arr = nb::cast<nb_cuda_array>(args[i]);
void* shadow;
Expand Down Expand Up @@ -225,6 +280,39 @@ void BenchmarkManager::validate_result(Expected& expected, const nb_cuda_array&
}
}

BenchmarkManager::Expected BenchmarkManager::parse_expected_spec(const nb::handle& obj) {
nb_cuda_array expected_array;
auto mode = BenchmarkManager::Expected::ExactMatch;
float rtol = 0.f;
float atol = 0.f;

if (nb::isinstance<nb_cuda_array>(obj)) {
expected_array = nb::cast<nb_cuda_array>(obj);
} else {
nb::tuple expected_tuple = nb::cast<nb::tuple>(obj);
if (expected_tuple.size() == 0) {
throw std::runtime_error("Expected spec tuple must not be empty");
}
if (expected_tuple.size() != 1 && expected_tuple.size() != 3) {
throw std::runtime_error("Expected spec tuple must have size 1 or 3");
}
expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);
if (expected_tuple.size() == 3) {
rtol = nb::cast<float>(expected_tuple[1]);
atol = nb::cast<float>(expected_tuple[2]);
mode = BenchmarkManager::Expected::ApproxMatch;
}
}

// copy expected values into memory not owned by torch, then wipe original
void* copy_mem;
CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));

return {mode, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
}

void BenchmarkManager::clear_cache(cudaStream_t stream) {
::clear_cache(mDeviceDummyMemory, 2 * mL2CacheSize, mDiscardCache, stream);
}
Expand All @@ -250,35 +338,37 @@ BenchmarkManager::ShadowArgument& BenchmarkManager::ShadowArgument::operator=(Sh
}

void BenchmarkManager::do_bench_py(
const std::string& kernel_qualname,
const std::vector<nb::tuple>& args,
const std::vector<nb::tuple>& expected,
cudaStream_t stream)
{
const std::string& kernel_qualname,
const std::vector<nb::tuple>& args,
const std::vector<std::vector<std::size_t>>& output_positions,
const std::vector<std::vector<std::size_t>>& input_output_positions,
const std::vector<nb::tuple>& expected,
cudaStream_t stream
) {
if (args.size() < 5) {
throw std::runtime_error("Not enough test cases to run benchmark");
}
if (expected.size() != args.size()) {
throw std::runtime_error("Expected results and test case list do not have the same length");
if (output_positions.size() != args.size() || input_output_positions.size() != args.size() || expected.size() != args.size()) {
throw std::runtime_error("Expected results, output metadata, and test case lists do not have the same length");
}
int calls = args.size() - 1;

// extract relevant infos from args and expected
// by convention, the first arg is the output tensor.
// TODO handle multiple outputs
std::vector<nb_cuda_array> outputs(args.size());
// extract relevant infos from outputs and expected
std::vector<std::vector<nb_cuda_array>> outputs(args.size());
for (int i = 0; i < args.size(); i++) {
outputs.at(i) = nb::cast<nb_cuda_array>(args.at(i)[0]);
outputs.at(i).reserve(output_positions.at(i).size());
for (auto pos : output_positions.at(i)) {
outputs.at(i).push_back(nb::cast<nb_cuda_array>(args.at(i)[pos]));
}
}

// Generate "shadow" copies of input arguments
std::vector<ShadowArgumentList> shadow_arguments;
for (const auto & arg : args) {
shadow_arguments.emplace_back(make_shadow_args(arg, stream));
for (int i = 0; i < args.size(); i++) {
shadow_arguments.emplace_back(make_shadow_args(args.at(i), output_positions.at(i), input_output_positions.at(i), stream));
}

// prepare expected outputs
setup_expected_outputs(args, expected);
setup_expected_outputs(output_positions, expected);

// clean up as much python state as we can
trigger_gc();
Expand All @@ -300,9 +390,28 @@ void BenchmarkManager::do_bench_py(
// after this, we cannot trust python anymore
nb::callable kernel = kernel_from_qualname(kernel_qualname);

auto prepare_args = [&](const ShadowArgumentList& shadow_args) {
for (auto& shadow_arg : shadow_args) {
if (shadow_arg) {
CUDA_CHECK(cudaMemcpyAsync(shadow_arg->Original.data(), shadow_arg->Shadow, shadow_arg->Original.nbytes(), cudaMemcpyDeviceToDevice, stream));
}
}

clear_cache(stream);

// ok, now we revert the canaries. This _does_ bring in the corresponding cache lines,
// but they are very sparse (1/256), so that seems like an acceptable trade-off
for (auto& shadow_arg : shadow_args) {
if (shadow_arg) {
canaries(shadow_arg->Original.data(), shadow_arg->Original.nbytes(), shadow_arg->Seed, stream);
}
}
};

// ok, first run for compilations etc
nvtx_push("warmup");
CUDA_CHECK(cudaDeviceSynchronize());
prepare_args(shadow_arguments.at(0));
kernel(*args.at(0));
CUDA_CHECK(cudaDeviceSynchronize());
nvtx_pop();
Expand All @@ -316,7 +425,7 @@ void BenchmarkManager::do_bench_py(
// note: we are assuming here that calling the kernel multiple times for the same input is a safe operation
// this is only potentially problematic for in-place kernels;
CUDA_CHECK(cudaDeviceSynchronize());
clear_cache(stream);
prepare_args(shadow_arguments.at(0));
kernel(*args.at(0));
CUDA_CHECK(cudaDeviceSynchronize());
std::chrono::high_resolution_clock::time_point cpu_end = std::chrono::high_resolution_clock::now();
Expand Down Expand Up @@ -379,32 +488,20 @@ void BenchmarkManager::do_bench_py(
// unfortunately, we need to do this before clearing the cache, so there is a window of opportunity
// *but* we deliberately modify a small subset of the inputs, which only get corrected immediately before
// the user code call.
for (auto& shadow_arg : shadow_arguments.at(test_id)) {
if (shadow_arg) {
CUDA_CHECK(cudaMemcpyAsync(shadow_arg->Original.data(), shadow_arg->Shadow, shadow_arg->Original.nbytes(), cudaMemcpyDeviceToDevice, stream));
}
}

nvtx_push("cc");
clear_cache(stream);
prepare_args(shadow_arguments.at(test_id));
nvtx_pop();

// ok, now we revert the canaries. This _does_ bring in the corresponding cache lines,
// but they are very sparse (1/256), so that seems like an acceptable trade-off
for (auto& shadow_arg : shadow_arguments.at(test_id)) {
if (shadow_arg) {
canaries(shadow_arg->Original.data(), shadow_arg->Original.nbytes(), shadow_arg->Seed, stream);
}
}

CUDA_CHECK(cudaEventRecord(mStartEvents.at(i), stream));
nvtx_push("kernel");
(void)kernel(*args.at(test_id));
nvtx_pop();
CUDA_CHECK(cudaEventRecord(mEndEvents.at(i), stream));
// immediately after the kernel, launch the checking code; if there is some unsynced work done on another stream,
// this increases the chance of detection.
validate_result(mExpectedOutputs.at(test_id), outputs.at(test_id), check_seed_generator(rng), stream);
for (std::size_t j = 0; j < outputs.at(test_id).size(); j++) {
validate_result(mExpectedOutputs.at(test_id).at(j), outputs.at(test_id).at(j), check_seed_generator(rng), stream);
}
}
nvtx_pop();

Expand Down Expand Up @@ -456,25 +553,20 @@ float BenchmarkManager::measure_event_overhead(int repeats, cudaStream_t stream)
return median;
}

void BenchmarkManager::setup_expected_outputs(const std::vector<nb::tuple>& args, const std::vector<nb::tuple>& expected) {
mExpectedOutputs.resize(args.size());
for (int i = 0; i < args.size(); i++) {
void BenchmarkManager::setup_expected_outputs(const std::vector<std::vector<std::size_t>>& output_positions, const std::vector<nb::tuple>& expected) {
for (auto& expected_per_test : mExpectedOutputs) {
for (auto& exp : expected_per_test) cudaFree(exp.Value);
}
mExpectedOutputs.clear();
mExpectedOutputs.resize(output_positions.size());
for (int i = 0; i < output_positions.size(); i++) {
const nb::tuple& expected_tuple = expected.at(i);
nb_cuda_array expected_array = nb::cast<nb_cuda_array>(expected_tuple[0]);

// make a copy of the expected result and put it in memory not owned by torch; overwrite the original
// so it cannot be read by cheating solutions.
void* copy_mem;
CUDA_CHECK(cudaMalloc(&copy_mem, expected_array.nbytes()));
CUDA_CHECK(cudaMemcpy(copy_mem, expected_array.data(), expected_array.nbytes(), cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemset(expected_array.data(), 0, expected_array.nbytes()));

if (expected.at(i).size() == 1) {
mExpectedOutputs.at(i) = {Expected::ExactMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), 0.f, 0.f};
} else {
float rtol = nb::cast<float>(expected_tuple[1]);
float atol = nb::cast<float>(expected_tuple[2]);
mExpectedOutputs.at(i) = {Expected::ApproxMatch, copy_mem, expected_array.nbytes(), expected_array.dtype(), atol, rtol};
if (expected_tuple.size() != output_positions.at(i).size()) {
throw std::runtime_error("Expected tuple size must match output_positions tuple size");
}
mExpectedOutputs.at(i).reserve(expected_tuple.size());
for (int j = 0; j < expected_tuple.size(); j++) {
mExpectedOutputs.at(i).push_back(parse_expected_spec(expected_tuple[j]));
}
}
}
}
Loading
Loading