Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,7 @@ SET(OPERATOR_SRCS
src/operators/dynamic-fully-connected-nc.c
src/operators/fully-connected-nc.c
src/operators/max-pooling-nhwc.c
src/operators/normalize-nc.c
src/operators/pack-lh.c
src/operators/reduce-nd.c
src/operators/resize-bilinear-nchw.c
Expand Down Expand Up @@ -535,6 +536,7 @@ SET(SUBGRAPH_SRCS
src/subgraph/fully-connected-sparse.c
src/subgraph/fully-connected.c
src/subgraph/max-pooling-2d.c
src/subgraph/normalize.c
src/subgraph/pack-lh.c
src/subgraph/reshape-helpers.c
src/subgraph/rope.c
Expand Down
30 changes: 18 additions & 12 deletions bench/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ xnnpack_cc_library(

xnnpack_cxx_library(
name = "gemm_benchmark",
testonly = True,
srcs = [
"gemm-benchmark.cc",
],
Expand All @@ -115,6 +116,7 @@ xnnpack_cxx_library(

xnnpack_cxx_library(
name = "packw_benchmark",
testonly = True,
hdrs = [
"packw-benchmark.h",
],
Expand All @@ -126,6 +128,7 @@ xnnpack_cxx_library(

xnnpack_cxx_library(
name = "bgemm",
testonly = True,
hdrs = [
"bgemm.h",
],
Expand All @@ -134,6 +137,19 @@ xnnpack_cxx_library(
],
)

xnnpack_cxx_library(
name = "packq_benchmark",
testonly = True,
srcs = [
"packq-benchmark.cc",
],
hdrs = ["packq-benchmark.h"],
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
"@com_google_benchmark//:benchmark",
],
)

######################### Benchmarks for micro-kernels #########################

[xnnpack_benchmark(
Expand Down Expand Up @@ -275,8 +291,10 @@ xnnpack_benchmark(
"f32_vcmul",
"rdminmax",
"rdsum",
"rdsum2",
"rminmax",
"rsum",
"rsum2",
"x8_lut",
]]

Expand Down Expand Up @@ -453,18 +471,6 @@ xnnpack_benchmark(
],
)

xnnpack_cxx_library(
name = "packq_benchmark",
srcs = [
"packq-benchmark.cc",
],
hdrs = ["packq-benchmark.h"],
deps = MICROKERNEL_BENCHMARK_DEPS + [
":bgemm",
"@com_google_benchmark//:benchmark",
],
)

xnnpack_benchmark(
name = "x8_packq_bench",
srcs = [
Expand Down
78 changes: 78 additions & 0 deletions bench/rdsum2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2025 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <cstddef>
#include <cstdint>
#include <random>

#include "bench/utils.h"
#include "src/xnnpack/buffer.h"
#include "src/xnnpack/common.h"
#include "src/xnnpack/hardware-config.h" // IWYU pragma: keep
#include "src/xnnpack/reduce.h" // IWYU pragma: keep
#include <benchmark/benchmark.h>

// Microkernel function, templated on the `params` type.
template <typename Input, typename Output, typename UKernelParams>
using UKernelFn = void (*)(size_t, size_t, size_t, size_t, const Input*, size_t,
size_t, size_t, const Input*, Output*,
const UKernelParams*);

template <typename Input, typename Output, typename UKernelParams>
static void reduce(benchmark::State& state, uint64_t arch_flags,
UKernelFn<Input, Output, UKernelParams> ukernel) {
if (!benchmark::utils::CheckArchFlags(state, arch_flags)) {
return;
}

const size_t channels = state.range(0);
const size_t rows = state.range(1);

std::random_device random_device;
auto rng = std::mt19937(random_device());

xnnpack::Buffer<Input, XNN_ALLOCATION_ALIGNMENT> input(
channels * rows, xnnpack::XnnExtraBytes);
xnnpack::Buffer<Input, XNN_ALLOCATION_ALIGNMENT> zero(channels, 0,
xnnpack::XnnExtraBytes);
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);
xnnpack::Buffer<Output, XNN_ALLOCATION_ALIGNMENT> output(channels);

UKernelParams params;
memset(&params, 0, sizeof(params));

for (auto _ : state) {
ukernel(channels, rows, 1, 1, input.data(), channels * sizeof(Input), 0, 0,
zero.data(), output.data(), &params);
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}

const size_t elements_per_iteration = channels * rows;
state.counters["elements"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * elements_per_iteration,
benchmark::Counter::kIsRate);

const size_t bytes_per_iteration = channels * rows * sizeof(Input);
state.counters["bytes"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * bytes_per_iteration,
benchmark::Counter::kIsRate);
}

#define XNN_UKERNEL(arch_flags, ukernel, row_tile, batch_tile, vector_tile, \
datatype_in, datatype_out, params_type, init_params) \
BENCHMARK_CAPTURE(reduce, ukernel, arch_flags, ukernel) \
->Apply(benchmark::utils::ReduceDiscontiguousParameters<datatype_in>) \
->UseRealTime();
// #include "src/f16-f32acc-rdsum/f16-f32acc-rdsum.inc"
#include "src/f32-rdsum/f32-rdsum.inc"
#undef XNN_UKERNEL

#ifndef XNNPACK_BENCHMARK_NO_MAIN
XNN_BENCHMARK_MAIN();
#endif
76 changes: 76 additions & 0 deletions bench/rsum2.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2025 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <cstddef>
#include <cstdint>
#include <random>

#include "bench/utils.h"
#include "src/xnnpack/buffer.h"
#include "src/xnnpack/common.h"
#include "src/xnnpack/hardware-config.h" // IWYU pragma: keep
#include "src/xnnpack/reduce.h" // IWYU pragma: keep
#include <benchmark/benchmark.h>

// Microkernel function, templated on the `params` type.
template <typename Input, typename Output, typename UKernelParams>
using UKernelFn = void (*)(size_t, const Input*, Output*, const UKernelParams*);

template <typename Input, typename Output, typename UKernelParams>
static void reduce(benchmark::State& state, uint64_t arch_flags,
UKernelFn<Input, Output, UKernelParams> ukernel) {
if (!benchmark::utils::CheckArchFlags(state, arch_flags)) {
return;
}

const size_t channels = state.range(0);
const size_t rows = state.range(1);

std::random_device random_device;
auto rng = std::mt19937(random_device());

xnnpack::Buffer<Input, XNN_ALLOCATION_ALIGNMENT> input(
channels * rows, xnnpack::XnnExtraBytes);
xnnpack::fill_uniform_random_bits(input.data(), input.size(), rng);

UKernelParams params;
memset(&params, 0, sizeof(params));

Output output = 0;
for (auto _ : state) {
for (size_t r = 0; r < rows; ++r) {
ukernel(channels * sizeof(Input), input.data() + r * channels, &output,
&params);
}
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
if (cpu_frequency != 0) {
state.counters["cpufreq"] = cpu_frequency;
}

const size_t elements_per_iteration = rows * channels;
state.counters["elements"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * elements_per_iteration,
benchmark::Counter::kIsRate);

const size_t bytes_per_iteration = rows * channels * sizeof(Input);
state.counters["bytes"] = benchmark::Counter(
static_cast<uint64_t>(state.iterations()) * bytes_per_iteration,
benchmark::Counter::kIsRate);
}

#define XNN_UKERNEL(arch_flags, ukernel, batch_tile, vector_tile, datatype_in, \
datatype_out, params_type, init_params) \
BENCHMARK_CAPTURE(reduce, ukernel, arch_flags, ukernel) \
->Apply(benchmark::utils::ReduceParameters<datatype_in>) \
->UseRealTime();
#include "src/f16-f32acc-rsum2/f16-f32acc-rsum2.inc"
#include "src/f32-rsum2/f32-rsum2.inc"
#undef XNN_UKERNEL

#ifndef XNNPACK_BENCHMARK_NO_MAIN
XNN_BENCHMARK_MAIN();
#endif
2 changes: 1 addition & 1 deletion bench/subgraph/fp32-l2-norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ xnn_subgraph_t FP32L2Norm(size_t m, size_t n, size_t k, uint32_t norm_mask) {

uint32_t inv_sqrt_sum_sq = XNN_INVALID_VALUE_ID;
status = xnn_define_tensor_value(
subgraph, xnn_datatype_fp32, dims.size(), dims.data(),
subgraph, xnn_datatype_fp32, reduction_dims.size(), reduction_dims.data(),
/*data=*/nullptr, XNN_INVALID_VALUE_ID, /*flags=*/0, &inv_sqrt_sum_sq);
if (status != xnn_status_success) {
std::cerr << "failed to create tensor inv_sqrt_sum_sq" << std::endl;
Expand Down
1 change: 0 additions & 1 deletion bench/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <string>

#include "src/xnnpack/common.h"
Expand Down
6 changes: 6 additions & 0 deletions build_srcs.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ OPERATOR_SRCS = [
"src/operators/dynamic-fully-connected-nc.c",
"src/operators/fully-connected-nc.c",
"src/operators/max-pooling-nhwc.c",
"src/operators/normalize-nc.c",
"src/operators/pack-lh.c",
"src/operators/reduce-nd.c",
"src/operators/resize-bilinear-nchw.c",
Expand Down Expand Up @@ -56,6 +57,7 @@ SUBGRAPH_SRCS = [
"src/subgraph/fully-connected-sparse.c",
"src/subgraph/fully-connected.c",
"src/subgraph/max-pooling-2d.c",
"src/subgraph/normalize.c",
"src/subgraph/pack-lh.c",
"src/subgraph/reshape-helpers.c",
"src/subgraph/rope.c",
Expand Down Expand Up @@ -99,7 +101,9 @@ MICROKERNEL_DEFS = [
"src/f16-dwconv/f16-dwconv-minmax.inc",
"src/f16-f32-vcvt/f16-f32-vcvt.inc",
"src/f16-f32acc-rdsum/f16-f32acc-rdsum.inc",
"src/f16-f32acc-rdsum2/f16-f32acc-rdsum2.inc",
"src/f16-f32acc-rsum/f16-f32acc-rsum.inc",
"src/f16-f32acc-rsum2/f16-f32acc-rsum2.inc",
"src/f16-maxpool/f16-maxpool-minmax.inc",
"src/f16-qs8-vcvt/f16-qs8-vcvt.inc",
"src/f16-qu8-vcvt/f16-qu8-vcvt.inc",
Expand Down Expand Up @@ -163,10 +167,12 @@ MICROKERNEL_DEFS = [
"src/f32-rdminmax/f32-rdmax.inc",
"src/f32-rdminmax/f32-rdmin.inc",
"src/f32-rdsum/f32-rdsum.inc",
"src/f32-rdsum2/f32-rdsum2.inc",
"src/f32-rminmax/f32-rmax.inc",
"src/f32-rminmax/f32-rmin.inc",
"src/f32-rminmax/f32-rminmax.inc",
"src/f32-rsum/f32-rsum.inc",
"src/f32-rsum2/f32-rsum2.inc",
"src/f32-spmm/f32-spmm-minmax.inc",
"src/f32-vabs/f32-vabs.inc",
"src/f32-vapproxgelu/f32-vapproxgelu.inc",
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/avx512f_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ SET(PROD_AVX512F_MICROKERNEL_SRCS
src/f32-rdminmax/gen/f32-rdmax-2p2x-avx512f-u32.c
src/f32-rdminmax/gen/f32-rdmin-2p2x-avx512f-u32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-u64.c
src/f32-rdsum2/gen/f32-rdsum2-7p7x-minmax-avx512f.c
src/f32-rminmax/gen/f32-rmax-avx512f-u64-acc4.c
src/f32-rminmax/gen/f32-rmin-avx512f-u64-acc4.c
src/f32-rminmax/gen/f32-rminmax-avx512f-u64-acc4.c
src/f32-rsum/gen/f32-rsum-avx512f-u32-acc2.c
src/f32-rsum2/gen/f32-rsum2-avx512f-u16.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-avx512f-rational-12-10-div.c
src/f32-vbinary/gen/f32-vadd-avx512f-u32.c
src/f32-vbinary/gen/f32-vaddc-avx512f-u32.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/avx512skx_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
SET(PROD_AVX512SKX_MICROKERNEL_SRCS
src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-avx512skx-u64.c
src/f16-f32acc-rdsum2/gen/f16-f32acc-rdsum2-7p7x-avx512skx.c
src/f16-f32acc-rsum/gen/f16-f32acc-rsum-avx512skx-u32-acc2.c
src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-avx512skx.c
src/f16-rminmax/gen/f16-rmax-avx512skx-u64-acc4.c
src/f16-rminmax/gen/f16-rmin-avx512skx-u64-acc4.c
src/f16-rminmax/gen/f16-rminmax-avx512skx-u64-acc4.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/avx_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ SET(PROD_AVX_MICROKERNEL_SRCS
src/f32-rdminmax/gen/f32-rdmax-2p2x-avx-u32.c
src/f32-rdminmax/gen/f32-rdmin-2p2x-avx-u32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-u32.c
src/f32-rdsum2/gen/f32-rdsum2-7p7x-minmax-avx.c
src/f32-rminmax/gen/f32-rmax-avx-u32-acc4.c
src/f32-rminmax/gen/f32-rmin-avx-u32-acc4.c
src/f32-rminmax/gen/f32-rminmax-avx-u32-acc4.c
src/f32-rsum/gen/f32-rsum-avx-u32-acc4.c
src/f32-rsum2/gen/f32-rsum2-avx-u8.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-avx-rational-12-10-div.c
src/f32-vbinary/gen/f32-vadd-avx-u16.c
src/f32-vbinary/gen/f32-vaddc-avx-u16.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/f16c_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ SET(PROD_F16C_MICROKERNEL_SRCS
src/f16-avgpool/gen/f16-avgpool-9p-minmax-f16c.c
src/f16-f32-vcvt/gen/f16-f32-vcvt-f16c-u16.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-f16c-u32.c
src/f16-f32acc-rdsum2/gen/f16-f32acc-rdsum2-7p7x-f16c.c
src/f16-f32acc-rsum/gen/f16-f32acc-rsum-f16c-u32-acc4.c
src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-f16c.c
src/f16-rminmax/f16-rmax-f16c-u32.c
src/f16-vbinary/gen/f16-vadd-f16c-u16.c
src/f16-vbinary/gen/f16-vaddc-f16c-u16.c
Expand Down
1 change: 1 addition & 0 deletions cmake/gen/hvx_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ SET(NON_PROD_HVX_MICROKERNEL_SRCS
src/f32-raddstoreexpminusmax/gen/f32-raddstoreexpminusmax-hvx-rr2-p5-u128-acc4.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-hvx-u32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-hvx-u64.c
src/f32-rdsum2/gen/f32-rdsum2-7p7x-minmax-hvx.c
src/f32-rminmax/gen/f32-rmax-hvx-u32.c
src/f32-rminmax/gen/f32-rmax-hvx-u96-acc3.c
src/f32-rminmax/gen/f32-rmax-hvx-u128-acc2.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/neon_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,12 @@ SET(PROD_NEON_MICROKERNEL_SRCS
src/f32-rdminmax/gen/f32-rdmax-2p2x-neon-u32.c
src/f32-rdminmax/gen/f32-rdmin-2p2x-neon-u32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-u16.c
src/f32-rdsum2/gen/f32-rdsum2-7p7x-minmax-neon.c
src/f32-rminmax/gen/f32-rmax-neon-u16-acc4.c
src/f32-rminmax/gen/f32-rmin-neon-u16-acc4.c
src/f32-rminmax/gen/f32-rminmax-neon-u16-acc4.c
src/f32-rsum/gen/f32-rsum-neon-u16-acc4.c
src/f32-rsum2/gen/f32-rsum2-neon.c
src/f32-spmm/gen/f32-spmm-32x1-minmax-neon.c
src/f32-vapproxgelu/gen/f32-vapproxgelu-neon-rational-12-10-div.c
src/f32-vbinary/gen/f32-vadd-neon-u8.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/neonfp16arith_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@ SET(PROD_NEONFP16ARITH_MICROKERNEL_SRCS
src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5p2-minmax-neonfp16arith-1x8.c
src/f16-dwconv2d-chw/gen/f16-dwconv2d-chw-5x5s2p2-minmax-neonfp16arith-1x8.c
src/f16-f32acc-rdsum/gen/f16-f32acc-rdsum-7p7x-minmax-neonfp16arith-u16.c
src/f16-f32acc-rdsum2/gen/f16-f32acc-rdsum2-7p7x-minmax-neonfp16arith.c
src/f16-f32acc-rsum/gen/f16-f32acc-rsum-neonfp16arith-u32-acc4.c
src/f16-f32acc-rsum2/gen/f16-f32acc-rsum2-neonfp16arith.c
src/f16-gemm/gen/f16-gemm-1x8-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen/f16-gemm-1x16-minmax-neonfp16arith-ld64.c
src/f16-gemm/gen/f16-gemm-6x8-minmax-neonfp16arith-ld64.c
Expand Down
2 changes: 2 additions & 0 deletions cmake/gen/scalar_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,12 @@ SET(PROD_SCALAR_MICROKERNEL_SRCS
src/f32-rdminmax/gen/f32-rdmax-2p2x-scalar-u2.c
src/f32-rdminmax/gen/f32-rdmin-2p2x-scalar-u2.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-scalar.c
src/f32-rdsum2/gen/f32-rdsum2-7p7x-minmax-scalar.c
src/f32-rminmax/gen/f32-rmax-scalar-u4-acc4.c
src/f32-rminmax/gen/f32-rmin-scalar-u4-acc4.c
src/f32-rminmax/gen/f32-rminmax-scalar-u4-acc4.c
src/f32-rsum/gen/f32-rsum-scalar-u4-acc4.c
src/f32-rsum2/gen/f32-rsum2-scalar-u1.c
src/f32-spmm/gen/f32-spmm-8x1-minmax-scalar.c
src/f32-spmm/gen/f32-spmm-8x2-minmax-scalar.c
src/f32-spmm/gen/f32-spmm-8x4-minmax-scalar.c
Expand Down
Loading
Loading