diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6c3e63665..a932c103d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -291,7 +291,7 @@ jobs: source ./env-setup/12.8_env_setup.sh source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ - ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D 7 + ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D 7 --spinlock_handling none - name: generate-rodinia_2.0-ft-hw_stats run: | source ./env-setup/12.8_env_setup.sh diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index 91cc95911..573495be3 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -34,7 +34,7 @@ jobs: source ./env-setup/12.8_env_setup.sh export PATH=/home/tgrogers-raid/a/common/python2:$PATH rm -rf ./gpu-app-collection/ - git clone git@github.com:accel-sim/gpu-app-collection.git + git clone --recursive git@github.com:accel-sim/gpu-app-collection.git source ./gpu-app-collection/src/setup_environment ln -s /home/tgrogers-raid/a/common/data_dirs ./gpu-app-collection/ make -j8 -C ./gpu-app-collection/src rodinia-3.1 @@ -53,6 +53,24 @@ jobs: ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces ./hw_run ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark -D 7 # ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7 + - name: generate-spinlock-traces-spinlock_handling + run: | + source ./env-setup/12.8_env_setup.sh + source ./gpu-app-collection/src/setup_environment + rm -rf ./hw_run/ + ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward + mv ./hw_run ./hw_run_fast_forward + ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none + mv ./hw_run ./hw_run_none + - name: test-new-traces-spinlock_handling + # Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr) + run: | + source ./env-setup/12.8_env_setup.sh + source ./gpu-simulator/setup_environment.sh + ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward + ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward + # ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none + # ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none SASS-Weekly: needs: [Tracer-Weekly] if: github.repository == 'accel-sim/accel-sim-framework' diff --git a/.gitignore b/.gitignore index 7a3451053..877bb3f22 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ gpu-simulator/gpgpu-sim extern gpu-simulator/accel_sim.pyi compile_commands.json -.cache \ No newline at end of file +.cache +.cursorrules +CLAUDE.md \ No newline at end of file diff --git a/README.md b/README.md index f396ff79c..d339b8b56 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ - [Accel-Sim Components](#accel-sim-components) - [Accel-Sim Tracer](#accel-sim-tracer) - [A simple example](#a-simple-example) + - [Spinlock handling](#spinlock-handling) - [Pre-traced applications](#pre-traced-applications) - [Accel-Sim SASS Frontend and Simulation Engine](#accel-sim-sass-frontend-and-simulation-engine) - [Accel-Sim Correlator](#accel-sim-correlator) @@ -113,6 +114,18 @@ That's it. The traces for the short-running rodinia tests will be generated in: To extend the tracer, use other apps and understand what, exactly is going on, read [this](https://github.com/accel-sim/accel-sim-framework/blob/dev/util/tracer_nvbit/README.md). +#### Spinlock handling + +If your application contains spinlock instructions, you can handle them with the tracer by using the following command: + +```bash +./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D --spinlock_handling fast_forward +``` + +This will fast forward the spinlock instructions and keep the spinlock instructions for the number of iterations specified in the `--spinlock_fast_forward_iterations` arg option. + +The tool for spinlock detection is in `./util/tracer_nvbit/others/spinlock_tool/`. + #### Pre-traced applications For convience, we have included a repository of pre-traced applications - to get all those traces, simply run: ```bash diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index d6f8a405b..3d0a6e416 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -126,6 +126,14 @@ GPU_Atomic: - args: 16 accel-sim-mem: 1G +Spinlock: + exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" + data_dirs: "$GPUAPPS_ROOT/data_dirs/" + execs: + - spinlock_simple: + - args: + accel-sim-mem: 1G + Atomic_Profile: exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/" data_dirs: "$GPUAPPS_ROOT/data_dirs/" diff --git a/util/tracer_nvbit/.gitignore b/util/tracer_nvbit/.gitignore index c1adccf67..e26001f3f 100644 --- a/util/tracer_nvbit/.gitignore +++ b/util/tracer_nvbit/.gitignore @@ -1,7 +1,4 @@ nvbit_release/ -silicon_checkpoint_tool/checkpoint/checkpoint.o -silicon_checkpoint_tool/checkpoint/checkpoint.so -tracer_tool/tracer_tool.o -tracer_tool/tracer_tool.so -tracer_tool/inject_funcs.o +*.o +*.so tracer_tool/traces-processing/post-traces-processing diff --git a/util/tracer_nvbit/Makefile b/util/tracer_nvbit/Makefile index a3433c74f..277e21cf6 100644 --- a/util/tracer_nvbit/Makefile +++ b/util/tracer_nvbit/Makefile @@ -1,9 +1,10 @@ all: - make -C tracer_tool - make -C tracer_tool/traces-processing - #make -C silicon_checkpoint_tool + $(MAKE) -C tracer_tool + $(MAKE) -C tracer_tool/traces-processing + $(MAKE) -C others/spinlock_tool +#$(MAKE) -C silicon_checkpoint_tool clean: - make clean -C tracer_tool - make clean -C tracer_tool/traces-processing + $(MAKE) clean -C tracer_tool + $(MAKE) clean -C tracer_tool/traces-processing diff --git a/util/tracer_nvbit/others/spinlock_tool/Makefile b/util/tracer_nvbit/others/spinlock_tool/Makefile new file mode 100644 index 000000000..cdccff25f --- /dev/null +++ b/util/tracer_nvbit/others/spinlock_tool/Makefile @@ -0,0 +1,79 @@ +# SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +NVCC=nvcc -ccbin=$(CXX) -D_FORCE_INLINES +PTXAS=ptxas + +NVCC_VER_REQ=10.1 +NVCC_VER=$(shell $(NVCC) --version | grep release | cut -f2 -d, | cut -f3 -d' ') +NVCC_VER_CHECK=$(shell echo "${NVCC_VER} >= $(NVCC_VER_REQ)" | bc) + +ifeq ($(NVCC_VER_CHECK),0) +$(error ERROR: nvcc version >= $(NVCC_VER_REQ) required to compile an nvbit tool! Instrumented applications can still use lower versions of nvcc.) +endif + +PTXAS_VER_ADD_FLAG=12.3 +PTXAS_VER=$(shell $(PTXAS) --version | grep release | cut -f2 -d, | cut -f3 -d' ') +PTXAS_VER_CHECK=$(shell echo "${PTXAS_VER} >= $(PTXAS_VER_ADD_FLAG)" | bc) + +ifeq ($(PTXAS_VER_CHECK), 0) +MAXRREGCOUNT_FLAG=-maxrregcount=24 +else +MAXRREGCOUNT_FLAG= +endif + +NVBIT_PATH=../../nvbit_release/core +INCLUDES=-I$(NVBIT_PATH) + +LIBS=-L$(NVBIT_PATH) -lnvbit +NVCC_PATH=-L $(subst bin/nvcc,lib64,$(shell which nvcc | tr -s /)) + +SOURCES=$(wildcard *.cu) + +OBJECTS=$(SOURCES:.cu=.o) +ARCH?=all + +mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST))) +current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path)))) + +NVBIT_TOOL=$(current_dir).so + +all: $(NVBIT_TOOL) + +$(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a + $(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@ + +%.o: %.cu + $(NVCC) -dc -c -std=c++17 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@ + +inject_funcs.o: inject_funcs.cu + $(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@ + +clean: + rm -f *.so *.o diff --git a/util/tracer_nvbit/others/spinlock_tool/README.md b/util/tracer_nvbit/others/spinlock_tool/README.md new file mode 100644 index 000000000..fb4381462 --- /dev/null +++ b/util/tracer_nvbit/others/spinlock_tool/README.md @@ -0,0 +1,20 @@ +# Spinlock tool + +## Description + +This tool is used to detect spinlocks in the kernel code. + +## Usage + +```bash +# Run program first time to get the instruction histogram of the program's kernels +SPINLOCK_PHASE=0 CUDA_INJECTION64_PATH=PATH/TO/spinlock_tool.so program + +# Run program second time to get another instruction histogram of the program's kernels +# At the end of nvbit, this tool will generate a file with the name of spinlock_detection/spinlock_instructions.txt +# containing the instruction indices of the spinlock instructions in the program's kernels +SPINLOCK_PHASE=1 CUDA_INJECTION64_PATH=PATH/TO/spinlock_tool.so program + +# To fast forward the spinlock instructions with accel-sim tracer, you can use the following command +ENABLE_SPINLOCK_FAST_FORWARD=1 CUDA_INJECTION64_PATH=PATH/TO/tracer_tool.so program +``` diff --git a/util/tracer_nvbit/others/spinlock_tool/common.h b/util/tracer_nvbit/others/spinlock_tool/common.h new file mode 100644 index 000000000..74fe3c696 --- /dev/null +++ b/util/tracer_nvbit/others/spinlock_tool/common.h @@ -0,0 +1,227 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* information collected in the instrumentation function and passed + * on the channel from the GPU to the CPU */ +typedef struct { + uint32_t instr_idx; + uint32_t count; +} instr_count_t; + + +/* Class to hold kernel instruction histogram */ +class KernelInstructionHistogram { +public: + KernelInstructionHistogram() + : id(0), name("dummy"), histogram(std::map()) { + } + + KernelInstructionHistogram(uint32_t id, std::string name) + : id(id), name(name), histogram(std::map()) { + } + + void add(uint32_t instr_idx, uint64_t count) { + if (histogram.find(instr_idx) == histogram.end()) { + histogram[instr_idx] = count; + } else { + histogram[instr_idx] += count; + } + } + + void merge(const KernelInstructionHistogram& other, bool use_hash = false) { + for (const auto& [instr_idx, count] : other.histogram) { + if (use_hash) { + // Simple modulo hash operation + add(instr_idx, count % hash_prime); + histogram[instr_idx] %= hash_prime; + } else { + add(instr_idx, count); + } + } + } + + void reinit(uint32_t id, std::string name) { + this->id = id; + this->name = name; + histogram.clear(); + } + + std::map> findSpinlock(const KernelInstructionHistogram& other) { + // Find instructions that have different execution counts between two runs + // These are likely part of spinlock sections + std::map> spinlockInstructions; + + // Check all instructions in this histogram + for (const auto& [instrIdx, count] : histogram) { + auto otherIt = other.histogram.find(instrIdx); + if (otherIt != other.histogram.end()) { + // Instruction exists in both histograms + if (count != otherIt->second) { + // Different execution counts - likely spinlock + spinlockInstructions[instrIdx] = {count, otherIt->second}; + } + } else { + // Instruction only exists in this histogram + spinlockInstructions[instrIdx] = {count, 0}; + } + } + + // Check instructions that only exist in the other histogram + for (const auto& [instrIdx, count] : other.histogram) { + if (histogram.find(instrIdx) == histogram.end()) { + // Instruction only exists in other histogram + spinlockInstructions[instrIdx] = {0, count}; // Mark as 0 in this run + } + } + + return spinlockInstructions; + } + + // Save histogram to file + bool saveToFile(const std::string& filename) const { + std::ofstream file(filename); + if (!file.is_open()) { + return false; + } + file << serialize(); + file.close(); + return true; + } + + // Load histogram from file + bool loadFromFile(const std::string& filename) { + std::ifstream file(filename); + if (!file.is_open()) { + return false; + } + + std::stringstream buffer; + buffer << file.rdbuf(); + file.close(); + + deserialize(buffer.str()); + return true; + } + + // Get total instruction count + uint64_t getTotalInstructionCount() const { + uint64_t total = 0; + for (const auto& [instrIdx, count] : histogram) { + total += count; + } + return total; + } + + // Get number of unique instructions + size_t getUniqueInstructionCount() const { + return histogram.size(); + } + + // Check if histogram is empty + bool isEmpty() const { + return histogram.empty(); + } + + // Clear histogram + void clear() { + histogram.clear(); + } + + std::string serialize() const { + std::stringstream ss; + ss << "Kernel: " << name << " (ID: " << id << ")" << std::endl; + for (const auto &[instr_idx, count] : histogram) { + ss << instr_idx << ": " << count << std::endl; + } + return ss.str(); + } + + void deserialize(const std::string& data) { + // Deserialize the data following the serialize format + // Kernel: (ID: ) + // : + // : + // ... + std::stringstream ss(data); + std::string line; + + // Clear existing histogram + histogram.clear(); + + // Regex patterns for parsing + std::regex headerPattern(R"(Kernel:\s*(.+?)\s*\(ID:\s*(\d+)\))"); + std::regex instructionPattern(R"(\s*(\d+)\s*:\s*(\d+)\s*)"); + + // Parse header line: "Kernel: (ID: )" + if (std::getline(ss, line)) { + std::smatch headerMatch; + if (std::regex_match(line, headerMatch, headerPattern)) { + if (headerMatch.size() >= 3) { + name = headerMatch[1].str(); + id = std::stoul(headerMatch[2].str()); + } + } + } + + // Parse instruction count lines: ": " + while (std::getline(ss, line)) { + if (line.empty()) continue; + + std::smatch instructionMatch; + if (std::regex_match(line, instructionMatch, instructionPattern)) { + if (instructionMatch.size() >= 3) { + try { + uint32_t instrIdx = std::stoul(instructionMatch[1].str()); + uint32_t count = std::stoul(instructionMatch[2].str()); + histogram[instrIdx] = count; + } catch (const std::exception& e) { + // Skip malformed lines + continue; + } + } + } + } + } + + uint32_t id; + std::string name; + std::map histogram; + // A large 30-bit prime number for hashing to avoid overflow + static constexpr uint32_t hash_prime = 1073741789; +}; \ No newline at end of file diff --git a/util/tracer_nvbit/others/spinlock_tool/inject_funcs.cu b/util/tracer_nvbit/others/spinlock_tool/inject_funcs.cu new file mode 100644 index 000000000..a18a6f475 --- /dev/null +++ b/util/tracer_nvbit/others/spinlock_tool/inject_funcs.cu @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +#include "utils/utils.h" + +/* for channel */ +#include "utils/channel.hpp" + +/* contains definition of the mem_access_t structure */ +#include "common.h" + +extern "C" __device__ __noinline__ void count_instr(int predicate, uint32_t instr_idx, + uint64_t grid_launch_id, + uint64_t pchannel_dev) { + const int active_mask = __ballot_sync(__activemask(), 1); + const int predicate_mask = __ballot_sync(__activemask(), predicate); + + const int laneid = get_laneid(); + const int first_laneid = __ffs(active_mask) - 1; + + // Count instruction for all threads + const int count = __popc(predicate_mask); + instr_count_t packet; + + // Also add count of warp instruction (+ 1 for the warp instruction itself) + packet.instr_idx = instr_idx; + packet.count = count + 1; + + /* first active lane pushes information on the channel */ + if (first_laneid == laneid) { + ChannelDev* channel_dev = (ChannelDev*)pchannel_dev; + channel_dev->push(&packet, sizeof(instr_count_t)); + } +} + \ No newline at end of file diff --git a/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu b/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu new file mode 100644 index 000000000..01a5625f5 --- /dev/null +++ b/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu @@ -0,0 +1,781 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * A tool to identify spinlock section in the SASS kernel. + * The idea is to run the tool twice and count the executed number for + * each instruction. For non-deterministic part (aka spinlock in our GEMM app), + * the number of executed instructions would be different, thus we can identify + * the spinlock section. + * + * You will need to pass SPINLOCK_PHASE=0 for first run and SPINLOCK_PHASE=1 for second run. + * Each run will generate a folder with the name of ctx_/spinlock_run_ and a spinlock_run__merged folder. + * Each folder will contain a file with the name of -.histogram. + * The spinlock_run__merged folder will contain a file for each kernel with the name of .histogram. + * When SPINLOCK_PHASE=1, the tool will also check for spinlock instructions during context termination. + * And generate a file with the name of spinlock_instructions.txt, with each + * line containing the kernel id, kernel name, and the indices of spinlock instructions. + * The indices are the instruction indices in the kernel function. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* every tool needs to include this once */ +#include "nvbit_tool.h" + +/* nvbit interface file */ +#include "nvbit.h" + +/* for channel */ +#include "utils/channel.hpp" + +/* contains definition of the mem_access_t structure */ +#include "common.h" + +#define HEX(x) \ + "0x" << std::setfill('0') << std::setw(16) << std::hex << (uint64_t)x \ + << std::dec + +#define CHANNEL_SIZE (1l << 20) + +enum class RecvThreadState { + WORKING, + STOP, + FINISHED, +}; + +struct CTXstate { + /* context id */ + int id; + + /* Channel used to communicate from GPU to CPU receiving thread */ + ChannelDev* channel_dev; + ChannelHost channel_host; + + // After initialization, set it to WORKING to make recv thread get data, + // parent thread sets it to STOP to make recv thread stop working. + // recv thread sets it to FINISHED when it cleans up. + // parent thread should wait until the state becomes FINISHED to clean up. + volatile RecvThreadState recv_thread_done = RecvThreadState::STOP; + + // Kernel reciving state + volatile bool kernel_receiving_done = false; + + // Kernel instruction histogram + KernelInstructionHistogram* instr_histogram = nullptr; +}; + +/* lock */ +pthread_mutex_t mutex; +pthread_mutex_t cuda_event_mutex; + +/* map to store context state */ +std::unordered_map ctx_state_map; + +/* skip flag used to avoid re-entry on the nvbit_callback when issuing + * flush_channel kernel call */ +bool skip_callback_flag = false; + +/* global control variables for this tool */ +uint32_t instr_begin_interval = 0; +uint32_t instr_end_interval = UINT32_MAX; +int verbose = 0; + +#define DPRINTF(fmt, ...) {if (verbose) printf(fmt, ##__VA_ARGS__);} + +/* opcode to id map and reverse map */ +std::map opcode_to_id_map; +std::map id_to_opcode_map; + +/* grid launch id, incremented at every launch */ +uint64_t global_grid_launch_id = 0; + +/* Spinlock phase */ +int spinlock_phase = 0; +// At end of phase SPINLOCK_PHASE_CHECK, we will compare the merged histogram +// from different runs to output a file containing the instructions for +// each kernel that are nondeterministic. +const int SPINLOCK_PHASE_CHECK = 1; +std::string spinlock_run_dir = "./"; +int spinlock_keep_intermediate_files = 0; +void spinlock_check(); + +void* recv_thread_fun(void* args); + +void nvbit_at_init() { + setenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC", "1", 1); + GET_VAR_INT( + instr_begin_interval, "INSTR_BEGIN", 0, + "Beginning of the instruction interval where to apply instrumentation"); + GET_VAR_INT( + instr_end_interval, "INSTR_END", UINT32_MAX, + "End of the instruction interval where to apply instrumentation"); + GET_VAR_INT(verbose, "TOOL_VERBOSE", 0, "Enable verbosity inside the tool"); + GET_VAR_INT(spinlock_phase, "SPINLOCK_PHASE", 0, "Spinlock phase"); + GET_VAR_STR(spinlock_run_dir, "TRACES_FOLDER", "Spinlock detection base directory, use the same as the traces folder"); + GET_VAR_INT(spinlock_keep_intermediate_files, "SPINLOCK_KEEP_INTERMEDIATE_FILES", 0, "Keep intermediate files"); + std::string pad(100, '-'); + printf("%s\n", pad.c_str()); + + /* set mutex as recursive */ + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&mutex, &attr); + + pthread_mutex_init(&cuda_event_mutex, &attr); + + // Add trailing slash if not empty + if (!spinlock_run_dir.empty()) { + spinlock_run_dir += "/"; + } +} + +/** + * This function is called when the program terminates. + * We will use this to merge all contexts' kernel histograms by kernel name + * so that we can identify all the spinlock/non-deterministic sections of + * launched kernels instead of by launched kernel instances, whose context order + * is not guaranteed. + */ +void nvbit_at_term() { + // Read the spinlock_run_PHASE dir under ctx_ and for each unique kernel name, + // we will have a vector of kernel histograms + using HistogramMapByName = std::map>; + HistogramMapByName map; + + // Build the histogram map by reading the spinlock_run_PHASE dir under ctx_ + // iterate the ctx_ dir under spinlock_detection folder + for (auto& folder : std::filesystem::directory_iterator(spinlock_run_dir + "spinlock_detection")) { + // If the folder is not a ctx_ dir, skip + if (folder.path().filename().string().find("ctx_") == std::string::npos) { + continue; + } + + // Now we iterate the spinlock_run_PHASE dir under ctx_ folder + std::string context_run_dir = folder.path().string() + "/spinlock_run_" + std::to_string(spinlock_phase); + + // Build this histogram vector for this context + for (auto& file : std::filesystem::directory_iterator(context_run_dir)) { + if (file.path().extension() == ".histogram") { + KernelInstructionHistogram* histogram = new KernelInstructionHistogram(); + histogram->loadFromFile(file.path().string()); + map[histogram->name].push_back(histogram); + } + } + } + + // Now, we merge all the histograms for each kernel name + std::vector merged_histograms; + size_t id = 0; + for (auto& [kernel_name, histograms] : map) { + KernelInstructionHistogram* merged_histogram = new KernelInstructionHistogram(); + // Set the name to the kernel name + merged_histogram->name = kernel_name; + merged_histogram->id = id; + id++; + for (auto& histogram : histograms) { + // Use hash to merge the histograms to avoid overflow + merged_histogram->merge(*histogram, true); + } + merged_histograms.push_back(merged_histogram); + } + + // For each merged histogram, save under spinlock_run_PHASE_merged dir + std::string merged_run_dir = spinlock_run_dir + "spinlock_detection/spinlock_run_" + std::to_string(spinlock_phase) + "_merged"; + std::error_code error_code; + bool success = std::filesystem::create_directories(merged_run_dir, error_code); + if (error_code) { + printf("Spinlock: Failed to create folder %s: %s\n", merged_run_dir.c_str(), error_code.message().c_str()); + assert(false); + } + + for (auto& histogram : merged_histograms) { + histogram->saveToFile(merged_run_dir + "/kernel-" + std::to_string(histogram->id) + ".histogram"); + } + + // Clean up + for (auto& histogram : merged_histograms) { + delete histogram; + } + for (auto& [name, histograms] : map) { + for (auto& histogram : histograms) { + delete histogram; + } + } + + // Check for spinlock + if (spinlock_phase == SPINLOCK_PHASE_CHECK) { + spinlock_check(); + } +} + +/* Set used to avoid re-instrumenting the same functions multiple times */ +std::unordered_set already_instrumented; + +void instrument_function_if_needed(CUcontext ctx, CUfunction func) { + assert(ctx_state_map.find(ctx) != ctx_state_map.end()); + CTXstate* ctx_state = ctx_state_map[ctx]; + + /* Get related functions of the kernel (device function that can be + * called by the kernel) */ + std::vector related_functions = + nvbit_get_related_functions(ctx, func); + + /* add kernel itself to the related function vector */ + related_functions.push_back(func); + + /* iterate on function */ + for (auto f : related_functions) { + /* "recording" function was instrumented, if set insertion failed + * we have already encountered this function */ + if (!already_instrumented.insert(f).second) { + continue; + } + + /* get vector of instructions of function "f" */ + const std::vector& instrs = nvbit_get_instrs(ctx, f); + + DPRINTF( + "Spinlock: CTX %p, Inspecting CUfunction %p name %s at address " + "0x%lx\n", + ctx, f, nvbit_get_func_name(ctx, f), nvbit_get_func_addr(ctx, f)); + + uint32_t cnt = 0; + /* iterate on all the static instructions in the function */ + for (auto instr : instrs) { + if (cnt < instr_begin_interval || cnt >= instr_end_interval) { + cnt++; + continue; + } + if (verbose) { + instr->printDecoded(); + } + + if (opcode_to_id_map.find(instr->getOpcode()) == + opcode_to_id_map.end()) { + int opcode_id = opcode_to_id_map.size(); + opcode_to_id_map[instr->getOpcode()] = opcode_id; + id_to_opcode_map[opcode_id] = std::string(instr->getOpcode()); + } + + int opcode_id = opcode_to_id_map[instr->getOpcode()]; + + // insert instrumentation function + nvbit_insert_call(instr, "count_instr", IPOINT_BEFORE); + + // pass arguments + nvbit_add_call_arg_guard_pred_val(instr); + nvbit_add_call_arg_const_val32(instr, (uint32_t)instr->getIdx()); + nvbit_add_call_arg_launch_val64(instr, 0); + nvbit_add_call_arg_const_val64(instr, (uint64_t)ctx_state->channel_dev); + + cnt++; + } + } +} + +/* flush channel */ +__global__ void flush_channel(ChannelDev* ch_dev) { + instr_count_t packet; + // Sentinel value to indicate the end of the histogram + packet.instr_idx = UINT32_MAX; + packet.count = 0; + ch_dev->push(&packet, sizeof(instr_count_t)); + ch_dev->flush(); +} + +void init_context_state(CUcontext ctx) { + CTXstate* ctx_state = ctx_state_map[ctx]; + ctx_state->id = (int)ctx_state_map.size() - 1; + ctx_state->recv_thread_done = RecvThreadState::WORKING; + cudaMallocManaged(&ctx_state->channel_dev, sizeof(ChannelDev)); + ctx_state->channel_host.init(ctx_state->id, CHANNEL_SIZE, + ctx_state->channel_dev, recv_thread_fun, ctx); + nvbit_set_tool_pthread(ctx_state->channel_host.get_thread()); +} + +static void enter_kernel_launch(CUcontext ctx, CUfunction func, + uint64_t &grid_launch_id, nvbit_api_cuda_t cbid, void* params, + bool stream_capture = false, bool build_graph = false) { + CTXstate* ctx_state = ctx_state_map[ctx]; + + // no need to sync during stream capture or manual graph build, since no + // kernel is actually launched. + if (!stream_capture && !build_graph) { + /* Make sure GPU is idle */ + cudaDeviceSynchronize(); + assert(cudaGetLastError() == cudaSuccess); + } + + // Initialize kernel instruction histogram map + if (ctx_state->instr_histogram == nullptr) { + ctx_state->instr_histogram = new KernelInstructionHistogram(grid_launch_id, nvbit_get_func_name(ctx, func, true)); + } else { + ctx_state->instr_histogram->reinit(grid_launch_id, nvbit_get_func_name(ctx, func, true)); + } + + /* instrument */ + instrument_function_if_needed(ctx, func); + + int nregs = 0; + CUDA_SAFECALL( + cuFuncGetAttribute(&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, func)); + + int shmem_static_nbytes = 0; + CUDA_SAFECALL( + cuFuncGetAttribute(&shmem_static_nbytes, + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func)); + + /* get function name and pc */ + const char* func_name = nvbit_get_func_name(ctx, func); + uint64_t pc = nvbit_get_func_addr(ctx, func); + + // during stream capture or manual graph build, no kernel is launched, so + // do not set launch argument, do not print kernel info, do not increase + // grid_launch_id. All these should be done at graph node launch time. + if (!stream_capture && !build_graph) { + /* set grid launch id at launch time */ + nvbit_set_at_launch(ctx, func, (uint64_t)grid_launch_id); + + if (cbid == API_CUDA_cuLaunchKernelEx_ptsz || + cbid == API_CUDA_cuLaunchKernelEx) { + cuLaunchKernelEx_params* p = (cuLaunchKernelEx_params*)params; + printf( + "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " + "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " + "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " + "id %ld\n", + (uint64_t)ctx, pc, func_name, grid_launch_id, + p->config->gridDimX, p->config->gridDimY, + p->config->gridDimZ, p->config->blockDimX, + p->config->blockDimY, p->config->blockDimZ, nregs, + shmem_static_nbytes + p->config->sharedMemBytes, + (uint64_t)p->config->hStream); + } else { + cuLaunchKernel_params* p = (cuLaunchKernel_params*)params; + printf( + "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " + "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " + "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " + "id %ld\n", + (uint64_t)ctx, pc, func_name, grid_launch_id, p->gridDimX, + p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, + p->blockDimZ, nregs, + shmem_static_nbytes + p->sharedMemBytes, + (uint64_t)p->hStream); + } + + // increment grid launch id for next launch + // grid id can be changed here, since nvbit_set_at_launch() has copied + // its value above. + grid_launch_id++; + } + + /* enable instrumented code to run */ + nvbit_enable_instrumented(ctx, func, true); + + // Reset the kernel receiving done flag for new kernel launch + ctx_state->kernel_receiving_done = false; +} + +// the function is only called for non cuda graph launch cases. +static void leave_kernel_launch(CTXstate *ctx_state, uint64_t &grid_launch_id) { + // make sure user kernel finishes to avoid deadlock + cudaDeviceSynchronize(); + /* push a flush channel kernel */ + flush_channel<<<1, 1>>>(ctx_state->channel_dev); + + /* Make sure GPU is idle */ + cudaDeviceSynchronize(); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("cuda error: %s\n", cudaGetErrorName(err)); + } + assert(err == cudaSuccess); + + /* wait here until the receiving thread has not finished with the + * current kernel */ + while (!ctx_state->kernel_receiving_done) { + pthread_yield(); + } + + // Dump the histogram to file + // Make a folder for the histogram + std::string folder_name = spinlock_run_dir + "spinlock_detection/ctx_" + std::to_string(ctx_state->id) + "/spinlock_run_" + std::to_string(spinlock_phase); + + std::error_code error_code; + std::filesystem::create_directories(folder_name, error_code); + if (error_code) { + printf("Spinlock: Failed to create folder %s: %s\n", folder_name.c_str(), error_code.message().c_str()); + assert(false); + } + + // Save the histogram to file in form of kernel-.histogram + bool success = ctx_state->instr_histogram->saveToFile( folder_name + "/" + "kernel-" + std::to_string(ctx_state->instr_histogram->id) + ".histogram"); + assert(success); +} + +void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid, + const char* name, void* params, CUresult* pStatus) { + pthread_mutex_lock(&cuda_event_mutex); + + /* we prevent re-entry on this callback when issuing CUDA functions inside + * this function */ + if (skip_callback_flag) { + pthread_mutex_unlock(&cuda_event_mutex); + return; + } + skip_callback_flag = true; + + CTXstate* ctx_state = ctx_state_map[ctx]; + + switch (cbid) { + // Identify all the possible CUDA launch events without stream + // parameters, they will not get involved with cuda graph + case API_CUDA_cuLaunch: + case API_CUDA_cuLaunchGrid: + { + cuLaunch_params *p = (cuLaunch_params *)params; + CUfunction func = p->f; + if (!is_exit) { + enter_kernel_launch(ctx, func, global_grid_launch_id, cbid, + params); + } else { + leave_kernel_launch(ctx_state, global_grid_launch_id); + } + } break; + // To support kernel launched by cuda graph (in addition to existing kernel + // launche method), we need to do: + // + // 1. instrument kernels at cudaGraphAddKernelNode event. This is for cases + // that kernels are manually added to a cuda graph. + // 2. distinguish captured kernels when kernels are recorded to a graph + // using stream capture. cudaStreamIsCapturing() tells us whether a stream + // is capturiong. + // 3. per-kernel instruction counters, since cuda graph can launch multiple + // kernels at the same time. + // + // Three cases: + // + // 1. original kernel launch: + // 1a. for any kernel launch without using a stream, we instrument it + // before it is launched, call cudaDeviceSynchronize after it is + // launched and read the instruction counter of the kernel. + // 1b. for any kernel launch using a stream, but the stream is not + // capturing, we do the same thing as 1a. + // + // 2. cuda graph using stream capturing: if a kernel is launched in a + // stream and the stream is capturing. We instrument the kernel before it + // is launched and do nothing after it is launched, because the kernel is + // not running until cudaGraphLaunch. Instead, we issue a + // cudaStreamSynchronize after cudaGraphLaunch is done and reset the + // instruction counters, since a cloned graph might be launched afterwards. + // + // 3. cuda graph manual: we instrument the kernel added by + // cudaGraphAddKernelNode and do the same thing for cudaGraphLaunch as 2. + // + // The above method should handle most of cuda graph launch cases. + // kernel launches with stream parameter, they can be used for cuda graph + case API_CUDA_cuLaunchKernel_ptsz: + case API_CUDA_cuLaunchKernel: + case API_CUDA_cuLaunchCooperativeKernel: + case API_CUDA_cuLaunchCooperativeKernel_ptsz: + case API_CUDA_cuLaunchKernelEx: + case API_CUDA_cuLaunchKernelEx_ptsz: + case API_CUDA_cuLaunchGridAsync: + { + CUfunction func; + CUstream hStream; + + if (cbid == API_CUDA_cuLaunchKernelEx_ptsz || + cbid == API_CUDA_cuLaunchKernelEx) { + cuLaunchKernelEx_params* p = + (cuLaunchKernelEx_params*)params; + func = p->f; + hStream = p->config->hStream; + } else if (cbid == API_CUDA_cuLaunchKernel_ptsz || + cbid == API_CUDA_cuLaunchKernel || + cbid == API_CUDA_cuLaunchCooperativeKernel_ptsz || + cbid == API_CUDA_cuLaunchCooperativeKernel) { + cuLaunchKernel_params* p = (cuLaunchKernel_params*)params; + func = p->f; + hStream = p->hStream; + } else { + cuLaunchGridAsync_params* p = + (cuLaunchGridAsync_params*)params; + func = p->f; + hStream = p->hStream; + } + + cudaStreamCaptureStatus streamStatus; + /* check if the stream is capturing, if yes, do not sync */ + CUDA_SAFECALL(cudaStreamIsCapturing(hStream, &streamStatus)); + if (!is_exit) { + bool stream_capture = (streamStatus == cudaStreamCaptureStatusActive); + enter_kernel_launch(ctx, func, global_grid_launch_id, cbid, params, stream_capture); + } else { + if (streamStatus != cudaStreamCaptureStatusActive) { + if (verbose >= 1) { + printf("kernel %s not captured by cuda graph\n", nvbit_get_func_name(ctx, func)); + } + leave_kernel_launch(ctx_state, global_grid_launch_id); + } else { + if (verbose >= 1) { + printf("kernel %s captured by cuda graph\n", nvbit_get_func_name(ctx, func)); + } + } + } + } break; + case API_CUDA_cuGraphAddKernelNode: + { + cuGraphAddKernelNode_params *p = (cuGraphAddKernelNode_params *)params; + CUfunction func = p->nodeParams->func; + + if (!is_exit) { + // cuGraphAddKernelNode_params->nodeParams is the same as + // cuLaunchKernel_params up to sharedMemBytes + enter_kernel_launch(ctx, func, global_grid_launch_id, cbid, (void*)p->nodeParams, false, true); + } + } break; + case API_CUDA_cuGraphLaunch: + { + // if we are exiting a cuda graph launch: + // Wait until the graph is completed using + // cudaStreamSynchronize() + if (is_exit) { + cuGraphLaunch_params *p = (cuGraphLaunch_params *)params; + + CUDA_SAFECALL(cudaStreamSynchronize(p->hStream)); + assert(cudaGetLastError() == cudaSuccess); + /* push a flush channel kernel */ + flush_channel<<<1, 1, 0, p->hStream>>>(ctx_state->channel_dev); + CUDA_SAFECALL(cudaStreamSynchronize(p->hStream)); + assert(cudaGetLastError() == cudaSuccess); + } + + } break; + default: + break; + }; + + + skip_callback_flag = false; + pthread_mutex_unlock(&cuda_event_mutex); +} + +void* recv_thread_fun(void* args) { + CUcontext ctx = (CUcontext)args; + + pthread_mutex_lock(&mutex); + /* get context state from map */ + assert(ctx_state_map.find(ctx) != ctx_state_map.end()); + CTXstate* ctx_state = ctx_state_map[ctx]; + + ChannelHost* ch_host = &ctx_state->channel_host; + pthread_mutex_unlock(&mutex); + char* recv_buffer = (char*)malloc(CHANNEL_SIZE); + + while (ctx_state->recv_thread_done == RecvThreadState::WORKING) { + /* receive buffer from channel */ + uint32_t num_recv_bytes = 0; + if (!ctx_state->kernel_receiving_done && + (num_recv_bytes = ch_host->recv(recv_buffer, CHANNEL_SIZE)) > 0) { + uint32_t num_processed_bytes = 0; + while (num_processed_bytes < num_recv_bytes) { + instr_count_t* packet = + (instr_count_t*)&recv_buffer[num_processed_bytes]; + + // Sentinel value to indicate the end of the histogram + if (packet->instr_idx == UINT32_MAX) { + ctx_state->kernel_receiving_done = true; + break; + } + + // Increment the instruction count + ctx_state->instr_histogram->add(packet->instr_idx, packet->count); + DPRINTF("Spinlock: Incrementing instruction count for instruction %d by %d\n", packet->instr_idx, packet->count); + + num_processed_bytes += sizeof(instr_count_t); + } + } + } + free(recv_buffer); + ctx_state->recv_thread_done = RecvThreadState::FINISHED; + return NULL; +} + +void nvbit_at_ctx_init(CUcontext ctx) { + pthread_mutex_lock(&mutex); + DPRINTF("Spinlock: STARTING CONTEXT %p\n", ctx); + assert(ctx_state_map.find(ctx) == ctx_state_map.end()); + CTXstate* ctx_state = new CTXstate; + ctx_state_map[ctx] = ctx_state; + pthread_mutex_unlock(&mutex); +} + +void nvbit_tool_init(CUcontext ctx) { + pthread_mutex_lock(&mutex); + assert(ctx_state_map.find(ctx) != ctx_state_map.end()); + init_context_state(ctx); + pthread_mutex_unlock(&mutex); +} + +void nvbit_at_ctx_term(CUcontext ctx) { + pthread_mutex_lock(&mutex); + skip_callback_flag = true; + DPRINTF("Spinlock: TERMINATING CONTEXT %p\n", ctx); + /* get context state from map */ + assert(ctx_state_map.find(ctx) != ctx_state_map.end()); + CTXstate* ctx_state = ctx_state_map[ctx]; + + /* Notify receiver thread and wait for receiver thread to + * notify back */ + ctx_state->recv_thread_done = RecvThreadState::STOP; + while (ctx_state->recv_thread_done != RecvThreadState::FINISHED) + ; + + ctx_state->channel_host.destroy(false); + cudaFree(ctx_state->channel_dev); + skip_callback_flag = false; + delete ctx_state; + pthread_mutex_unlock(&mutex); +} + +void nvbit_at_graph_node_launch(CUcontext ctx, CUfunction func, + CUstream stream, + uint64_t launch_handle) { + func_config_t config = {0}; + const char* func_name = nvbit_get_func_name(ctx, func); + uint64_t pc = nvbit_get_func_addr(ctx, func); + + pthread_mutex_lock(&mutex); + nvbit_set_at_launch(ctx, func, (uint64_t)global_grid_launch_id, stream, + launch_handle); + nvbit_get_func_config(ctx, func, &config); + + printf( + "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - " + "Kernel name %s - grid launch id %ld - grid size %d,%d,%d " + "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream " + "id %ld\n", + (uint64_t)ctx, pc, func_name, global_grid_launch_id, config.gridDimX, + config.gridDimY, config.gridDimZ, config.blockDimX, config.blockDimY, + config.blockDimZ, config.num_registers, + config.shmem_static_nbytes + config.shmem_dynamic_nbytes, + (uint64_t)stream); + // grid id can be changed here, since nvbit_set_at_launch() has copied its + // value above. + global_grid_launch_id++; + pthread_mutex_unlock(&mutex); +} + +void spinlock_check() { + // Get spinlock run folders + std::string spinlock_run0_folder = spinlock_run_dir + "spinlock_detection/spinlock_run_0_merged"; + std::string spinlock_run1_folder = spinlock_run_dir + "spinlock_detection/spinlock_run_1_merged"; + + // Get the map from kernel name to histogram + std::map spinlock_run0_histograms; + std::map spinlock_run1_histograms; + + // Load the histogram files + DPRINTF("Spinlock: Loading histograms from %s and %s\n", spinlock_run0_folder.c_str(), spinlock_run1_folder.c_str()); + std::map*> zipped_folders = {{spinlock_run0_folder, &spinlock_run0_histograms}, {spinlock_run1_folder, &spinlock_run1_histograms}}; + for (const auto& iter : zipped_folders) { + auto spinlock_run_folder = iter.first; + auto histograms = iter.second; + DPRINTF("Spinlock: Loading histograms from %s\n", spinlock_run_folder.c_str()); + assert(std::filesystem::exists(spinlock_run_folder)); + for (const auto& entry : std::filesystem::directory_iterator(spinlock_run_folder)) { + if (entry.is_regular_file() && entry.path().extension().compare(".histogram") == 0) { + DPRINTF("Spinlock: Loading histogram from %s\n", entry.path().string().c_str()); + KernelInstructionHistogram *histogram = new KernelInstructionHistogram(); + histogram->loadFromFile(entry.path().string()); + histograms->insert({histogram->name, histogram}); + } + } + } + DPRINTF("Spinlock: Loaded %d histograms from %s and %d histograms from %s\n", spinlock_run0_histograms.size(), spinlock_run0_folder.c_str(), spinlock_run1_histograms.size(), spinlock_run1_folder.c_str()); + // Check if the kernel count are the same + assert(spinlock_run0_histograms.size() == spinlock_run1_histograms.size()); + + DPRINTF("Spinlock: Comparing the two histograms\n"); + // Now compare the two histograms and generate output of spinlock instructions per context + // Each row will be kernel id, kernel name, and indices of spinlock instructions + std::string output_file = spinlock_run_dir + "spinlock_detection/spinlock_instructions.txt"; + std::ofstream output_file_stream(output_file); + DPRINTF("Spinlock: Generating output file %s\n", output_file.c_str()); + for (auto [kernel_name, run0_histogram] : spinlock_run0_histograms) { + DPRINTF("Spinlock: Comparing histogram %d %s\n", run0_histogram->id, kernel_name.c_str()); + auto run1_histogram = spinlock_run1_histograms.at(kernel_name); + auto spinlock_instructions = run0_histogram->findSpinlock(*run1_histogram); + DPRINTF("Spinlock: Found %d spinlock instructions\n", spinlock_instructions.size()); + output_file_stream << run0_histogram->id << ", " << kernel_name << ": "; + for (auto [instr_idx, counts] : spinlock_instructions) { + // Write to output file + output_file_stream << instr_idx << " "; + } + output_file_stream << "\n"; + } + output_file_stream.close(); + DPRINTF("Spinlock: Generated output file %s\n", output_file.c_str()); + // Free the histograms + for (auto [kernel_name, histogram] : spinlock_run0_histograms) { + delete histogram; + } + for (auto [kernel_name, histogram] : spinlock_run1_histograms) { + delete histogram; + } + + // Clean up intermediate files + if (!spinlock_keep_intermediate_files) { + // Remove the ctx_ and spinlock_run__merged folders + for (auto& folder : std::filesystem::directory_iterator(spinlock_run_dir + "spinlock_detection")) { + if (folder.path().filename().string().find("ctx_") != std::string::npos) { + std::filesystem::remove_all(folder.path()); + } else if (folder.path().filename().string().find("spinlock_run_") != std::string::npos) { + std::filesystem::remove_all(folder.path()); + } + } + } + return; +} diff --git a/util/tracer_nvbit/run_hw_trace.py b/util/tracer_nvbit/run_hw_trace.py index f6153eafb..07d22c41a 100755 --- a/util/tracer_nvbit/run_hw_trace.py +++ b/util/tracer_nvbit/run_hw_trace.py @@ -53,6 +53,20 @@ action="store_true", help="Once the kernel limit is reached, terminate the tracing process", ) +parser.add_option( + "--spinlock_handling", + dest="spinlock_handling", + choices=["none", "fast_forward"], + default="none", + help="How to handle spinlock instructions", +) +parser.add_option( + "--spinlock_fast_forward_iterations", + dest="spinlock_fast_forward_iterations", + type=int, + default=1, + help="Number of iterations to keep for spinlock fast forwarding. Only used if spinlock_handling is fast_forward", +) (options, args) = parser.parse_args() @@ -68,6 +82,7 @@ logfile = day_string + "--" + time_string + ".csv" nvbit_tracer_path = os.path.join(this_directory, "tracer_tool") +nvbit_spinlock_path = os.path.join(this_directory, "others", "spinlock_tool") for bench in benchmarks: edir, ddir, exe, argslist = bench @@ -143,13 +158,14 @@ + '"; export CUDA_VISIBLE_DEVICES="' + options.device_num + '" ; ' + ) + + tracer_contents = ( + sh_contents + "\nrm -f traces/*" + "\nexport TRACES_FOLDER=" + this_run_dir - + "; CUDA_INJECTION64_PATH=" - + os.path.join(nvbit_tracer_path, "tracer_tool.so") - + " " - + " LD_PRELOAD=" + + f"; ENABLE_SPINLOCK_FAST_FORWARD={1 if options.spinlock_handling == 'fast_forward' else 0} SPINLOCK_ITER_TO_KEEP={options.spinlock_fast_forward_iterations} CUDA_INJECTION64_PATH=" + os.path.join(nvbit_tracer_path, "tracer_tool.so") + " " + exec_path @@ -167,16 +183,46 @@ + this_trace_folder + "/kernelslist " ) + + # Spinlock tool run script + # will run twice, once for phase 0 and once for phase 1 + spinlock_contents = ( + sh_contents + + "\nrm -f spinlock_detection/*" + + "\nexport TRACES_FOLDER=" + + this_run_dir + + "; SPINLOCK_PHASE=0 CUDA_INJECTION64_PATH=" + + os.path.join(nvbit_spinlock_path, "spinlock_tool.so") + + " " + + exec_path + + " " + + str(args) + + " ; " + + " SPINLOCK_PHASE=1 CUDA_INJECTION64_PATH=" + + os.path.join(nvbit_spinlock_path, "spinlock_tool.so") + + " " + + exec_path + + " " + + str(args) + + " ; " + ) + + for path, content in [("run.sh", tracer_contents), ("run_spinlock_detection.sh", spinlock_contents)]: + open(os.path.join(this_run_dir, path), "w").write(content) + if subprocess.call(["chmod", "u+x", os.path.join(this_run_dir, path)]) != 0: + exit(f"Error chmod {path} runfile") - open(os.path.join(this_run_dir, "run.sh"), "w").write(sh_contents) - if subprocess.call(["chmod", "u+x", os.path.join(this_run_dir, "run.sh")]) != 0: - exit("Error chmod runfile") if not options.norun: saved_dir = os.getcwd() os.chdir(this_run_dir) print("Running {0}".format(exe)) + # Call the spinlock detection script + if options.spinlock_handling == 'fast_forward': + if subprocess.call(["bash", "run_spinlock_detection.sh"]) != 0: + sys.exit(f"Error invoking spinlock detection on {this_run_dir}") + if subprocess.call(["bash", "run.sh"]) != 0: - sys.exit("Error invoking nvbit on {0}".format(this_run_dir)) + sys.exit(f"Error invoking nvbit on {this_run_dir}") os.chdir(saved_dir) diff --git a/util/tracer_nvbit/tracer_tool/Makefile b/util/tracer_nvbit/tracer_tool/Makefile index 950639750..f4bff9fe5 100644 --- a/util/tracer_nvbit/tracer_tool/Makefile +++ b/util/tracer_nvbit/tracer_tool/Makefile @@ -49,7 +49,7 @@ $(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a $(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@ %.o: %.cu common.h - $(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@ + $(NVCC) -dc -c -std=c++17 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@ inject_funcs.o: inject_funcs.cu common.h $(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@ diff --git a/util/tracer_nvbit/tracer_tool/common.h b/util/tracer_nvbit/tracer_tool/common.h index 7eff0feb7..9da0a7cbd 100644 --- a/util/tracer_nvbit/tracer_tool/common.h +++ b/util/tracer_nvbit/tracer_tool/common.h @@ -21,6 +21,7 @@ typedef struct { int opcode_id; uint64_t addrs[32]; uint32_t line_num; + uint32_t instr_idx; uint32_t vpc; bool is_mem; int32_t GPRDst; diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu index dfc3f8769..015d108f0 100644 --- a/util/tracer_nvbit/tracer_tool/inject_funcs.cu +++ b/util/tracer_nvbit/tracer_tool/inject_funcs.cu @@ -25,7 +25,7 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem, int32_t srcReg5, int32_t srcNum, uint64_t immediate, uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter, uint64_t preported_dynamic_instr_counter, uint64_t pstop_report, - uint32_t line_num) { + uint32_t line_num, uint32_t instr_idx) { const int active_mask = __ballot_sync(__activemask(), 1); const int predicate_mask = __ballot_sync(__activemask(), pred); const int laneid = get_laneid(); @@ -55,6 +55,7 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem, int uniqe_threadId = threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x; ma.line_num = line_num; + ma.instr_idx = instr_idx; ma.warpid_tb = uniqe_threadId / 32; ma.cta_id_x = cta.x; diff --git a/util/tracer_nvbit/tracer_tool/tracer_tool.cu b/util/tracer_nvbit/tracer_tool/tracer_tool.cu index f4649c5ee..b36c33b18 100644 --- a/util/tracer_nvbit/tracer_tool/tracer_tool.cu +++ b/util/tracer_nvbit/tracer_tool/tracer_tool.cu @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -80,6 +81,7 @@ std::unordered_map ctx_kernelslist; std::unordered_map ctx_stats_location; std::unordered_map ctx_kernelid; std::unordered_map ctx_resultsFile; +std::unordered_map ctx_current_kernel_name; std::string kernel_ranges = ""; @@ -200,6 +202,14 @@ static bool first_call = true; unsigned old_total_insts = 0; unsigned old_total_reported_insts = 0; +/* Spinlock fast forward control */ +int enable_spinlock_fast_forward = 0; +int spinlock_iter_to_keep = 0; +// Map from kernel name to spinlock instruction indices +std::map *> spinlock_instr_map; +std::pair> +parse_spinlock_instructions(const std::string &line); + void nvbit_at_init() { setenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC", "1", 1); GET_VAR_INT( @@ -241,6 +251,10 @@ void nvbit_at_init() { GET_VAR_INT(xz_compress_trace, "TRACE_FILE_COMPRESS", 1, "Create xz-compressed trace" "file"); + GET_VAR_INT(enable_spinlock_fast_forward, "ENABLE_SPINLOCK_FAST_FORWARD", 0, + "Enable spinlock fast forwarding"); + GET_VAR_INT(spinlock_iter_to_keep, "SPINLOCK_ITER_TO_KEEP", 1, + "Number of iterations to keep for spinlock fast forwarding"); std::string pad(100, '-'); printf("%s\n", pad.c_str()); @@ -249,6 +263,20 @@ void nvbit_at_init() { if (usr_defined_folder != NULL) user_folder = usr_defined_folder; parse_kernel_ranges_from_env(); + + // Read in the spinlock_instructions.txt and build a map from kernel name to + // spinlock instruction indices + if (enable_spinlock_fast_forward) { + std::string spinlock_instr_file = + user_folder + "/spinlock_detection/spinlock_instructions.txt"; + std::ifstream instr_fs(spinlock_instr_file); + std::string line; + while (std::getline(instr_fs, line)) { + auto [kernel_name, indices] = parse_spinlock_instructions(line); + spinlock_instr_map[kernel_name] = new std::vector(indices); + } + instr_fs.close(); + } } /* Set used to avoid re-instrumenting the same functions multiple times */ @@ -400,6 +428,8 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) { nvbit_add_call_arg_const_val64(instr, (uint64_t)&stop_report); /* Add Source code line number for current instr */ nvbit_add_call_arg_const_val32(instr, (int)line_num); + /* Add instruction index for current instr (spinlock detection) */ + nvbit_add_call_arg_const_val32(instr, (uint32_t)instr->getIdx()); mem_oper_idx--; } while (mem_oper_idx >= 0); @@ -565,6 +595,8 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func, fclose(statsFile); ctx_kernelid[ctx]++; + ctx_current_kernel_name[ctx] = + std::string(nvbit_get_func_name(ctx, func, true)); recv_thread_receiving = true; } @@ -888,9 +920,52 @@ void base_delta_compress(const uint64_t *addrs, const std::bitset<32> &mask, } } +void trim_string(std::string &str) { + // Remove the leading and trailing spaces + str.erase(0, str.find_first_not_of(' ')); + str.erase(str.find_last_not_of(' ') + 1); +} + +typedef std::map counter_t; +typedef std::tuple warp_key_t; + +counter_t create_counter(const std::vector &indices) { + counter_t counter; + for (auto instr_idx : indices) { + counter[instr_idx] = 0; + } + return counter; +} + +std::pair> +parse_spinlock_instructions(const std::string &line) { + std::vector indices; + // Each line is of the form: , : + // Though kernel id is not used + // -2 for the comma and the space + size_t name_length = line.find(':') - line.find(',') - 2; + std::string kernel_name = line.substr(line.find(',') + 2, name_length); + std::string indices_str = line.substr(line.find(':') + 1); + trim_string(indices_str); + std::stringstream ss(indices_str); + std::string instr_idx; + while (std::getline(ss, instr_idx, ' ')) { + indices.push_back(std::stoi(instr_idx)); + } + return {kernel_name, indices}; +} + void *recv_thread_fun(void *args) { CUcontext ctx = (CUcontext)args; char *recv_buffer = (char *)malloc(CHANNEL_SIZE); + + // This counter map will keep track of the spinlock instruction + // count in the current detected spinlock loop for each warp + // The detection start if a spinlock instruction is encountered (start to + // increment the counter) and end when a non-spinlock instruction is + // encountered (clear the counter) + std::map warp_counter_map; + while (recv_thread_started) { uint32_t num_recv_bytes = 0; if (recv_thread_receiving && @@ -903,9 +978,52 @@ void *recv_thread_fun(void *args) { */ if (ma->cta_id_x == -1) { recv_thread_receiving = false; + if (enable_spinlock_fast_forward) { + // Clear the counter map for all warps as we are starting a new + // kernel + warp_counter_map.clear(); + } break; } + /* Spinlock fast forwarding */ + if (enable_spinlock_fast_forward) { + // Check if this warp is in the warp_counter_map + warp_key_t warp_key = std::make_tuple(ma->cta_id_x, ma->cta_id_y, + ma->cta_id_z, ma->warpid_tb); + if (warp_counter_map.find(warp_key) == warp_counter_map.end()) { + // This warp is not in the warp_counter_map, so we create a counter + // for this warp using the spinlock instruction indices for the + // current kernel + std::vector &indices = + *(spinlock_instr_map[ctx_current_kernel_name[ctx]]); + warp_counter_map[warp_key] = create_counter(indices); + } + + // Get the counter map for this warp + auto &counter = warp_counter_map[warp_key]; + + // Now check if we should start spinlock fast forwarding for this warp + if (counter.find(ma->instr_idx) != counter.end()) { + // We are still in a spinlock loop, so we increment the counter + counter[ma->instr_idx]++; + if (counter[ma->instr_idx] > spinlock_iter_to_keep) { + // This spinlock instruction is executed more than the threshold + // so we fast forward it in the output trace + // Note we are only fast forwarding the innermost spinlock loop + num_processed_bytes += sizeof(inst_trace_t); + continue; + } + } else { + // We are exiting the innermost spinlock loop, so we reset the + // counter map for this warp + for (auto &[instr_idx, count] : counter) { + count = 0; + } + } + } + + /* Dump the instruction trace information */ fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_x); fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_y); fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_z); @@ -1000,10 +1118,12 @@ void *recv_thread_fun(void *args) { } } free(recv_buffer); + return NULL; } void nvbit_tool_init(CUcontext ctx) { + ctx_current_kernel_name[ctx] = ""; recv_thread_started = true; channel_host.init(0, CHANNEL_SIZE, &channel_dev, NULL); pthread_create(&recv_thread, NULL, recv_thread_fun, ctx);