diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6c3e63665..a932c103d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -291,7 +291,7 @@ jobs:
           source ./env-setup/12.8_env_setup.sh
           source ./gpu-app-collection/src/setup_environment
           rm -rf ./hw_run/
-          ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D 7
+          ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D 7 --spinlock_handling none
       - name: generate-rodinia_2.0-ft-hw_stats
         run: |
           source ./env-setup/12.8_env_setup.sh
diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
index 91cc95911..573495be3 100644
--- a/.github/workflows/weekly.yml
+++ b/.github/workflows/weekly.yml
@@ -34,7 +34,7 @@ jobs:
           source ./env-setup/12.8_env_setup.sh
           export PATH=/home/tgrogers-raid/a/common/python2:$PATH
           rm -rf ./gpu-app-collection/
-          git clone git@github.com:accel-sim/gpu-app-collection.git
+          git clone --recursive git@github.com:accel-sim/gpu-app-collection.git
           source ./gpu-app-collection/src/setup_environment
           ln -s /home/tgrogers-raid/a/common/data_dirs ./gpu-app-collection/
           make -j8 -C ./gpu-app-collection/src rodinia-3.1
@@ -53,6 +53,24 @@ jobs:
           ln -s /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces ./hw_run
           ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark -D 7
           # ./util/tracer_nvbit/run_hw_trace.py -B rodinia-3.1,GPU_Microbenchmark,parboil,polybench,cutlass_5_trace,Deepbench_nvidia_tencore,Deepbench_nvidia_normal -D 7
+      - name: generate-spinlock-traces-spinlock_handling
+        run: |
+          source ./env-setup/12.8_env_setup.sh
+          source ./gpu-app-collection/src/setup_environment
+          rm -rf ./hw_run/
+          ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling fast_forward
+          mv ./hw_run ./hw_run_fast_forward
+          ./util/tracer_nvbit/run_hw_trace.py -B Spinlock -D 7 --spinlock_handling none
+          mv ./hw_run ./hw_run_none
+      - name: test-new-traces-spinlock_handling
+        # Test only fast-forwarded traces as the none one takes too long to run (~2-3 hr)
+        run: |
+          source ./env-setup/12.8_env_setup.sh
+          source ./gpu-simulator/setup_environment.sh
+          ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_fast_forward/traces/device-7/ -N spinlock-microbenchmark-$$-fast_forward
+          ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-fast_forward
+          # ./util/job_launching/run_simulations.py -B Spinlock -C QV100-SASS -T ./hw_run_none/traces/device-7/ -N spinlock-microbenchmark-$$-none
+          # ./util/job_launching/monitor_func_test.py -I -v -s spinlock-stats-per-app.csv -N spinlock-microbenchmark-$$-none
   SASS-Weekly:
     needs: [Tracer-Weekly]
     if: github.repository == 'accel-sim/accel-sim-framework'
diff --git a/.gitignore b/.gitignore
index 7a3451053..877bb3f22 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,4 +13,6 @@ gpu-simulator/gpgpu-sim
 extern
 gpu-simulator/accel_sim.pyi
 compile_commands.json
-.cache
\ No newline at end of file
+.cache
+.cursorrules
+CLAUDE.md
\ No newline at end of file
diff --git a/README.md b/README.md
index f396ff79c..d339b8b56 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@
   - [Accel-Sim Components](#accel-sim-components)
     - [Accel-Sim Tracer](#accel-sim-tracer)
       - [A simple example](#a-simple-example)
+      - [Spinlock handling](#spinlock-handling)
       - [Pre-traced applications](#pre-traced-applications)
     - [Accel-Sim SASS Frontend and Simulation Engine](#accel-sim-sass-frontend-and-simulation-engine)
     - [Accel-Sim Correlator](#accel-sim-correlator)
@@ -113,6 +114,18 @@ That's it. The traces for the short-running rodinia tests will be generated in:
 
 To extend the tracer, use other apps and understand what, exactly is going on, read [this](https://github.com/accel-sim/accel-sim-framework/blob/dev/util/tracer_nvbit/README.md).
 
+#### Spinlock handling
+
+If your application contains spinlock instructions, you can handle them with the tracer by using the following command:
+
+```bash
+./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft -D <gpu-device-num-to-run-on> --spinlock_handling fast_forward
+```
+
+This will fast forward the spinlock instructions and keep the spinlock instructions for the number of iterations specified in the `--spinlock_fast_forward_iterations` arg option.
+
+The tool for spinlock detection is in `./util/tracer_nvbit/others/spinlock_tool/`.
+
 #### Pre-traced applications
 For convience, we have included a repository of pre-traced applications - to get all those traces, simply run:
 ```bash
diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml
index d6f8a405b..3d0a6e416 100644
--- a/util/job_launching/apps/define-all-apps.yml
+++ b/util/job_launching/apps/define-all-apps.yml
@@ -126,6 +126,14 @@ GPU_Atomic:
             - args: 16
               accel-sim-mem: 1G
 
+Spinlock:
+    exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/"
+    data_dirs: "$GPUAPPS_ROOT/data_dirs/"
+    execs:
+        - spinlock_simple:
+            - args: 
+              accel-sim-mem: 1G
+
 Atomic_Profile:
     exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/"
     data_dirs: "$GPUAPPS_ROOT/data_dirs/"
diff --git a/util/tracer_nvbit/.gitignore b/util/tracer_nvbit/.gitignore
index c1adccf67..e26001f3f 100644
--- a/util/tracer_nvbit/.gitignore
+++ b/util/tracer_nvbit/.gitignore
@@ -1,7 +1,4 @@
 nvbit_release/
-silicon_checkpoint_tool/checkpoint/checkpoint.o
-silicon_checkpoint_tool/checkpoint/checkpoint.so
-tracer_tool/tracer_tool.o
-tracer_tool/tracer_tool.so
-tracer_tool/inject_funcs.o
+*.o
+*.so
 tracer_tool/traces-processing/post-traces-processing
diff --git a/util/tracer_nvbit/Makefile b/util/tracer_nvbit/Makefile
index a3433c74f..277e21cf6 100644
--- a/util/tracer_nvbit/Makefile
+++ b/util/tracer_nvbit/Makefile
@@ -1,9 +1,10 @@
 
 all:
-	make -C tracer_tool
-	make -C tracer_tool/traces-processing
-	#make -C silicon_checkpoint_tool
+	$(MAKE) -C tracer_tool
+	$(MAKE) -C tracer_tool/traces-processing
+	$(MAKE) -C others/spinlock_tool
+#$(MAKE) -C silicon_checkpoint_tool
 
 clean:
-	make clean -C tracer_tool
-	make clean -C tracer_tool/traces-processing
+	$(MAKE) clean -C tracer_tool
+	$(MAKE) clean -C tracer_tool/traces-processing
diff --git a/util/tracer_nvbit/others/spinlock_tool/Makefile b/util/tracer_nvbit/others/spinlock_tool/Makefile
new file mode 100644
index 000000000..cdccff25f
--- /dev/null
+++ b/util/tracer_nvbit/others/spinlock_tool/Makefile
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: Copyright (c) 2017 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+NVCC=nvcc -ccbin=$(CXX) -D_FORCE_INLINES
+PTXAS=ptxas
+
+NVCC_VER_REQ=10.1
+NVCC_VER=$(shell $(NVCC) --version | grep release | cut -f2 -d, | cut -f3 -d' ')
+NVCC_VER_CHECK=$(shell echo "${NVCC_VER} >= $(NVCC_VER_REQ)" | bc)
+
+ifeq ($(NVCC_VER_CHECK),0)
+$(error ERROR: nvcc version >= $(NVCC_VER_REQ) required to compile an nvbit tool! Instrumented applications can still use lower versions of nvcc.)
+endif
+
+PTXAS_VER_ADD_FLAG=12.3
+PTXAS_VER=$(shell $(PTXAS) --version | grep release | cut -f2 -d, | cut -f3 -d' ')
+PTXAS_VER_CHECK=$(shell echo "${PTXAS_VER} >= $(PTXAS_VER_ADD_FLAG)" | bc)
+
+ifeq ($(PTXAS_VER_CHECK), 0)
+MAXRREGCOUNT_FLAG=-maxrregcount=24
+else
+MAXRREGCOUNT_FLAG=
+endif
+
+NVBIT_PATH=../../nvbit_release/core
+INCLUDES=-I$(NVBIT_PATH)
+
+LIBS=-L$(NVBIT_PATH) -lnvbit
+NVCC_PATH=-L $(subst bin/nvcc,lib64,$(shell which nvcc | tr -s /))
+
+SOURCES=$(wildcard *.cu)
+
+OBJECTS=$(SOURCES:.cu=.o)
+ARCH?=all
+
+mkfile_path := $(abspath $(lastword $(MAKEFILE_LIST)))
+current_dir := $(notdir $(patsubst %/,%,$(dir $(mkfile_path))))
+
+NVBIT_TOOL=$(current_dir).so
+
+all: $(NVBIT_TOOL)
+
+$(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a
+	$(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@
+
+%.o: %.cu
+	$(NVCC) -dc -c -std=c++17 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@
+
+inject_funcs.o: inject_funcs.cu
+	$(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@
+
+clean:
+	rm -f *.so *.o
diff --git a/util/tracer_nvbit/others/spinlock_tool/README.md b/util/tracer_nvbit/others/spinlock_tool/README.md
new file mode 100644
index 000000000..fb4381462
--- /dev/null
+++ b/util/tracer_nvbit/others/spinlock_tool/README.md
@@ -0,0 +1,20 @@
+# Spinlock tool
+
+## Description
+
+This tool is used to detect spinlocks in the kernel code.
+
+## Usage
+
+```bash
+# Run program first time to get the instruction histogram of the program's kernels
+SPINLOCK_PHASE=0 CUDA_INJECTION64_PATH=PATH/TO/spinlock_tool.so program
+
+# Run program second time to get another instruction histogram of the program's kernels
+# At the end of nvbit, this tool will generate a file with the name of spinlock_detection/spinlock_instructions.txt
+# containing the instruction indices of the spinlock instructions in the program's kernels
+SPINLOCK_PHASE=1 CUDA_INJECTION64_PATH=PATH/TO/spinlock_tool.so program
+
+# To fast forward the spinlock instructions with accel-sim tracer, you can use the following command
+ENABLE_SPINLOCK_FAST_FORWARD=1 CUDA_INJECTION64_PATH=PATH/TO/tracer_tool.so program
+```
diff --git a/util/tracer_nvbit/others/spinlock_tool/common.h b/util/tracer_nvbit/others/spinlock_tool/common.h
new file mode 100644
index 000000000..74fe3c696
--- /dev/null
+++ b/util/tracer_nvbit/others/spinlock_tool/common.h
@@ -0,0 +1,227 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <map>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <regex>
+
+/* information collected in the instrumentation function and passed
+ * on the channel from the GPU to the CPU */
+typedef struct {
+    uint32_t instr_idx;
+    uint32_t count;
+} instr_count_t;
+
+
+/* Class to hold kernel instruction histogram */
+class KernelInstructionHistogram {
+public:
+    KernelInstructionHistogram() 
+        : id(0), name("dummy"), histogram(std::map<uint32_t, uint32_t>()) {
+    }
+
+    KernelInstructionHistogram(uint32_t id, std::string name) 
+        : id(id), name(name), histogram(std::map<uint32_t, uint32_t>()) {
+    }
+
+    void add(uint32_t instr_idx, uint64_t count) {
+        if (histogram.find(instr_idx) == histogram.end()) {
+            histogram[instr_idx] = count;
+        } else {
+            histogram[instr_idx] += count;
+        }
+    }
+
+    void merge(const KernelInstructionHistogram& other, bool use_hash = false) {
+        for (const auto& [instr_idx, count] : other.histogram) {
+            if (use_hash) {
+                // Simple modulo hash operation
+                add(instr_idx, count % hash_prime);
+                histogram[instr_idx] %= hash_prime;
+            } else {
+                add(instr_idx, count);
+            }
+        }
+    }
+
+    void reinit(uint32_t id, std::string name) {
+        this->id = id;
+        this->name = name;
+        histogram.clear();
+    }
+
+    std::map<uint32_t, std::pair<uint32_t, uint32_t>> findSpinlock(const KernelInstructionHistogram& other) {
+        // Find instructions that have different execution counts between two runs
+        // These are likely part of spinlock sections
+        std::map<uint32_t, std::pair<uint32_t, uint32_t>> spinlockInstructions;
+        
+        // Check all instructions in this histogram
+        for (const auto& [instrIdx, count] : histogram) {
+            auto otherIt = other.histogram.find(instrIdx);
+            if (otherIt != other.histogram.end()) {
+                // Instruction exists in both histograms
+                if (count != otherIt->second) {
+                    // Different execution counts - likely spinlock
+                    spinlockInstructions[instrIdx] = {count, otherIt->second};
+                }
+            } else {
+                // Instruction only exists in this histogram
+                spinlockInstructions[instrIdx] = {count, 0};
+            }
+        }
+        
+        // Check instructions that only exist in the other histogram
+        for (const auto& [instrIdx, count] : other.histogram) {
+            if (histogram.find(instrIdx) == histogram.end()) {
+                // Instruction only exists in other histogram
+                spinlockInstructions[instrIdx] = {0, count}; // Mark as 0 in this run
+            }
+        }
+        
+        return spinlockInstructions;
+    }
+
+    // Save histogram to file
+    bool saveToFile(const std::string& filename) const {
+        std::ofstream file(filename);
+        if (!file.is_open()) {
+            return false;
+        }
+        file << serialize();
+        file.close();
+        return true;
+    }
+
+    // Load histogram from file
+    bool loadFromFile(const std::string& filename) {
+        std::ifstream file(filename);
+        if (!file.is_open()) {
+            return false;
+        }
+        
+        std::stringstream buffer;
+        buffer << file.rdbuf();
+        file.close();
+        
+        deserialize(buffer.str());
+        return true;
+    }
+
+    // Get total instruction count
+    uint64_t getTotalInstructionCount() const {
+        uint64_t total = 0;
+        for (const auto& [instrIdx, count] : histogram) {
+            total += count;
+        }
+        return total;
+    }
+
+    // Get number of unique instructions
+    size_t getUniqueInstructionCount() const {
+        return histogram.size();
+    }
+
+    // Check if histogram is empty
+    bool isEmpty() const {
+        return histogram.empty();
+    }
+
+    // Clear histogram
+    void clear() {
+        histogram.clear();
+    }
+
+    std::string serialize() const {
+        std::stringstream ss;
+        ss << "Kernel: " << name << " (ID: " << id << ")" << std::endl;
+        for (const auto &[instr_idx, count] : histogram) {
+            ss << instr_idx << ": " << count << std::endl;
+        }
+        return ss.str();
+    }
+
+    void deserialize(const std::string& data) {
+        // Deserialize the data following the serialize format
+        // Kernel: <name> (ID: <id>)
+        // <instr_idx>: <count>
+        // <instr_idx>: <count>
+        // ...
+        std::stringstream ss(data);
+        std::string line;
+        
+        // Clear existing histogram
+        histogram.clear();
+        
+        // Regex patterns for parsing
+        std::regex headerPattern(R"(Kernel:\s*(.+?)\s*\(ID:\s*(\d+)\))");
+        std::regex instructionPattern(R"(\s*(\d+)\s*:\s*(\d+)\s*)");
+        
+        // Parse header line: "Kernel: <name> (ID: <id>)"
+        if (std::getline(ss, line)) {
+            std::smatch headerMatch;
+            if (std::regex_match(line, headerMatch, headerPattern)) {
+                if (headerMatch.size() >= 3) {
+                    name = headerMatch[1].str();
+                    id = std::stoul(headerMatch[2].str());
+                }
+            }
+        }
+        
+        // Parse instruction count lines: "<instr_idx>: <count>"
+        while (std::getline(ss, line)) {
+            if (line.empty()) continue;
+            
+            std::smatch instructionMatch;
+            if (std::regex_match(line, instructionMatch, instructionPattern)) {
+                if (instructionMatch.size() >= 3) {
+                    try {
+                        uint32_t instrIdx = std::stoul(instructionMatch[1].str());
+                        uint32_t count = std::stoul(instructionMatch[2].str());
+                        histogram[instrIdx] = count;
+                    } catch (const std::exception& e) {
+                        // Skip malformed lines
+                        continue;
+                    }
+                }
+            }
+        }
+    }
+
+    uint32_t id;
+    std::string name;
+    std::map<uint32_t, uint32_t> histogram;
+    // A large 30-bit prime number for hashing to avoid overflow
+    static constexpr uint32_t hash_prime = 1073741789;
+};
\ No newline at end of file
diff --git a/util/tracer_nvbit/others/spinlock_tool/inject_funcs.cu b/util/tracer_nvbit/others/spinlock_tool/inject_funcs.cu
new file mode 100644
index 000000000..a18a6f475
--- /dev/null
+++ b/util/tracer_nvbit/others/spinlock_tool/inject_funcs.cu
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "utils/utils.h"
+
+/* for channel */
+#include "utils/channel.hpp"
+
+/* contains definition of the mem_access_t structure */
+#include "common.h"
+
+extern "C" __device__ __noinline__ void count_instr(int predicate, uint32_t instr_idx,
+                                                    uint64_t grid_launch_id, 
+                                                    uint64_t pchannel_dev) {
+    const int active_mask = __ballot_sync(__activemask(), 1);
+    const int predicate_mask = __ballot_sync(__activemask(), predicate);
+
+    const int laneid = get_laneid();
+    const int first_laneid = __ffs(active_mask) - 1;
+
+    // Count instruction for all threads
+    const int count = __popc(predicate_mask);
+    instr_count_t packet;
+
+    // Also add count of warp instruction (+ 1 for the warp instruction itself)
+    packet.instr_idx = instr_idx;
+    packet.count = count + 1;
+
+    /* first active lane pushes information on the channel */
+    if (first_laneid == laneid) {
+        ChannelDev* channel_dev = (ChannelDev*)pchannel_dev;
+        channel_dev->push(&packet, sizeof(instr_count_t));
+    }
+}
+ 
\ No newline at end of file
diff --git a/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu b/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu
new file mode 100644
index 000000000..01a5625f5
--- /dev/null
+++ b/util/tracer_nvbit/others/spinlock_tool/spinlock_tool.cu
@@ -0,0 +1,781 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/**
+ * A tool to identify spinlock section in the SASS kernel.
+ * The idea is to run the tool twice and count the executed number for
+ * each instruction. For non-deterministic part (aka spinlock in our GEMM app), 
+ * the number of executed instructions would be different, thus we can identify
+ * the spinlock section.
+ *
+ * You will need to pass SPINLOCK_PHASE=0 for first run and SPINLOCK_PHASE=1 for second run.
+ * Each run will generate a folder with the name of ctx_<ctx_id>/spinlock_run_<phase> and a spinlock_run_<phase>_merged folder.
+ * Each folder will contain a file with the name of <kernel_id>-<kernel_name>.histogram.
+ * The spinlock_run_<phase>_merged folder will contain a file for each kernel with the name of <kernel_name>.histogram.
+ * When SPINLOCK_PHASE=1, the tool will also check for spinlock instructions during context termination.
+ * And generate a file with the name of spinlock_instructions.txt, with each 
+ * line containing the kernel id, kernel name, and the indices of spinlock instructions.
+ * The indices are the instruction indices in the kernel function.
+ */
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <map>
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <filesystem>
+
+/* every tool needs to include this once */
+#include "nvbit_tool.h"
+
+/* nvbit interface file */
+#include "nvbit.h"
+
+/* for channel */
+#include "utils/channel.hpp"
+
+/* contains definition of the mem_access_t structure */
+#include "common.h"
+
+#define HEX(x)                                                            \
+    "0x" << std::setfill('0') << std::setw(16) << std::hex << (uint64_t)x \
+         << std::dec
+
+#define CHANNEL_SIZE (1l << 20)
+
+enum class RecvThreadState {
+    WORKING,
+    STOP,
+    FINISHED,
+};
+
+struct CTXstate {
+    /* context id */
+    int id;
+
+    /* Channel used to communicate from GPU to CPU receiving thread */
+    ChannelDev* channel_dev;
+    ChannelHost channel_host;
+
+    // After initialization, set it to WORKING to make recv thread get data,
+    // parent thread sets it to STOP to make recv thread stop working.
+    // recv thread sets it to FINISHED when it cleans up.
+    // parent thread should wait until the state becomes FINISHED to clean up.
+    volatile RecvThreadState recv_thread_done = RecvThreadState::STOP;
+
+    // Kernel reciving state
+    volatile bool kernel_receiving_done = false;
+
+    // Kernel instruction histogram
+    KernelInstructionHistogram* instr_histogram = nullptr;
+};
+
+/* lock */
+pthread_mutex_t mutex;
+pthread_mutex_t cuda_event_mutex;
+
+/* map to store context state */
+std::unordered_map<CUcontext, CTXstate*> ctx_state_map;
+
+/* skip flag used to avoid re-entry on the nvbit_callback when issuing
+ * flush_channel kernel call */
+bool skip_callback_flag = false;
+
+/* global control variables for this tool */
+uint32_t instr_begin_interval = 0;
+uint32_t instr_end_interval = UINT32_MAX;
+int verbose = 0;
+
+#define DPRINTF(fmt, ...) {if (verbose) printf(fmt, ##__VA_ARGS__);}
+
+/* opcode to id map and reverse map  */
+std::map<std::string, int> opcode_to_id_map;
+std::map<int, std::string> id_to_opcode_map;
+
+/* grid launch id, incremented at every launch */
+uint64_t global_grid_launch_id = 0;
+
+/* Spinlock phase */
+int spinlock_phase = 0;
+// At end of phase SPINLOCK_PHASE_CHECK, we will compare the merged histogram
+// from different runs to output a file containing the instructions for 
+// each kernel that are nondeterministic.
+const int SPINLOCK_PHASE_CHECK = 1;
+std::string spinlock_run_dir = "./";
+int spinlock_keep_intermediate_files = 0;
+void spinlock_check();
+
+void* recv_thread_fun(void* args);
+
+void nvbit_at_init() {
+    setenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC", "1", 1);
+    GET_VAR_INT(
+        instr_begin_interval, "INSTR_BEGIN", 0,
+        "Beginning of the instruction interval where to apply instrumentation");
+    GET_VAR_INT(
+        instr_end_interval, "INSTR_END", UINT32_MAX,
+        "End of the instruction interval where to apply instrumentation");
+    GET_VAR_INT(verbose, "TOOL_VERBOSE", 0, "Enable verbosity inside the tool");
+    GET_VAR_INT(spinlock_phase, "SPINLOCK_PHASE", 0, "Spinlock phase");
+    GET_VAR_STR(spinlock_run_dir, "TRACES_FOLDER", "Spinlock detection base directory, use the same as the traces folder");
+    GET_VAR_INT(spinlock_keep_intermediate_files, "SPINLOCK_KEEP_INTERMEDIATE_FILES", 0, "Keep intermediate files");
+    std::string pad(100, '-');
+    printf("%s\n", pad.c_str());
+
+    /* set mutex as recursive */
+    pthread_mutexattr_t attr;
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    pthread_mutex_init(&mutex, &attr);
+
+    pthread_mutex_init(&cuda_event_mutex, &attr);
+
+    // Add trailing slash if not empty
+    if (!spinlock_run_dir.empty()) {
+        spinlock_run_dir += "/";
+    }
+}
+
+/**
+ * This function is called when the program terminates.
+ * We will use this to merge all contexts' kernel histograms by kernel name
+ * so that we can identify all the spinlock/non-deterministic sections of
+ * launched kernels instead of by launched kernel instances, whose context order
+ * is not guaranteed.
+ */
+void nvbit_at_term() {
+    // Read the spinlock_run_PHASE dir under ctx_<ctx_id> and for each unique kernel name, 
+    // we will have a vector of kernel histograms
+    using HistogramMapByName = std::map<std::string, std::vector<KernelInstructionHistogram*>>;
+    HistogramMapByName map;
+
+    // Build the histogram map by reading the spinlock_run_PHASE dir under ctx_<ctx_id>
+    // iterate the ctx_<ctx_id> dir under spinlock_detection folder
+    for (auto& folder : std::filesystem::directory_iterator(spinlock_run_dir + "spinlock_detection")) {
+        // If the folder is not a ctx_<ctx_id> dir, skip
+        if (folder.path().filename().string().find("ctx_") == std::string::npos) {
+            continue;
+        }
+
+        // Now we iterate the spinlock_run_PHASE dir under ctx_<ctx_id> folder
+        std::string context_run_dir = folder.path().string() + "/spinlock_run_" + std::to_string(spinlock_phase);
+
+        // Build this histogram vector for this context
+        for (auto& file : std::filesystem::directory_iterator(context_run_dir)) {
+            if (file.path().extension() == ".histogram") {
+                KernelInstructionHistogram* histogram = new KernelInstructionHistogram();
+                histogram->loadFromFile(file.path().string());
+                map[histogram->name].push_back(histogram);
+            }
+        }
+    }
+
+    // Now, we merge all the histograms for each kernel name
+    std::vector<KernelInstructionHistogram*> merged_histograms;
+    size_t id = 0;
+    for (auto& [kernel_name, histograms] : map) {
+        KernelInstructionHistogram* merged_histogram = new KernelInstructionHistogram();
+        // Set the name to the kernel name
+        merged_histogram->name = kernel_name;
+        merged_histogram->id = id;
+        id++;
+        for (auto& histogram : histograms) {
+            // Use hash to merge the histograms to avoid overflow
+            merged_histogram->merge(*histogram, true);
+        }
+        merged_histograms.push_back(merged_histogram);
+    }
+
+    // For each merged histogram, save under spinlock_run_PHASE_merged dir
+    std::string merged_run_dir = spinlock_run_dir + "spinlock_detection/spinlock_run_" + std::to_string(spinlock_phase) + "_merged";
+    std::error_code error_code;
+    bool success = std::filesystem::create_directories(merged_run_dir, error_code);
+    if (error_code) {
+        printf("Spinlock: Failed to create folder %s: %s\n", merged_run_dir.c_str(), error_code.message().c_str());
+        assert(false);
+    }
+
+    for (auto& histogram : merged_histograms) {
+        histogram->saveToFile(merged_run_dir + "/kernel-" + std::to_string(histogram->id) + ".histogram");
+    }
+
+    // Clean up
+    for (auto& histogram : merged_histograms) {
+        delete histogram;
+    }
+    for (auto& [name, histograms] : map) {
+        for (auto& histogram : histograms) {
+            delete histogram;
+        }
+    }
+
+    // Check for spinlock
+    if (spinlock_phase == SPINLOCK_PHASE_CHECK) {
+        spinlock_check();
+    }
+}
+
+/* Set used to avoid re-instrumenting the same functions multiple times */
+std::unordered_set<CUfunction> already_instrumented;
+
+void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    /* Get related functions of the kernel (device function that can be
+     * called by the kernel) */
+    std::vector<CUfunction> related_functions =
+        nvbit_get_related_functions(ctx, func);
+
+    /* add kernel itself to the related function vector */
+    related_functions.push_back(func);
+
+    /* iterate on function */
+    for (auto f : related_functions) {
+        /* "recording" function was instrumented, if set insertion failed
+         * we have already encountered this function */
+        if (!already_instrumented.insert(f).second) {
+            continue;
+        }
+
+        /* get vector of instructions of function "f" */
+        const std::vector<Instr*>& instrs = nvbit_get_instrs(ctx, f);
+
+        DPRINTF(
+            "Spinlock: CTX %p, Inspecting CUfunction %p name %s at address "
+            "0x%lx\n",
+            ctx, f, nvbit_get_func_name(ctx, f), nvbit_get_func_addr(ctx, f));
+
+        uint32_t cnt = 0;
+        /* iterate on all the static instructions in the function */
+        for (auto instr : instrs) {
+            if (cnt < instr_begin_interval || cnt >= instr_end_interval) {
+                cnt++;
+                continue;
+            }
+            if (verbose) {
+                instr->printDecoded();
+            }
+
+            if (opcode_to_id_map.find(instr->getOpcode()) ==
+                opcode_to_id_map.end()) {
+                int opcode_id = opcode_to_id_map.size();
+                opcode_to_id_map[instr->getOpcode()] = opcode_id;
+                id_to_opcode_map[opcode_id] = std::string(instr->getOpcode());
+            }
+
+            int opcode_id = opcode_to_id_map[instr->getOpcode()];
+
+            // insert instrumentation function
+            nvbit_insert_call(instr, "count_instr", IPOINT_BEFORE);
+            
+            // pass arguments
+            nvbit_add_call_arg_guard_pred_val(instr);
+            nvbit_add_call_arg_const_val32(instr, (uint32_t)instr->getIdx());
+            nvbit_add_call_arg_launch_val64(instr, 0);
+            nvbit_add_call_arg_const_val64(instr, (uint64_t)ctx_state->channel_dev);
+
+            cnt++;
+        }
+    }
+}
+
+/* flush channel */
+__global__ void flush_channel(ChannelDev* ch_dev) {
+    instr_count_t packet;
+    // Sentinel value to indicate the end of the histogram
+    packet.instr_idx = UINT32_MAX;
+    packet.count = 0;
+    ch_dev->push(&packet, sizeof(instr_count_t));
+    ch_dev->flush(); 
+}
+
+void init_context_state(CUcontext ctx) {
+    CTXstate* ctx_state = ctx_state_map[ctx];
+    ctx_state->id = (int)ctx_state_map.size() - 1;
+    ctx_state->recv_thread_done = RecvThreadState::WORKING;
+    cudaMallocManaged(&ctx_state->channel_dev, sizeof(ChannelDev));
+    ctx_state->channel_host.init(ctx_state->id, CHANNEL_SIZE,
+                                 ctx_state->channel_dev, recv_thread_fun, ctx);
+    nvbit_set_tool_pthread(ctx_state->channel_host.get_thread());
+}
+
+static void enter_kernel_launch(CUcontext ctx, CUfunction func,
+                uint64_t &grid_launch_id, nvbit_api_cuda_t cbid, void* params,
+                bool stream_capture = false, bool build_graph = false) {
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    // no need to sync during stream capture or manual graph build, since no
+    // kernel is actually launched.
+    if (!stream_capture && !build_graph) {
+        /* Make sure GPU is idle */
+        cudaDeviceSynchronize();
+        assert(cudaGetLastError() == cudaSuccess);
+    }
+
+    // Initialize kernel instruction histogram map
+    if (ctx_state->instr_histogram == nullptr) {
+        ctx_state->instr_histogram = new KernelInstructionHistogram(grid_launch_id, nvbit_get_func_name(ctx, func, true));
+    } else {
+        ctx_state->instr_histogram->reinit(grid_launch_id, nvbit_get_func_name(ctx, func, true));
+    }
+
+    /* instrument */
+    instrument_function_if_needed(ctx, func);
+
+    int nregs = 0;
+    CUDA_SAFECALL(
+        cuFuncGetAttribute(&nregs, CU_FUNC_ATTRIBUTE_NUM_REGS, func));
+
+    int shmem_static_nbytes = 0;
+    CUDA_SAFECALL(
+        cuFuncGetAttribute(&shmem_static_nbytes,
+                           CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
+
+    /* get function name and pc */
+    const char* func_name = nvbit_get_func_name(ctx, func);
+    uint64_t pc = nvbit_get_func_addr(ctx, func);
+
+    // during stream capture or manual graph build, no kernel is launched, so
+    // do not set launch argument, do not print kernel info, do not increase
+    // grid_launch_id. All these should be done at graph node launch time.
+    if (!stream_capture && !build_graph) {
+        /* set grid launch id at launch time */
+        nvbit_set_at_launch(ctx, func, (uint64_t)grid_launch_id);
+
+        if (cbid == API_CUDA_cuLaunchKernelEx_ptsz ||
+            cbid == API_CUDA_cuLaunchKernelEx) {
+            cuLaunchKernelEx_params* p = (cuLaunchKernelEx_params*)params;
+            printf(
+                "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
+                "Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
+                "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
+                "id %ld\n",
+                (uint64_t)ctx, pc, func_name, grid_launch_id,
+                p->config->gridDimX, p->config->gridDimY,
+                p->config->gridDimZ, p->config->blockDimX,
+                p->config->blockDimY, p->config->blockDimZ, nregs,
+                shmem_static_nbytes + p->config->sharedMemBytes,
+                (uint64_t)p->config->hStream);
+        } else {
+            cuLaunchKernel_params* p = (cuLaunchKernel_params*)params;
+            printf(
+                "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
+                "Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
+                "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
+                "id %ld\n",
+                (uint64_t)ctx, pc, func_name, grid_launch_id, p->gridDimX,
+                p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY,
+                p->blockDimZ, nregs,
+                shmem_static_nbytes + p->sharedMemBytes,
+                (uint64_t)p->hStream);
+        }
+
+        // increment grid launch id for next launch
+        // grid id can be changed here, since nvbit_set_at_launch() has copied
+        // its value above.
+        grid_launch_id++;
+    }
+
+    /* enable instrumented code to run */
+    nvbit_enable_instrumented(ctx, func, true);
+
+    // Reset the kernel receiving done flag for new kernel launch
+    ctx_state->kernel_receiving_done = false;
+}
+
+// the function is only called for non cuda graph launch cases.
+static void leave_kernel_launch(CTXstate *ctx_state, uint64_t &grid_launch_id) {
+    // make sure user kernel finishes to avoid deadlock
+    cudaDeviceSynchronize();
+    /* push a flush channel kernel */
+    flush_channel<<<1, 1>>>(ctx_state->channel_dev);
+
+    /* Make sure GPU is idle */
+    cudaDeviceSynchronize();
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        printf("cuda error: %s\n", cudaGetErrorName(err));
+    }
+    assert(err == cudaSuccess);
+    
+    /* wait here until the receiving thread has not finished with the
+     * current kernel */
+    while (!ctx_state->kernel_receiving_done) {
+        pthread_yield();
+    }
+
+    // Dump the histogram to file
+    // Make a folder for the histogram
+    std::string folder_name = spinlock_run_dir + "spinlock_detection/ctx_" + std::to_string(ctx_state->id) + "/spinlock_run_" + std::to_string(spinlock_phase);
+
+    std::error_code error_code;
+    std::filesystem::create_directories(folder_name, error_code);
+    if (error_code) {
+        printf("Spinlock: Failed to create folder %s: %s\n", folder_name.c_str(), error_code.message().c_str());
+        assert(false);
+    }
+
+    // Save the histogram to file in form of kernel-<kernel_id>.histogram
+    bool success = ctx_state->instr_histogram->saveToFile( folder_name + "/" + "kernel-" + std::to_string(ctx_state->instr_histogram->id) + ".histogram");
+    assert(success);
+}
+
+void nvbit_at_cuda_event(CUcontext ctx, int is_exit, nvbit_api_cuda_t cbid,
+                         const char* name, void* params, CUresult* pStatus) {
+    pthread_mutex_lock(&cuda_event_mutex);
+
+    /* we prevent re-entry on this callback when issuing CUDA functions inside
+     * this function */
+    if (skip_callback_flag) {
+        pthread_mutex_unlock(&cuda_event_mutex);
+        return;
+    }
+    skip_callback_flag = true;
+
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    switch (cbid) {
+        // Identify all the possible CUDA launch events without stream
+        // parameters, they will not get involved with cuda graph
+        case API_CUDA_cuLaunch:
+        case API_CUDA_cuLaunchGrid:
+            {
+                cuLaunch_params *p = (cuLaunch_params *)params;
+                CUfunction func = p->f;
+                if (!is_exit) {
+                    enter_kernel_launch(ctx, func, global_grid_launch_id, cbid,
+                                        params);
+                } else {
+                    leave_kernel_launch(ctx_state, global_grid_launch_id);
+                }
+            } break;
+        // To support kernel launched by cuda graph (in addition to existing kernel
+        // launche method), we need to do:
+        //
+        // 1. instrument kernels at cudaGraphAddKernelNode event. This is for cases
+        // that kernels are manually added to a cuda graph.
+        // 2. distinguish captured kernels when kernels are recorded to a graph
+        // using stream capture. cudaStreamIsCapturing() tells us whether a stream
+        // is capturiong.
+        // 3. per-kernel instruction counters, since cuda graph can launch multiple
+        // kernels at the same time.
+        //
+        // Three cases:
+        //
+        // 1. original kernel launch:
+        //     1a. for any kernel launch without using a stream, we instrument it
+        //     before it is launched, call cudaDeviceSynchronize after it is
+        //     launched and read the instruction counter of the kernel.
+        //     1b. for any kernel launch using a stream, but the stream is not
+        //     capturing, we do the same thing as 1a.
+        //
+        //  2. cuda graph using stream capturing: if a kernel is launched in a
+        //  stream and the stream is capturing. We instrument the kernel before it
+        //  is launched and do nothing after it is launched, because the kernel is
+        //  not running until cudaGraphLaunch. Instead, we issue a
+        //  cudaStreamSynchronize after cudaGraphLaunch is done and reset the
+        //  instruction counters, since a cloned graph might be launched afterwards.
+        //
+        //  3. cuda graph manual: we instrument the kernel added by
+        //  cudaGraphAddKernelNode and do the same thing for cudaGraphLaunch as 2.
+        //
+        // The above method should handle most of cuda graph launch cases.
+        // kernel launches with stream parameter, they can be used for cuda graph
+        case API_CUDA_cuLaunchKernel_ptsz:
+        case API_CUDA_cuLaunchKernel:
+        case API_CUDA_cuLaunchCooperativeKernel:
+        case API_CUDA_cuLaunchCooperativeKernel_ptsz:
+        case API_CUDA_cuLaunchKernelEx:
+        case API_CUDA_cuLaunchKernelEx_ptsz:
+        case API_CUDA_cuLaunchGridAsync:
+            {
+                CUfunction func;
+                CUstream hStream;
+
+                if (cbid == API_CUDA_cuLaunchKernelEx_ptsz ||
+                    cbid == API_CUDA_cuLaunchKernelEx) {
+                    cuLaunchKernelEx_params* p =
+                        (cuLaunchKernelEx_params*)params;
+                    func = p->f;
+                    hStream = p->config->hStream;
+                } else if (cbid == API_CUDA_cuLaunchKernel_ptsz ||
+                           cbid == API_CUDA_cuLaunchKernel ||
+                           cbid == API_CUDA_cuLaunchCooperativeKernel_ptsz ||
+                           cbid == API_CUDA_cuLaunchCooperativeKernel) {
+                    cuLaunchKernel_params* p = (cuLaunchKernel_params*)params;
+                    func = p->f;
+                    hStream = p->hStream;
+                } else {
+                    cuLaunchGridAsync_params* p =
+                        (cuLaunchGridAsync_params*)params;
+                    func = p->f;
+                    hStream = p->hStream;
+                }
+
+                cudaStreamCaptureStatus streamStatus;
+                /* check if the stream is capturing, if yes, do not sync */
+                CUDA_SAFECALL(cudaStreamIsCapturing(hStream, &streamStatus));
+                if (!is_exit) {
+                    bool stream_capture = (streamStatus == cudaStreamCaptureStatusActive);
+                    enter_kernel_launch(ctx, func, global_grid_launch_id, cbid, params, stream_capture);
+                } else {
+                    if (streamStatus != cudaStreamCaptureStatusActive) {
+                        if (verbose >= 1) {
+                            printf("kernel %s not captured by cuda graph\n", nvbit_get_func_name(ctx, func));
+                        }
+                        leave_kernel_launch(ctx_state, global_grid_launch_id);
+                    } else {
+                        if (verbose >= 1) {
+                            printf("kernel %s captured by cuda graph\n", nvbit_get_func_name(ctx, func));
+                        }
+                    }
+                }
+            } break;
+        case API_CUDA_cuGraphAddKernelNode:
+            {
+                cuGraphAddKernelNode_params *p = (cuGraphAddKernelNode_params *)params;
+                CUfunction func = p->nodeParams->func;
+
+                if (!is_exit) {
+                    // cuGraphAddKernelNode_params->nodeParams is the same as
+                    // cuLaunchKernel_params up to sharedMemBytes
+                    enter_kernel_launch(ctx, func, global_grid_launch_id, cbid, (void*)p->nodeParams, false, true);
+                } 
+            } break;
+        case API_CUDA_cuGraphLaunch:
+            {
+                // if we are exiting a cuda graph launch:
+                // Wait until the graph is completed using
+                // cudaStreamSynchronize()
+                if (is_exit) {
+                    cuGraphLaunch_params *p = (cuGraphLaunch_params *)params;
+
+                    CUDA_SAFECALL(cudaStreamSynchronize(p->hStream));
+                    assert(cudaGetLastError() == cudaSuccess);
+                    /* push a flush channel kernel */
+                    flush_channel<<<1, 1, 0, p->hStream>>>(ctx_state->channel_dev);
+                    CUDA_SAFECALL(cudaStreamSynchronize(p->hStream));
+                    assert(cudaGetLastError() == cudaSuccess);
+                }
+
+            } break;
+        default:
+            break;
+    };
+
+
+    skip_callback_flag = false;
+    pthread_mutex_unlock(&cuda_event_mutex);
+}
+
+void* recv_thread_fun(void* args) {
+    CUcontext ctx = (CUcontext)args;
+
+    pthread_mutex_lock(&mutex);
+    /* get context state from map */
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    ChannelHost* ch_host = &ctx_state->channel_host;
+    pthread_mutex_unlock(&mutex);
+    char* recv_buffer = (char*)malloc(CHANNEL_SIZE);
+
+    while (ctx_state->recv_thread_done == RecvThreadState::WORKING) {
+        /* receive buffer from channel */
+        uint32_t num_recv_bytes = 0;
+        if (!ctx_state->kernel_receiving_done &&
+            (num_recv_bytes = ch_host->recv(recv_buffer, CHANNEL_SIZE)) > 0) {
+            uint32_t num_processed_bytes = 0;
+            while (num_processed_bytes < num_recv_bytes) {
+                instr_count_t* packet =
+                    (instr_count_t*)&recv_buffer[num_processed_bytes];
+                
+                // Sentinel value to indicate the end of the histogram
+                if (packet->instr_idx == UINT32_MAX) {
+                    ctx_state->kernel_receiving_done = true;
+                    break;
+                }
+
+                // Increment the instruction count
+                ctx_state->instr_histogram->add(packet->instr_idx, packet->count);
+                DPRINTF("Spinlock: Incrementing instruction count for instruction %d by %d\n", packet->instr_idx, packet->count);
+
+                num_processed_bytes += sizeof(instr_count_t);
+            }
+        }
+    }
+    free(recv_buffer);
+    ctx_state->recv_thread_done = RecvThreadState::FINISHED;
+    return NULL;
+}
+
+void nvbit_at_ctx_init(CUcontext ctx) {
+    pthread_mutex_lock(&mutex);
+    DPRINTF("Spinlock: STARTING CONTEXT %p\n", ctx);
+    assert(ctx_state_map.find(ctx) == ctx_state_map.end());
+    CTXstate* ctx_state = new CTXstate;
+    ctx_state_map[ctx] = ctx_state;
+    pthread_mutex_unlock(&mutex);
+}
+
+void nvbit_tool_init(CUcontext ctx) {
+    pthread_mutex_lock(&mutex);
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    init_context_state(ctx);
+    pthread_mutex_unlock(&mutex);
+}
+
+void nvbit_at_ctx_term(CUcontext ctx) {
+    pthread_mutex_lock(&mutex);
+    skip_callback_flag = true;
+    DPRINTF("Spinlock: TERMINATING CONTEXT %p\n", ctx);
+    /* get context state from map */
+    assert(ctx_state_map.find(ctx) != ctx_state_map.end());
+    CTXstate* ctx_state = ctx_state_map[ctx];
+
+    /* Notify receiver thread and wait for receiver thread to
+     * notify back */
+    ctx_state->recv_thread_done = RecvThreadState::STOP;
+    while (ctx_state->recv_thread_done != RecvThreadState::FINISHED)
+        ;
+
+    ctx_state->channel_host.destroy(false);
+    cudaFree(ctx_state->channel_dev);
+    skip_callback_flag = false;
+    delete ctx_state;
+    pthread_mutex_unlock(&mutex);
+}
+
+void nvbit_at_graph_node_launch(CUcontext ctx, CUfunction func,
+                                          CUstream stream,
+                                          uint64_t launch_handle) {
+    func_config_t config = {0};
+    const char* func_name = nvbit_get_func_name(ctx, func);
+    uint64_t pc = nvbit_get_func_addr(ctx, func);
+
+    pthread_mutex_lock(&mutex);
+    nvbit_set_at_launch(ctx, func, (uint64_t)global_grid_launch_id, stream,
+                        launch_handle);
+    nvbit_get_func_config(ctx, func, &config);
+
+    printf(
+        "Spinlock: CTX 0x%016lx - LAUNCH - Kernel pc 0x%016lx - "
+        "Kernel name %s - grid launch id %ld - grid size %d,%d,%d "
+        "- block size %d,%d,%d - nregs %d - shmem %d - cuda stream "
+        "id %ld\n",
+        (uint64_t)ctx, pc, func_name, global_grid_launch_id, config.gridDimX,
+        config.gridDimY, config.gridDimZ, config.blockDimX, config.blockDimY,
+        config.blockDimZ, config.num_registers,
+        config.shmem_static_nbytes + config.shmem_dynamic_nbytes,
+        (uint64_t)stream);
+    // grid id can be changed here, since nvbit_set_at_launch() has copied its
+    // value above.
+    global_grid_launch_id++;
+    pthread_mutex_unlock(&mutex);
+}
+
+void spinlock_check() {
+    // Get spinlock run folders
+    std::string spinlock_run0_folder = spinlock_run_dir + "spinlock_detection/spinlock_run_0_merged";
+    std::string spinlock_run1_folder = spinlock_run_dir + "spinlock_detection/spinlock_run_1_merged";
+
+    // Get the map from kernel name to histogram
+    std::map<std::string, KernelInstructionHistogram*> spinlock_run0_histograms;
+    std::map<std::string, KernelInstructionHistogram*> spinlock_run1_histograms;
+
+    // Load the histogram files
+    DPRINTF("Spinlock: Loading histograms from %s and %s\n", spinlock_run0_folder.c_str(), spinlock_run1_folder.c_str());
+    std::map<std::string, std::map<std::string, KernelInstructionHistogram*>*> zipped_folders = {{spinlock_run0_folder, &spinlock_run0_histograms}, {spinlock_run1_folder, &spinlock_run1_histograms}};
+    for (const auto& iter : zipped_folders) {
+        auto spinlock_run_folder = iter.first;
+        auto histograms = iter.second;
+        DPRINTF("Spinlock: Loading histograms from %s\n", spinlock_run_folder.c_str());
+        assert(std::filesystem::exists(spinlock_run_folder));
+        for (const auto& entry : std::filesystem::directory_iterator(spinlock_run_folder)) {
+            if (entry.is_regular_file() && entry.path().extension().compare(".histogram") == 0) {
+                DPRINTF("Spinlock: Loading histogram from %s\n", entry.path().string().c_str());
+                KernelInstructionHistogram *histogram = new KernelInstructionHistogram();
+                histogram->loadFromFile(entry.path().string());
+                histograms->insert({histogram->name, histogram});
+            }
+        }
+    }
+    DPRINTF("Spinlock: Loaded %d histograms from %s and %d histograms from %s\n", spinlock_run0_histograms.size(), spinlock_run0_folder.c_str(), spinlock_run1_histograms.size(), spinlock_run1_folder.c_str());
+    // Check if the kernel count are the same
+    assert(spinlock_run0_histograms.size() == spinlock_run1_histograms.size());
+
+    DPRINTF("Spinlock: Comparing the two histograms\n");
+    // Now compare the two histograms and generate output of spinlock instructions per context
+    // Each row will be kernel id, kernel name, and indices of spinlock instructions
+    std::string output_file = spinlock_run_dir + "spinlock_detection/spinlock_instructions.txt";
+    std::ofstream output_file_stream(output_file);
+    DPRINTF("Spinlock: Generating output file %s\n", output_file.c_str());
+    for (auto [kernel_name, run0_histogram] : spinlock_run0_histograms) {
+        DPRINTF("Spinlock: Comparing histogram %d %s\n", run0_histogram->id, kernel_name.c_str());
+        auto run1_histogram = spinlock_run1_histograms.at(kernel_name);
+        auto spinlock_instructions = run0_histogram->findSpinlock(*run1_histogram);
+        DPRINTF("Spinlock: Found %d spinlock instructions\n", spinlock_instructions.size());
+        output_file_stream << run0_histogram->id << ", " << kernel_name << ": ";
+        for (auto [instr_idx, counts] : spinlock_instructions) {
+            // Write to output file
+            output_file_stream << instr_idx << " ";
+        }
+        output_file_stream << "\n";
+    }
+    output_file_stream.close();
+    DPRINTF("Spinlock: Generated output file %s\n", output_file.c_str());
+    // Free the histograms
+    for (auto [kernel_name, histogram] : spinlock_run0_histograms) {
+        delete histogram;
+    }
+    for (auto [kernel_name, histogram] : spinlock_run1_histograms) {
+        delete histogram;
+    }
+
+    // Clean up intermediate files
+    if (!spinlock_keep_intermediate_files) {
+        // Remove the ctx_<ctx_id> and spinlock_run_<phase>_merged folders
+        for (auto& folder : std::filesystem::directory_iterator(spinlock_run_dir + "spinlock_detection")) {
+            if (folder.path().filename().string().find("ctx_") != std::string::npos) {
+                std::filesystem::remove_all(folder.path());
+            } else if (folder.path().filename().string().find("spinlock_run_") != std::string::npos) {
+                std::filesystem::remove_all(folder.path());
+            }
+        }
+    }
+    return;
+}
diff --git a/util/tracer_nvbit/run_hw_trace.py b/util/tracer_nvbit/run_hw_trace.py
index f6153eafb..07d22c41a 100755
--- a/util/tracer_nvbit/run_hw_trace.py
+++ b/util/tracer_nvbit/run_hw_trace.py
@@ -53,6 +53,20 @@
     action="store_true",
     help="Once the kernel limit is reached, terminate the tracing process",
 )
+parser.add_option(
+    "--spinlock_handling",
+    dest="spinlock_handling",
+    choices=["none", "fast_forward"],
+    default="none",
+    help="How to handle spinlock instructions",
+)
+parser.add_option(
+    "--spinlock_fast_forward_iterations",
+    dest="spinlock_fast_forward_iterations",
+    type=int,
+    default=1,
+    help="Number of iterations to keep for spinlock fast forwarding. Only used if spinlock_handling is fast_forward",
+)
 
 (options, args) = parser.parse_args()
 
@@ -68,6 +82,7 @@
 logfile = day_string + "--" + time_string + ".csv"
 
 nvbit_tracer_path = os.path.join(this_directory, "tracer_tool")
+nvbit_spinlock_path = os.path.join(this_directory, "others", "spinlock_tool")
 
 for bench in benchmarks:
     edir, ddir, exe, argslist = bench
@@ -143,13 +158,14 @@
             + '"; export CUDA_VISIBLE_DEVICES="'
             + options.device_num
             + '" ; '
+        )
+        
+        tracer_contents = (
+            sh_contents
             + "\nrm -f traces/*"
             + "\nexport TRACES_FOLDER="
             + this_run_dir
-            + "; CUDA_INJECTION64_PATH="
-            + os.path.join(nvbit_tracer_path, "tracer_tool.so")
-            + " "
-            + " LD_PRELOAD="
+            + f"; ENABLE_SPINLOCK_FAST_FORWARD={1 if options.spinlock_handling == 'fast_forward' else 0} SPINLOCK_ITER_TO_KEEP={options.spinlock_fast_forward_iterations} CUDA_INJECTION64_PATH="
             + os.path.join(nvbit_tracer_path, "tracer_tool.so")
             + " "
             + exec_path
@@ -167,16 +183,46 @@
             + this_trace_folder
             + "/kernelslist "
         )
+        
+        # Spinlock tool run script
+        # will run twice, once for phase 0 and once for phase 1
+        spinlock_contents = (
+            sh_contents
+            + "\nrm -f spinlock_detection/*"
+            + "\nexport TRACES_FOLDER="
+            + this_run_dir
+            + "; SPINLOCK_PHASE=0 CUDA_INJECTION64_PATH="
+            + os.path.join(nvbit_spinlock_path, "spinlock_tool.so")
+            + " "
+            + exec_path
+            + " "
+            + str(args)
+            + " ; "
+            + " SPINLOCK_PHASE=1 CUDA_INJECTION64_PATH="
+            + os.path.join(nvbit_spinlock_path, "spinlock_tool.so")
+            + " "
+            + exec_path
+            + " "
+            + str(args)
+            + " ; "
+        )
+
+        for path, content in [("run.sh", tracer_contents), ("run_spinlock_detection.sh", spinlock_contents)]:
+            open(os.path.join(this_run_dir, path), "w").write(content)
+            if subprocess.call(["chmod", "u+x", os.path.join(this_run_dir, path)]) != 0:
+                exit(f"Error chmod {path} runfile")
 
-        open(os.path.join(this_run_dir, "run.sh"), "w").write(sh_contents)
-        if subprocess.call(["chmod", "u+x", os.path.join(this_run_dir, "run.sh")]) != 0:
-            exit("Error chmod runfile")
 
         if not options.norun:
             saved_dir = os.getcwd()
             os.chdir(this_run_dir)
             print("Running {0}".format(exe))
 
+            # Call the spinlock detection script
+            if options.spinlock_handling == 'fast_forward':
+                if subprocess.call(["bash", "run_spinlock_detection.sh"]) != 0:
+                    sys.exit(f"Error invoking spinlock detection on {this_run_dir}")
+
             if subprocess.call(["bash", "run.sh"]) != 0:
-                sys.exit("Error invoking nvbit on {0}".format(this_run_dir))
+                sys.exit(f"Error invoking nvbit on {this_run_dir}")
             os.chdir(saved_dir)
diff --git a/util/tracer_nvbit/tracer_tool/Makefile b/util/tracer_nvbit/tracer_tool/Makefile
index 950639750..f4bff9fe5 100644
--- a/util/tracer_nvbit/tracer_tool/Makefile
+++ b/util/tracer_nvbit/tracer_tool/Makefile
@@ -49,7 +49,7 @@ $(NVBIT_TOOL): $(OBJECTS) $(NVBIT_PATH)/libnvbit.a
 	$(NVCC) -arch=$(ARCH) -O3 $(OBJECTS) $(LIBS) $(NVCC_PATH) -lcuda -lcudart_static -shared -o $@
 
 %.o: %.cu common.h
-	$(NVCC) -dc -c -std=c++11 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@
+	$(NVCC) -dc -c -std=c++17 $(INCLUDES) -Xptxas -cloning=no -Xcompiler -Wall -arch=$(ARCH) -O3 -Xcompiler -fPIC $< -o $@
 
 inject_funcs.o: inject_funcs.cu common.h
 	$(NVCC) $(INCLUDES) $(MAXRREGCOUNT_FLAG) -Xptxas -astoolspatch --keep-device-functions -arch=$(ARCH) -Xcompiler -Wall -Xcompiler -fPIC -c $< -o $@
diff --git a/util/tracer_nvbit/tracer_tool/common.h b/util/tracer_nvbit/tracer_tool/common.h
index 7eff0feb7..9da0a7cbd 100644
--- a/util/tracer_nvbit/tracer_tool/common.h
+++ b/util/tracer_nvbit/tracer_tool/common.h
@@ -21,6 +21,7 @@ typedef struct {
   int opcode_id;
   uint64_t addrs[32];
   uint32_t line_num;
+  uint32_t instr_idx;
   uint32_t vpc;
   bool is_mem;
   int32_t GPRDst;
diff --git a/util/tracer_nvbit/tracer_tool/inject_funcs.cu b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
index dfc3f8769..015d108f0 100644
--- a/util/tracer_nvbit/tracer_tool/inject_funcs.cu
+++ b/util/tracer_nvbit/tracer_tool/inject_funcs.cu
@@ -25,7 +25,7 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
                 int32_t srcReg5, int32_t srcNum, uint64_t immediate,
                 uint64_t pchannel_dev, uint64_t ptotal_dynamic_instr_counter,
                 uint64_t preported_dynamic_instr_counter, uint64_t pstop_report,
-                uint32_t line_num) {
+                uint32_t line_num, uint32_t instr_idx) {
   const int active_mask = __ballot_sync(__activemask(), 1);
   const int predicate_mask = __ballot_sync(__activemask(), pred);
   const int laneid = get_laneid();
@@ -55,6 +55,7 @@ instrument_inst(int pred, int opcode_id, int32_t vpc, bool is_mem,
   int uniqe_threadId = threadIdx.z * blockDim.y * blockDim.x +
                        threadIdx.y * blockDim.x + threadIdx.x;
   ma.line_num = line_num;
+  ma.instr_idx = instr_idx;
   ma.warpid_tb = uniqe_threadId / 32;
 
   ma.cta_id_x = cta.x;
diff --git a/util/tracer_nvbit/tracer_tool/tracer_tool.cu b/util/tracer_nvbit/tracer_tool/tracer_tool.cu
index f4649c5ee..b36c33b18 100644
--- a/util/tracer_nvbit/tracer_tool/tracer_tool.cu
+++ b/util/tracer_nvbit/tracer_tool/tracer_tool.cu
@@ -10,6 +10,7 @@
 
 #include <algorithm>
 #include <bitset>
+#include <fstream>
 #include <iostream>
 #include <iterator>
 #include <map>
@@ -80,6 +81,7 @@ std::unordered_map<CUcontext, std::string> ctx_kernelslist;
 std::unordered_map<CUcontext, std::string> ctx_stats_location;
 std::unordered_map<CUcontext, int> ctx_kernelid;
 std::unordered_map<CUcontext, FILE *> ctx_resultsFile;
+std::unordered_map<CUcontext, std::string> ctx_current_kernel_name;
 
 std::string kernel_ranges = "";
 
@@ -200,6 +202,14 @@ static bool first_call = true;
 unsigned old_total_insts = 0;
 unsigned old_total_reported_insts = 0;
 
+/* Spinlock fast forward control */
+int enable_spinlock_fast_forward = 0;
+int spinlock_iter_to_keep = 0;
+// Map from kernel name to spinlock instruction indices
+std::map<std::string, std::vector<uint32_t> *> spinlock_instr_map;
+std::pair<std::string, std::vector<uint32_t>>
+parse_spinlock_instructions(const std::string &line);
+
 void nvbit_at_init() {
   setenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC", "1", 1);
   GET_VAR_INT(
@@ -241,6 +251,10 @@ void nvbit_at_init() {
   GET_VAR_INT(xz_compress_trace, "TRACE_FILE_COMPRESS", 1,
               "Create xz-compressed trace"
               "file");
+  GET_VAR_INT(enable_spinlock_fast_forward, "ENABLE_SPINLOCK_FAST_FORWARD", 0,
+              "Enable spinlock fast forwarding");
+  GET_VAR_INT(spinlock_iter_to_keep, "SPINLOCK_ITER_TO_KEEP", 1,
+              "Number of iterations to keep for spinlock fast forwarding");
   std::string pad(100, '-');
   printf("%s\n", pad.c_str());
 
@@ -249,6 +263,20 @@ void nvbit_at_init() {
   if (usr_defined_folder != NULL)
     user_folder = usr_defined_folder;
   parse_kernel_ranges_from_env();
+
+  // Read in the spinlock_instructions.txt and build a map from kernel name to
+  // spinlock instruction indices
+  if (enable_spinlock_fast_forward) {
+    std::string spinlock_instr_file =
+        user_folder + "/spinlock_detection/spinlock_instructions.txt";
+    std::ifstream instr_fs(spinlock_instr_file);
+    std::string line;
+    while (std::getline(instr_fs, line)) {
+      auto [kernel_name, indices] = parse_spinlock_instructions(line);
+      spinlock_instr_map[kernel_name] = new std::vector<uint32_t>(indices);
+    }
+    instr_fs.close();
+  }
 }
 
 /* Set used to avoid re-instrumenting the same functions multiple times */
@@ -400,6 +428,8 @@ void instrument_function_if_needed(CUcontext ctx, CUfunction func) {
         nvbit_add_call_arg_const_val64(instr, (uint64_t)&stop_report);
         /* Add Source code line number for current instr */
         nvbit_add_call_arg_const_val32(instr, (int)line_num);
+        /* Add instruction index for current instr (spinlock detection) */
+        nvbit_add_call_arg_const_val32(instr, (uint32_t)instr->getIdx());
 
         mem_oper_idx--;
       } while (mem_oper_idx >= 0);
@@ -565,6 +595,8 @@ static void enter_kernel_launch(CUcontext ctx, CUfunction func,
   fclose(statsFile);
 
   ctx_kernelid[ctx]++;
+  ctx_current_kernel_name[ctx] =
+      std::string(nvbit_get_func_name(ctx, func, true));
   recv_thread_receiving = true;
 }
 
@@ -888,9 +920,52 @@ void base_delta_compress(const uint64_t *addrs, const std::bitset<32> &mask,
   }
 }
 
+void trim_string(std::string &str) {
+  // Remove the leading and trailing spaces
+  str.erase(0, str.find_first_not_of(' '));
+  str.erase(str.find_last_not_of(' ') + 1);
+}
+
+typedef std::map<uint32_t, size_t> counter_t;
+typedef std::tuple<uint32_t, uint32_t, uint32_t, uint32_t> warp_key_t;
+
+counter_t create_counter(const std::vector<uint32_t> &indices) {
+  counter_t counter;
+  for (auto instr_idx : indices) {
+    counter[instr_idx] = 0;
+  }
+  return counter;
+}
+
+std::pair<std::string, std::vector<uint32_t>>
+parse_spinlock_instructions(const std::string &line) {
+  std::vector<uint32_t> indices;
+  // Each line is of the form: <kernel_id>, <kernel_name>: <indices>
+  // Though kernel id is not used
+  // -2 for the comma and the space
+  size_t name_length = line.find(':') - line.find(',') - 2;
+  std::string kernel_name = line.substr(line.find(',') + 2, name_length);
+  std::string indices_str = line.substr(line.find(':') + 1);
+  trim_string(indices_str);
+  std::stringstream ss(indices_str);
+  std::string instr_idx;
+  while (std::getline(ss, instr_idx, ' ')) {
+    indices.push_back(std::stoi(instr_idx));
+  }
+  return {kernel_name, indices};
+}
+
 void *recv_thread_fun(void *args) {
   CUcontext ctx = (CUcontext)args;
   char *recv_buffer = (char *)malloc(CHANNEL_SIZE);
+
+  // This counter map will keep track of the spinlock instruction
+  // count in the current detected spinlock loop for each warp
+  // The detection start if a spinlock instruction is encountered (start to
+  // increment the counter) and end when a non-spinlock instruction is
+  // encountered (clear the counter)
+  std::map<warp_key_t, counter_t> warp_counter_map;
+
   while (recv_thread_started) {
     uint32_t num_recv_bytes = 0;
     if (recv_thread_receiving &&
@@ -903,9 +978,52 @@ void *recv_thread_fun(void *args) {
          */
         if (ma->cta_id_x == -1) {
           recv_thread_receiving = false;
+          if (enable_spinlock_fast_forward) {
+            // Clear the counter map for all warps as we are starting a new
+            // kernel
+            warp_counter_map.clear();
+          }
           break;
         }
 
+        /* Spinlock fast forwarding */
+        if (enable_spinlock_fast_forward) {
+          // Check if this warp is in the warp_counter_map
+          warp_key_t warp_key = std::make_tuple(ma->cta_id_x, ma->cta_id_y,
+                                                ma->cta_id_z, ma->warpid_tb);
+          if (warp_counter_map.find(warp_key) == warp_counter_map.end()) {
+            // This warp is not in the warp_counter_map, so we create a counter
+            // for this warp using the spinlock instruction indices for the
+            // current kernel
+            std::vector<uint32_t> &indices =
+                *(spinlock_instr_map[ctx_current_kernel_name[ctx]]);
+            warp_counter_map[warp_key] = create_counter(indices);
+          }
+
+          // Get the counter map for this warp
+          auto &counter = warp_counter_map[warp_key];
+
+          // Now check if we should start spinlock fast forwarding for this warp
+          if (counter.find(ma->instr_idx) != counter.end()) {
+            // We are still in a spinlock loop, so we increment the counter
+            counter[ma->instr_idx]++;
+            if (counter[ma->instr_idx] > spinlock_iter_to_keep) {
+              // This spinlock instruction is executed more than the threshold
+              // so we fast forward it in the output trace
+              // Note we are only fast forwarding the innermost spinlock loop
+              num_processed_bytes += sizeof(inst_trace_t);
+              continue;
+            }
+          } else {
+            // We are exiting the innermost spinlock loop, so we reset the
+            // counter map for this warp
+            for (auto &[instr_idx, count] : counter) {
+              count = 0;
+            }
+          }
+        }
+
+        /* Dump the instruction trace information */
         fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_x);
         fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_y);
         fprintf(ctx_resultsFile[ctx], "%d ", ma->cta_id_z);
@@ -1000,10 +1118,12 @@ void *recv_thread_fun(void *args) {
     }
   }
   free(recv_buffer);
+
   return NULL;
 }
 
 void nvbit_tool_init(CUcontext ctx) {
+  ctx_current_kernel_name[ctx] = "";
   recv_thread_started = true;
   channel_host.init(0, CHANNEL_SIZE, &channel_dev, NULL);
   pthread_create(&recv_thread, NULL, recv_thread_fun, ctx);