diff --git a/PIMbench/lnorm/Makefile b/PIMbench/lnorm/Makefile new file mode 100644 index 00000000..b3e4ce58 --- /dev/null +++ b/PIMbench/lnorm/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/lnorm/PIM/Makefile b/PIMbench/lnorm/PIM/Makefile new file mode 100644 index 00000000..e637f51d --- /dev/null +++ b/PIMbench/lnorm/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := lnorm.out +SRC := lnorm.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/lnorm/PIM/lnorm.cpp b/PIMbench/lnorm/PIM/lnorm.cpp new file mode 100644 index 00000000..75b7375f --- /dev/null +++ b/PIMbench/lnorm/PIM/lnorm.cpp @@ -0,0 +1,291 @@ +// Test: C++ version of matrix vector multiplication +// Copyright (c) 2024 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" +#include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); +//auto start_cpu, stop_cpu; + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; + char *configFile; + char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./lnorm.out [options]" + "\n" + "\n -l vectorLength (default=128 elements)" + "\n -c dramsim config file" + "\n -i input file containing two vectors (default=generates vector with random numbers)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'l': + p.vectorLength = strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't') ? true : false; + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +// Newton-Raphson iterative integer square root +uint32_t newton_sqrt(uint32_t x) { + if (x == 0) return 0; // Handle zero case + + uint32_t guess = x; // Initial guess + uint32_t prev_guess = 0; + + while (guess != prev_guess) { // Continue until convergence + prev_guess = guess; + guess = (guess + x / guess) / 2; // Newton-Raphson iteration + } + + //std::cout << "newton sqrt: " << guess << std::endl; + return guess; +} + +void lnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32); + if (srcObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId tempObj1 = pimAllocAssociated(srcObj1, PIM_INT32); + if (tempObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32); + if (dstObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimStatus status; + + status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + //mean + int32_t sum = 0; + status = pimRedSum(srcObj1, static_cast(&sum), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + auto start_cpu = std::chrono::high_resolution_clock::now(); + int32_t mean = sum/vectorLength; + std::cout << "mean " << mean << " sum " << sum <(&sum2), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + start_cpu = std::chrono::high_resolution_clock::now(); + + int32_t variance = sum2/vectorLength; + int32_t sqrt_var = newton_sqrt(variance + 1); + std::cout << "sqrt_var " << sqrt_var << " var " << variance < srcVector (params.vectorLength, 1), resultVector; + + if (params.shouldVerify) { + if (params.inputFile == nullptr) + { + getVector(params.vectorLength, srcVector); + } + else + { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } + } + + + if (!createDevice(params.configFile)) + { + return 1; + } + + // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration. + lnorm(params.vectorLength, srcVector, resultVector); + + if (params.shouldVerify) + { + bool shouldBreak = false; // shared flag variable + + // verify result + + std::vector result (params.vectorLength, 0); + std::vector src_minus_mean (params.vectorLength, 0); + std::vector sq_src_minus_mean (params.vectorLength, 0); + + int32_t sum = 0; + + for (size_t i = 0; i < params.vectorLength; i++) { + sum += srcVector[i]; + } + + int32_t mean = sum / params.vectorLength; + + for (size_t i = 0; i < params.vectorLength; i++) { + src_minus_mean[i] = srcVector[i] - mean; + } + + for (size_t i = 0; i < params.vectorLength; i++) { + sq_src_minus_mean[i] = (int32_t)(src_minus_mean[i]*src_minus_mean[i]); + } + + int32_t sum2 = 0; + for (size_t i = 0; i < params.vectorLength; i++) { + sum2 += sq_src_minus_mean[i]; + } + + int32_t var = sum2/params.vectorLength; + + int32_t sqrt_var = newton_sqrt(var+1); + if(sqrt_var==0){ + sqrt_var = 1; + } + + // layer norm + for (size_t i = 0; i < params.vectorLength; i++) { + result[i] = src_minus_mean[i] / (sqrt_var); // Prevent division by zero + } + + for (size_t i = 0; i < params.vectorLength; i++) + { + if (result[i] != resultVector[i]) + { + #pragma omp critical + { + if (!shouldBreak) + { // check the flag again in a critical section + std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl; + shouldBreak = true; // set the flag to true + } + } + } + } + + + if (!shouldBreak) { + std::cout << "\n\nCorrect Answer!!\n\n"; + } + } + + pimShowStats(); + std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/lnorm/PIM/run_lnorm.sh b/PIMbench/lnorm/PIM/run_lnorm.sh new file mode 100755 index 00000000..e7aaba83 --- /dev/null +++ b/PIMbench/lnorm/PIM/run_lnorm.sh @@ -0,0 +1 @@ +./lnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 128 diff --git a/PIMbench/lnorm/README.md b/PIMbench/lnorm/README.md new file mode 100644 index 00000000..6c9f4303 --- /dev/null +++ b/PIMbench/lnorm/README.md @@ -0,0 +1,95 @@ +# Layer Normalization (LNORM) + +The LayerNorm is a normalization function mostly used in AI models + + +For a detailed description of RMSNorm, you can refer to the [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) or the [paper](https://arxiv.org/pdf/1607.06450) + +## Directory Structure + +``` +rmsnorm/ +├── PIM/ +│ ├── Makefile +│ ├── lnorm.cpp +├── baselines/ +│ ├── CPU/ +│ │ ├── Makefile +│ │ ├── lnorm.cpp +│ ├── GPU/ **TODO** +│ │ ├── Makefile +│ │ ├── lnorm.cu +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains three different implementations of the RMSNORM benchmark: + +1. CPU +2. GPU **TODO** +3. PIM + +### Baseline Implementation + +CPU and GPU have been used as baselines. + +#### CPU + +The CPU variant ... + +#### GPU + +The GPU variant (**TODO** Try torch rmsnorm) + +### PIM Implementation + +The PIM variant is implemented using C++ and three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### CPU Variant + +To compile for the CPU variant, use: + +```bash +cd baselines/CPU +make +``` + +### GPU Variant + +To compile for the GPU variant, use: + +```bash +cd baselines/GPU +make +``` + +*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make -j USE_OPENMP=1 +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run the each executable with the following command that will run it for default parameters: + +```bash +./lnorm.out +``` + +To see help text on all usages and how to modify any of the input parameters, use following command: + +```bash +./lnorm.out -h +``` diff --git a/PIMbench/lnorm/baselines/CPU/Makefile b/PIMbench/lnorm/baselines/CPU/Makefile new file mode 100644 index 00000000..5a45f922 --- /dev/null +++ b/PIMbench/lnorm/baselines/CPU/Makefile @@ -0,0 +1,24 @@ +# Compiler +CXX := g++ + +# Compiler flags +CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp +LDFLAGS = -lopenblas + +# Executable name +EXEC := lnorm.out + +# Source files +SRC_FILES := $(wildcard *.cpp) + + +.PHONY: all clean + +all: $(EXEC) + +$(EXEC): $(SRC_FILES) | + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS) + +clean: + rm -rf $(EXEC) + diff --git a/PIMbench/lnorm/baselines/CPU/lnorm.cpp b/PIMbench/lnorm/baselines/CPU/lnorm.cpp new file mode 100644 index 00000000..5e1124b7 --- /dev/null +++ b/PIMbench/lnorm/baselines/CPU/lnorm.cpp @@ -0,0 +1,135 @@ +/** + * @file lnorm.cpp + * @brief LNORM. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../../util/utilBaselines.h" + +using namespace std; + +// Global Vectors +vector A; +vector B; + + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./lnorm.out [options]" + "\n" + "\n -l vector size (default=128 elements)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +/** + * @brief Parses command line input parameters + * @param argc Number of command line arguments + * @param argv Array of command line arguments + * @return Parsed parameters + */ +struct Params parseParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + + int opt; + while ((opt = getopt(argc, argv, ":l:h:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + case 'l': + p.vectorLength = stoull(optarg); + break; + default: + cerr << "\nUnrecognized option: " << opt << "\n"; + usage(); + exit(1); + } + } + return p; +} + +void lnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + + std::vector src_minus_mean (vectorLength, 0); + std::vector sq_src_minus_mean (vectorLength, 0); + + int32_t sum = 0; + + for (size_t i = 0; i < vectorLength; i++) { + sum += srcVector[i]; + } + + int32_t mean = sum / vectorLength; + + for (size_t i = 0; i < vectorLength; i++) { + src_minus_mean[i] = srcVector[i] - mean; + } + + for (size_t i = 0; i < vectorLength; i++) { + sq_src_minus_mean[i] = (int32_t)(src_minus_mean[i]*src_minus_mean[i]); + } + + int32_t sum2 = 0; + for (size_t i = 0; i < vectorLength; i++) { + sum2 += sq_src_minus_mean[i]; + } + + int32_t var = sum2/vectorLength; + + int32_t sqrt_var = sqrt(var+1); + + // layer norm + for (size_t i = 0; i < vectorLength; i++) { + dst[i] = src_minus_mean[i] / (sqrt_var + 1); // Prevent division by zero + } +} + +/** + * @brief Main function. + */ +int main(int argc, char **argv) +{ + // Parse input parameters + Params params = parseParams(argc, argv); + uint64_t vectorLength = params.vectorLength; + + // Initialize vectors + getVector(vectorLength, A); + B.resize(vectorLength); + std::cout << "Done initialization." << std::endl; + + auto start = chrono::high_resolution_clock::now(); + + for (int32_t i = 0; i < WARMUP; i++) + { + lnorm(vectorLength, A, B); + } + + auto end = chrono::high_resolution_clock::now(); + + chrono::duration elapsedTime = (end - start) / WARMUP; + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/rmsnorm/Makefile b/PIMbench/rmsnorm/Makefile new file mode 100644 index 00000000..b3e4ce58 --- /dev/null +++ b/PIMbench/rmsnorm/Makefile @@ -0,0 +1,16 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +SUBDIRS := PIM + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +debug perf dramsim3_integ clean: $(SUBDIRS) + +$(SUBDIRS): + $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) diff --git a/PIMbench/rmsnorm/PIM/Makefile b/PIMbench/rmsnorm/PIM/Makefile new file mode 100644 index 00000000..53225bf0 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/Makefile @@ -0,0 +1,24 @@ +# Makefile: C++ version of matrix vector multiplication +# Copyright (c) 2024 University of Virginia +# This file is licensed under the MIT License. +# See the LICENSE file in the root of this repository for more details. + +PROJ_ROOT = ../../.. +include ${PROJ_ROOT}/Makefile.common + +# make USE_OPENMP=1 +USE_OPENMP ?= 0 +ifeq ($(USE_OPENMP),1) + CXXFLAGS += -fopenmp +endif + +EXEC := rmsnorm.out +SRC := rmsnorm.cpp + +debug perf dramsim3_integ: $(EXEC) + +$(EXEC): $(SRC) $(DEPS) + $(CXX) $< $(CXXFLAGS) -o $@ + +clean: + rm -rf $(EXEC) *.dSYM diff --git a/PIMbench/rmsnorm/PIM/rmsnorm.cpp b/PIMbench/rmsnorm/PIM/rmsnorm.cpp new file mode 100644 index 00000000..07a09547 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/rmsnorm.cpp @@ -0,0 +1,250 @@ +// Test: C++ version of matrix vector multiplication +// Copyright (c) 2024 University of Virginia +// This file is licensed under the MIT License. +// See the LICENSE file in the root of this repository for more details. + +#include +#include +#include +#include +#include +#if defined(_OPENMP) +#include +#endif + +#include "util.h" +#include "libpimeval.h" +#include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; + char *configFile; + char *inputFile; + bool shouldVerify; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./rmsnorm.out [options]" + "\n" + "\n -l vectorLength (default=128 elements)" + "\n -c dramsim config file" + "\n -i input file containing two vectors (default=generates vector with random numbers)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +struct Params getInputParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + p.configFile = nullptr; + p.inputFile = nullptr; + p.shouldVerify = false; + + int opt; + while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + break; + case 'l': + p.vectorLength = strtoull(optarg, NULL, 0); + break; + case 'c': + p.configFile = optarg; + break; + case 'i': + p.inputFile = optarg; + break; + case 'v': + p.shouldVerify = (*optarg == 't') ? true : false; + break; + default: + fprintf(stderr, "\nUnrecognized option!\n"); + usage(); + exit(0); + } + } + return p; +} + +// Newton-Raphson iterative integer square root +uint32_t newton_sqrt(uint32_t x) { + if (x == 0) return 0; // Handle zero case + + uint32_t guess = x; // Initial guess + uint32_t prev_guess = 0; + + while (guess != prev_guess) { // Continue until convergence + prev_guess = guess; + guess = (guess + x / guess) / 2; // Newton-Raphson iteration + } + + //std::cout << "newton sqrt: " << guess << std::endl; + return guess; +} + +void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ + PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32); + if (srcObj1 == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32); + if (dstObj == -1) + { + std::cout << "Abort" << std::endl; + return; + } + + PimStatus status; + + status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + // Square the element of the vector + status = pimMul(srcObj1, srcObj1, dstObj); //TODO: How to take care of overflow? + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + // Sum of the squared elements - reduction + uint32_t sum = 0; + status = pimRedSum(dstObj, static_cast(&sum), 0, vectorLength); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + + auto start_cpu = std::chrono::high_resolution_clock::now(); + // divide to get mean + uint32_t mean = sum/vectorLength; + + // Compute RMS using Newton-Raphson square root + uint32_t rms = newton_sqrt(mean + 1); // +1 to prevent division by zero + //uint32_t rms = 0; + auto stop_cpu = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu - start_cpu); + + // Scale srcVector + status = pimDivScalar(srcObj1, dstObj, rms+1); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + return; + } + + dst.resize(vectorLength); + status = pimCopyDeviceToHost(dstObj, (void *)dst.data()); + if (status != PIM_OK) + { + std::cout << "Abort" << std::endl; + } + pimFree(srcObj1); + pimFree(dstObj); +} + +int main(int argc, char *argv[]) +{ + struct Params params = getInputParams(argc, argv); + std::cout << "Running RMSNORM for vector of size: " << params.vectorLength << std::endl; + + std::vector srcVector (params.vectorLength, 1), resultVector; + + if (params.shouldVerify) { + if (params.inputFile == nullptr) + { + getVector(params.vectorLength, srcVector); + } + else + { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } + } + + if (!createDevice(params.configFile)) + { + return 1; + } + + // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration. + rmsnorm(params.vectorLength, srcVector, resultVector); + + if (params.shouldVerify) + { + bool shouldBreak = false; // shared flag variable + + // verify result + + std::vector result (params.vectorLength, 0); + + //rms norm + uint32_t sum_sq = 0; + + // Compute sum of squares + for (size_t i = 0; i < params.vectorLength; i++) { + sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow + } + + // Compute mean squared value + uint32_t mean_sq = sum_sq / params.vectorLength; // Integer division + + // Compute RMS using Newton-Raphson square root + //uint32_t rms = newton_sqrt(mean_sq + 1); // +1 to prevent division by zero + uint32_t rms = sqrt(mean_sq+1); + //std::cout << "sqrt(): " << rms << std::endl; + + // Normalize each element: Y[i] = X[i] / RMS + for (size_t i = 0; i < params.vectorLength; i++) { + result[i] = srcVector[i] / (rms + 1); // Prevent division by zero + } + + for (size_t i = 0; i < params.vectorLength; i++) + { + if (result[i] != resultVector[i]) + { + #pragma omp critical + { + if (!shouldBreak) + { // check the flag again in a critical section + std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl; + shouldBreak = true; // set the flag to true + } + } + } + } + + + if (!shouldBreak) { + std::cout << "\n\nCorrect Answer!!\n\n"; + } + } + + pimShowStats(); + std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; + + return 0; +} diff --git a/PIMbench/rmsnorm/PIM/run_rmsnorm.sh b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh new file mode 100755 index 00000000..3d3d8b46 --- /dev/null +++ b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh @@ -0,0 +1 @@ +./rmsnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 12000 diff --git a/PIMbench/rmsnorm/README.md b/PIMbench/rmsnorm/README.md new file mode 100644 index 00000000..63c8cc12 --- /dev/null +++ b/PIMbench/rmsnorm/README.md @@ -0,0 +1,95 @@ +# Root Mean Square Normalization (RMSNorm) + +The RMSNorm is a normalization function mostly used in AI models + + +For a detailed description of RMSNorm, you can refer to the [torch.nn.RMSNorm](https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html) or the [paper](https://dl.acm.org/doi/pdf/10.5555/3454287.3455397) + +## Directory Structure + +``` +rmsnorm/ +├── PIM/ +│ ├── Makefile +│ ├── rmsnorm.cpp +├── baselines/ +│ ├── CPU/ +│ │ ├── Makefile +│ │ ├── rmsnorm.cpp +│ ├── GPU/ **TODO** +│ │ ├── Makefile +│ │ ├── rmsnorm.cu +├── README.md +├── Makefile +``` + +## Implementation Description + +This repository contains three different implementations of the RMSNORM benchmark: + +1. CPU +2. GPU **TODO** +3. PIM + +### Baseline Implementation + +CPU and GPU have been used as baselines. + +#### CPU + +The CPU variant ... + +#### GPU + +The GPU variant (**TODO** Try torch rmsnorm) + +### PIM Implementation + +The PIM variant is implemented using C++ and three different PIM architectures can be tested with this. + +## Compilation Instructions for Specific Variants + +### CPU Variant + +To compile for the CPU variant, use: + +```bash +cd baselines/CPU +make +``` + +### GPU Variant + +To compile for the GPU variant, use: + +```bash +cd baselines/GPU +make +``` + +*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile. + +### PIM Variant + +To compile for the PIM variant, use: + +```bash +cd PIM +make -j USE_OPENMP=1 +``` + +## Execution Instructions + +### Running the Executable + +After compiling, run the each executable with the following command that will run it for default parameters: + +```bash +./rmsnorm.out +``` + +To see help text on all usages and how to modify any of the input parameters, use following command: + +```bash +./rmsnorm.out -h +``` diff --git a/PIMbench/rmsnorm/baselines/CPU/Makefile b/PIMbench/rmsnorm/baselines/CPU/Makefile new file mode 100644 index 00000000..ac1a058c --- /dev/null +++ b/PIMbench/rmsnorm/baselines/CPU/Makefile @@ -0,0 +1,24 @@ +# Compiler +CXX := g++ + +# Compiler flags +CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp +LDFLAGS = -lopenblas + +# Executable name +EXEC := rmsnorm.out + +# Source files +SRC_FILES := $(wildcard *.cpp) + + +.PHONY: all clean + +all: $(EXEC) + +$(EXEC): $(SRC_FILES) | + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS) + +clean: + rm -rf $(EXEC) + diff --git a/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp new file mode 100644 index 00000000..8ddc4e0b --- /dev/null +++ b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp @@ -0,0 +1,112 @@ +/** + * @file rmsnorm.cpp + * @brief RMSNORM. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../../../util/utilBaselines.h" + +using namespace std; + +// Global Vectors +vector A; +vector B; + +// Params --------------------------------------------------------------------- +typedef struct Params +{ + uint64_t vectorLength; +} Params; + +void usage() +{ + fprintf(stderr, + "\nUsage: ./rmsnorm.out [options]" + "\n" + "\n -l vector size (default=128 elements)" + "\n -v t = verifies PIM output with host output. (default=false)" + "\n"); +} + +/** + * @brief Parses command line input parameters + * @param argc Number of command line arguments + * @param argv Array of command line arguments + * @return Parsed parameters + */ +struct Params parseParams(int argc, char **argv) +{ + struct Params p; + p.vectorLength = 128; + + int opt; + while ((opt = getopt(argc, argv, ":l:h:")) >= 0) + { + switch (opt) + { + case 'h': + usage(); + exit(0); + case 'l': + p.vectorLength = stoull(optarg); + break; + default: + cerr << "\nUnrecognized option: " << opt << "\n"; + usage(); + exit(1); + } + } + return p; +} + +void rmsnorm(uint64_t vectorLength, std::vector &srcVector, std::vector &dst) +{ +uint32_t sum_sq = 0; +for (size_t i = 0; i < vectorLength; i++) +{ + sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow +} +uint32_t mean_sq = sum_sq / vectorLength; +uint32_t rms = sqrt(mean_sq+1); +for (size_t i = 0; i < vectorLength; i++) +{ + dst[i] = srcVector[i] / (rms + 1); // Prevent division by zero +} +} + +/** + * @brief Main function. + */ +int main(int argc, char **argv) +{ + // Parse input parameters + Params params = parseParams(argc, argv); + uint64_t vectorLength = params.vectorLength; + + // Initialize vectors + getVector(vectorLength, A); + B.resize(vectorLength); + std::cout << "Done initialization." << std::endl; + + auto start = chrono::high_resolution_clock::now(); + + for (int32_t i = 0; i < WARMUP; i++) + { + rmsnorm(vectorLength, A, B); + } + + auto end = chrono::high_resolution_clock::now(); + + chrono::duration elapsedTime = (end - start) / WARMUP; + cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl; + + return 0; +}