diff --git a/PIMbench/lnorm/Makefile b/PIMbench/lnorm/Makefile
new file mode 100644
index 00000000..b3e4ce58
--- /dev/null
+++ b/PIMbench/lnorm/Makefile
@@ -0,0 +1,16 @@
+# Makefile: C++ version of matrix vector multiplication
+# Copyright (c) 2024 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+SUBDIRS := PIM
+
+.PHONY: debug perf dramsim3_integ clean $(SUBDIRS)
+.DEFAULT_GOAL := perf
+
+USE_OPENMP ?= 0
+
+debug perf dramsim3_integ clean: $(SUBDIRS)
+
+$(SUBDIRS):
+	$(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP)
diff --git a/PIMbench/lnorm/PIM/Makefile b/PIMbench/lnorm/PIM/Makefile
new file mode 100644
index 00000000..e637f51d
--- /dev/null
+++ b/PIMbench/lnorm/PIM/Makefile
@@ -0,0 +1,24 @@
+# Makefile: C++ version of matrix vector multiplication
+# Copyright (c) 2024 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+PROJ_ROOT = ../../..
+include ${PROJ_ROOT}/Makefile.common
+
+# make USE_OPENMP=1
+USE_OPENMP ?= 0
+ifeq ($(USE_OPENMP),1)
+	CXXFLAGS += -fopenmp
+endif
+
+EXEC := lnorm.out
+SRC := lnorm.cpp
+
+debug perf dramsim3_integ: $(EXEC)
+
+$(EXEC): $(SRC) $(DEPS)
+	$(CXX) $< $(CXXFLAGS) -o $@
+
+clean:
+	rm -rf $(EXEC) *.dSYM
diff --git a/PIMbench/lnorm/PIM/lnorm.cpp b/PIMbench/lnorm/PIM/lnorm.cpp
new file mode 100644
index 00000000..75b7375f
--- /dev/null
+++ b/PIMbench/lnorm/PIM/lnorm.cpp
@@ -0,0 +1,291 @@
+// Test: C++ version of matrix vector multiplication
+// Copyright (c) 2024 University of Virginia
+// This file is licensed under the MIT License.
+// See the LICENSE file in the root of this repository for more details.
+
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+#include <stdint.h>
+#include <iomanip>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "util.h"
+#include "libpimeval.h"
+#include <chrono>
+#include <cmath>
+
+std::chrono::duration<double, std::milli> hostElapsedTime = std::chrono::duration<double, std::milli>::zero();
+//auto start_cpu, stop_cpu;
+
+// Params ---------------------------------------------------------------------
+typedef struct Params
+{
+  uint64_t vectorLength;
+  char *configFile;
+  char *inputFile;
+  bool shouldVerify;
+} Params;
+
+void usage()
+{
+  fprintf(stderr,
+          "\nUsage:  ./lnorm.out [options]"
+          "\n"
+          "\n    -l    vectorLength (default=128 elements)"
+          "\n    -c    dramsim config file"
+          "\n    -i    input file containing two vectors (default=generates vector with random numbers)"
+          "\n    -v    t = verifies PIM output with host output. (default=false)"
+          "\n");
+}
+
+struct Params getInputParams(int argc, char **argv)
+{
+  struct Params p;
+  p.vectorLength = 128;
+  p.configFile = nullptr;
+  p.inputFile = nullptr;
+  p.shouldVerify = false;
+
+  int opt;
+  while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0)
+  {
+    switch (opt)
+    {
+    case 'h':
+      usage();
+      exit(0);
+      break;
+    case 'l':
+      p.vectorLength = strtoull(optarg, NULL, 0);
+      break;
+    case 'c':
+      p.configFile = optarg;
+      break;
+    case 'i':
+      p.inputFile = optarg;
+      break;
+    case 'v':
+      p.shouldVerify = (*optarg == 't') ? true : false;
+      break;
+    default:
+      fprintf(stderr, "\nUnrecognized option!\n");
+      usage();
+      exit(0);
+    }
+  }
+  return p;
+}
+
+// Newton-Raphson iterative integer square root
+uint32_t newton_sqrt(uint32_t x) {
+  if (x == 0) return 0;  // Handle zero case
+
+  uint32_t guess = x; // Initial guess
+  uint32_t prev_guess = 0;
+
+  while (guess != prev_guess) { // Continue until convergence
+      prev_guess = guess;
+      guess = (guess + x / guess) / 2; // Newton-Raphson iteration
+  }
+
+  //std::cout << "newton sqrt: " << guess << std::endl;
+  return guess;
+}
+
+void lnorm(uint64_t vectorLength, std::vector<int> &srcVector, std::vector<int> &dst)
+{
+  PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32);
+  if (srcObj1 == -1)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  PimObjId tempObj1 = pimAllocAssociated(srcObj1, PIM_INT32);
+  if (tempObj1 == -1)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32);
+  if (dstObj == -1)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  PimStatus status;
+
+  status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  //mean
+  int32_t sum = 0;
+  status = pimRedSum(srcObj1, static_cast<void*>(&sum), 0, vectorLength);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+  auto start_cpu = std::chrono::high_resolution_clock::now();
+  int32_t mean = sum/vectorLength;
+  std::cout << "mean " << mean << " sum " << sum <<std::endl;
+  auto stop_cpu = std::chrono::high_resolution_clock::now();
+  hostElapsedTime += (stop_cpu - start_cpu);
+  
+  status = pimSubScalar(srcObj1, tempObj1, mean);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+  
+  status = pimMul(tempObj1, tempObj1, dstObj);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+
+  int32_t sum2 = 0;
+  status = pimRedSum(dstObj, static_cast<void*>(&sum2), 0, vectorLength);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+  
+  start_cpu = std::chrono::high_resolution_clock::now(); 
+
+  int32_t variance = sum2/vectorLength;
+  int32_t sqrt_var = newton_sqrt(variance + 1);
+  std::cout << "sqrt_var " << sqrt_var << " var " << variance <<std::endl;
+
+  stop_cpu = std::chrono::high_resolution_clock::now();
+  hostElapsedTime += (stop_cpu - start_cpu);
+
+  // Scale sqrt_variance
+  status = pimDivScalar(tempObj1, dstObj, sqrt_var);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+
+
+  dst.resize(vectorLength);
+  status = pimCopyDeviceToHost(dstObj, (void *)dst.data());
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+  }
+  pimFree(srcObj1);
+  pimFree(tempObj1);
+  pimFree(dstObj);
+}
+
+int main(int argc, char *argv[])
+{
+  struct Params params = getInputParams(argc, argv);
+  std::cout << "Running LNORM for vector of size: " << params.vectorLength << std::endl;
+
+  std::vector<int> srcVector (params.vectorLength, 1), resultVector;
+
+  if (params.shouldVerify) {
+    if (params.inputFile == nullptr)
+    {
+      getVector(params.vectorLength, srcVector);
+    }
+    else
+    {
+      std::cout << "Reading from input file is not implemented yet." << std::endl;
+      return 1;
+    }
+  }
+
+
+  if (!createDevice(params.configFile))
+  {
+    return 1;
+  }
+
+  // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration.
+  lnorm(params.vectorLength, srcVector, resultVector);
+
+  if (params.shouldVerify)
+  {
+    bool shouldBreak = false; // shared flag variable
+
+    // verify result
+
+      std::vector<int> result (params.vectorLength, 0);
+      std::vector<int> src_minus_mean (params.vectorLength, 0);
+      std::vector<int> sq_src_minus_mean (params.vectorLength, 0);
+      
+      int32_t sum = 0;
+
+      for (size_t i = 0; i < params.vectorLength; i++) {
+          sum += srcVector[i];
+      } 
+
+      int32_t mean = sum / params.vectorLength; 
+
+      for (size_t i = 0; i < params.vectorLength; i++) {
+        src_minus_mean[i] = srcVector[i] - mean;  
+      }
+
+      for (size_t i = 0; i < params.vectorLength; i++) {
+        sq_src_minus_mean[i] = (int32_t)(src_minus_mean[i]*src_minus_mean[i]);  
+      }
+
+      int32_t sum2 = 0;
+      for (size_t i = 0; i < params.vectorLength; i++) {
+        sum2 += sq_src_minus_mean[i];
+      } 
+
+      int32_t var = sum2/params.vectorLength;
+
+      int32_t sqrt_var = newton_sqrt(var+1);
+      if(sqrt_var==0){
+        sqrt_var = 1;
+      }
+
+      // layer norm
+      for (size_t i = 0; i < params.vectorLength; i++) {
+          result[i] = src_minus_mean[i] / (sqrt_var);  // Prevent division by zero
+      }
+
+    for (size_t i = 0; i < params.vectorLength; i++)
+    {
+      if (result[i] != resultVector[i])
+      {
+        #pragma omp critical
+        {
+          if (!shouldBreak)
+          { // check the flag again in a critical section
+            std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl;
+            shouldBreak = true; // set the flag to true
+          }
+        }
+      }
+    }
+    
+
+    if (!shouldBreak) {
+      std::cout << "\n\nCorrect Answer!!\n\n";
+    }
+  }
+
+  pimShowStats();
+  std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl;
+
+  return 0;
+}
diff --git a/PIMbench/lnorm/PIM/run_lnorm.sh b/PIMbench/lnorm/PIM/run_lnorm.sh
new file mode 100755
index 00000000..e7aaba83
--- /dev/null
+++ b/PIMbench/lnorm/PIM/run_lnorm.sh
@@ -0,0 +1 @@
+./lnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 128
diff --git a/PIMbench/lnorm/README.md b/PIMbench/lnorm/README.md
new file mode 100644
index 00000000..6c9f4303
--- /dev/null
+++ b/PIMbench/lnorm/README.md
@@ -0,0 +1,95 @@
+# Layer Normalization (LNORM)
+
+The LayerNorm is a normalization function mostly used in AI models
+
+
+For a detailed description of RMSNorm, you can refer to the [torch.nn.LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html) or the [paper](https://arxiv.org/pdf/1607.06450)
+
+## Directory Structure
+
+```
+rmsnorm/
+├── PIM/
+│   ├── Makefile
+│   ├── lnorm.cpp
+├── baselines/
+│   ├── CPU/
+│   │   ├── Makefile
+│   │   ├── lnorm.cpp
+│   ├── GPU/ **TODO**
+│   │   ├── Makefile
+│   │   ├── lnorm.cu 
+├── README.md
+├── Makefile
+```
+
+## Implementation Description
+
+This repository contains three different implementations of the RMSNORM benchmark:
+
+1. CPU
+2. GPU **TODO**
+3. PIM
+
+### Baseline Implementation
+
+CPU and GPU have been used as baselines.
+
+#### CPU
+
+The CPU variant ...
+
+#### GPU
+
+The GPU variant (**TODO** Try torch rmsnorm)
+
+### PIM Implementation
+
+The PIM variant is implemented using C++ and three different PIM architectures can be tested with this.
+
+## Compilation Instructions for Specific Variants
+
+### CPU Variant
+
+To compile for the CPU variant, use:
+
+```bash
+cd baselines/CPU
+make
+```
+
+### GPU Variant 
+
+To compile for the GPU variant, use:
+
+```bash
+cd baselines/GPU
+make
+```
+
+*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile.
+
+### PIM Variant
+
+To compile for the PIM variant, use:
+
+```bash
+cd PIM
+make -j USE_OPENMP=1
+```
+
+## Execution Instructions
+
+### Running the Executable
+
+After compiling, run the each executable with the following command that will run it for default parameters:
+
+```bash
+./lnorm.out
+```
+
+To see help text on all usages and how to modify any of the input parameters, use following command:
+
+```bash
+./lnorm.out -h
+```
diff --git a/PIMbench/lnorm/baselines/CPU/Makefile b/PIMbench/lnorm/baselines/CPU/Makefile
new file mode 100644
index 00000000..5a45f922
--- /dev/null
+++ b/PIMbench/lnorm/baselines/CPU/Makefile
@@ -0,0 +1,24 @@
+# Compiler
+CXX := g++
+
+# Compiler flags
+CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp
+LDFLAGS = -lopenblas
+
+# Executable name
+EXEC := lnorm.out
+
+# Source files
+SRC_FILES := $(wildcard *.cpp)
+
+
+.PHONY: all clean
+
+all: $(EXEC)
+
+$(EXEC): $(SRC_FILES) |
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS)
+
+clean:
+	rm -rf $(EXEC)
+	
diff --git a/PIMbench/lnorm/baselines/CPU/lnorm.cpp b/PIMbench/lnorm/baselines/CPU/lnorm.cpp
new file mode 100644
index 00000000..5e1124b7
--- /dev/null
+++ b/PIMbench/lnorm/baselines/CPU/lnorm.cpp
@@ -0,0 +1,135 @@
+/**
+ * @file lnorm.cpp
+ * @brief LNORM.
+ */
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <unistd.h>
+#include <getopt.h>
+#include <chrono>
+#include <cblas.h>
+#include <cmath>
+
+#include "../../../../util/utilBaselines.h"
+
+using namespace std;
+
+// Global Vectors
+vector<int> A;
+vector<int> B;
+
+
+// Params ---------------------------------------------------------------------
+typedef struct Params
+{
+  uint64_t vectorLength;
+} Params;
+
+void usage()
+{
+  fprintf(stderr,
+          "\nUsage:  ./lnorm.out [options]"
+          "\n"
+          "\n    -l    vector size (default=128 elements)"
+          "\n    -v    t = verifies PIM output with host output. (default=false)"
+          "\n");
+}
+
+/**
+ * @brief Parses command line input parameters
+ * @param argc Number of command line arguments
+ * @param argv Array of command line arguments
+ * @return Parsed parameters
+ */
+struct Params parseParams(int argc, char **argv)
+{
+  struct Params p;
+  p.vectorLength = 128;
+
+  int opt;
+  while ((opt = getopt(argc, argv, ":l:h:")) >= 0)
+  {
+    switch (opt)
+    {
+    case 'h':
+      usage();
+      exit(0);
+    case 'l':
+      p.vectorLength = stoull(optarg);
+      break;
+    default:
+      cerr << "\nUnrecognized option: " << opt << "\n";
+      usage();
+      exit(1);
+    }
+  }
+  return p;
+}
+
+void lnorm(uint64_t vectorLength, std::vector<int> &srcVector, std::vector<int> &dst)
+{
+
+  std::vector<int> src_minus_mean (vectorLength, 0);
+  std::vector<int> sq_src_minus_mean (vectorLength, 0);
+  
+  int32_t sum = 0;
+
+  for (size_t i = 0; i < vectorLength; i++) {
+      sum += srcVector[i];
+  } 
+
+  int32_t mean = sum / vectorLength; 
+
+  for (size_t i = 0; i < vectorLength; i++) {
+    src_minus_mean[i] = srcVector[i] - mean;  
+  }
+
+  for (size_t i = 0; i < vectorLength; i++) {
+    sq_src_minus_mean[i] = (int32_t)(src_minus_mean[i]*src_minus_mean[i]);  
+  }
+
+  int32_t sum2 = 0;
+  for (size_t i = 0; i < vectorLength; i++) {
+    sum2 += sq_src_minus_mean[i];
+  } 
+
+  int32_t var = sum2/vectorLength;
+
+  int32_t sqrt_var = sqrt(var+1);
+
+  // layer norm
+  for (size_t i = 0; i < vectorLength; i++) {
+      dst[i] = src_minus_mean[i] / (sqrt_var + 1);  // Prevent division by zero
+  }
+}
+
+/**
+ * @brief Main function.
+ */
+int main(int argc, char **argv)
+{
+  // Parse input parameters
+  Params params = parseParams(argc, argv);
+  uint64_t vectorLength = params.vectorLength;
+
+  // Initialize vectors
+  getVector(vectorLength, A);
+  B.resize(vectorLength);
+  std::cout << "Done initialization." << std::endl;
+
+  auto start = chrono::high_resolution_clock::now();
+
+  for (int32_t i = 0; i < WARMUP; i++)
+  {
+    lnorm(vectorLength, A, B);
+  }
+
+  auto end = chrono::high_resolution_clock::now();
+
+  chrono::duration<double, milli> elapsedTime = (end - start) / WARMUP;
+  cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl;
+
+  return 0;
+}
diff --git a/PIMbench/rmsnorm/Makefile b/PIMbench/rmsnorm/Makefile
new file mode 100644
index 00000000..b3e4ce58
--- /dev/null
+++ b/PIMbench/rmsnorm/Makefile
@@ -0,0 +1,16 @@
+# Makefile: C++ version of matrix vector multiplication
+# Copyright (c) 2024 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+SUBDIRS := PIM
+
+.PHONY: debug perf dramsim3_integ clean $(SUBDIRS)
+.DEFAULT_GOAL := perf
+
+USE_OPENMP ?= 0
+
+debug perf dramsim3_integ clean: $(SUBDIRS)
+
+$(SUBDIRS):
+	$(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP)
diff --git a/PIMbench/rmsnorm/PIM/Makefile b/PIMbench/rmsnorm/PIM/Makefile
new file mode 100644
index 00000000..53225bf0
--- /dev/null
+++ b/PIMbench/rmsnorm/PIM/Makefile
@@ -0,0 +1,24 @@
+# Makefile: C++ version of matrix vector multiplication
+# Copyright (c) 2024 University of Virginia
+# This file is licensed under the MIT License.
+# See the LICENSE file in the root of this repository for more details.
+
+PROJ_ROOT = ../../..
+include ${PROJ_ROOT}/Makefile.common
+
+# make USE_OPENMP=1
+USE_OPENMP ?= 0
+ifeq ($(USE_OPENMP),1)
+	CXXFLAGS += -fopenmp
+endif
+
+EXEC := rmsnorm.out
+SRC := rmsnorm.cpp
+
+debug perf dramsim3_integ: $(EXEC)
+
+$(EXEC): $(SRC) $(DEPS)
+	$(CXX) $< $(CXXFLAGS) -o $@
+
+clean:
+	rm -rf $(EXEC) *.dSYM
diff --git a/PIMbench/rmsnorm/PIM/rmsnorm.cpp b/PIMbench/rmsnorm/PIM/rmsnorm.cpp
new file mode 100644
index 00000000..07a09547
--- /dev/null
+++ b/PIMbench/rmsnorm/PIM/rmsnorm.cpp
@@ -0,0 +1,250 @@
+// Test: C++ version of matrix vector multiplication
+// Copyright (c) 2024 University of Virginia
+// This file is licensed under the MIT License.
+// See the LICENSE file in the root of this repository for more details.
+
+#include <iostream>
+#include <vector>
+#include <getopt.h>
+#include <stdint.h>
+#include <iomanip>
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "util.h"
+#include "libpimeval.h"
+#include <chrono>
+#include <cmath>
+
+std::chrono::duration<double, std::milli> hostElapsedTime = std::chrono::duration<double, std::milli>::zero();
+
+// Params ---------------------------------------------------------------------
+typedef struct Params
+{
+  uint64_t vectorLength;
+  char *configFile;
+  char *inputFile;
+  bool shouldVerify;
+} Params;
+
+void usage()
+{
+  fprintf(stderr,
+          "\nUsage:  ./rmsnorm.out [options]"
+          "\n"
+          "\n    -l    vectorLength (default=128 elements)"
+          "\n    -c    dramsim config file"
+          "\n    -i    input file containing two vectors (default=generates vector with random numbers)"
+          "\n    -v    t = verifies PIM output with host output. (default=false)"
+          "\n");
+}
+
+struct Params getInputParams(int argc, char **argv)
+{
+  struct Params p;
+  p.vectorLength = 128;
+  p.configFile = nullptr;
+  p.inputFile = nullptr;
+  p.shouldVerify = false;
+
+  int opt;
+  while ((opt = getopt(argc, argv, "h:l:c:i:v:")) >= 0)
+  {
+    switch (opt)
+    {
+    case 'h':
+      usage();
+      exit(0);
+      break;
+    case 'l':
+      p.vectorLength = strtoull(optarg, NULL, 0);
+      break;
+    case 'c':
+      p.configFile = optarg;
+      break;
+    case 'i':
+      p.inputFile = optarg;
+      break;
+    case 'v':
+      p.shouldVerify = (*optarg == 't') ? true : false;
+      break;
+    default:
+      fprintf(stderr, "\nUnrecognized option!\n");
+      usage();
+      exit(0);
+    }
+  }
+  return p;
+}
+
+// Newton-Raphson iterative integer square root
+uint32_t newton_sqrt(uint32_t x) {
+  if (x == 0) return 0;  // Handle zero case
+
+  uint32_t guess = x; // Initial guess
+  uint32_t prev_guess = 0;
+
+  while (guess != prev_guess) { // Continue until convergence
+      prev_guess = guess;
+      guess = (guess + x / guess) / 2; // Newton-Raphson iteration
+  }
+
+  //std::cout << "newton sqrt: " << guess << std::endl;
+  return guess;
+}
+
+void rmsnorm(uint64_t vectorLength, std::vector<int> &srcVector, std::vector<int> &dst)
+{
+  PimObjId srcObj1 = pimAlloc(PIM_ALLOC_AUTO, vectorLength, PIM_INT32);
+  if (srcObj1 == -1)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  PimObjId dstObj = pimAllocAssociated(srcObj1, PIM_INT32);
+  if (dstObj == -1)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  PimStatus status;
+
+  status = pimCopyHostToDevice((void *)srcVector.data(), srcObj1);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  }
+
+  // Square the element of the vector
+  status = pimMul(srcObj1, srcObj1, dstObj); //TODO: How to take care of overflow?
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+
+
+  // Sum of the squared elements - reduction
+  uint32_t sum = 0;
+  status = pimRedSum(dstObj, static_cast<void*>(&sum), 0, vectorLength);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+
+  
+  auto start_cpu = std::chrono::high_resolution_clock::now();
+  // divide to get mean
+  uint32_t mean = sum/vectorLength;
+
+  // Compute RMS using Newton-Raphson square root
+  uint32_t rms = newton_sqrt(mean + 1); // +1 to prevent division by zero
+  //uint32_t rms = 0;
+  auto stop_cpu = std::chrono::high_resolution_clock::now();
+  hostElapsedTime += (stop_cpu - start_cpu);
+
+  // Scale srcVector
+  status = pimDivScalar(srcObj1, dstObj, rms+1);
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+    return;
+  } 
+
+  dst.resize(vectorLength);
+  status = pimCopyDeviceToHost(dstObj, (void *)dst.data());
+  if (status != PIM_OK)
+  {
+    std::cout << "Abort" << std::endl;
+  }
+  pimFree(srcObj1);
+  pimFree(dstObj);
+}
+
+int main(int argc, char *argv[])
+{
+  struct Params params = getInputParams(argc, argv);
+  std::cout << "Running RMSNORM for vector of size: " << params.vectorLength << std::endl;
+
+  std::vector<int> srcVector (params.vectorLength, 1), resultVector;
+
+  if (params.shouldVerify) {
+    if (params.inputFile == nullptr)
+    {
+      getVector(params.vectorLength, srcVector);
+    }
+    else
+    {
+      std::cout << "Reading from input file is not implemented yet." << std::endl;
+      return 1;
+    }
+  }
+
+  if (!createDevice(params.configFile))
+  {
+    return 1;
+  }
+
+  // TODO: Check if vector can fit in one iteration. Otherwise need to run in multiple iteration.
+  rmsnorm(params.vectorLength, srcVector, resultVector);
+
+  if (params.shouldVerify)
+  {
+    bool shouldBreak = false; // shared flag variable
+
+    // verify result
+
+      std::vector<int> result (params.vectorLength, 0);
+      
+      //rms norm
+      uint32_t sum_sq = 0;
+
+      // Compute sum of squares
+      for (size_t i = 0; i < params.vectorLength; i++) {
+          sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow
+      }
+
+      // Compute mean squared value
+      uint32_t mean_sq = sum_sq / params.vectorLength; // Integer division
+
+      // Compute RMS using Newton-Raphson square root
+      //uint32_t rms = newton_sqrt(mean_sq + 1); // +1 to prevent division by zero
+      uint32_t rms = sqrt(mean_sq+1);
+      //std::cout << "sqrt(): " << rms << std::endl;
+
+      // Normalize each element: Y[i] = X[i] / RMS
+      for (size_t i = 0; i < params.vectorLength; i++) {
+          result[i] = srcVector[i] / (rms + 1);  // Prevent division by zero
+      }
+
+    for (size_t i = 0; i < params.vectorLength; i++)
+    {
+      if (result[i] != resultVector[i])
+      {
+        #pragma omp critical
+        {
+          if (!shouldBreak)
+          { // check the flag again in a critical section
+            std::cout << "Wrong answer: " << resultVector[i] << " (expected " << result[i] << ")" << std::endl;
+            shouldBreak = true; // set the flag to true
+          }
+        }
+      }
+    }
+    
+
+    if (!shouldBreak) {
+      std::cout << "\n\nCorrect Answer!!\n\n";
+    }
+  }
+
+  pimShowStats();
+  std::cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl;
+
+  return 0;
+}
diff --git a/PIMbench/rmsnorm/PIM/run_rmsnorm.sh b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh
new file mode 100755
index 00000000..3d3d8b46
--- /dev/null
+++ b/PIMbench/rmsnorm/PIM/run_rmsnorm.sh
@@ -0,0 +1 @@
+./rmsnorm.out -v t -c ../../../configs/hbm/PIMeval_Bank_Rank1.cfg -l 12000
diff --git a/PIMbench/rmsnorm/README.md b/PIMbench/rmsnorm/README.md
new file mode 100644
index 00000000..63c8cc12
--- /dev/null
+++ b/PIMbench/rmsnorm/README.md
@@ -0,0 +1,95 @@
+# Root Mean Square Normalization (RMSNorm)
+
+The RMSNorm is a normalization function mostly used in AI models
+
+
+For a detailed description of RMSNorm, you can refer to the [torch.nn.RMSNorm](https://pytorch.org/docs/stable/generated/torch.nn.RMSNorm.html) or the [paper](https://dl.acm.org/doi/pdf/10.5555/3454287.3455397)
+
+## Directory Structure
+
+```
+rmsnorm/
+├── PIM/
+│   ├── Makefile
+│   ├── rmsnorm.cpp
+├── baselines/
+│   ├── CPU/
+│   │   ├── Makefile
+│   │   ├── rmsnorm.cpp
+│   ├── GPU/ **TODO**
+│   │   ├── Makefile
+│   │   ├── rmsnorm.cu 
+├── README.md
+├── Makefile
+```
+
+## Implementation Description
+
+This repository contains three different implementations of the RMSNORM benchmark:
+
+1. CPU
+2. GPU **TODO**
+3. PIM
+
+### Baseline Implementation
+
+CPU and GPU have been used as baselines.
+
+#### CPU
+
+The CPU variant ...
+
+#### GPU
+
+The GPU variant (**TODO** Try torch rmsnorm)
+
+### PIM Implementation
+
+The PIM variant is implemented using C++ and three different PIM architectures can be tested with this.
+
+## Compilation Instructions for Specific Variants
+
+### CPU Variant
+
+To compile for the CPU variant, use:
+
+```bash
+cd baselines/CPU
+make
+```
+
+### GPU Variant 
+
+To compile for the GPU variant, use:
+
+```bash
+cd baselines/GPU
+make
+```
+
+*Note that the GPU Makefile currently uses SM_80, which is compatible with the A100. To run it on a different GPU, please manually change this in the makefile.
+
+### PIM Variant
+
+To compile for the PIM variant, use:
+
+```bash
+cd PIM
+make -j USE_OPENMP=1
+```
+
+## Execution Instructions
+
+### Running the Executable
+
+After compiling, run the each executable with the following command that will run it for default parameters:
+
+```bash
+./rmsnorm.out
+```
+
+To see help text on all usages and how to modify any of the input parameters, use following command:
+
+```bash
+./rmsnorm.out -h
+```
diff --git a/PIMbench/rmsnorm/baselines/CPU/Makefile b/PIMbench/rmsnorm/baselines/CPU/Makefile
new file mode 100644
index 00000000..ac1a058c
--- /dev/null
+++ b/PIMbench/rmsnorm/baselines/CPU/Makefile
@@ -0,0 +1,24 @@
+# Compiler
+CXX := g++
+
+# Compiler flags
+CXXFLAGS := -Wall -Wextra -Werror -std=c++17 -O3 -fopenmp
+LDFLAGS = -lopenblas
+
+# Executable name
+EXEC := rmsnorm.out
+
+# Source files
+SRC_FILES := $(wildcard *.cpp)
+
+
+.PHONY: all clean
+
+all: $(EXEC)
+
+$(EXEC): $(SRC_FILES) |
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDFLAGS)
+
+clean:
+	rm -rf $(EXEC)
+	
diff --git a/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp
new file mode 100644
index 00000000..8ddc4e0b
--- /dev/null
+++ b/PIMbench/rmsnorm/baselines/CPU/rmsnorm.cpp
@@ -0,0 +1,112 @@
+/**
+ * @file rmsnorm.cpp
+ * @brief RMSNORM.
+ */
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <unistd.h>
+#include <getopt.h>
+#include <chrono>
+#include <cblas.h>
+#include <cmath>
+
+#include "../../../../util/utilBaselines.h"
+
+using namespace std;
+
+// Global Vectors
+vector<int> A;
+vector<int> B;
+
+// Params ---------------------------------------------------------------------
+typedef struct Params
+{
+  uint64_t vectorLength;
+} Params;
+
+void usage()
+{
+  fprintf(stderr,
+          "\nUsage:  ./rmsnorm.out [options]"
+          "\n"
+          "\n    -l    vector size (default=128 elements)"
+          "\n    -v    t = verifies PIM output with host output. (default=false)"
+          "\n");
+}
+
+/**
+ * @brief Parses command line input parameters
+ * @param argc Number of command line arguments
+ * @param argv Array of command line arguments
+ * @return Parsed parameters
+ */
+struct Params parseParams(int argc, char **argv)
+{
+  struct Params p;
+  p.vectorLength = 128;
+
+  int opt;
+  while ((opt = getopt(argc, argv, ":l:h:")) >= 0)
+  {
+    switch (opt)
+    {
+    case 'h':
+      usage();
+      exit(0);
+    case 'l':
+      p.vectorLength = stoull(optarg);
+      break;
+    default:
+      cerr << "\nUnrecognized option: " << opt << "\n";
+      usage();
+      exit(1);
+    }
+  }
+  return p;
+}
+
+void rmsnorm(uint64_t vectorLength, std::vector<int> &srcVector, std::vector<int> &dst)
+{
+uint32_t sum_sq = 0;
+for (size_t i = 0; i < vectorLength; i++) 
+{
+  sum_sq += (uint32_t)(srcVector[i] * srcVector[i]); // Prevent overflow
+}
+uint32_t mean_sq = sum_sq / vectorLength;
+uint32_t rms = sqrt(mean_sq+1); 
+for (size_t i = 0; i < vectorLength; i++) 
+{
+  dst[i] = srcVector[i] / (rms + 1);  // Prevent division by zero
+}
+}
+
+/**
+ * @brief Main function.
+ */
+int main(int argc, char **argv)
+{
+  // Parse input parameters
+  Params params = parseParams(argc, argv);
+  uint64_t vectorLength = params.vectorLength;
+
+  // Initialize vectors
+  getVector(vectorLength, A);
+  B.resize(vectorLength);
+  std::cout << "Done initialization." << std::endl;
+
+  auto start = chrono::high_resolution_clock::now();
+
+  for (int32_t i = 0; i < WARMUP; i++)
+  {
+    rmsnorm(vectorLength, A, B);
+  }
+
+  auto end = chrono::high_resolution_clock::now();
+
+  chrono::duration<double, milli> elapsedTime = (end - start) / WARMUP;
+  cout << "Duration: " << fixed << setprecision(3) << elapsedTime.count() << " ms." << endl;
+
+  return 0;
+}