diff --git a/misc-bench/Makefile b/misc-bench/Makefile index bc6fc42b..798a5f9d 100644 --- a/misc-bench/Makefile +++ b/misc-bench/Makefile @@ -1,8 +1,42 @@ +<<<<<<< HEAD # Makefile for PIMeval Simulator - Applications +======= +<<<<<<< HEAD +# Makefile for PIMeval / PIMbench Framework +======= +# Makefile for PIMeval Simulator - Applications +>>>>>>> 556bf46 (host_PIM_Prefixsum) +>>>>>>> 14b1d2b (host_PIM_Prefixsum) # Copyright (c) 2024 University of Virginia # This file is licensed under the MIT License. # See the LICENSE file in the root of this repository for more details. +<<<<<<< HEAD +SUBDIRS := $(wildcard */.) +======= +<<<<<<< HEAD +LIBDIR := libpimeval +BITSERIALDIR := bit-serial +APPDIR := PIMbench +TESTDIR := misc-bench tests +ALLDIRS := $(LIBDIR) $(BITSERIALDIR) $(APPDIR) $(TESTDIR) +>>>>>>> 14b1d2b (host_PIM_Prefixsum) + +.PHONY: debug perf dramsim3_integ clean $(SUBDIRS) +.DEFAULT_GOAL := perf + +USE_OPENMP ?= 0 + +COMPILE_WITH_JPEG ?= 0 + +debug: $(SUBDIRS) + @echo "INFO: apps target = debug" + +<<<<<<< HEAD +======= +$(BITSERIALDIR) $(APPDIR) $(TESTDIR): $(DEP_LIBPIMEVAL) + $(MAKE) -C $@ $(MAKECMDGOALS) PIM_SIM_TARGET=$(PIM_SIM_TARGET) USE_OPENMP=$(USE_OPENMP) COMPILE_WITH_JPEG=$(COMPILE_WITH_JPEG) +======= SUBDIRS := $(wildcard */.) .PHONY: debug perf dramsim3_integ clean $(SUBDIRS) @@ -15,6 +49,7 @@ COMPILE_WITH_JPEG ?= 0 debug: $(SUBDIRS) @echo "INFO: apps target = debug" +>>>>>>> 14b1d2b (host_PIM_Prefixsum) perf: $(SUBDIRS) @echo "INFO: apps target = perf" @@ -25,4 +60,8 @@ clean: $(SUBDIRS) $(SUBDIRS): $(MAKE) -C $@ $(MAKECMDGOALS) USE_OPENMP=$(USE_OPENMP) COMPILE_WITH_JPEG=$(COMPILE_WITH_JPEG) +<<<<<<< HEAD +======= +>>>>>>> 556bf46 (host_PIM_Prefixsum) +>>>>>>> 14b1d2b (host_PIM_Prefixsum) diff --git a/misc-bench/cpp-prefix-sum/prefix-sum.cpp b/misc-bench/cpp-prefix-sum/prefix-sum.cpp index 942c247e..be5ecbc9 100644 --- a/misc-bench/cpp-prefix-sum/prefix-sum.cpp +++ b/misc-bench/cpp-prefix-sum/prefix-sum.cpp @@ -7,11 +7,15 @@ #include #include #include -#include "util.h" +#include "../../util/util.h" #include "libpimeval.h" #include #include #include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); + #if defined(_OPENMP) #include @@ -77,138 +81,274 @@ struct Params getInputParams(int argc, char **argv) return p; } -void prefixSum(vector &input, vector &deviceoutput, uint64_t len) +void prefixSum(vector &right, vector &left,uint64_t len) { - std::vector temp(len); - std::vector acc(len); - - PimObjId inputObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); - if (inputObj == -1) + + PimObjId rightObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); + if (rightObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - - PimStatus status = pimCopyHostToDevice((void *)input.data(), inputObj); + PimStatus status = pimCopyHostToDevice((void *)right.data(), rightObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; return; } - PimObjId tempObj = pimAllocAssociated(inputObj, PIM_INT32); - if (tempObj == -1) + + + PimObjId leftObj = pimAllocAssociated(rightObj , PIM_INT32); + if (leftObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - status = pimCopyHostToDevice((void *)input.data(), tempObj); + status = pimCopyHostToDevice((void *)left.data(), leftObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; + } + + + //PIM Add + status = pimAdd(rightObj, leftObj, rightObj); + if (status != PIM_OK) + { + std::cerr << "Abort: Failed to perform PIM addition." << std::endl; return; } - PimObjId accObj = pimAllocAssociated(inputObj, PIM_INT32); - if (accObj == -1) + //Copy results back to Host + status = pimCopyDeviceToHost(rightObj, (void *)right.data()); + if (status != PIM_OK) { - std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; + std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl; return; } - status = pimCopyHostToDevice((void *)input.data(), accObj); + // Clean up PIM objects + pimFree(rightObj); + pimFree(leftObj); +} + + +void downsweep(vector &left, vector &right, uint64_t len) +{ + PimObjId rightObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); + if (rightObj == -1) + { + std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; + return; + } + PimStatus status = pimCopyHostToDevice((void *)right.data(), rightObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; return; } - PimObjId outputObj = pimAllocAssociated(inputObj, PIM_INT32); - if (outputObj == -1) + PimObjId leftObj = pimAllocAssociated(rightObj, PIM_INT32); + if (rightObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - - status = pimCopyHostToDevice((void *)deviceoutput.data(), outputObj); + status = pimCopyHostToDevice((void *)left.data(), leftObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; - return; - } + } - while (len > 0) + //PIM Add + status = pimAdd(rightObj, leftObj, rightObj); + if (status != PIM_OK) { - pimShiftElementsRight(tempObj); - status = pimAdd(tempObj, accObj, accObj); - if (status != PIM_OK) - { - std::cerr << "Abort: Failed to perform PIM addition." << std::endl; - return; - } - len--; + std::cerr << "Abort: Failed to perform PIM addition." << std::endl; + return; } - status = pimCopyDeviceToHost(accObj, (void *)deviceoutput.data()); + //Copy results back to Host + status = pimCopyDeviceToHost(rightObj, (void *)right.data()); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl; return; - } + } - // Clean up PIM objects - pimFree(accObj); - pimFree(tempObj); - pimFree(inputObj); - pimFree(outputObj); + //Clean up PIM objects + pimFree(rightObj); + pimFree(leftObj); } -int main(int argc, char *argv[]) -{ - struct Params params = getInputParams(argc, argv); - vector input; - if (params.inputFile == nullptr) - { - getVector(params.vectorLength, input); - } - else - { - std::cout << "Reading from input file is not implemented yet." << std::endl; - return 1; - } +bool isPowerofTwo(int n) { + if (n <= 0) + return false; - uint64_t len = input.size(); - vector deviceoutput; - vector hostoutput(len); + int logValue = log2(n); + return pow(2, logValue) == n; +} - for (uint64_t i = 0; i < input.size(); i++) - { - deviceoutput.push_back(0); - } +int nextPowerOfTwo(int n) { + int power = 1; + while (power < n) { + power *= 2; + } + return power; +} - hostoutput[0] = input[0]; - for (uint64_t i = 0; i < input.size(); i++) - { - hostoutput[i + 1] = hostoutput[i] + input[i + 1]; - } +int main(int argc, char *argv[]) { + struct Params params = getInputParams(argc, argv); + std::vector input; - if (!createDevice(params.configFile)) - return 1; - prefixSum(input, deviceoutput, len); + if (params.inputFile == nullptr) { + getVector(params.vectorLength, input); + } else { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } - // Verification of Results hostresults vs deviceresults - if (params.shouldVerify) - { + int len = input.size(); + if (!isPowerofTwo(len)) { + int result = nextPowerOfTwo(len); + int padding = result + len; + input.resize(padding, 0); + len = input.size(); + } else { + std::cout << "Input size is already a power of two. " << std::endl; + } + + std::vector host_device_merged(len); + std::vector hostoutput(len); + + auto start_cpu = std::chrono::high_resolution_clock::now(); + hostoutput[0] = input[0]; + for (uint64_t i = 0; i < input.size(); i++) { + hostoutput[i] = input[i]+ hostoutput[i-1]; + host_device_merged[i]=input[i]; + } + + int max = 0; + int it = 0; + std::vector right(len); + std::vector left(len); + size_t iterations = input.size(); + +// UpSweep +while (iterations > 1) { + size_t num_right = (iterations + 1) / 2; + size_t num_left = iterations / 2; + int position = std::pow(2, it); + + right.resize(num_right); + left.resize(num_left); + + size_t right_idx = 0, left_idx = 0; + + for (size_t i = 0; i < right.size(); i++) { + + size_t running_indexex_right = position * (2 * i) + (position - 1); + size_t running_indexex_left = position * (2 * i + 1) + (position - 1); + + right[right_idx++] = input[running_indexex_right]; + left[left_idx++] = input[running_indexex_left]; + } + + + auto stop_cpu = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu - start_cpu); + + if (!createDevice(params.configFile)) + return 1; + + prefixSum(right, left, right.size()); + + auto start_cpu2 = std::chrono::high_resolution_clock::now(); + it++; + int running_index = std::pow(2, it); + iterations = right.size(); + + for (uint64_t i = 0; i < right.size(); ++i) { + int running_indexex = running_index * i + (running_index - 1); + input[running_indexex] = right[i]; + } + + max = running_index; + + auto stop_cpu2 = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu2 - start_cpu2); +} + +auto start_cpu3 = std::chrono::high_resolution_clock::now(); + +// DownSweep +input[max - 1] = input[(max / 2) - 1]; // Clear last element +input[(max / 2) - 1] = 0; +max = static_cast(std::log2(max)); // eliminate the looping for first two steps +max -= 2; + +while (max >= 0) { + int position = std::pow(2, max); + int val = 0; + size_t partitions = 0; + + for (uint64_t i = position - 1; i < input.size(); i += position) + partitions++; + size_t num_right = (partitions + 1) / 2; + size_t num_left = partitions / 2; + + right.resize(num_right); + left.resize(num_left); + size_t right_idx = 0, left_idx = 0; + + for (uint64_t i = position - 1; i < input.size(); i+= position) { + + if (val % 2 == 0) + right[right_idx++] = input[i]; + else + left[left_idx++] = input[i]; + val++; + } + + auto stop_cpu3 = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu3 - start_cpu3); + + // PIM kernel + downsweep(left, right, right.size()); + + for (size_t i = 0; i < right.size(); i++) { + + size_t running_indexex_left = position * (2 * i) + (position - 1); + size_t running_indexex_right = position * (2 * i + 1) + (position - 1); + + if (i < left.size()) + input[running_indexex_left] = left[i]; + input[running_indexex_right] = right[i]; + } + + max--; +} + +for (uint64_t i = 0; i < host_device_merged.size(); i++) { // Merge results + host_device_merged[i] += input[i]; +} + +//Verification of Results hostresults vs deviceresults +if (params.shouldVerify) +{ // verify result #pragma omp parallel for for (uint64_t i = 0; i < len; ++i) { - if (hostoutput[i] != deviceoutput[i]) + if (hostoutput[i] != host_device_merged[i]) { - std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << deviceoutput[i] << std::endl; + std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << host_device_merged[i] << std::endl; } } - } +} + +pimShowStats(); +cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; - pimShowStats(); return 0; } diff --git a/misc-bench/cpp-prefix-sum/slurm.sh b/misc-bench/cpp-prefix-sum/slurm.sh new file mode 100644 index 00000000..ef7a55e3 --- /dev/null +++ b/misc-bench/cpp-prefix-sum/slurm.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#SBATCH -n 1 +#SBATCH -t 1-00:00:00 +#SBATCH -p cpu +#SBATCH --job-name=lr_BitSerial +#SBATCH --mem=16GB +#SBATCH --cpus-per-task=12 +#SBATCH --output=/u/bg9qq/beenishPIM/PIMeval-PIMbench/misc-bench/cpp-prefix-sum/out/out.txt +#SBATCH --error=/u/bg9qq/beenishPIM/PIMeval-PIMbench/misc-bench/cpp-prefix-sum/out/error.txt + + +./prefix-sum.out -l 16 -c ../../configs/taco/PIMeval_BitSerial_Rank16.cfg > ./out/BitSerial_Rank_16.txt +#./prefix-sum.out -l 8192 -c ../../configs/taco/PIMeval_Fulcrum_Rank16.cfg > ./out/PIMeval_Fulcrum_Rank16.txt + + +#./prefix-sum.out -l 10 -c ../../configs/taco/PIMeval_Bank_Rank16.cfg > ./out/PIMeval_Bank_Rank16.txt \ No newline at end of file