From 799f8a94f4be4c8112135adb22524cec911f8dd9 Mon Sep 17 00:00:00 2001 From: Beenish Gul Date: Sun, 18 May 2025 16:08:16 -0400 Subject: [PATCH] updated prefixsum --- misc-bench/cpp-prefix-sum/prefix-sum.cpp | 263 +++++++++++++++++------ 1 file changed, 196 insertions(+), 67 deletions(-) diff --git a/misc-bench/cpp-prefix-sum/prefix-sum.cpp b/misc-bench/cpp-prefix-sum/prefix-sum.cpp index 942c247e..54e89d19 100644 --- a/misc-bench/cpp-prefix-sum/prefix-sum.cpp +++ b/misc-bench/cpp-prefix-sum/prefix-sum.cpp @@ -7,11 +7,15 @@ #include #include #include -#include "util.h" +#include "../../util/util.h" #include "libpimeval.h" #include #include #include +#include + +std::chrono::duration hostElapsedTime = std::chrono::duration::zero(); + #if defined(_OPENMP) #include @@ -77,138 +81,263 @@ struct Params getInputParams(int argc, char **argv) return p; } -void prefixSum(vector &input, vector &deviceoutput, uint64_t len) +void prefixSum(vector &even, vector &odd,vector &deviceoutput, uint64_t len) { - std::vector temp(len); - std::vector acc(len); - - PimObjId inputObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); - if (inputObj == -1) + + PimObjId evenObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); + if (evenObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - PimStatus status = pimCopyHostToDevice((void *)input.data(), inputObj); + PimStatus status = pimCopyHostToDevice((void *)even.data(), evenObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; return; } - PimObjId tempObj = pimAllocAssociated(inputObj, PIM_INT32); - if (tempObj == -1) + + PimObjId oddObj = pimAllocAssociated(evenObj , PIM_INT32); + if (oddObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - status = pimCopyHostToDevice((void *)input.data(), tempObj); + + status = pimCopyHostToDevice((void *)odd.data(), oddObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; - return; - } + } - PimObjId accObj = pimAllocAssociated(inputObj, PIM_INT32); - if (accObj == -1) + PimObjId outObj = pimAllocAssociated(evenObj, PIM_INT32); + if (outObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - status = pimCopyHostToDevice((void *)input.data(), accObj); + status = pimCopyHostToDevice((void *)deviceoutput.data(), outObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; + } + + //PIM Add + status = pimAdd(evenObj, oddObj, outObj); + if (status != PIM_OK) + { + std::cerr << "Abort: Failed to perform PIM addition." << std::endl; return; } + //Copy results back to Host + status = pimCopyDeviceToHost(outObj, (void *)deviceoutput.data()); + if (status != PIM_OK) + { + std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl; + return; + } + + // Clean up PIM objects + pimFree(evenObj); + pimFree(oddObj); + pimFree(outObj); +} + - PimObjId outputObj = pimAllocAssociated(inputObj, PIM_INT32); - if (outputObj == -1) +void downsweep(vector &odd2, vector &even2, uint64_t len) +{ + + PimObjId d_evenObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); + if (d_evenObj == -1) { std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; return; } - status = pimCopyHostToDevice((void *)deviceoutput.data(), outputObj); + PimStatus status = pimCopyHostToDevice((void *)even2.data(), d_evenObj); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy data to PIM." << std::endl; return; } - while (len > 0) + PimObjId d_oddObj = pimAllocAssociated(d_evenObj, PIM_INT32); + if (d_evenObj == -1) { - pimShiftElementsRight(tempObj); - status = pimAdd(tempObj, accObj, accObj); - if (status != PIM_OK) - { - std::cerr << "Abort: Failed to perform PIM addition." << std::endl; - return; - } - len--; + std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; + return; } - status = pimCopyDeviceToHost(accObj, (void *)deviceoutput.data()); + status = pimCopyHostToDevice((void *)odd2.data(), d_oddObj); + if (status != PIM_OK) + { + std::cerr << "Abort: Failed to copy data to PIM." << std::endl; + } + + //PIM Add + status = pimAdd(d_evenObj, d_oddObj, d_evenObj); + if (status != PIM_OK) + { + std::cerr << "Abort: Failed to perform PIM addition." << std::endl; + return; + } + + //Copy results back to Host + status = pimCopyDeviceToHost(d_evenObj, (void *)even2.data()); if (status != PIM_OK) { std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl; return; } - + // Clean up PIM objects - pimFree(accObj); - pimFree(tempObj); - pimFree(inputObj); - pimFree(outputObj); + pimFree(d_evenObj); + pimFree(d_oddObj); } + int main(int argc, char *argv[]) { - struct Params params = getInputParams(argc, argv); - vector input; - if (params.inputFile == nullptr) - { - getVector(params.vectorLength, input); - } - else - { - std::cout << "Reading from input file is not implemented yet." << std::endl; - return 1; - } + struct Params params = getInputParams(argc, argv); + std::vector input; - uint64_t len = input.size(); - vector deviceoutput; - vector hostoutput(len); + if (params.inputFile == nullptr) { + getVector(params.vectorLength, input); + } else { + std::cout << "Reading from input file is not implemented yet." << std::endl; + return 1; + } - for (uint64_t i = 0; i < input.size(); i++) - { - deviceoutput.push_back(0); - } + uint64_t len = input.size(); + std::vector deviceoutput; + std::vector hostoutput(len); + std::vector intermeadiate_results; + std::vector host_device_merged; - hostoutput[0] = input[0]; - for (uint64_t i = 0; i < input.size(); i++) - { - hostoutput[i + 1] = hostoutput[i] + input[i + 1]; - } + auto start_cpu = std::chrono::high_resolution_clock::now(); - if (!createDevice(params.configFile)) - return 1; - prefixSum(input, deviceoutput, len); + for (uint64_t i = 0; i < input.size(); i++) { + deviceoutput.push_back(0); + } - // Verification of Results hostresults vs deviceresults - if (params.shouldVerify) - { + hostoutput[0] = input[0]; + for (uint64_t i = 0; i < input.size(); i++) { + hostoutput[i + 1] = hostoutput[i] + input[i + 1]; + intermeadiate_results.push_back(input[i]); + host_device_merged.push_back(input[i]); + } + + int max = 0; + int it = 0; + + // UpSweep + while (intermeadiate_results.size() > 1) { + std::vector even, odd; + + for (uint64_t i = 0; i < intermeadiate_results.size(); ++i) { + if (i % 2 == 0) + even.push_back(intermeadiate_results[i]); + else + odd.push_back(intermeadiate_results[i]); + } + + size_t maxSize = std::max(even.size(), odd.size()); + even.resize(maxSize, 0); + odd.resize(maxSize, 0); + + + auto stop_cpu = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu - start_cpu); + + if (!createDevice(params.configFile)) + return 1; + + prefixSum(even, odd, deviceoutput, maxSize); + + auto start_cpu2 = std::chrono::high_resolution_clock::now(); + it++; + int ind = std::pow(2, it); + intermeadiate_results = deviceoutput; + intermeadiate_results.resize(maxSize); + + for (uint64_t i = 0; i < maxSize; ++i) { + int index = ind * i + (ind - 1); + input[index] = deviceoutput[i]; + } + + max = ind; + + auto stop_cpu2 = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu2 - start_cpu2); + } + + std::cout << "Host elapsed time before downsweep: " + << std::fixed << std::setprecision(3) + << hostElapsedTime.count() << " ms." << std::endl; + + auto start_cpu3 = std::chrono::high_resolution_clock::now(); + + // Clear last element + input[max - 1] = input[(max / 2) - 1]; + input[(max / 2) - 1] = 0; + max = static_cast(std::log2(max)); //eliminate the looping for first two steps + max -= 2; + + + // DownSweep + while (max >= 0) { + int ind2 = std::pow(2, max); + int val = 0; + std::vector even2, odd2, result; + + for (uint64_t i = ind2 - 1; i < input.size(); i += ind2) { + if (val % 2 == 0) + even2.push_back(input[i]); + else + odd2.push_back(input[i]); + val++; + } + auto stop_cpu3 = std::chrono::high_resolution_clock::now(); + hostElapsedTime += (stop_cpu3 - start_cpu3); + //PIM kernel + downsweep(odd2, even2, even2.size()); + + auto start_cpu4 = std::chrono::high_resolution_clock::now(); + for (uint64_t i = 0; i < even2.size(); i++) { + result.push_back(odd2[i]); + result.push_back(even2[i]); + } + + for (uint64_t i = 0; i < result.size(); i++) { + int index2 = ind2 * i + (ind2 - 1); + input[index2] = result[i]; + } + max--; + } + + + for (uint64_t i = 0; i < host_device_merged.size(); i++) { // Merge results + host_device_merged[i] += input[i]; + } + +//Verification of Results hostresults vs deviceresults +if (params.shouldVerify) +{ // verify result #pragma omp parallel for for (uint64_t i = 0; i < len; ++i) { - if (hostoutput[i] != deviceoutput[i]) + if (hostoutput[i] != host_device_merged[i]) { - std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << deviceoutput[i] << std::endl; + std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << host_device_merged[i] << std::endl; } } - } +} + +pimShowStats(); +cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; - pimShowStats(); return 0; }