-
Couldn't load subscription status.
- Fork 25
updated prefixsum #285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
updated prefixsum #285
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,11 +7,15 @@ | |
| #include <vector> | ||
| #include <cmath> | ||
| #include <algorithm> | ||
| #include "util.h" | ||
| #include "../../util/util.h" | ||
| #include "libpimeval.h" | ||
| #include <getopt.h> | ||
| #include <stdint.h> | ||
| #include <iomanip> | ||
| #include <chrono> | ||
|
|
||
| std::chrono::duration<double, std::milli> hostElapsedTime = std::chrono::duration<double, std::milli>::zero(); | ||
|
|
||
|
|
||
| #if defined(_OPENMP) | ||
| #include <omp.h> | ||
|
|
@@ -77,138 +81,263 @@ struct Params getInputParams(int argc, char **argv) | |
| return p; | ||
| } | ||
|
|
||
| void prefixSum(vector<int> &input, vector<int> &deviceoutput, uint64_t len) | ||
| void prefixSum(vector<int> &even, vector<int> &odd,vector<int> &deviceoutput, uint64_t len) | ||
| { | ||
| std::vector<PimObjId> temp(len); | ||
| std::vector<PimObjId> acc(len); | ||
|
|
||
| PimObjId inputObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); | ||
| if (inputObj == -1) | ||
|
|
||
| PimObjId evenObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); | ||
| if (evenObj == -1) | ||
| { | ||
| std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| PimStatus status = pimCopyHostToDevice((void *)input.data(), inputObj); | ||
| PimStatus status = pimCopyHostToDevice((void *)even.data(), evenObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy data to PIM." << std::endl; | ||
| return; | ||
| } | ||
| PimObjId tempObj = pimAllocAssociated(inputObj, PIM_INT32); | ||
| if (tempObj == -1) | ||
|
|
||
| PimObjId oddObj = pimAllocAssociated(evenObj , PIM_INT32); | ||
| if (oddObj == -1) | ||
| { | ||
| std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; | ||
| return; | ||
| } | ||
| status = pimCopyHostToDevice((void *)input.data(), tempObj); | ||
|
|
||
| status = pimCopyHostToDevice((void *)odd.data(), oddObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy data to PIM." << std::endl; | ||
| return; | ||
| } | ||
| } | ||
|
|
||
| PimObjId accObj = pimAllocAssociated(inputObj, PIM_INT32); | ||
| if (accObj == -1) | ||
| PimObjId outObj = pimAllocAssociated(evenObj, PIM_INT32); | ||
| if (outObj == -1) | ||
| { | ||
| std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| status = pimCopyHostToDevice((void *)input.data(), accObj); | ||
| status = pimCopyHostToDevice((void *)deviceoutput.data(), outObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy data to PIM." << std::endl; | ||
| } | ||
|
|
||
| //PIM Add | ||
| status = pimAdd(evenObj, oddObj, outObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to perform PIM addition." << std::endl; | ||
| return; | ||
| } | ||
| //Copy results back to Host | ||
| status = pimCopyDeviceToHost(outObj, (void *)deviceoutput.data()); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| // Clean up PIM objects | ||
| pimFree(evenObj); | ||
| pimFree(oddObj); | ||
| pimFree(outObj); | ||
| } | ||
|
|
||
|
|
||
| PimObjId outputObj = pimAllocAssociated(inputObj, PIM_INT32); | ||
| if (outputObj == -1) | ||
| void downsweep(vector<int> &odd2, vector<int> &even2, uint64_t len) | ||
| { | ||
|
|
||
| PimObjId d_evenObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32); | ||
| if (d_evenObj == -1) | ||
| { | ||
| std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| status = pimCopyHostToDevice((void *)deviceoutput.data(), outputObj); | ||
| PimStatus status = pimCopyHostToDevice((void *)even2.data(), d_evenObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy data to PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| while (len > 0) | ||
| PimObjId d_oddObj = pimAllocAssociated(d_evenObj, PIM_INT32); | ||
| if (d_evenObj == -1) | ||
| { | ||
| pimShiftElementsRight(tempObj); | ||
| status = pimAdd(tempObj, accObj, accObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to perform PIM addition." << std::endl; | ||
| return; | ||
| } | ||
| len--; | ||
| std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| status = pimCopyDeviceToHost(accObj, (void *)deviceoutput.data()); | ||
| status = pimCopyHostToDevice((void *)odd2.data(), d_oddObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy data to PIM." << std::endl; | ||
| } | ||
|
|
||
| //PIM Add | ||
| status = pimAdd(d_evenObj, d_oddObj, d_evenObj); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to perform PIM addition." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| //Copy results back to Host | ||
| status = pimCopyDeviceToHost(d_evenObj, (void *)even2.data()); | ||
| if (status != PIM_OK) | ||
| { | ||
| std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl; | ||
| return; | ||
| } | ||
|
|
||
| // Clean up PIM objects | ||
| pimFree(accObj); | ||
| pimFree(tempObj); | ||
| pimFree(inputObj); | ||
| pimFree(outputObj); | ||
| pimFree(d_evenObj); | ||
| pimFree(d_oddObj); | ||
| } | ||
|
|
||
|
|
||
| int main(int argc, char *argv[]) | ||
| { | ||
| struct Params params = getInputParams(argc, argv); | ||
| vector<int> input; | ||
| if (params.inputFile == nullptr) | ||
| { | ||
| getVector(params.vectorLength, input); | ||
| } | ||
| else | ||
| { | ||
| std::cout << "Reading from input file is not implemented yet." << std::endl; | ||
| return 1; | ||
| } | ||
| struct Params params = getInputParams(argc, argv); | ||
| std::vector<int> input; | ||
|
|
||
| uint64_t len = input.size(); | ||
| vector<int> deviceoutput; | ||
| vector<int> hostoutput(len); | ||
| if (params.inputFile == nullptr) { | ||
| getVector(params.vectorLength, input); | ||
| } else { | ||
| std::cout << "Reading from input file is not implemented yet." << std::endl; | ||
| return 1; | ||
| } | ||
|
|
||
| for (uint64_t i = 0; i < input.size(); i++) | ||
| { | ||
| deviceoutput.push_back(0); | ||
| } | ||
| uint64_t len = input.size(); | ||
| std::vector<int> deviceoutput; | ||
| std::vector<int> hostoutput(len); | ||
| std::vector<int> intermeadiate_results; | ||
| std::vector<int> host_device_merged; | ||
|
|
||
| hostoutput[0] = input[0]; | ||
| for (uint64_t i = 0; i < input.size(); i++) | ||
| { | ||
| hostoutput[i + 1] = hostoutput[i] + input[i + 1]; | ||
| } | ||
| auto start_cpu = std::chrono::high_resolution_clock::now(); | ||
|
|
||
| if (!createDevice(params.configFile)) | ||
| return 1; | ||
| prefixSum(input, deviceoutput, len); | ||
| for (uint64_t i = 0; i < input.size(); i++) { | ||
| deviceoutput.push_back(0); | ||
| } | ||
|
|
||
| // Verification of Results hostresults vs deviceresults | ||
| if (params.shouldVerify) | ||
| { | ||
| hostoutput[0] = input[0]; | ||
| for (uint64_t i = 0; i < input.size(); i++) { | ||
| hostoutput[i + 1] = hostoutput[i] + input[i + 1]; | ||
| intermeadiate_results.push_back(input[i]); | ||
| host_device_merged.push_back(input[i]); | ||
|
Comment on lines
+228
to
+229
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to have three copies of the input vector? |
||
| } | ||
|
|
||
| int max = 0; | ||
| int it = 0; | ||
|
|
||
| // UpSweep | ||
| while (intermeadiate_results.size() > 1) { | ||
| std::vector<int> even, odd; | ||
|
|
||
| for (uint64_t i = 0; i < intermeadiate_results.size(); ++i) { | ||
| if (i % 2 == 0) | ||
| even.push_back(intermeadiate_results[i]); | ||
| else | ||
| odd.push_back(intermeadiate_results[i]); | ||
| } | ||
|
Comment on lines
+239
to
+244
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's recommended to avoid using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. * |
||
|
|
||
| size_t maxSize = std::max(even.size(), odd.size()); | ||
| even.resize(maxSize, 0); | ||
| odd.resize(maxSize, 0); | ||
|
|
||
|
|
||
| auto stop_cpu = std::chrono::high_resolution_clock::now(); | ||
| hostElapsedTime += (stop_cpu - start_cpu); | ||
|
|
||
| if (!createDevice(params.configFile)) | ||
| return 1; | ||
|
|
||
| prefixSum(even, odd, deviceoutput, maxSize); | ||
|
|
||
| auto start_cpu2 = std::chrono::high_resolution_clock::now(); | ||
| it++; | ||
| int ind = std::pow(2, it); | ||
| intermeadiate_results = deviceoutput; | ||
| intermeadiate_results.resize(maxSize); | ||
|
|
||
| for (uint64_t i = 0; i < maxSize; ++i) { | ||
| int index = ind * i + (ind - 1); | ||
| input[index] = deviceoutput[i]; | ||
| } | ||
|
|
||
| max = ind; | ||
|
|
||
| auto stop_cpu2 = std::chrono::high_resolution_clock::now(); | ||
| hostElapsedTime += (stop_cpu2 - start_cpu2); | ||
| } | ||
|
|
||
| std::cout << "Host elapsed time before downsweep: " | ||
| << std::fixed << std::setprecision(3) | ||
| << hostElapsedTime.count() << " ms." << std::endl; | ||
|
|
||
| auto start_cpu3 = std::chrono::high_resolution_clock::now(); | ||
|
|
||
| // Clear last element | ||
| input[max - 1] = input[(max / 2) - 1]; | ||
| input[(max / 2) - 1] = 0; | ||
| max = static_cast<int>(std::log2(max)); //eliminate the looping for first two steps | ||
| max -= 2; | ||
|
|
||
|
|
||
| // DownSweep | ||
| while (max >= 0) { | ||
| int ind2 = std::pow(2, max); | ||
| int val = 0; | ||
| std::vector<int> even2, odd2, result; | ||
|
|
||
| for (uint64_t i = ind2 - 1; i < input.size(); i += ind2) { | ||
| if (val % 2 == 0) | ||
| even2.push_back(input[i]); | ||
| else | ||
| odd2.push_back(input[i]); | ||
| val++; | ||
| } | ||
| auto stop_cpu3 = std::chrono::high_resolution_clock::now(); | ||
| hostElapsedTime += (stop_cpu3 - start_cpu3); | ||
| //PIM kernel | ||
| downsweep(odd2, even2, even2.size()); | ||
|
|
||
| auto start_cpu4 = std::chrono::high_resolution_clock::now(); | ||
| for (uint64_t i = 0; i < even2.size(); i++) { | ||
| result.push_back(odd2[i]); | ||
| result.push_back(even2[i]); | ||
| } | ||
|
|
||
| for (uint64_t i = 0; i < result.size(); i++) { | ||
| int index2 = ind2 * i + (ind2 - 1); | ||
| input[index2] = result[i]; | ||
| } | ||
| max--; | ||
| } | ||
|
|
||
|
|
||
| for (uint64_t i = 0; i < host_device_merged.size(); i++) { // Merge results | ||
| host_device_merged[i] += input[i]; | ||
| } | ||
|
|
||
| //Verification of Results hostresults vs deviceresults | ||
| if (params.shouldVerify) | ||
| { | ||
| // verify result | ||
| #pragma omp parallel for | ||
| for (uint64_t i = 0; i < len; ++i) | ||
| { | ||
| if (hostoutput[i] != deviceoutput[i]) | ||
| if (hostoutput[i] != host_device_merged[i]) | ||
| { | ||
| std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << deviceoutput[i] << std::endl; | ||
| std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << host_device_merged[i] << std::endl; | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| pimShowStats(); | ||
| cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl; | ||
|
|
||
| pimShowStats(); | ||
| return 0; | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an out-of-bounds access. Depending on the compiler and operating system, it may either get segmentation fault, or produce error-prone results.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.