Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 196 additions & 67 deletions misc-bench/cpp-prefix-sum/prefix-sum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,15 @@
#include <vector>
#include <cmath>
#include <algorithm>
#include "util.h"
#include "../../util/util.h"
#include "libpimeval.h"
#include <getopt.h>
#include <stdint.h>
#include <iomanip>
#include <chrono>

std::chrono::duration<double, std::milli> hostElapsedTime = std::chrono::duration<double, std::milli>::zero();


#if defined(_OPENMP)
#include <omp.h>
Expand Down Expand Up @@ -77,138 +81,263 @@ struct Params getInputParams(int argc, char **argv)
return p;
}

void prefixSum(vector<int> &input, vector<int> &deviceoutput, uint64_t len)
void prefixSum(vector<int> &even, vector<int> &odd,vector<int> &deviceoutput, uint64_t len)
{
std::vector<PimObjId> temp(len);
std::vector<PimObjId> acc(len);

PimObjId inputObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32);
if (inputObj == -1)

PimObjId evenObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32);
if (evenObj == -1)
{
std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl;
return;
}

PimStatus status = pimCopyHostToDevice((void *)input.data(), inputObj);
PimStatus status = pimCopyHostToDevice((void *)even.data(), evenObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy data to PIM." << std::endl;
return;
}
PimObjId tempObj = pimAllocAssociated(inputObj, PIM_INT32);
if (tempObj == -1)

PimObjId oddObj = pimAllocAssociated(evenObj , PIM_INT32);
if (oddObj == -1)
{
std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl;
return;
}
status = pimCopyHostToDevice((void *)input.data(), tempObj);

status = pimCopyHostToDevice((void *)odd.data(), oddObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy data to PIM." << std::endl;
return;
}
}

PimObjId accObj = pimAllocAssociated(inputObj, PIM_INT32);
if (accObj == -1)
PimObjId outObj = pimAllocAssociated(evenObj, PIM_INT32);
if (outObj == -1)
{
std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl;
return;
}

status = pimCopyHostToDevice((void *)input.data(), accObj);
status = pimCopyHostToDevice((void *)deviceoutput.data(), outObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy data to PIM." << std::endl;
}

//PIM Add
status = pimAdd(evenObj, oddObj, outObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to perform PIM addition." << std::endl;
return;
}
//Copy results back to Host
status = pimCopyDeviceToHost(outObj, (void *)deviceoutput.data());
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl;
return;
}

// Clean up PIM objects
pimFree(evenObj);
pimFree(oddObj);
pimFree(outObj);
}


PimObjId outputObj = pimAllocAssociated(inputObj, PIM_INT32);
if (outputObj == -1)
void downsweep(vector<int> &odd2, vector<int> &even2, uint64_t len)
{

PimObjId d_evenObj = pimAlloc(PIM_ALLOC_AUTO, len, PIM_INT32);
if (d_evenObj == -1)
{
std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl;
return;
}

status = pimCopyHostToDevice((void *)deviceoutput.data(), outputObj);
PimStatus status = pimCopyHostToDevice((void *)even2.data(), d_evenObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy data to PIM." << std::endl;
return;
}

while (len > 0)
PimObjId d_oddObj = pimAllocAssociated(d_evenObj, PIM_INT32);
if (d_evenObj == -1)
{
pimShiftElementsRight(tempObj);
status = pimAdd(tempObj, accObj, accObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to perform PIM addition." << std::endl;
return;
}
len--;
std::cerr << "Abort: Failed to allocate memory on PIM." << std::endl;
return;
}

status = pimCopyDeviceToHost(accObj, (void *)deviceoutput.data());
status = pimCopyHostToDevice((void *)odd2.data(), d_oddObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy data to PIM." << std::endl;
}

//PIM Add
status = pimAdd(d_evenObj, d_oddObj, d_evenObj);
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to perform PIM addition." << std::endl;
return;
}

//Copy results back to Host
status = pimCopyDeviceToHost(d_evenObj, (void *)even2.data());
if (status != PIM_OK)
{
std::cerr << "Abort: Failed to copy prefix sum result from PIM." << std::endl;
return;
}

// Clean up PIM objects
pimFree(accObj);
pimFree(tempObj);
pimFree(inputObj);
pimFree(outputObj);
pimFree(d_evenObj);
pimFree(d_oddObj);
}


int main(int argc, char *argv[])
{
struct Params params = getInputParams(argc, argv);
vector<int> input;
if (params.inputFile == nullptr)
{
getVector(params.vectorLength, input);
}
else
{
std::cout << "Reading from input file is not implemented yet." << std::endl;
return 1;
}
struct Params params = getInputParams(argc, argv);
std::vector<int> input;

uint64_t len = input.size();
vector<int> deviceoutput;
vector<int> hostoutput(len);
if (params.inputFile == nullptr) {
getVector(params.vectorLength, input);
} else {
std::cout << "Reading from input file is not implemented yet." << std::endl;
return 1;
}

for (uint64_t i = 0; i < input.size(); i++)
{
deviceoutput.push_back(0);
}
uint64_t len = input.size();
std::vector<int> deviceoutput;
std::vector<int> hostoutput(len);
std::vector<int> intermeadiate_results;
std::vector<int> host_device_merged;

hostoutput[0] = input[0];
for (uint64_t i = 0; i < input.size(); i++)
{
hostoutput[i + 1] = hostoutput[i] + input[i + 1];
}
auto start_cpu = std::chrono::high_resolution_clock::now();

if (!createDevice(params.configFile))
return 1;
prefixSum(input, deviceoutput, len);
for (uint64_t i = 0; i < input.size(); i++) {
deviceoutput.push_back(0);
}

// Verification of Results hostresults vs deviceresults
if (params.shouldVerify)
{
hostoutput[0] = input[0];
for (uint64_t i = 0; i < input.size(); i++) {
hostoutput[i + 1] = hostoutput[i] + input[i + 1];
Comment on lines +226 to +227
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an out-of-bounds access. Depending on the compiler and operating system, it may either get segmentation fault, or produce error-prone results.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prefix-sum.out(9981,0x1f217cf40) malloc: Incorrect checksum for freed object 0x12200c800: probably modified after being freed.

Corrupt value: 0xffc00

prefix-sum.out(9981,0x1f217cf40) malloc: *** set a breakpoint in malloc_error_break to debug

Abort trap: 6

intermeadiate_results.push_back(input[i]);
host_device_merged.push_back(input[i]);
Comment on lines +228 to +229
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need to have three copies of the input vector?

}

int max = 0;
int it = 0;

// UpSweep
while (intermeadiate_results.size() > 1) {
std::vector<int> even, odd;

for (uint64_t i = 0; i < intermeadiate_results.size(); ++i) {
if (i % 2 == 0)
even.push_back(intermeadiate_results[i]);
else
odd.push_back(intermeadiate_results[i]);
}
Comment on lines +239 to +244
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's recommended to avoid using push_back for efficiency purpose, because this allocates memory dynamically. Since the vector size is known beforehand, we can use reside and indexing.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

*resize


size_t maxSize = std::max(even.size(), odd.size());
even.resize(maxSize, 0);
odd.resize(maxSize, 0);


auto stop_cpu = std::chrono::high_resolution_clock::now();
hostElapsedTime += (stop_cpu - start_cpu);

if (!createDevice(params.configFile))
return 1;

prefixSum(even, odd, deviceoutput, maxSize);

auto start_cpu2 = std::chrono::high_resolution_clock::now();
it++;
int ind = std::pow(2, it);
intermeadiate_results = deviceoutput;
intermeadiate_results.resize(maxSize);

for (uint64_t i = 0; i < maxSize; ++i) {
int index = ind * i + (ind - 1);
input[index] = deviceoutput[i];
}

max = ind;

auto stop_cpu2 = std::chrono::high_resolution_clock::now();
hostElapsedTime += (stop_cpu2 - start_cpu2);
}

std::cout << "Host elapsed time before downsweep: "
<< std::fixed << std::setprecision(3)
<< hostElapsedTime.count() << " ms." << std::endl;

auto start_cpu3 = std::chrono::high_resolution_clock::now();

// Clear last element
input[max - 1] = input[(max / 2) - 1];
input[(max / 2) - 1] = 0;
max = static_cast<int>(std::log2(max)); //eliminate the looping for first two steps
max -= 2;


// DownSweep
while (max >= 0) {
int ind2 = std::pow(2, max);
int val = 0;
std::vector<int> even2, odd2, result;

for (uint64_t i = ind2 - 1; i < input.size(); i += ind2) {
if (val % 2 == 0)
even2.push_back(input[i]);
else
odd2.push_back(input[i]);
val++;
}
auto stop_cpu3 = std::chrono::high_resolution_clock::now();
hostElapsedTime += (stop_cpu3 - start_cpu3);
//PIM kernel
downsweep(odd2, even2, even2.size());

auto start_cpu4 = std::chrono::high_resolution_clock::now();
for (uint64_t i = 0; i < even2.size(); i++) {
result.push_back(odd2[i]);
result.push_back(even2[i]);
}

for (uint64_t i = 0; i < result.size(); i++) {
int index2 = ind2 * i + (ind2 - 1);
input[index2] = result[i];
}
max--;
}


for (uint64_t i = 0; i < host_device_merged.size(); i++) { // Merge results
host_device_merged[i] += input[i];
}

//Verification of Results hostresults vs deviceresults
if (params.shouldVerify)
{
// verify result
#pragma omp parallel for
for (uint64_t i = 0; i < len; ++i)
{
if (hostoutput[i] != deviceoutput[i])
if (hostoutput[i] != host_device_merged[i])
{
std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << deviceoutput[i] << std::endl;
std::cout << "Wrong answer for Prefixsum: " << hostoutput[i] << " != " << host_device_merged[i] << std::endl;
}
}
}
}

pimShowStats();
cout << "Host elapsed time: " << std::fixed << std::setprecision(3) << hostElapsedTime.count() << " ms." << endl;

pimShowStats();
return 0;
}