@@ -98,62 +98,6 @@ __global__ void initVectors(float *rhs, float *x, int N) {
9898 }
9999}
100100
101- __global__ void gpuDotProduct (float *vecA, float *vecB, float *result,
102- int size) {
103- cg::thread_block cta = cg::this_thread_block ();
104-
105- int gid = blockIdx .x * blockDim .x + threadIdx .x ;
106- extern __shared__ double tmp[];
107-
108- double temp_sum = 0.0 ;
109- for (int i = gid; i < size; i += gridDim .x * blockDim .x ) {
110- temp_sum += (double )(vecA[i] * vecB[i]);
111- }
112- tmp[cta.thread_rank ()] = temp_sum;
113-
114- cg::sync (cta);
115-
116- cg::thread_block_tile<32 > tile32 = cg::tiled_partition<32 >(cta);
117-
118- double beta = temp_sum;
119- double temp;
120-
121- for (int i = tile32.size () / 2 ; i > 0 ; i >>= 1 ) {
122- if (tile32.thread_rank () < i) {
123- temp = tmp[cta.thread_rank () + i];
124- beta += temp;
125- tmp[cta.thread_rank ()] = beta;
126- }
127- cg::sync (tile32);
128- }
129- cg::sync (cta);
130-
131- if (cta.thread_rank () == 0 ) {
132- beta = 0.0 ;
133- for (int i = 0 ; i < cta.size (); i += tile32.size ()) {
134- beta += tmp[i];
135- }
136- atomicAdd (result, (float )beta);
137- }
138- }
139-
140- __global__ void gpuSpMV (int *I, int *J, float *val, int nnz, int num_rows,
141- float alpha, float *inputVecX, float *outputVecY) {
142- size_t gid = blockIdx .x * blockDim .x + threadIdx .x ;
143- for (size_t i = gid; i < num_rows; i += blockDim .x * gridDim .x ) {
144- int row_elem = I[i];
145- int next_row_elem = I[i + 1 ];
146- int num_elems_this_row = next_row_elem - row_elem;
147-
148- float output = 0.0 ;
149- for (int j = 0 ; j < num_elems_this_row; j++) {
150- output += alpha * val[row_elem + j] * inputVecX[J[row_elem + j]];
151- }
152-
153- outputVecY[i] = output;
154- }
155- }
156-
157101__global__ void r1_div_x (float *r1, float *r0, float *b) {
158102 int gid = blockIdx .x * blockDim .x + threadIdx .x ;
159103 if (gid == 0 ) {
@@ -255,7 +199,7 @@ int main(int argc, char **argv) {
255199 checkCudaErrors (cusparseSetMatType (descr, CUSPARSE_MATRIX_TYPE_GENERAL));
256200 checkCudaErrors (cusparseSetMatIndexBase (descr, CUSPARSE_INDEX_BASE_ZERO));
257201
258- int numBlocks = 0 , blockSize = 0 , numBlocks2 = 0 , blockSize2 = 0 ;
202+ int numBlocks = 0 , blockSize = 0 ;
259203 checkCudaErrors (
260204 cudaOccupancyMaxPotentialBlockSize (&numBlocks, &blockSize, initVectors));
261205
@@ -268,11 +212,6 @@ int main(int argc, char **argv) {
268212
269213 initVectors<<<numBlocks, blockSize, 0 , stream1>>> (d_r, d_x, N);
270214
271- checkCudaErrors (cudaOccupancyMaxPotentialBlockSize (&numBlocks2, &blockSize2,
272- gpuSpMV));
273- checkCudaErrors (cudaOccupancyMaxPotentialBlockSize (&numBlocks, &blockSize,
274- gpuDotProduct));
275-
276215 alpha = 1.0 ;
277216 alpham1 = -1.0 ;
278217 beta = 0.0 ;
@@ -332,22 +271,14 @@ int main(int argc, char **argv) {
332271 checkCudaErrors (cublasSaxpy (cublasHandle, N, &alpha, d_r, 1 , d_p, 1 ));
333272 cublasSetPointerMode (cublasHandle, CUBLAS_POINTER_MODE_DEVICE);
334273
335- #if 0 // Use cusparseScsrmv API when it is cuda graph compliant
336274 checkCudaErrors (
337275 cusparseSetPointerMode (cusparseHandle, CUSPARSE_POINTER_MODE_HOST));
338276 checkCudaErrors (
339277 cusparseScsrmv (cusparseHandle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nz,
340278 &alpha, descr, d_val, d_row, d_col, d_p, &beta, d_Ax));
341- #else
342- gpuSpMV<<<numBlocks2, blockSize2, 0 , stream1>>> (d_row, d_col, d_val, nz,
343- N, alpha, d_p, d_Ax);
344- #endif
345279
346280 checkCudaErrors (cudaMemsetAsync (d_dot, 0 , sizeof (float ), stream1));
347- // Use cublasSdot API when it is cuda graph compliant.
348- // checkCudaErrors(cublasSdot(cublasHandle, N, d_p, 1, d_Ax, 1, d_dot));
349- gpuDotProduct<<<numBlocks, blockSize, blockSize * sizeof (double ), stream1>>> (
350- d_p, d_Ax, d_dot, N);
281+ checkCudaErrors (cublasSdot (cublasHandle, N, d_p, 1 , d_Ax, 1 , d_dot));
351282
352283 r1_div_x<<<1 , 1 , 0 , stream1>>> (d_r1, d_dot, d_a);
353284
@@ -360,10 +291,9 @@ int main(int argc, char **argv) {
360291 checkCudaErrors (cudaMemcpyAsync (d_r0, d_r1, sizeof (float ),
361292 cudaMemcpyDeviceToDevice, stream1));
362293 checkCudaErrors (cudaMemsetAsync (d_r1, 0 , sizeof (float ), stream1));
363- // Use cublasSdot API when it is cuda graph compliant.
364- // checkCudaErrors(cublasSdot(cublasHandle, N, d_r, 1, d_r, 1, d_r1));
365- gpuDotProduct<<<numBlocks, blockSize, blockSize * sizeof (double ), stream1>>> (
366- d_r, d_r, d_r1, N);
294+
295+ checkCudaErrors (cublasSdot (cublasHandle, N, d_r, 1 , d_r, 1 , d_r1));
296+
367297 checkCudaErrors (cudaMemcpyAsync ((float *)&r1, d_r1, sizeof (float ),
368298 cudaMemcpyDeviceToHost, stream1));
369299
0 commit comments