OpenACC 与 CUDA 的相互调用

▶ 按照书上的代码完成了 OpenACC 与CUDA 的相互调用，以及 OpenACC 调用 cuBLAS。便于过程遇到了很多问题，注入 CUDA 版本，代码版本，计算能力指定等，先放在这里，以后填坑。

● 代码，OpenACC 调用 CUDA

 // kernel.cu

 __global__ void saxpy_kernel(const int n, const float a, float *x, float *y)

 {

     int id = blockIdx.x * blockDim.x + threadIdx.x;

     if (id < n)

         y[id] += a * x[id];

 }

 extern "C" void saxpy(const int n, const float a, float *x, float *y)

 {

     saxpy_kernel << < (n +  - ) / ,  >> > (n, a, x, y);

 }

 // main.c

 #include <stdio.h>

 #include <stdlib.h>

 #define N   1024

 #pragma acc routine seq

 extern void saxpy(int n, float a, float *x, float *y);

 int main()

 {

     float *x = (float *)malloc(sizeof(float)*N);

     float *y = (float *)malloc(sizeof(float)*N);

 #pragma acc data create(x[0:N]) copyout(y[0:N])

     {

 #pragma acc kernels

 #pragma acc loop independent

         for (int i = ; i < N; i++)

         {

             x[i] = 1.0f;

             y[i] = 4.0f;

         }

 #pragma acc host_data use_device(x, y)

         saxpy(N, 2.0f, x, y);

     }

 #pragma wait

     printf("\ny[0] = %f\n", y[]);

     free(x);

     free(y);

     //getchar();

     return ;

 }

● 输出结果，代码在 win10上不能链接，报错：LINK : fatal error LNK1104: 无法打开文件“libcudapgi.lib”，WSL上输出结果不正确，在 Ubuntu 中报链接错误。参考了参考https://blog.****.net/wcj0626/article/details/12611689?locationNum=12&fps=1 和 https://*.com/questions/31737024/openacc-calling-cuda-device-kernel-from-openacc-parallel-loop，还是没有解决问题

WSL:

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c kernel.cu -rdc=true

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c main.c

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -ta=tesla:rdc,cuda9. -Mcuda -o acc.exe main.o kernel.o

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

y[] = 4.000000

Ubuntu:

@E@nvlink fatal   : elfLink fatbinary error

pgacclnk: child process exit status : /usr/local/pgi/linux86-/18.4/bin/pgnvd

● 代码，OpenACC 调用 CUDA

 // fun.c

 void set(const int n, const float c, float *x)

 {

 #pragma acc kernels deviceptr(x)

     for (int i = ; i < n; i++)

         x[i] = c;

 }

 void saxpy(const int n, const float a, float *restrict x, float *restrict y)

 {

 #pragma acc kernels deviceptr(x, y)

     for (int i = ; i < n; i++)

         y[i] += a * x[i];

 }

 // main.cu

 #include <stdio.h>

 #include <cuda.h>

 #include "cuda_runtime.h"

 #include "device_launch_parameters.h"

 #define N   1024

 extern "C" void set(int, float, float *);

 extern "C" void saxpy(int, float, float *, float *);

 int main()

 {

     float *x, *y, y0;

     cudaMalloc((void**)&x, sizeof(float)*N);

     cudaMalloc((void**)&y, sizeof(float)*N);

     set(N, 1.0f, x);

     set(N, 0.0f, y);

     saxpy(N, 2.0f, x, y);

     cudaMemcpy(&y0, y, sizeof(float), cudaMemcpyDeviceToHost);

     printf("\ny[0] = %f\n", y0);

     cudaFree(x);

     cudaFree(y);

     //getchar();

     return ;

 }

● 输出结果，代码在 win10上不能链接，WSL 和 Ubuntu 中报链接错误

WSL:

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c main.cu -rdc=true

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c fun.c

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -ta=tesla:rdc,cuda9. -Mcuda -o acc.exe main.o fun.o

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

Segmentation fault (core dumped)

Ubuntu:

cuan@CUAN:~/Temp$ nvcc -c main.cu -rdc=true

cuan@CUAN:~/Temp$ pgcc -acc -c fun.c

cuan@CUAN:~/Temp$ pgcc -ta=tesla:rdc,cuda9. -Mcuda -o acc.exe main.o fun.o

@E@nvlink fatal   : elfLink fatbinary error

pgacclnk: child process exit status : /usr/local/pgi/linux86-/18.4/bin/pgnvd

● 代码，CUDA 调用 OpenACC，捆绑变量地址

 // fun.c

 #include <openacc.h>

 void map(float *restrict pHost, float *restrict pDevice, int sizeByte)

 {

     acc_map_data(pHost, pDevice, sizeByte);

 }

 void set(int n, float c, float *x)

 {

 #pragma acc kernels present(x)

     for (int i = ; i < n; i++)

         x[i] = c;

 }

 void saxpy(int n, float a, float *restrict x, float *restrict y)

 {

 #pragma acc kernels deviceptr(x,y)

     for (int i = ; i < n; i++)

         y[i] += a * x[i];

 }

 // main.cu

 #include <stdio.h>

 #include <stdlib.h>

 #define N   1024

 extern "C" void map(float *, float *, int);

 extern "C" void set(int, float, float *);

 extern "C" void saxpy(int, float, float *, float *);

 int main()

 {

     float *x = (float *)malloc(sizeof(float)*N);

     float *y = (float *)malloc(sizeof(float)*N);

     float *dx, *dy, y0;

     cudaMalloc((void**)&dx, sizeof(float)*N);

     cudaMalloc((void**)&dy, sizeof(float)*N);

     map(x, dx, sizeof(float)*N);

     map(y, dy, sizeof(float)*N);

     set(N, 1.0f, x);

     set(N, 4.0f, y);

     saxpy(N, 2.0f, x, y);

     cudaMemcpy(&y0, y, sizeof(float), cudaMemcpyDeviceToHost);

     printf("\ny[0] = %f\n",y0);

     free(x);

     free(y);

     cudaFree(x);

     cudaFree(y);

     //getchar();

     return ;

 }

● 输出结果，代码在 win10上不能链接，在 WSL 上结果正确，在 Ubuntu 中未尝试

WSL:

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c main.cu -rdc=true

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -c fun.c -acc

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -Mcuda -o acc.exe main.o fun.o -ta=tesla:rdc,cuda9.

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

y[] = 6.000000

● 代码，OpenACC 调用 cuBLAS

 #include <stdio.h>

 #include <stdlib.h>

 #define N   1024

 extern void cublasSaxpy(int, float, float *, int, float *, int);

 int main()

 {

     float *x = (float *)malloc(sizeof(float)*N);

     float *y = (float *)malloc(sizeof(float)*N);

 #pragma acc data create(x[0:N]) copyout(y[0:N])

     {

 #pragma acc kernels

         for (int i = ; i < N; i++)

         {

             x[i] = 1.0f;

             y[i] = 4.0f;

         }

 #pragma acc host_data use_device(x,y)

         {

             cublasSaxpy(N, 2.0f, x, , y, );

         }

     }

     printf("\ny[0] = %f\n", y[]);

     free(x);

     free(y);

     //getchar();

     return ;

 }

● 输出结果，代码在 win10上不能链接，在 WSL 中结果错误，在 Ubuntu 中结果正确

WSL:

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ nvcc -c fun.c -rdc=true

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -c main.c

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ pgcc -acc -Mcuda -lcublas -o acc.exe main.o

cuan@CUAN:/mnt/d/Code/CUDA/cudaProject/cudaProject$ ./acc.exe

y[] = 4.000000

Ubuntu:

cuan@CUAN:~/Temp$ nvcc -c fun.c -rdc=true

cuan@CUAN:~/Temp$ pgcc -acc -c main.c

cuan@CUAN:~/Temp$ pgcc -acc -Mcuda -lcublas -o acc.exe main.o

cuan@CUAN:~/Temp$ ./acc.exe

y[] = 6.000000

秒客网

OpenACC 与 CUDA 的相互调用

相关文章