Caffe 初学拾遗(五) CUDA 并行化示例

时间:2023-01-12 23:43:58

Original Source:

http://blog.csdn.net/augusdi/article/details/12833235

一些CUDA编程的简单示例程序,笔者在此进行了整理说明


在此之前,关于GPU选择(有提到流处理簇内容)以及Intel与Nvidia争论,读者有兴趣也可以一看。


Note:

1. 没有任何前缀的函数,都是Host程序。Host程序可以用__host__前缀进行显式声明。

2. Device程序需要由NVCC进行编译,而Host程序只需要由Host编译器(GCC)编译。

3. Host程序主要完成设备环境初始化,数据传输等必备过程,Device程序只负责计算。

4. Host程序中,“cuda”开头的函数,都是CUDA Runtime API,即运行时函数,主要负责完成Device的初始化、内存分配、内存拷贝等任务。



1.获取设备数目及属性:

// Add vectors in parallel.  
cudaError_t cudaStatus;
int num = 0;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = 0;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
}
cudaError_t是cuda错误类型,取值为整数。
cudaDeviceProp为设备属性结构体,其定义如下:

/** 
* CUDA device properties
*/
struct __device_builtin__ cudaDeviceProp
{
char name[256]; /**< ASCII string identifying device */
size_t totalGlobalMem; /**< Global memory available on device in bytes */
size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */
int regsPerBlock; /**< 32-bit registers available per block */
int warpSize; /**< Warp size in threads */
size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */
int maxThreadsPerBlock; /**< Maximum number of threads per block */
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
int clockRate; /**< Clock frequency in kilohertz */
size_t totalConstMem; /**< Constant memory available on device in bytes */
int major; /**< Major compute capability */
int minor; /**< Minor compute capability */
size_t textureAlignment; /**< Alignment requirement for textures */
size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */
int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
int multiProcessorCount; /**< Number of multiprocessors on device */
int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */
int integrated; /**< Device is integrated as opposed to discrete */
int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
int computeMode; /**< Compute mode (See ::cudaComputeMode) */
int maxTexture1D; /**< Maximum 1D texture size */
int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */
int maxTexture1DLinear; /**< Maximum size for 1D textures bound to linear memory */
int maxTexture2D[2]; /**< Maximum 2D texture dimensions */
int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */
int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
int maxTexture3D[3]; /**< Maximum 3D texture dimensions */
int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */
int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */
int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */
int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
int maxSurface1D; /**< Maximum 1D surface size */
int maxSurface2D[2]; /**< Maximum 2D surface dimensions */
int maxSurface3D[3]; /**< Maximum 3D surface dimensions */
int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */
int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */
int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */
int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
size_t surfaceAlignment; /**< Alignment requirements for surfaces */
int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */
int ECCEnabled; /**< Device has ECC support enabled */
int pciBusID; /**< PCI bus ID of the device */
int pciDeviceID; /**< PCI device ID of the device */
int pciDomainID; /**< PCI domain ID of the device */
int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
int asyncEngineCount; /**< Number of asynchronous engines */
int unifiedAddressing; /**< Device shares a unified address space with the host */
int memoryClockRate; /**< Peak memory clock frequency in kilohertz */
int memoryBusWidth; /**< Global memory bus width in bits */
int l2CacheSize; /**< Size of L2 cache in bytes */
int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
};
name:设备名称;
totalGlobalMem:显存大小;
clockRate:GPU时钟频率
multiProcessorCount:GPU流多处理器数目,SM,Stream-Multiprocessor{一个SM包含多个流处理器(SP,Stream-Processor)};


查看SP数目:

// Beginning of GPU Architecture definitions  
inline int _ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct
{
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;

sSMtoCores nGpuArchCoresPerSM[] =
{
{ 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class
{ 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class
{ -1, -1 }
};

int index = 0;

while (nGpuArchCoresPerSM[index].SM != -1)
{
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))
{
return nGpuArchCoresPerSM[index].Cores;
}

index++;
}

// If we don't find the values, we default use the previous one to run properly
printf("MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);
return nGpuArchCoresPerSM[7].Cores;
}
// end of GPU Architecture definitions


2.线程并行:

CPU Host中进程是资源分配的基本单元,线程是CPU时间调度的基本单元。

GPU Device中线程是执行CUDA程序的最小单元。GPU上线程没有优先级概念,所有线程机会均等,线程状态只有等待资源和执行两种状态

如果资源未就绪,那么就等待;一旦就绪,立即执行。

当GPU资源很充裕时,所有线程都是并发执行的,这样加速效果很接近理论加速比;

而GPU资源少于总线程个数时,有一部分线程就会等待前面执行的线程释放资源,从而变为串行化执行

#include "cuda_runtime.h"           //CUDA运行时API  
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = threadIdx.x; // 每个线程获得自身ID 由于dim3结构体只传入了1维参数 x,y,z三维参数只有x维是有效的
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = 0;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = 0;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
}
cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return 1;
}
return 0;
}

cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)
{
int *dev_a = 0; //GPU设备端数据指针
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus; //状态指示

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0); // GPU 0
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// 显存空间申请
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// 拷贝CPU Host数据到GPU Device
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
// 运行核函数
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// 1表示只分配一个Block
// size表示每个Block有size个线程(Thread)
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize(); // 同步线程
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); //拷贝结果回主机
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_c); //释放GPU设备端内存
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}


3.块并行

将线程并行代码中的:

addKernel<<<1,size >>>(dev_c, dev_a, dev_b);
// 改为
addKernel<<<size,1 >>>(dev_c, dev_a, dev_b);
另将
__global__ void addKernel(int *c, const int *a, const int *b)  
{
int i = blockIdx.x; // 原为threadIdx.x
c[i] = a[i] + b[i];
}
线程并行是细粒度并行,调度效率高;

块并行是粗粒度并行,每次调度都要重新分配资源,由于资源限制,可能需要串行执行。


4.流并行

线程并行为细粒度的并行,而块并行为粗粒度的并行。

一组Thread并行处理可以组织为一个block,而一组block并行处理可以组织为一个Grid。

利用多个Grid来完成并行处理即流并行


流可以实现在一个Device上运行多个核函数。

块并行线程并行运行的核函数都是相同的(代码相同)

而流并行,可以执行不同的核函数,也可以实现对同一个核函数传递不同的参数,实现任务级别的并行。

CUDA中的流用cudaStream_t类型实现,用到以下几个API:

cudaStreamCreate(cudaStream_t * s)// 用于创建流;
cudaStreamDestroy(cudaStream_t s)// 用于销毁流;
cudaStreamSynchronize()// 用于单个流同步;
cudaDeviceSynchronize()// 用于整个设备上的所有流同步;
cudaStreamQuery()// 用于查询一个流的任务是否已经完成。
E.g.

#include "cuda_runtime.h"  
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = 0;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = 0;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
}
cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}

cudaStream_t stream[5];
for(int i = 0; i < 5; i++)
{
cudaStreamCreate(&stream[i]); //创建流
}
// Launch a kernel on the GPU with one thread for each element.
for(int i = 0; i < 5; i++)
{
addKernel<<<1,1,0,stream[i]>>>(dev_c+i, dev_a+i, dev_b+i); //执行流
// 1 : block 数目
// 1 : block 中 thread 数目
// 0 : block 中用到的共享内存大小
// stream[i] : 当前核函数在哪个流上运行 每个流上都拥有一个核函数
// 每个核函数作用的对象不同就实现了任务级别的并行
// 当有多个互不相关的任务时 可以写多个核函数
// 在资源允许的情况下 将这些核函数装载到不同流上执行
}
cudaDeviceSynchronize();
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
for(int i = 0;i<5;i++)
{
cudaStreamDestroy(stream[i]); //销毁流
}
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}