Caffe 初学拾遗(五) CUDA 并行化示例

Original Source:

http://blog.****.net/augusdi/article/details/12833235

一些CUDA编程的简单示例程序，笔者在此进行了整理说明：

在此之前，关于GPU选择（有提到流处理簇内容）以及Intel与Nvidia的争论，读者有兴趣也可以一看。

Note:

1. 没有任何前缀的函数，都是Host程序。Host程序可以用__host__前缀进行显式声明。

2. Device程序需要由NVCC进行编译，而Host程序只需要由Host编译器（GCC）编译。

3. Host程序主要完成设备环境初始化，数据传输等必备过程，Device程序只负责计算。

4. Host程序中，“cuda”开头的函数，都是CUDA Runtime API，即运行时函数，主要负责完成Device的初始化、内存分配、内存拷贝等任务。

1.获取设备数目及属性：

// Add vectors in parallel.  
cudaError_t cudaStatus;  
int num = 0;  
cudaDeviceProp prop;  
cudaStatus = cudaGetDeviceCount(&num);  
for(int i = 0;i<num;i++)  
{  
    cudaGetDeviceProperties(&prop,i);  
}

cudaError_t是cuda错误类型，取值为整数。
cudaDeviceProp为设备属性结构体，其定义如下：

/** 
 * CUDA device properties 
 */  
struct __device_builtin__ cudaDeviceProp  
{  
    char   name[256];                  /**< ASCII string identifying device */  
    size_t totalGlobalMem;             /**< Global memory available on device in bytes */  
    size_t sharedMemPerBlock;          /**< Shared memory available per block in bytes */  
    int    regsPerBlock;               /**< 32-bit registers available per block */  
    int    warpSize;                   /**< Warp size in threads */  
    size_t memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */  
    int    maxThreadsPerBlock;         /**< Maximum number of threads per block */  
    int    maxThreadsDim[3];           /**< Maximum size of each dimension of a block */  
    int    maxGridSize[3];             /**< Maximum size of each dimension of a grid */  
    int    clockRate;                  /**< Clock frequency in kilohertz */  
    size_t totalConstMem;              /**< Constant memory available on device in bytes */  
    int    major;                      /**< Major compute capability */  
    int    minor;                      /**< Minor compute capability */  
    size_t textureAlignment;           /**< Alignment requirement for textures */  
    size_t texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */  
    int    deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */  
    int    multiProcessorCount;        /**< Number of multiprocessors on device */  
    int    kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */  
    int    integrated;                 /**< Device is integrated as opposed to discrete */  
    int    canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */  
    int    computeMode;                /**< Compute mode (See ::cudaComputeMode) */  
    int    maxTexture1D;               /**< Maximum 1D texture size */  
    int    maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */  
    int    maxTexture1DLinear;         /**< Maximum size for 1D textures bound to linear memory */  
    int    maxTexture2D[2];            /**< Maximum 2D texture dimensions */  
    int    maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */  
    int    maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */  
    int    maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */  
    int    maxTexture3D[3];            /**< Maximum 3D texture dimensions */  
    int    maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */  
    int    maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */  
    int    maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */  
    int    maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */  
    int    maxSurface1D;               /**< Maximum 1D surface size */  
    int    maxSurface2D[2];            /**< Maximum 2D surface dimensions */  
    int    maxSurface3D[3];            /**< Maximum 3D surface dimensions */  
    int    maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */  
    int    maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */  
    int    maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */  
    int    maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */  
    size_t surfaceAlignment;           /**< Alignment requirements for surfaces */  
    int    concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */  
    int    ECCEnabled;                 /**< Device has ECC support enabled */  
    int    pciBusID;                   /**< PCI bus ID of the device */  
    int    pciDeviceID;                /**< PCI device ID of the device */  
    int    pciDomainID;                /**< PCI domain ID of the device */  
    int    tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */  
    int    asyncEngineCount;           /**< Number of asynchronous engines */  
    int    unifiedAddressing;          /**< Device shares a unified address space with the host */  
    int    memoryClockRate;            /**< Peak memory clock frequency in kilohertz */  
    int    memoryBusWidth;             /**< Global memory bus width in bits */  
    int    l2CacheSize;                /**< Size of L2 cache in bytes */  
    int    maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */  
};

name：设备名称；
totalGlobalMem：显存大小；
clockRate：GPU时钟频率；
multiProcessorCount：GPU流多处理器数目，SM，Stream-Multiprocessor｛一个SM包含多个流处理器（SP，Stream-Processor）｝；

查看SP数目：

// Beginning of GPU Architecture definitions  
inline int _ConvertSMVer2Cores(int major, int minor)  
{  
    // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM  
    typedef struct  
    {  
        int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version  
        int Cores;  
    } sSMtoCores;  

    sSMtoCores nGpuArchCoresPerSM[] =  
    {  
        { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class  
        { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class  
        { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class  
        { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class  
        { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class  
        { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class  
        { 0x30, 192}, // Kepler Generation (SM 3.0) GK10x class  
        { 0x35, 192}, // Kepler Generation (SM 3.5) GK11x class  
        {   -1, -1 }  
    };  

    int index = 0;  

    while (nGpuArchCoresPerSM[index].SM != -1)  
    {  
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor))  
        {  
            return nGpuArchCoresPerSM[index].Cores;  
        }  

        index++;  
    }  

    // If we don't find the values, we default use the previous one to run properly  
    printf("MapSMtoCores for SM %d.%d is undefined.  Default to use %d Cores/SM\n", major, minor, nGpuArchCoresPerSM[7].Cores);  
    return nGpuArchCoresPerSM[7].Cores;  
}  
// end of GPU Architecture definitions

2.线程并行：

CPU Host中进程是资源分配的基本单元，线程是CPU时间调度的基本单元。

GPU Device中线程是执行CUDA程序的最小单元。GPU上线程没有优先级概念，所有线程机会均等，线程状态只有等待资源和执行两种状态。

如果资源未就绪，那么就等待；一旦就绪，立即执行。

当GPU资源很充裕时，所有线程都是并发执行的，这样加速效果很接近理论加速比；

而GPU资源少于总线程个数时，有一部分线程就会等待前面执行的线程释放资源，从而变为串行化执行。

#include "cuda_runtime.h"           //CUDA运行时API  
#include "device_launch_parameters.h"     
#include <stdio.h>  
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = threadIdx.x;  // 每个线程获得自身ID 由于dim3结构体只传入了1维参数 x,y,z三维参数只有x维是有效的
    c[i] = a[i] + b[i];  
}  
int main()  
{  
    const int arraySize = 5;  
    const int a[arraySize] = { 1, 2, 3, 4, 5 };  
    const int b[arraySize] = { 10, 20, 30, 40, 50 };  
    int c[arraySize] = { 0 };  
    // Add vectors in parallel.  
    cudaError_t cudaStatus;  
    int num = 0;  
    cudaDeviceProp prop;  
    cudaStatus = cudaGetDeviceCount(&num);  
    for(int i = 0;i<num;i++)  
    {  
        cudaGetDeviceProperties(&prop,i);  
    }  
    cudaStatus = addWithCuda(c, a, b, arraySize);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "addWithCuda failed!");  
        return 1;  
    }  
    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);  
    // cudaThreadExit must be called before exiting in order for profiling and  
    // tracing tools such as Nsight and Visual Profiler to show complete traces.  
    cudaStatus = cudaThreadExit();  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaThreadExit failed!");  
        return 1;  
    }  
    return 0;  
}  

cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
{  
    int *dev_a = 0; //GPU设备端数据指针  
    int *dev_b = 0;  
    int *dev_c = 0;  
    cudaError_t cudaStatus;     //状态指示  

    // Choose which GPU to run on, change this on a multi-GPU system.  
    cudaStatus = cudaSetDevice(0);  // GPU 0 
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
        goto Error;  
    }  
    // 显存空间申请
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
    // 拷贝CPU Host数据到GPU Device 
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
    // 运行核函数  
addKernel<<<1, size>>>(dev_c, dev_a, dev_b); 
// 1表示只分配一个Block
// size表示每个Block有size个线程（Thread）
// cudaThreadSynchronize waits for the kernel to finish, and returns  
    // any errors encountered during the launch.  
    cudaStatus = cudaThreadSynchronize(); // 同步线程  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
        goto Error;  
    }  
    // Copy output vector from GPU buffer to host memory.  
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);      //拷贝结果回主机  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
Error:  
    cudaFree(dev_c);    //释放GPU设备端内存  
    cudaFree(dev_a);  
    cudaFree(dev_b);      
    return cudaStatus;  
}

3.块并行

将线程并行代码中的：

addKernel<<<1,size >>>(dev_c, dev_a, dev_b);
// 改为
addKernel<<<size,1 >>>(dev_c, dev_a, dev_b);

另将

__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = blockIdx.x; // 原为threadIdx.x  
    c[i] = a[i] + b[i];  
}

线程并行是细粒度并行，调度效率高；

块并行是粗粒度并行，每次调度都要重新分配资源，由于资源限制，可能需要串行执行。

4.流并行

线程并行为细粒度的并行，而块并行为粗粒度的并行。

一组Thread并行处理可以组织为一个block，而一组block并行处理可以组织为一个Grid。

利用多个Grid来完成并行处理即流并行。

流可以实现在一个Device上运行多个核函数。

块并行线程并行运行的核函数都是相同的（代码相同）。

而流并行，可以执行不同的核函数，也可以实现对同一个核函数传递不同的参数，实现任务级别的并行。

CUDA中的流用cudaStream_t类型实现，用到以下几个API：

cudaStreamCreate(cudaStream_t * s)// 用于创建流；
cudaStreamDestroy(cudaStream_t s)// 用于销毁流；
cudaStreamSynchronize()// 用于单个流同步；
cudaDeviceSynchronize()// 用于整个设备上的所有流同步；
cudaStreamQuery()// 用于查询一个流的任务是否已经完成。

E.g.

#include "cuda_runtime.h"  
#include "device_launch_parameters.h"  
#include <stdio.h>  
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
__global__ void addKernel(int *c, const int *a, const int *b)  
{  
    int i = blockIdx.x;  
    c[i] = a[i] + b[i];  
}  
int main()  
{  
    const int arraySize = 5;  
    const int a[arraySize] = { 1, 2, 3, 4, 5 };  
    const int b[arraySize] = { 10, 20, 30, 40, 50 };  
    int c[arraySize] = { 0 };  
    // Add vectors in parallel.  
    cudaError_t cudaStatus;  
    int num = 0;  
    cudaDeviceProp prop;  
    cudaStatus = cudaGetDeviceCount(&num);  
    for(int i = 0;i<num;i++)  
    {  
        cudaGetDeviceProperties(&prop,i);  
    }  
    cudaStatus = addWithCuda(c, a, b, arraySize);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "addWithCuda failed!");  
        return 1;  
    }  
    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);  
    // cudaThreadExit must be called before exiting in order for profiling and  
    // tracing tools such as Nsight and Visual Profiler to show complete traces.  
    cudaStatus = cudaThreadExit();  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaThreadExit failed!");  
        return 1;  
    }  
    return 0;  
}  
// Helper function for using CUDA to add vectors in parallel.  
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
{  
    int *dev_a = 0;  
    int *dev_b = 0;  
    int *dev_c = 0;  
    cudaError_t cudaStatus;  

    // Choose which GPU to run on, change this on a multi-GPU system.  
    cudaStatus = cudaSetDevice(0);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
        goto Error;  
    }  
    // Allocate GPU buffers for three vectors (two input, one output)    .  
    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMalloc failed!");  
        goto Error;  
    }  
    // Copy input vectors from host memory to GPU buffers.  
    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  

    cudaStream_t stream[5];  
    for(int i = 0; i < 5; i++)  
    {  
        cudaStreamCreate(&stream[i]);   //创建流  
    }  
// Launch a kernel on the GPU with one thread for each element.  
    for(int i = 0; i < 5; i++)  
    {  
        addKernel<<<1,1,0,stream[i]>>>(dev_c+i, dev_a+i, dev_b+i);    //执行流  
// 1 : block 数目
// 1 : block 中 thread 数目
// 0 : block 中用到的共享内存大小
// stream[i] : 当前核函数在哪个流上运行 每个流上都拥有一个核函数
// 每个核函数作用的对象不同就实现了任务级别的并行
// 当有多个互不相关的任务时 可以写多个核函数 
// 在资源允许的情况下 将这些核函数装载到不同流上执行
    }  
    cudaDeviceSynchronize();  
    // cudaThreadSynchronize waits for the kernel to finish, and returns  
    // any errors encountered during the launch.  
    cudaStatus = cudaThreadSynchronize();  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
        goto Error;  
    }  
    // Copy output vector from GPU buffer to host memory.  
    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);  
    if (cudaStatus != cudaSuccess)   
    {  
        fprintf(stderr, "cudaMemcpy failed!");  
        goto Error;  
    }  
for(int i = 0;i<5;i++)  
    {  
        cudaStreamDestroy(stream[i]);   //销毁流  
    }  
cudaFree(dev_c);  
    cudaFree(dev_a);  
    cudaFree(dev_b);      
    return cudaStatus;  
}

秒客网

Caffe 初学拾遗(五) CUDA 并行化示例

相关文章