计算核函数调用使得占用率,并尝试使用 runtime 函数自动优化线程块尺寸,以便提高占用率。
▶ 源代码。
#include <iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <helper_cuda.h> const int manualBlockSize = ; // 核函数,输入数组的每个元素平方后放回
__global__ void square(int *array, int arrayCount)
{
extern __shared__ int dynamicSmem[];
int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < arrayCount)
array[idx] *= array[idx];
} // 负责调用核函数,计时,并考虑是否使用 runtime 函数优化线程块尺寸
static int launchConfig(int *data, int size, bool automatic)
{
int blockSize;
int numBlocks;
int gridSize;
int minGridSize;
float elapsedTime;
double potentialOccupancy;
size_t dynamicSMemUsage = ; cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, );
cudaEvent_t start;
cudaEvent_t end;
cudaEventCreate(&start);
cudaEventCreate(&end); if (automatic)// true 则使用 runtime 函数自动优化线程块尺寸
{
cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, (void*)square, dynamicSMemUsage, size);
printf("\n\tSuggested block size: %d, minimum grid size for maximum occupancy: %d\n", blockSize, minGridSize);
}
else
blockSize = manualBlockSize; gridSize = (size + blockSize - ) / blockSize; cudaEventRecord(start);
square<<<gridSize, blockSize, dynamicSMemUsage>>>(data, size);
cudaEventRecord(end);
cudaDeviceSynchronize();
cudaEventElapsedTime(&elapsedTime, start, end);
printf("\n\tElapsed time: %4.2f ms\n", elapsedTime); // 依线程数计算占用率,分子分母同除以 prop.warpSize 即按活动线程束数计算,两者等价
cudaOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, square, blockSize, dynamicSMemUsage);
potentialOccupancy = (double)(numBlocks * blockSize) / (prop.maxThreadsPerMultiProcessor);
printf("\n\tPotential occupancy: %4.2f %%\n", potentialOccupancy * ); return ;
} // 负责核函数调用前后内存控制,以及结果检查
static int test(bool automaticLaunchConfig, const int count = )
{
int size = count * sizeof(int);
int *h_data = (int *)malloc(size);
for (int i = ; i < count; i++)
h_data[i] = i;
int *d_data;
cudaMalloc(&d_data, size); cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
memset(h_data,,size);
launchConfig(d_data, count, automaticLaunchConfig);
cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost); for (int i = ; i < count; i += )
{
if (h_data[i] != i * i)
{
printf("\n\tError at %d, d_data = %d\n", i, h_data[i]);
return ;
}
} free(h_data);
cudaFree(d_data);
return ;
} int main()
{
int status; printf("\n\tStart.\n"); printf("\n\tManual configuration test, BlockSize = %d\n", manualBlockSize);
if (test(false))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tAutomatic configuration\n");
if (test(true))
{
printf("\n\tTest failed\n");
return -;
} printf("\n\tTest PASSED\n");
getchar();
return ;
}
▶ 输出结果
Start. Manual configuration test, BlockSize = Elapsed time: 0.13 ms Potential occupancy: 50.00 % Automatic configuration Suggested block size: , minimum grid size for maximum occupancy: Elapsed time: 0.12 ms Potential occupancy: 100.00 % Test PASSED
▶ 涨姿势
● 用到的几个 runtime 函数及其相互关系。
// driver_types.h
// 用于优化线程块尺寸的函数中的标志
#define cudaOccupancyDefault 0x00 // 默认标志
#define cudaOccupancyDisableCachingOverride 0x01 // 开启全局缓存,且不能被禁用 // cuda_device_runtime_api.h
// 与 cuda_runtime.h 中同名的函数,貌似没有用到?
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
{
return cudaErrorUnknown;
} // 被函数 cudaOccupancyMaxActiveBlocksPerMultiprocessor 和函数 cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags 调用的
__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
{
return cudaErrorUnknown;
} // cuda_runtime.h
template<class T>
static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, T func, int blockSize, size_t dynamicSMemSize)
{
return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
} template<typename UnaryFunction, class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
(
int* minGridSize, int* blockSize, T func, UnaryFunction blockSizeToDynamicSMemSize, int blockSizeLimit = , unsigned int flags =
)
{
cudaError_t status; // 设备和函数属性
int device;
struct cudaFuncAttributes attr;
int maxThreadsPerMultiProcessor;
int warpSize;
int devMaxThreadsPerBlock;
int multiProcessorCount;
int occupancyLimit;
int granularity; // 记录最大值
int maxBlockSize = ;
int numBlocks = ;
int maxOccupancy = ; // 临时变量
int blockSizeToTryAligned;
int blockSizeToTry;
int occupancyInBlocks;
int occupancyInThreads;
size_t dynamicSMemSize; // 检查输入
if (!minGridSize || !blockSize || !func)
return cudaErrorInvalidValue; //获取设备和核函数属性
status = ::cudaGetDevice(&device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&warpSize,cudaDevAttrWarpSize,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&devMaxThreadsPerBlock,cudaDevAttrMaxThreadsPerBlock,device);
if (status != cudaSuccess)
return status;
status = cudaDeviceGetAttribute(&multiProcessorCount,cudaDevAttrMultiProcessorCount,device);
if (status != cudaSuccess)
return status;
status = cudaFuncGetAttributes(&attr, func);
if (status != cudaSuccess)
return status; //尝试线程块尺寸
occupancyLimit = maxThreadsPerMultiProcessor;
granularity = warpSize; if (blockSizeLimit == || blockSizeLimit > devMaxThreadsPerBlock)
blockSizeLimit = devMaxThreadsPerBlock; if (blockSizeLimit > attr.maxThreadsPerBlock)
blockSizeLimit = attr.maxThreadsPerBlock; for (blockSizeToTryAligned = ((blockSizeLimit + (warpSize - )) / warpSize) * warpSize; blockSizeToTryAligned > ; blockSizeToTryAligned -= warpSize)
// blockSizeLimit 向上对齐到 warpSize 的整数倍,并尝试以 warpSize 为单位向下减少
// 如果一开始 blockSizeLimit 就比 blockSizeToTryAligned 小,则从 blockSizeLimit 开始尝试(这时只用迭代一次)
{
blockSizeToTry = (blockSizeLimit < blockSizeToTryAligned) ? blockSizeLimit : blockSizeToTryAligned;
dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); // 计算占用率的核心
status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(&occupancyInBlocks, func, blockSizeToTry, dynamicSMemSize, flags);
if (status != cudaSuccess)
return status; // 记录有效结果
if ((occupancyInThreads = blockSizeToTry * occupancyInBlocks) > maxOccupancy)
{
maxBlockSize = blockSizeToTry;
numBlocks = occupancyInBlocks;
maxOccupancy = occupancyInThreads;
} // 已经达到了占用率 100%,退出
if (occupancyLimit == maxOccupancy)
break;
} // 返回最优结果
*minGridSize = numBlocks * multiProcessorCount;
*blockSize = maxBlockSize; return status;
} class __cudaOccupancyB2DHelper
{
size_t n;
public:
inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
inline __host__ CUDART_DEVICE size_t operator()(int)
{
return n;
}
}; // 优化线程块尺寸的 runtime 函数
// 参数:输出最小线程格尺寸 minGridSize,输出线程块尺寸 blockSize,内核 func,动态共享内存大小 dynamicSMemSize,总线程数 blockSizeLimit
template<class T>
static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize
(
int *minGridSize, int *blockSize, T func, size_t dynamicSMemSize = , int blockSizeLimit =
)
{
return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
}