CUDA内存(一) 寄存器

时间:2024-04-09 09:09:31

不同种类存储的读取速度

不同种类存储的读取速度1
CUDA内存(一) 寄存器
将变量设置为局部变量, 编译器会将其放入寄存器中, 可以省去大量的内存读写操作.
CUDA内存(一) 寄存器

GPU 寄存器实现位包装

__global__ void test_reg_kernel(Cuda32u* data, bool *packed_array, Cuda32u num_elements)
{
	Cuda32u idx = blockIdx.x*blockDim.x + threadIdx.x;
	Cuda32u idy = blockIdx.y*blockDim.y + threadIdx.y;
	Cuda32u tid = idy*blockDim.x*gridDim.x + idx;
	if (tid < num_elements)
	{
		// 局部变量, 放在寄存器中.
		Cuda32u d_tmp = 0;
		for (int i = 0; i < KERNEL_LOOP;i++)
		{
			d_tmp |= (packed_array[i] << i);
		}
		data[tid] = d_tmp;
	}
}
void test_reg(Cuda32u* d_puData, bool *d_pbPackArray, Cuda32u uArrayLen)
{
	dim3 thread_rect(8, 8);
	dim3 block_rect(256, 256);
	test_reg_kernel<<<block_rect, thread_rect>>>(d_puData, d_pbPackArray, uArrayLen);
}

GPU 全局内存实现位包装

// 放在全局内存中.
__device__ static Cuda32u d_tmp = 0;
__global__ void test_gmem_kernel(Cuda32u* data, bool *packed_array, Cuda32u num_elements)
{
	Cuda32u idx = blockIdx.x*blockDim.x + threadIdx.x;
	Cuda32u idy = blockIdx.y*blockDim.y + threadIdx.y;
	Cuda32u tid = idy*blockDim.x*gridDim.x + idx;
	//Cuda32u tid = blockIdx.x*blockDim.x + threadIdx.x;
	if (tid < num_elements)
	{
		for (int i = 0; i < KERNEL_LOOP; i++)
		{
			d_tmp |= (packed_array[i] << i);
		}
		data[tid] = d_tmp;
	}
}

void test_gmem(Cuda32u* d_puData, bool *d_pbPackArray, Cuda32u uArrayLen)
{
	dim3 thread_rect(8, 8);
	dim3 block_rect(256, 256);
	test_gmem_kernel<<<block_rect, thread_rect>>>(d_puData, d_pbPackArray, uArrayLen);
}

测试

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "sm_20_atomic_functions.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "helper_cuda.h"
#include "timer.h"
#include <ctime>
#include "Global.h"
#include "RegisterTest.h"
#include "CalHist.h"
int main()
{
	// CPU 数据初始化
	Cuda32u* d_puData = NULL;
	Cuda32u uArrayLen = 2048*2048;
	checkCudaErrors(cudaMalloc((void**)&d_puData, uArrayLen*sizeof(Cuda32u)));
	checkCudaErrors(cudaMemset((void*)d_puData, 0, uArrayLen*sizeof(Cuda32u)));
	bool *pbPackArray = (bool*)malloc(KERNEL_LOOP*sizeof(bool));
	memset((void*)pbPackArray, 1, KERNEL_LOOP*sizeof(bool));
	// CPU->GPU
	bool *d_pbPackArray = NULL;
	checkCudaErrors(cudaMalloc((void**)&d_pbPackArray, KERNEL_LOOP*sizeof(bool)));
	checkCudaErrors(cudaMemcpy((void*)d_pbPackArray, (void*)pbPackArray,
		KERNEL_LOOP*sizeof(bool), cudaMemcpyHostToDevice));
	// 计时初始化
	Cuda32u iIterNum = 10;
	cudaEvent_t start, stop;
	Cuda32f elapsedTime = 0.0;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	// 预热
	cudaWarmUp();
	// 开始计时
	cudaEventRecord(start, 0);
	// GPU 处理
	for (Cuda32u i = 0; i < iIterNum; i++)
	{
		// 恢复.
		//checkCudaErrors(cudaMemset((void*)d_puData, 0, uArrayLen*sizeof(Cuda32u)));
		// GPU 寄存器 位包装.
		test_reg(d_puData, d_pbPackArray, uArrayLen);
	}
	// 结束计时
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);
	// 传输数据 GPU->CPU
	Cuda32u* h_puData = (Cuda32u*)malloc(uArrayLen*sizeof(Cuda32u));
	checkCudaErrors(cudaMemcpy((void*)h_puData, (void*)d_puData,
		uArrayLen*sizeof(Cuda32u), cudaMemcpyDeviceToHost));
	// 累加结果. 显示测试结果.
	Cuda32u iSumData = 0;
	for (Cuda32u i = 0; i < uArrayLen; i++)
	{
		iSumData += h_puData[i];
	}
	printf("\n%%%%%%%%%%%%%% GPU 寄存器 位包装:%%%%%%%%%%%%%%\n");
	printf("序列长度 = %d\n", uArrayLen);
	printf("重复次数 = %d\n", iIterNum);
	printf("序列求和 = %d\n", iSumData);
	printf("平均用时 = %f ms\n", elapsedTime / (Cuda32u)iIterNum);
	printf("%%%%%%%%%%%%%% GPU 寄存器 位包装:%%%%%%%%%%%%%%\n\n");
	// 开始计时
	cudaEventRecord(start, 0);
	// GPU 处理
	for (Cuda32u i = 0; i < iIterNum; i++)
	{
		// 恢复.
		//checkCudaErrors(cudaMemset((void*)d_puData, 0, uArrayLen*sizeof(Cuda32u)));
		// GPU 全局内存 位包装
		test_gmem(d_puData, d_pbPackArray, uArrayLen);
	}
	// 结束计时
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);
	// 传输数据 GPU->CPU
	//Cuda32u* h_puData = (Cuda32u*)malloc(uArrayLen*sizeof(Cuda32u));
	checkCudaErrors(cudaMemcpy((void*)h_puData, (void*)d_puData,
		uArrayLen*sizeof(Cuda32u), cudaMemcpyDeviceToHost));
	// 累加结果. 显示测试结果.
	Cuda32u iSumData2 = 0;
	for (Cuda32u i = 0; i < uArrayLen; i++)
	{
		iSumData2 += h_puData[i];
	}
	printf("\n%%%%%%%%%%%%%% GPU 全局内存 位包装:%%%%%%%%%%%%%%\n");
	printf("序列长度 = %d\n", uArrayLen);
	printf("重复次数 = %d\n", iIterNum);
	printf("序列求和 = %d\n", iSumData2);
	printf("平均用时 = %f ms\n", elapsedTime / (Cuda32u)iIterNum);
	printf("%%%%%%%%%%%%%% GPU 全局内存 位包装:%%%%%%%%%%%%%%\n\n");
	// 释放资源
	checkCudaErrors(cudaFree((void*)d_puData));
	checkCudaErrors(cudaFree((void*)d_pbPackArray));
	free(h_puData);
	free(pbPackArray);
	cudaEventDestroy(start);
	cudaEventDestroy(stop);

}

测试结果

使用寄存器比使用全局内存快很多. 节省了约75%的时间.
CUDA内存(一) 寄存器

参考文献


  1. Shane Cook. CUDA Programming: A developer’s guide to parallel computing with GPUs ↩︎