《GPU高性能编程CUDA实战》第八章 图形互操作性

时间:2023-03-09 08:15:35
《GPU高性能编程CUDA实战》第八章 图形互操作性

▶ OpenGL与DirectX,等待填坑。

● basic_interop

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\cpu_bitmap.h" PFNGLBINDBUFFERARBPROC glBindBuffer = NULL;
PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL;
PFNGLGENBUFFERSARBPROC glGenBuffers = NULL;
PFNGLBUFFERDATAARBPROC glBufferData = NULL; #define DIM 512 GLuint bufferObj;
cudaGraphicsResource *resource; // based on ripple code, but uses uchar4 which is the type of data
// graphic inter op uses. see screenshot - basic2.png
__global__ void kernel(uchar4 *ptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x / (float)DIM - 0.5f;
float fy = y / (float)DIM - 0.5f;
unsigned char green = + * sin(abs(fx * ) - abs(fy * ); // accessing uchar4 vs unsigned char*
ptr[offset].x = ;
ptr[offset].y = green;
ptr[offset].z = ;
ptr[offset].w = ;
} static void key_func(unsigned char key, int x, int y)
{
switch (key)
{
case :
// clean up OpenGL and CUDA
cudaGraphicsUnregisterResource(resource);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, );
glDeleteBuffers(, &bufferObj);
exit();
}
} static void draw_func(void)
{
// we pass zero as the last parameter, because out bufferObj is now
// the source, and the field switches from being a pointer to a
// bitmap to now mean an offset into a bitmap object
glDrawPixels(DIM, DIM, GL_RGBA, GL_UNSIGNED_BYTE, );
glutSwapBuffers();
} int main(int argc, char **argv)
{
cudaDeviceProp prop;
int dev; memset(&prop, , sizeof(cudaDeviceProp));
prop.major = ;
prop.minor = ;
cudaChooseDevice(&dev, &prop); // tell CUDA which dev we will be using for graphic interop
// from the programming guide: Interoperability with OpenGL
// requires that the CUDA device be specified by
// cudaGLSetGLDevice() before any other runtime calls. cudaGLSetGLDevice(dev); // these GLUT calls need to be made before the other OpenGL
// calls, else we get a seg fault
glutInit(&argc, argv);
glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
glutInitWindowSize(DIM, DIM);
glutCreateWindow("bitmap");//初始化并创建一个窗口 //创建缓冲区
glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); // the first three are standard OpenGL, the 4th is the CUDA reg
// of the bitmap these calls exist starting in OpenGL 1.5
glGenBuffers(, &bufferObj);// 将bufferObj注册为图形资源
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj);
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, DIM * DIM * , NULL, GL_DYNAMIC_DRAW_ARB); cudaGraphicsGLRegisterBuffer(&resource, bufferObj, cudaGraphicsMapFlagsNone); // do work with the memory dst being on the GPU, gotten via mapping
cudaGraphicsMapResources(, &resource, NULL);
uchar4* devPtr;
size_t size;
cudaGraphicsResourceGetMappedPointer((void**)&devPtr,&size,resource); dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(devPtr);
cudaGraphicsUnmapResources(, &resource, NULL); // set up GLUT and kick off main loop
glutKeyboardFunc(key_func);
glutDisplayFunc(draw_func);
glutMainLoop(); getchar();
return;
}

● ripple

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024 __global__ void kernel(uchar4 *ptr, int ticks)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; // now calculate the value at that position
float fx = x - DIM / ;
float fy = y - DIM / ;
float d = sqrtf(fx * fx + fy * fy);
unsigned char grey = (unsigned char)(128.0f + 127.0f *cos(d / 10.0f - ticks / 7.0f) / (d / 10.0f + 1.0f));
ptr[offset].x = grey;
ptr[offset].y = grey;
ptr[offset].z = grey;
ptr[offset].w = ;
} void generate_frame(uchar4 *pixels, void*, int ticks)
{
dim3 grids(DIM / , DIM / );
dim3 threads(, );
kernel << <grids, threads >> >(pixels, ticks);
} int main(void)
{
GPUAnimBitmap bitmap(DIM, DIM, NULL); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))generate_frame, NULL); getchar();
return;
}

● heat

 #include <stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "cuda.h"
#include "cuda_gl_interop.h"
#include "D:\Code\CUDA\book\common\book.h"
#include "D:\Code\CUDA\book\common\gpu_anim.h" #define DIM 1024
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f // these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut; // this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel(float *dst,bool dstOut)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; int left = offset - ;
int right = offset + ;
if (x == ) left++;
if (x == DIM - ) right--; int top = offset - DIM;
int bottom = offset + DIM;
if (y == ) top += DIM;
if (y == DIM - ) bottom -= DIM; float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom); }
else {
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - * c);
} // NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel(float *iptr)
{
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x; float c = tex1Dfetch(texConstSrc, offset);
if (c != )
iptr[offset] = c;
} // globals needed by the update routine
struct DataBlock
{
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
cudaEvent_t start, stop;
float totalTime;
float frames;
}; void anim_gpu(uchar4* outputBitmap, DataBlock *d, int ticks)
{
cudaEventRecord(d->start, );
dim3 blocks(DIM / , DIM / );
dim3 threads(, ); // since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i = ; i<; i++)
{
float *in, *out;
if (dstOut)
{
in = d->dev_inSrc;
out = d->dev_outSrc;
}
else
{
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel << <blocks, threads >> >(in);
blend_kernel << <blocks, threads >> >(out, dstOut);
dstOut = !dstOut;
}
float_to_color << <blocks, threads >> >(outputBitmap,d->dev_inSrc); cudaEventRecord(d->stop, );
cudaEventSynchronize(d->stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime,d->start, d->stop);
d->totalTime += elapsedTime;
++d->frames;
printf("Average Time per frame: %3.1f ms\n",d->totalTime / d->frames);
} // clean up memory allocated on the GPU
void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc);
cudaEventDestroy(d->start);
cudaEventDestroy(d->stop);
} int main(void)
{
DataBlock data;
GPUAnimBitmap bitmap(DIM, DIM, &data);
data.totalTime = ;
data.frames = ;
cudaEventCreate(&data.start);
cudaEventCreate(&data.stop); int imageSize = bitmap.image_size(); cudaMalloc((void**)&data.dev_inSrc, imageSize);
cudaMalloc((void**)&data.dev_outSrc, imageSize);
cudaMalloc((void**)&data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, imageSize);
cudaBindTexture(NULL, texIn, data.dev_inSrc, imageSize);
cudaBindTexture(NULL, texOut, data.dev_outSrc, imageSize); float *temp = (float*)malloc(imageSize);
for (int i = ; i < DIM*DIM; i++)// 恒温格点数据
{
temp[i] = ;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MIN_TEMP;
}
cudaMemcpy(data.dev_constSrc, temp,imageSize,cudaMemcpyHostToDevice); for (int i = ; i < DIM*DIM; i++)// 初始温度场数据
{
temp[i] = 0.5;
int x = i % DIM;
int y = i / DIM;
if ((x >= ) && (x < ) && (y >= ) && (y < ))
temp[i] = MAX_TEMP;
}
cudaMemcpy(data.dev_inSrc, temp,imageSize,cudaMemcpyHostToDevice);
free(temp); bitmap.anim_and_exit((void(*)(uchar4*, void*, int))anim_gpu,(void(*)(void*))anim_exit);
getchar();
return;
}