OpenACC 优化矩阵乘法

时间:2023-03-09 22:33:51
OpenACC 优化矩阵乘法

▶ 按书上的步骤使用不同的导语优化矩阵乘法

● 已经优化的代码

 #include <iostream>
#include <cstdlib>
#include <chrono> #define SIZE 1024 using namespace std;
using namespace std::chrono; double a[SIZE][SIZE], b[SIZE][SIZE], c[SIZE][SIZE], d[SIZE][SIZE]; // 四个数组放入 main 里会报错 Segmentation fault (core dumped) int main()
{
//int i, j, k; // ijk 和 tmp 在循环中使用时才声明会导致运行时间变长
double tmp; #pragma acc enter data create(a, b, c)
#pragma acc kernels present(a, b, c)
{
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
a[i][j] = (double)(i + j);
}
for (int i = ; i < SIZE; i++) // 初始化 ab
{
for (int j = ; j < SIZE; j++)
b[i][j] = (double)(i - j);
}
for (int i = ; i < SIZE; i++) // 每种方法前都要清空 c
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
}
} high_resolution_clock::time_point t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 1,每层循环都 auto
{
#pragma acc loop auto
for (int i = ; i < SIZE; i++)
{
#pragma acc loop auto
for (int j = ; j < SIZE; j++)
{
#pragma acc loop auto
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double> time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Auto: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 2,外两层 independent,最里层串行
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
#pragma acc loop independent
for (int k = ; k < SIZE; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Seq: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 3,外两层 independent,最里层规约
{
#pragma acc loop independent
for (int i = ; i < SIZE; i++)
{
#pragma acc loop independent
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Independent Reduction: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 4,手动指定 gang 和 vector
{
#pragma acc loop gang(32)
for (int i = ; i < SIZE; i++)
{
#pragma acc loop vector(16)
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Gang Vector: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 5,分块重排
{
#pragma acc loop tile(32, 32)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+ \
: tmp)
for (int k = ; k < SIZE; ++k)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - tile: %.6lf s.\n\n", time.count()); #pragma acc kernels present(c)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
c[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma acc kernels present(a, b, c) // 方法 6,合并多层迭代
{
#pragma acc loop collapse(2) independent
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
tmp = 0.0f;
#pragma acc loop reduction(+: tmp)
for (int k = ; k < SIZE; k++)
tmp += a[i][k] * b[k][j];
c[i][j] = tmp;
}
}
} t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenACC - Collapse: %.6lf s.\n\n", time.count()); #pragma acc exit data copyout(a, b, c) #pragma omp parallel for shared(d)
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
d[i][j] = 0.0;
} t1 = high_resolution_clock::now(); #pragma omp parallel for default(none) shared(a, b, d) // 使用 OpenMP
for (int i = ; i < SIZE; i++)
{
for (int j = ; j < SIZE; j++)
{
for (int k = ; k < SIZE; k++)
d[i][j] += a[i][k] * b[k][j];
}
}
t2 = high_resolution_clock::now();
time = duration_cast<duration<double>>(t2 - t1);
printf("Time OpenMP: %.6lf s.\n\n", time.count()); for (int i = ; i < SIZE; i++) // 检查结果
{
for (int j = ; j < SIZE; j++)
{
if (c[i][j] != d[i][j])
printf("\nError at [%d, %d],c = %f d = %f \n", i, j, c[i][j], d[i][j]);
}
}
return ;
}

● 输出结果(数据管理优化前)

cuan@CUAN:/mnt/d/Code/OpenACC/OpenACCProject/OpenACCProject$ ./acc.exe

Time OpenACC - Auto: 4.589736 s.

Time OpenACC - Independent Seq: 4.823721 s.

Time OpenACC - Independent Reduction: 3.669336 s.

Time OpenACC - Gang Vector: 3.611391 s.

Time OpenACC - tile: 3.609573 s.

Time OpenACC - Collapse: 3.605792 s.

Time OpenMP: 4.345018 s.

● 输出结果(数据管理优化后)

cuan@CUAN:~/acc$ pgc++ main.cpp -std=c++ -acc -mp -Minfo -o main.exe
main:
, include "chrono"
, include "chrono"
, Parallel region activated
, Parallel region terminated
, Parallel region activated
, Parallel region terminated
, Generating enter data create(b[:][:],c[:][:],a[:][:])
Generating present(a[:][:],b[:][:],c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Complex loop carried dependence of c prevents parallelization
Loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
Inner sequential loop scheduled on accelerator
Generating Tesla code
, #pragma acc loop gang /* blockIdx.y */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Complex loop carried dependence of c prevents parallelization
Loop carried backward dependence of c prevents vectorization
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang /* blockIdx.z */
, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
, #pragma acc loop gang /* blockIdx.y */
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, FMA (fused multiply-add) instruction(s) generated
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang(32), vector(8) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(16) /* blockIdx.x threadIdx.x */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector tile(32,32) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x tiled */
, #pragma acc loop seq
, Loop is parallelizable
, Generating present(c[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(4) /* blockIdx.y threadIdx.y */
, #pragma acc loop gang, vector(32) /* blockIdx.x threadIdx.x */
, Memory zero idiom, loop replaced by call to __c_mzero8
, Generating present(a[:][:],c[:][:],b[:][:])
, Loop is parallelizable
, Loop is parallelizable
Generating Tesla code
, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
, /* blockIdx.x threadIdx.x collapsed */
, #pragma acc loop seq
, Loop is parallelizable
, Generating exit data copyout(c[:][:],b[:][:],a[:][:])
Parallel loop activated with static block schedule
, Memory zero idiom, loop replaced by call to __c_mzero8
, Barrier
, Parallel loop activated with static block schedule
FMA (fused multiply-add) instruction(s) generated
, Barrier
cuan@CUAN:~/acc$ ./main.exe
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024 block=
Time OpenACC - Auto: 0.018726 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=8x1024x4 block=
Time OpenACC - Independent Seq: 0.040719 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
Time OpenACC - Independent Reduction: 0.012491 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=64x32 block=16x8
Time OpenACC - Gang Vector: 0.012314 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - tile: 0.013609 s. launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid=32x256 block=32x4
launch CUDA kernel file=/home/cuan/acc/main.cpp function=main line= device= threadid= num_gangs= num_workers= vector_length= grid= block=
Time OpenACC - Collapse: 0.012676 s. Time OpenMP: 0.504436 s. Accelerator Kernel Timing data
/home/cuan/acc/main.cpp
main NVIDIA devicenum=
time(us): ,
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [8x1024x4] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [64x32] block: [16x8]
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: compute region reached time
: kernel launched time
grid: [32x256] block: [32x4]
device time(us): total= max= min= avg=
elapsed time(us): total= max= min= avg=
: data region reached times
: compute region reached time
: kernel launched time
grid: [] block: []
device time(us): total=, max=, min=, avg=,
elapsed time(us): total=, max=, min=, avg=,
: data region reached times
: data region reached time
: data copyout transfers:
device time(us): total=, max= min= avg=