▶ 使用 kernels 导语并行化 for 循环
● 同一段代码,使用 kernels,parallel 和 parallel + loop 进行对比
1 #include <stdio.h>
2 #include <time.h>
3 #include <openacc.h>
4
5 const int row = 65536;
6
7 int main()
8 {
9 int i, j, k, a[row], b[row], c[row];
10 clock_t time;
11 for (i = 0; i < row; i++)
12 a[i] = b[i] = i;
13
14 #ifdef _OPENACC
15 time = clock();
16 #pragma acc kernels // 使用 kernels 或 parallel 或 parallel + loop
17 // #pragma acc parallel
18 // #pragma acc loop
19 for (i = 0; i < row; i++)
20 c[i] = a[i] + b[i];
21 time = clock() - time;
22 printf("\nTime with acc:%d ms\n", time);
23 #else
24 time = clock();
25 for (i = 0; i < row; i++)
26 c[i] = a[i] + b[i];
27 time = clock() - time;
28 printf("\nTime without acc:%d ms\n", time);
29 #endif
30 getchar();
31 return 0;
32 }
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_kernels.exe // kernels
main:
16, Generating implicit copyin(b[:row])
Generating implicit copyout(c[:row])
Generating implicit copyin(a[:row])
19, Loop is parallelizable
Accelerator kernel generated
Generating Tesla code
19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_parallel.exe // parallel
main:
17, Accelerator kernel generated
Generating Tesla code
19, #pragma acc loop vector(128) /* threadIdx.x */
17, Generating implicit copyout(c[:row])
Generating implicit copyin(b[:row],a[:row])
19, Loop is parallelizable
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_parallel_loop.exe // parallel + loop
main:
17, Accelerator kernel generated
Generating Tesla code
19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
17, Generating implicit copyout(c[:row])
Generating implicit copyin(b[:row],a[:row])
D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_kernels.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=19 device=0 threadid=1 num_gangs=512 num_workers=1 vector_length=128 grid=512 block=128 // 多个 gang,自动配置,线程网格全都是一维的
Time with acc:243 ms
D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_parallel.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=17 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=128 grid=1 block=128 // 一个 gang,gang冗余模式
Time with acc:245 ms
D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_parallel_loop.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=17 device=0 threadid=1 num_gangs=512 num_workers=1 vector_length=128 grid=512 block=128 // 多个 gang,gang分裂模式
Time with acc:229 ms
● 二重循环,考虑是否在内层循环中使用 loop 导语
1 #include <stdio.h>
2 #include <time.h>
3 #include <openacc.h>
4
5 const int row = 1024, col = 64;
6
7 int main()
8 {
9 int i, j, k, a[row][col], b[row][col], c[row][col];
10 clock_t time;
11 for (i = 0; i < row; i++)
12 {
13 for (j = 0; j < col; j++)
14 a[i][j] = b[i][j] = i + j;
15 }
16
17 #ifdef _OPENACC
18 time = clock();
19 #pragma acc parallel
20 #pragma acc loop
21 for (i = 0; i < row; i++)
22 {
23 // #pragma acc loop
24 for (j = 0; j < col; j++)
25 c[i][j] = a[i][j] + b[i][j];
26 }
27 time = clock() - time;
28 printf("\nTime with acc:%d ms\n", time);
29 #else
30 time = clock();
31 for (i = 0; i < row; i++)
32 {
33 for (j = 0; j < col; j++)
34 c[i][j] = a[i][j] + b[i][j];
35 }
36 time = clock() - time;
37 printf("\nTime without acc:%d ms\n", time);
38 #endif
39 getchar();
40 return 0;
41 }
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop1.exe // 仅使用外层 loop
main:
19, Accelerator kernel generated
Generating Tesla code
21, #pragma acc loop gang /* blockIdx.x */
24, #pragma acc loop vector(128) /* threadIdx.x */
19, Generating implicit copyin(a[:row][:col])
Generating implicit copyout(c[:row][:col])
Generating implicit copyin(b[:row][:col])
24, Loop is parallelizable
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop2.exe // 内外都使用 loop,优化结果完全相同
main:
19, Accelerator kernel generated
Generating Tesla code
21, #pragma acc loop gang /* blockIdx.x */
24, #pragma acc loop vector(128) /* threadIdx.x */
19, Generating implicit copyin(a[:row][:col])
Generating implicit copyout(c[:row][:col])
Generating implicit copyin(b[:row][:col])
24, Loop is parallelizable
D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop1.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=19 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128
Time with acc:251 ms
D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop2.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=19 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128 // 优化结果完全相同
Time with acc:234 ms
● 三重循环,无论仅使用外循环 loop、外中循环 loop,还是外中内循环 loop,获得的编译和运行结果都是相同的,只放上来一个进行讨论
1 #include <stdio.h>
2 #include <time.h>
3 #include <openacc.h>
4
5 const int row = 256, col = 64, page = 4;
6
7 int main()
8 {
9 int i, j, k, a[row][col][page], b[row][col][page], c[row][col][page];
10 clock_t time;
11 for (i = 0; i < row; i++)
12 {
13 for (j = 0; j < col; j++)
14 {
15 for (k = 0; k < page; k++)
16 a[i][j][k] = b[i][j][k] = i + j + k;
17 }
18 }
19
20 #ifdef _OPENACC
21 time = clock();
22 #pragma acc parallel
23 #pragma acc loop
24 for (i = 0; i < row; i++)
25 {
26 //#pragma acc loop
27 for (j = 0; j < col; j++)
28 {
29 //#pragma acc loop
30 for (k = 0; k<page; k++)
31 c[i][j][k] = a[i][j][k] + b[i][j][k];
32 }
33 }
34 time = clock() - time;
35 printf("\nTime with acc:%d ms\n", time);
36 #else
37 time = clock();
38 for (i = 0; i < row; i++)
39 {
40 for (j = 0; j < col; j++)
41 {
42 for (k = 0; k<page; k++)
43 c[i][j][k] = a[i][j][k] + b[i][j][k];
44 }
45 }
46 time = clock() - time;
47 printf("\nTime without acc:%d ms\n", time);
48 #endif
49 getchar();
50 return 0;
51 }
● 输出结果
D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop.exe
main:
22, Accelerator kernel generated
Generating Tesla code
24, #pragma acc loop gang /* blockIdx.x */ // 并行化了外层循环和内层循环,但是用中间层使用的是串行
27, #pragma acc loop seq
30, #pragma acc loop vector(128) /* threadIdx.x */
22, Generating implicit copyout(c[:row][:col][:page])
Generating implicit copyin(b[:row][:col][:page],a[:row][:col][:page])
27, Loop is parallelizable
30, Loop is parallelizable
D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop1.exe
launch CUDA kernel file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=22 device=0 threadid=1 num_gangs=256 num_workers=1 vector_length=128 grid=256 block=128
Time with acc:226 ms