OpenACC parallel

2018/08/03 13:59
阅读数 190

▶ 使用 kernels 导语并行化 for 循环

● 同一段代码,使用 kernels,parallel 和 parallel + loop 进行对比

 1 #include <stdio.h>
 2 #include <time.h>
 3 #include <openacc.h>
 4 
 5 const int row = 65536;
 6 
 7 int main()
 8 {
 9     int i, j, k, a[row], b[row], c[row];
10     clock_t time;
11     for (i = 0; i < row; i++)
12         a[i] = b[i] = i;
13 
14 #ifdef _OPENACC    
15     time = clock();
16 #pragma acc kernels     // 使用 kernels 或 parallel 或 parallel + loop 
17 // #pragma acc parallel
18 // #pragma acc loop
19     for (i = 0; i < row; i++)
20         c[i] = a[i] + b[i];
21     time = clock() - time;
22     printf("\nTime with acc:%d ms\n", time);
23 #else
24     time = clock();
25     for (i = 0; i < row; i++)
26         c[i] = a[i] + b[i];
27     time = clock() - time;
28     printf("\nTime without acc:%d ms\n", time);
29 #endif
30     getchar();
31     return 0;
32 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_kernels.exe       // kernels
main:
     16, Generating implicit copyin(b[:row])
         Generating implicit copyout(c[:row])
         Generating implicit copyin(a[:row])
     19, Loop is parallelizable
         Accelerator kernel generated
         Generating Tesla code
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_parallel.exe      // parallel
main:
     17, Accelerator kernel generated
         Generating Tesla code
         19, #pragma acc loop vector(128) /* threadIdx.x */
     17, Generating implicit copyout(c[:row])
         Generating implicit copyin(b[:row],a[:row])
     19, Loop is parallelizable

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_parallel_loop.exe // parallel + loop
main:
     17, Accelerator kernel generated
         Generating Tesla code
         19, #pragma acc loop gang, vector(128) /* blockIdx.x threadIdx.x */
     17, Generating implicit copyout(c[:row])
         Generating implicit copyin(b[:row],a[:row])

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_kernels.exe
launch CUDA kernel  file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=19 device=0 threadid=1 num_gangs=512 num_workers=1 vector_length=128 grid=512 block=128    // 多个 gang,自动配置,线程网格全都是一维的

Time with acc:243 ms


D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_parallel.exe
launch CUDA kernel  file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=17 device=0 threadid=1 num_gangs=1 num_workers=1 vector_length=128 grid=1 block=128        // 一个 gang,gang冗余模式

Time with acc:245 ms


D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_parallel_loop.exe
launch CUDA kernel  file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=17 device=0 threadid=1 num_gangs=512 num_workers=1 vector_length=128 grid=512 block=128    // 多个 gang,gang分裂模式

Time with acc:229 ms

● 二重循环,考虑是否在内层循环中使用 loop 导语

 1 #include <stdio.h>
 2 #include <time.h>
 3 #include <openacc.h>
 4 
 5 const int row = 1024, col = 64;
 6 
 7 int main()
 8 {
 9     int i, j, k, a[row][col], b[row][col], c[row][col];
10     clock_t time;
11     for (i = 0; i < row; i++)
12     {
13         for (j = 0; j < col; j++)
14             a[i][j] = b[i][j] = i + j;
15     }
16 
17 #ifdef _OPENACC    
18     time = clock();
19 #pragma acc parallel
20 #pragma acc loop
21     for (i = 0; i < row; i++)
22     {
23 // #pragma acc loop
24         for (j = 0; j < col; j++)
25             c[i][j] = a[i][j] + b[i][j];
26     }
27     time = clock() - time;
28     printf("\nTime with acc:%d ms\n", time);
29 #else
30     time = clock();
31     for (i = 0; i < row; i++)
32     {
33         for (j = 0; j < col; j++)
34             c[i][j] = a[i][j] + b[i][j];
35     }
36     time = clock() - time;
37     printf("\nTime without acc:%d ms\n", time);
38 #endif
39     getchar();
40     return 0;
41 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop1.exe // 仅使用外层 loop
main:
     19, Accelerator kernel generated
         Generating Tesla code
         21, #pragma acc loop gang /* blockIdx.x */
         24, #pragma acc loop vector(128) /* threadIdx.x */
     19, Generating implicit copyin(a[:row][:col])
         Generating implicit copyout(c[:row][:col])
         Generating implicit copyin(b[:row][:col])
     24, Loop is parallelizable

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop2.exe // 内外都使用 loop,优化结果完全相同
main:
     19, Accelerator kernel generated
         Generating Tesla code
         21, #pragma acc loop gang /* blockIdx.x */
         24, #pragma acc loop vector(128) /* threadIdx.x */
     19, Generating implicit copyin(a[:row][:col])
         Generating implicit copyout(c[:row][:col])
         Generating implicit copyin(b[:row][:col])
     24, Loop is parallelizable

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop1.exe
launch CUDA kernel  file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=19 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128

Time with acc:251 ms


D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop2.exe
launch CUDA kernel  file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=19 device=0 threadid=1 num_gangs=1024 num_workers=1 vector_length=128 grid=1024 block=128  // 优化结果完全相同

Time with acc:234 ms

● 三重循环,无论仅使用外循环 loop、外中循环 loop,还是外中内循环 loop,获得的编译和运行结果都是相同的,只放上来一个进行讨论

 1 #include <stdio.h>
 2 #include <time.h>
 3 #include <openacc.h>
 4 
 5 const int row = 256, col = 64, page = 4;
 6 
 7 int main()
 8 {
 9     int i, j, k, a[row][col][page], b[row][col][page], c[row][col][page];
10     clock_t time;
11     for (i = 0; i < row; i++)
12     {
13         for (j = 0; j < col; j++)
14         {
15             for (k = 0; k < page; k++)
16                 a[i][j][k] = b[i][j][k] = i + j + k;
17         }
18     }
19 
20 #ifdef _OPENACC    
21     time = clock();
22 #pragma acc parallel
23 #pragma acc loop
24     for (i = 0; i < row; i++)
25     {
26 //#pragma acc loop
27         for (j = 0; j < col; j++)
28         {
29 //#pragma acc loop
30             for (k = 0; k<page; k++)
31                 c[i][j][k] = a[i][j][k] + b[i][j][k];
32         }
33     }
34     time = clock() - time;
35     printf("\nTime with acc:%d ms\n", time);
36 #else
37     time = clock();
38     for (i = 0; i < row; i++)
39     {
40         for (j = 0; j < col; j++)
41         {
42             for (k = 0; k<page; k++)
43                 c[i][j][k] = a[i][j][k] + b[i][j][k];
44         }
45     }
46     time = clock() - time;
47     printf("\nTime without acc:%d ms\n", time);
48 #endif
49     getchar();
50     return 0;
51 }

● 输出结果

D:\Code\OpenACC\OpenACCProject\OpenACCProject>pgcc -acc -Minfo main.c -o main_acc_loop.exe
main:
     22, Accelerator kernel generated
         Generating Tesla code
         24, #pragma acc loop gang /* blockIdx.x */ // 并行化了外层循环和内层循环,但是用中间层使用的是串行
         27, #pragma acc loop seq
         30, #pragma acc loop vector(128) /* threadIdx.x */
     22, Generating implicit copyout(c[:row][:col][:page])
         Generating implicit copyin(b[:row][:col][:page],a[:row][:col][:page])
     27, Loop is parallelizable
     30, Loop is parallelizable

D:\Code\OpenACC\OpenACCProject\OpenACCProject>main_acc_loop1.exe
launch CUDA kernel  file=C:/Program Files (x86)/Windows Kits/10/Include/10.0.16299.0/ucrt\time.h function=main
line=22 device=0 threadid=1 num_gangs=256 num_workers=1 vector_length=128 grid=256 block=128

Time with acc:226 ms

 

展开阅读全文
加载中
点击引领话题📣 发布并加入讨论🔥
打赏
0 评论
0 收藏
0
分享
返回顶部
顶部