Hi, I am trying to accelerate a nested DO loop using loop tiling method with openacc clause: tile(). However I am not able to obtain the expected speedup that the loop tiling should give. I also tried performing loop tiling manually without using the intrinsic tile() clause of OpenACC; did not help. I also experimented with several tile sizes, even that did not help. I am even experiencing a slowdown instead of speedup. What could be the possible reason?
my program is as follow :
PROGRAM test_loop_tiling
implicit none
double precision, allocatable :: prim(:,:,:,:), prim_x(:,:,:,:), prim_y(:,:,:,:), prim_z(:,:,:,:)
integer, parameter :: nx=256, ny=256, nz=256, tileX = 32, tileY = 32, tileZ = 1
integer :: i,j,k,ii,jj,kk,p
integer :: start_clock, end_clock, clock_rate
real*8 :: elapsed_time
allocate(prim(nx,ny,nz,5), prim_x(nx,ny,nz,5), prim_y(nx,ny,nz,5), prim_z(nx,ny,nz,5))
! Dummy initialization (optional)
prim(:,:,:,1) = 1.0d0
prim(:,:,:,2) = 2.0d0
prim(:,:,:,3) = 3.0d0
prim(:,:,:,4) = 4.0d0
prim(:,:,:,5) = 5.0d0
! Copy data to device before timing
!$acc data copyin(prim, prim_x, prim_y, prim_z)
! ********************* WITH TILING *********************
call system_clock(start_clock, clock_rate)
do p = 1,5
!$acc parallel loop gang vector tile(tileX, tileY, tileZ) default(present)
do k = 2, nz-1
do j = 2, ny-1
do i = 2, nx-1
prim_x(i,j,k,p) = 2.d0/3.d0 * (prim(i+1,j,k,p) - prim(i-1,j,k,p)) - 1.d0/12.d0 * (prim(i+2,j,k,p) - prim(i-2,j,k,p))
prim_y(i,j,k,p) = 2.d0/3.d0 * (prim(i,j+1,k,p) - prim(i,j-1,k,p)) - 1.d0/12.d0 * (prim(i,j+2,k,p) - prim(i,j-2,k,p))
prim_z(i,j,k,p) = 2.d0/3.d0 * (prim(i,j,k+1,p) - prim(i,j,k-1,p)) - 1.d0/12.d0 * (prim(i,j,k+2,p) - prim(i,j,k-2,p))
end do
end do
end do
end do
call system_clock(end_clock)
elapsed_time = real(end_clock - start_clock) / real(clock_rate)
print *, 'Loop execution time (s) with loop tiling (setup 1): ', elapsed_time
! ********************* WITH TILING *********************
call system_clock(start_clock, clock_rate)
do p = 1,5
!$acc parallel loop collapse(3) gang default(present)
do kk = 2, nz-1, tileZ
do jj = 2, ny-1, tileY
do ii = 2, nx-1, tileX
!$acc loop collapse(3) vector
do k = kk, min(kk+tileZ-1, nz-1)
do j = jj, min(jj+tileY-1, ny-1)
do i = ii, min(ii+tileX-1, nx-1)
prim_x(i,j,k,p) = 2.d0/3.d0 * (prim(i+1,j,k,p) - prim(i-1,j,k,p)) - 1.d0/12.d0 * (prim(i+2,j,k,p) - prim(i-2,j,k,p))
prim_y(i,j,k,p) = 2.d0/3.d0 * (prim(i,j+1,k,p) - prim(i,j-1,k,p)) - 1.d0/12.d0 * (prim(i,j+2,k,p) - prim(i,j-2,k,p))
prim_z(i,j,k,p) = 2.d0/3.d0 * (prim(i,j,k+1,p) - prim(i,j,k-1,p)) - 1.d0/12.d0 * (prim(i,j,k+2,p) - prim(i,j,k-2,p))
end do
end do
end do
end do
end do
end do
end do
call system_clock(end_clock)
elapsed_time = real(end_clock - start_clock) / real(clock_rate)
print *, 'Loop execution time (s) with loop tiling (setup 2): ', elapsed_time
! ********************* NO TILING *********************
call system_clock(start_clock, clock_rate)
do p = 1,5
!$acc parallel loop gang vector collapse(3) default(present)
do k = 2, nz-1
do j = 2, ny-1
do i = 2, nx-1
prim_x(i,j,k,p) = 2.d0/3.d0 * (prim(i+1,j,k,p) - prim(i-1,j,k,p)) - 1.d0/12.d0 * (prim(i+2,j,k,p) - prim(i-2,j,k,p))
prim_y(i,j,k,p) = 2.d0/3.d0 * (prim(i,j+1,k,p) - prim(i,j-1,k,p)) - 1.d0/12.d0 * (prim(i,j+2,k,p) - prim(i,j-2,k,p))
prim_z(i,j,k,p) = 2.d0/3.d0 * (prim(i,j,k+1,p) - prim(i,j,k-1,p)) - 1.d0/12.d0 * (prim(i,j,k+2,p) - prim(i,j,k-2,p))
end do
end do
end do
end do
call system_clock(end_clock)
elapsed_time = real(end_clock - start_clock) / real(clock_rate)
print *, 'Loop execution time (s) without loop tiling: ', elapsed_time
!$acc end data
END PROGRAM
and the results in terms of wall clock time:
Loop execution time (s) with loop tiling (setup 1): 4.0629999712109566E-003
Loop execution time (s) with loop tiling (setup 2): 2.8250000905245543E-003
Loop execution time (s) without loop tiling: 2.8329999186098576E-003
Following are the compilation settings used:
# Makefile to compile tile.test.f90 using nvfortran with OpenACC and automatic data management
# Compiler
FC = nvfortran
# Flags
FFLAGS = -acc -fast
# Target executable name
TARGET = a.out
# Source file
SRC = tile_test.f90
# Default rule
all: $(TARGET)
$(TARGET): $(SRC)
$(FC) $(FFLAGS) -o $@ $<
clean:
rm -f $(TARGET) *.o *.mod
The code was ran on a single A100 GPU.
Thanks in advance.