Loop tiling not giving expected results

Hi, I am trying to accelerate a nested DO loop using loop tiling method with openacc clause: tile(). However I am not able to obtain the expected speedup that the loop tiling should give. I also tried performing loop tiling manually without using the intrinsic tile() clause of OpenACC; did not help. I also experimented with several tile sizes, even that did not help. I am even experiencing a slowdown instead of speedup. What could be the possible reason?

my program is as follow :

PROGRAM test_loop_tiling

	implicit none

	double precision, allocatable :: prim(:,:,:,:), prim_x(:,:,:,:), prim_y(:,:,:,:), prim_z(:,:,:,:)
	integer, parameter :: nx=256, ny=256, nz=256, tileX = 32, tileY = 32, tileZ = 1
	integer :: i,j,k,ii,jj,kk,p
	integer :: start_clock, end_clock, clock_rate
	real*8  :: elapsed_time

	allocate(prim(nx,ny,nz,5), prim_x(nx,ny,nz,5), prim_y(nx,ny,nz,5), prim_z(nx,ny,nz,5))

	! Dummy initialization (optional)
	prim(:,:,:,1) = 1.0d0
	prim(:,:,:,2) = 2.0d0
	prim(:,:,:,3) = 3.0d0
	prim(:,:,:,4) = 4.0d0
	prim(:,:,:,5) = 5.0d0
	


	! Copy data to device before timing
	!$acc data copyin(prim, prim_x, prim_y, prim_z)
	
	! ********************* WITH TILING *********************

	call system_clock(start_clock, clock_rate)
	
	do p = 1,5
		!$acc parallel loop gang vector tile(tileX, tileY, tileZ) default(present)
		do k = 2, nz-1
			do j = 2, ny-1
				do i = 2, nx-1
					prim_x(i,j,k,p) = 2.d0/3.d0 * (prim(i+1,j,k,p) - prim(i-1,j,k,p)) - 1.d0/12.d0 * (prim(i+2,j,k,p) - prim(i-2,j,k,p))
					prim_y(i,j,k,p) = 2.d0/3.d0 * (prim(i,j+1,k,p) - prim(i,j-1,k,p)) - 1.d0/12.d0 * (prim(i,j+2,k,p) - prim(i,j-2,k,p))
					prim_z(i,j,k,p) = 2.d0/3.d0 * (prim(i,j,k+1,p) - prim(i,j,k-1,p)) - 1.d0/12.d0 * (prim(i,j,k+2,p) - prim(i,j,k-2,p))
				end do
			end do
		end do
	end do
	
	call system_clock(end_clock)
	
	elapsed_time = real(end_clock - start_clock) / real(clock_rate)
	print *, 'Loop execution time (s) with loop tiling (setup 1): ', elapsed_time
	
	! ********************* WITH TILING *********************

	call system_clock(start_clock, clock_rate)
	
	do p = 1,5
		!$acc parallel loop collapse(3) gang default(present)
		do kk = 2, nz-1, tileZ
			do jj = 2, ny-1, tileY
				do ii = 2, nx-1, tileX

					!$acc loop collapse(3) vector
					do k = kk, min(kk+tileZ-1, nz-1)
						do j = jj, min(jj+tileY-1, ny-1)
							do i = ii, min(ii+tileX-1, nx-1)
								prim_x(i,j,k,p) = 2.d0/3.d0 * (prim(i+1,j,k,p) - prim(i-1,j,k,p)) - 1.d0/12.d0 * (prim(i+2,j,k,p) - prim(i-2,j,k,p))
								prim_y(i,j,k,p) = 2.d0/3.d0 * (prim(i,j+1,k,p) - prim(i,j-1,k,p)) - 1.d0/12.d0 * (prim(i,j+2,k,p) - prim(i,j-2,k,p))
								prim_z(i,j,k,p) = 2.d0/3.d0 * (prim(i,j,k+1,p) - prim(i,j,k-1,p)) - 1.d0/12.d0 * (prim(i,j,k+2,p) - prim(i,j,k-2,p))
							end do
						end do
					end do

				end do
			end do
		end do
	end do
	
	call system_clock(end_clock)
	
	elapsed_time = real(end_clock - start_clock) / real(clock_rate)
	print *, 'Loop execution time (s) with loop tiling (setup 2): ', elapsed_time
	
	
	
	! ********************* NO TILING *********************
	call system_clock(start_clock, clock_rate)
	
	do p = 1,5
		!$acc parallel loop gang vector collapse(3) default(present)
		do k = 2, nz-1
			do j = 2, ny-1
				do i = 2, nx-1
					prim_x(i,j,k,p) = 2.d0/3.d0 * (prim(i+1,j,k,p) - prim(i-1,j,k,p)) - 1.d0/12.d0 * (prim(i+2,j,k,p) - prim(i-2,j,k,p))
					prim_y(i,j,k,p) = 2.d0/3.d0 * (prim(i,j+1,k,p) - prim(i,j-1,k,p)) - 1.d0/12.d0 * (prim(i,j+2,k,p) - prim(i,j-2,k,p))
					prim_z(i,j,k,p) = 2.d0/3.d0 * (prim(i,j,k+1,p) - prim(i,j,k-1,p)) - 1.d0/12.d0 * (prim(i,j,k+2,p) - prim(i,j,k-2,p))
				end do
			end do
		end do
	end do
	
	call system_clock(end_clock)
	
	elapsed_time = real(end_clock - start_clock) / real(clock_rate)
	print *, 'Loop execution time (s) without loop tiling: ', elapsed_time
	
	
	!$acc end data

END PROGRAM

and the results in terms of wall clock time:

Loop execution time (s) with loop tiling (setup 1):    4.0629999712109566E-003
Loop execution time (s) with loop tiling (setup 2):    2.8250000905245543E-003
Loop execution time (s) without loop tiling:    2.8329999186098576E-003

Following are the compilation settings used:

# Makefile to compile tile.test.f90 using nvfortran with OpenACC and automatic data management

# Compiler
FC = nvfortran

# Flags
FFLAGS = -acc -fast

# Target executable name
TARGET = a.out

# Source file
SRC = tile_test.f90

# Default rule
all: $(TARGET)

$(TARGET): $(SRC)
	$(FC) $(FFLAGS) -o $@ $<

clean:
	rm -f $(TARGET) *.o *.mod

The code was ran on a single A100 GPU.

Thanks in advance.