Issue in openacc nested loop collapse

I added openacc directives to a finite difference code solve a 2D PDE, iterate with nested loop. The dynamical variables is defined by a 1D array of size nx*ny. The iterations only involve inner points in a nested loop along x and y, ie, i=1; i<nx-1; ++i and j=1; j<ny-1, ++j, the boundary point will not be iterated.
I found that using collapse clause gave incorrect results (without collapse my result is correct). However, if I define an array of flag which is 1 at inner points and 0 at boundary, and then I wrote the loop for the whole range,
include the boundary as well, and inside the nested loop I add a if statement that we only compute when the flag is 1, the result is correct with collapse clause. What is the reason?
The problem I am solving is complicated, so I put here a simple example
to demonstrate the issue, which is a jacobi solver (file name main_lp_ref_2D.c):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <algorithm>
#ifdef _OPENMP
#include <omp.h>
#endif

int print_ref(int mx, int my, double *Aarr, char fname[100]) {
char outfilename[100];
double temp_val;
strcpy(outfilename, fname);
FILE *fp=fopen(outfilename, "w");

for  (int j=0;j<my; j++) {
  for  (int i=0;i<mx; i++) {

      temp_val=Aarr[mx*j+i];
      fprintf(fp, "%5d %5d  %20.10f\n", i, j, temp_val);
  }
  fprintf(fp, "\n");
}

fclose(fp);
return 0;
}

int main(int argc, char** argv){
  int niter=10000;
  int nx=300;
  int ny=400;
  // launch by  ./*.out niter nx ny
  if (argc >= 2)
  {
      niter = atoi( argv[1] );
  }
  if (argc == 3)
  {
      nx = atoi( argv[2] );
      ny = nx;
  }
  if (argc == 4)
  {
      nx = atoi( argv[2] );
      ny = atoi( argv[3] );
  }

  int it;
  int ntot=nx*ny;
  double *psi, *psi_temp, *psi_ref;
  clock_t start_t, end_t;
  float total_t=0;

  //allocate memory
  psi= (double *) malloc(ntot*sizeof(double));
  psi_ref= (double *) malloc(ntot*sizeof(double));
  psi_temp= (double *) malloc(ntot*sizeof(double));

  //IC
  memset(psi, 0, ntot*sizeof(double));
  memset(psi_ref, 0, ntot*sizeof(double));
  memset(psi_temp, 0, ntot*sizeof(double));

  //BC
    for (int i=0;i<nx;i++){
      psi[i]=1.0;
      psi_temp[i]=1.0;
    }

  start_t = clock();
#ifdef _OPENMP
  int nthreads;
  double start_omp = omp_get_wtime();
#endif
  //iterate
int nx1=nx-1;
int ny1=ny-1;
  #pragma omp parallel private(it) shared(niter, nx, nx1, ny1, psi, psi_temp, nthreads) 
{
#ifdef _OPENMP
  #pragma omp single
  nthreads= omp_get_num_threads();
#endif
  for (it=0;it<niter; it++){
 #pragma omp for 
  //  stepping (nx, ny, psi, psi_temp);
  for (int j=1; j<ny1; j++){
    for (int i=1; i<nx1; i++){
      psi_temp[i+nx*j] = 0.25*
      ( psi[i+nx*j-1]  +psi[i+nx*j+1]
       +psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);

     }
  }
  //update
  #pragma omp single
    std::swap(psi,psi_temp);
  }

} // exit omp parallel region
#ifdef _OPENMP
  double end_omp = omp_get_wtime();
#endif
  end_t = clock();
#ifdef _OPENMP
    printf ( "Using %d OMP threads\n", nthreads);
  printf("Omp timing %f seconds\n", end_omp - start_omp);
#endif
  total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
  printf("Total time taken by CPU: %f sec\n", total_t  );
  //Take cpu result as reference:
  std::swap(psi,psi_ref);
  char outname[]="test_serial.txt";
  printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
  print_ref(nx,ny, psi_ref, outname);

  //IC
  memset(psi, 0, ntot*sizeof(double));
  memset(psi_temp, 0, ntot*sizeof(double));
  //BC
    for (int i=0;i<nx;i++){
      psi[i]=1.0;
      psi_temp[i]=1.0;
    }

#ifdef MASK
int *cfunc;
cfunc=(int *) malloc(ntot*sizeof(int));
memset(cfunc, 0, ntot*sizeof(int));
  for (int j=1; j<ny1; j++){
    for (int i=1; i<nx1; i++){
       cfunc[i+nx*j]=1;
    }
  }
#endif

//start openacc 
#pragma acc init
  start_t = clock();
  //iterate
#ifdef MASK
#pragma acc data copy(psi[0:ntot]) copyin(psi_temp[0:ntot], cfunc[0:ntot])
#else
#pragma acc data copy(psi[0:ntot]) copyin(psi_temp[0:ntot])
#endif
{ //data region
  for (int it=0;it<niter; it++){

 #pragma acc parallel 
 { //enter acc parallel
 #pragma acc loop collapse(2) 
  //  stepping (nx, ny, psi, psi_temp);
#ifdef MASK
    for (int j=0; j<ny; j++){
      for (int i=0; i<nx; i++){
        if (cfunc[i+nx*j]==1) {
          psi_temp[i+nx*j] = 0.25*
          ( psi[i+nx*j-1]  +psi[i+nx*j+1]
           +psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
        }
      }
    }
#else
    for (int j=1; j<ny1; j++){
      for (int i=1; i<nx1; i++){
        psi_temp[i+nx*j] = 0.25*
        ( psi[i+nx*j-1]  +psi[i+nx*j+1]
         +psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
      }
    }
#endif
  //update
  #pragma acc loop
    for (int k=0; k<ntot; ++k) {
      psi[k]=psi_temp[k];
    }
 }//exit acc parallel
  } //exit iter

} // exit acc data region

  end_t = clock();
  total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
  printf("Total time taken by GPU: %f sec\n", total_t  );
  char outname2[]="test_acc.txt";
  printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
  print_ref(nx,ny, psi, outname2);

  double err_norm=0;
  double norm=0;
  for (int k=0; k<ntot; ++k) {
      err_norm+=(psi[k]-psi_ref[k])*(psi[k]-psi_ref[k]);
      norm+=psi_ref[k]*psi_ref[k];
  };
  err_norm=sqrt(err_norm);
  norm=sqrt(norm);
  printf("Total error = %lf, rel err=%lf %\n", err_norm, 100.0*err_norm/norm);

  char outname3[]="test_diff.txt";
  for (int k=0; k<ntot; ++k) {
      psi[k]=abs(psi[k]-psi_ref[k]);
  };
  printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
  print_ref(nx,ny, psi, outname3);
#ifdef MASK
  free(cfunc);
#endif
  free(psi);
  free(psi_ref);
  free(psi_temp);
  return 0;
}

And my Makefile:

FC=nvc++
FCFLAG=-O0 -fast -acc=gpu -gpu=cc61 -Minfo=accel #-DMASK

TAR=main_lp_ref_2D.out
OBJ=main_lp_ref_2D.o

$(TAR):$(OBJ) Makefile
        $(FC) $(FCFLAG) -o $(TAR) $(OBJ)

$(OBJ): %.o: %.c Makefile
        $(FC) $(FCFLAG) -c $^ -o $@

clean:
        rm -rf *.o $(TAR) test*

In the Makefile, if the flag -DMASK is not used, the nested loop does not
iterate the boundary point and I got

Total time taken by CPU: 0.714485 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.340217 sec
nx=300, ny=400, niter=10000
Total error = 1.566312, rel err=1.770348 %
nx=300, ny=400, niter=10000

If the flag -DMASK is used, the nest loop iterate the indices for boundary as well and inside the loop we used if statement to compute only at inner point and I got

Total time taken by CPU: 0.711233 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.362877 sec
nx=300, ny=400, niter=10000
Total error = 0.015819, rel err=0.017880 %
nx=300, ny=400, niter=10000

The 2nd one shows a much smaller error.
In the more complicate code I worked with, the resulting error of not looping the indices for boundary is much greater and completely wrong.

I redo the whole thing with fortran and compile with nvfortran.
The conclusion for fortran is more mysterious.
First, without collapse clause I got correct result.
Second, with the collapse clause, in the simple jacobi solver example
I cannot reproduce the same issue with C, things are ok even I did not loop through the indices for boundary.
Third, with the collapse clause, in my more complicate code,
same problem as C arised but the above trick of looping all the indices and
with a if statement did not fix the issue like I did in C. And I suspect
the error is caused by the optimization of fortran compiler, because different optimization level -Ox gave different result, but I use -O0 still could not give correct result (within acceptable error compared with the serial code.)
I would appreciate if anyone knows more about this.
Thanks a lot.

1 Like

Hi huccpp,

The problem here is that there’s no implicit barrier between loops within a parallel. Hence some gangs start to copy the arrays in the second loop before the first loop is complete.

Note that it still gets incorrect answers with the “MASK” version, just less so since the timing is different causing fewer gangs to move to the second loop ahead of time.

The solution is to move the copy loop into it’s own parallel region, or better yet, swap the pointers like you do in the serial version. Though if you swap, add a “present” clause on the parallel so the updated pointers are updated in the kernel each time.

% diff -u test.org.cpp test.cpp
--- test.org.cpp        2024-10-28 10:10:32.473492979 -0700
+++ test.cpp    2024-10-28 10:10:15.757299223 -0700
@@ -148,7 +148,7 @@
 { //data region
   for (int it=0;it<niter; it++){

- #pragma acc parallel
+ #pragma acc parallel present(psi,psi_temp)
  { //enter acc parallel
  #pragma acc loop collapse(2)
   //  stepping (nx, ny, psi, psi_temp);
@@ -171,12 +171,9 @@
       }
     }
 #endif
-  //update
-  #pragma acc loop
-    for (int k=0; k<ntot; ++k) {
-      psi[k]=psi_temp[k];
-    }
  }//exit acc parallel
+  //update
+    std::swap(psi,psi_temp);
   } //exit iter

 } // exit acc data region
% nvc++ -fast -acc -Minfo=accel test.cpp ; a.out
main:
    148, Generating copyin(psi_temp[:ntot]) [if not already present]
         Generating copy(psi[:ntot]) [if not already present]
    149, Generating present(psi_temp[:],psi[:])
    152, Generating implicit firstprivate(ny1,nx1)
         Generating NVIDIA GPU code
        166, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        167,   /* blockIdx.x threadIdx.x collapsed */
    167, Generating implicit firstprivate(nx)
Total time taken by CPU: 0.345164 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.095250 sec
nx=300, ny=400, niter=10000
Total error = 0.000000, rel err=0.000000 %
nx=300, ny=400, niter=10000

-Mat

1 Like

Thanks a lot, Mat.
It works.