Issue in openacc nested loop collapse

huccpp · October 28, 2024, 9:49am

I added openacc directives to a finite difference code solve a 2D PDE, iterate with nested loop. The dynamical variables is defined by a 1D array of size nx*ny. The iterations only involve inner points in a nested loop along x and y, ie, i=1; i<nx-1; ++i and j=1; j<ny-1, ++j, the boundary point will not be iterated.
I found that using collapse clause gave incorrect results (without collapse my result is correct). However, if I define an array of flag which is 1 at inner points and 0 at boundary, and then I wrote the loop for the whole range,
include the boundary as well, and inside the nested loop I add a if statement that we only compute when the flag is 1, the result is correct with collapse clause. What is the reason?
The problem I am solving is complicated, so I put here a simple example
to demonstrate the issue, which is a jacobi solver (file name main_lp_ref_2D.c):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <algorithm>
#ifdef _OPENMP
#include <omp.h>
#endif

int print_ref(int mx, int my, double *Aarr, char fname[100]) {
char outfilename[100];
double temp_val;
strcpy(outfilename, fname);
FILE *fp=fopen(outfilename, "w");

for  (int j=0;j<my; j++) {
  for  (int i=0;i<mx; i++) {

      temp_val=Aarr[mx*j+i];
      fprintf(fp, "%5d %5d  %20.10f\n", i, j, temp_val);
  }
  fprintf(fp, "\n");
}

fclose(fp);
return 0;
}

int main(int argc, char** argv){
  int niter=10000;
  int nx=300;
  int ny=400;
  // launch by  ./*.out niter nx ny
  if (argc >= 2)
  {
      niter = atoi( argv[1] );
  }
  if (argc == 3)
  {
      nx = atoi( argv[2] );
      ny = nx;
  }
  if (argc == 4)
  {
      nx = atoi( argv[2] );
      ny = atoi( argv[3] );
  }

  int it;
  int ntot=nx*ny;
  double *psi, *psi_temp, *psi_ref;
  clock_t start_t, end_t;
  float total_t=0;

  //allocate memory
  psi= (double *) malloc(ntot*sizeof(double));
  psi_ref= (double *) malloc(ntot*sizeof(double));
  psi_temp= (double *) malloc(ntot*sizeof(double));

  //IC
  memset(psi, 0, ntot*sizeof(double));
  memset(psi_ref, 0, ntot*sizeof(double));
  memset(psi_temp, 0, ntot*sizeof(double));

  //BC
    for (int i=0;i<nx;i++){
      psi[i]=1.0;
      psi_temp[i]=1.0;
    }

  start_t = clock();
#ifdef _OPENMP
  int nthreads;
  double start_omp = omp_get_wtime();
#endif
  //iterate
int nx1=nx-1;
int ny1=ny-1;
  #pragma omp parallel private(it) shared(niter, nx, nx1, ny1, psi, psi_temp, nthreads) 
{
#ifdef _OPENMP
  #pragma omp single
  nthreads= omp_get_num_threads();
#endif
  for (it=0;it<niter; it++){
 #pragma omp for 
  //  stepping (nx, ny, psi, psi_temp);
  for (int j=1; j<ny1; j++){
    for (int i=1; i<nx1; i++){
      psi_temp[i+nx*j] = 0.25*
      ( psi[i+nx*j-1]  +psi[i+nx*j+1]
       +psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);

     }
  }
  //update
  #pragma omp single
    std::swap(psi,psi_temp);
  }

} // exit omp parallel region
#ifdef _OPENMP
  double end_omp = omp_get_wtime();
#endif
  end_t = clock();
#ifdef _OPENMP
    printf ( "Using %d OMP threads\n", nthreads);
  printf("Omp timing %f seconds\n", end_omp - start_omp);
#endif
  total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
  printf("Total time taken by CPU: %f sec\n", total_t  );
  //Take cpu result as reference:
  std::swap(psi,psi_ref);
  char outname[]="test_serial.txt";
  printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
  print_ref(nx,ny, psi_ref, outname);

  //IC
  memset(psi, 0, ntot*sizeof(double));
  memset(psi_temp, 0, ntot*sizeof(double));
  //BC
    for (int i=0;i<nx;i++){
      psi[i]=1.0;
      psi_temp[i]=1.0;
    }

#ifdef MASK
int *cfunc;
cfunc=(int *) malloc(ntot*sizeof(int));
memset(cfunc, 0, ntot*sizeof(int));
  for (int j=1; j<ny1; j++){
    for (int i=1; i<nx1; i++){
       cfunc[i+nx*j]=1;
    }
  }
#endif

//start openacc 
#pragma acc init
  start_t = clock();
  //iterate
#ifdef MASK
#pragma acc data copy(psi[0:ntot]) copyin(psi_temp[0:ntot], cfunc[0:ntot])
#else
#pragma acc data copy(psi[0:ntot]) copyin(psi_temp[0:ntot])
#endif
{ //data region
  for (int it=0;it<niter; it++){

 #pragma acc parallel 
 { //enter acc parallel
 #pragma acc loop collapse(2) 
  //  stepping (nx, ny, psi, psi_temp);
#ifdef MASK
    for (int j=0; j<ny; j++){
      for (int i=0; i<nx; i++){
        if (cfunc[i+nx*j]==1) {
          psi_temp[i+nx*j] = 0.25*
          ( psi[i+nx*j-1]  +psi[i+nx*j+1]
           +psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
        }
      }
    }
#else
    for (int j=1; j<ny1; j++){
      for (int i=1; i<nx1; i++){
        psi_temp[i+nx*j] = 0.25*
        ( psi[i+nx*j-1]  +psi[i+nx*j+1]
         +psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
      }
    }
#endif
  //update
  #pragma acc loop
    for (int k=0; k<ntot; ++k) {
      psi[k]=psi_temp[k];
    }
 }//exit acc parallel
  } //exit iter

} // exit acc data region

  end_t = clock();
  total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
  printf("Total time taken by GPU: %f sec\n", total_t  );
  char outname2[]="test_acc.txt";
  printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
  print_ref(nx,ny, psi, outname2);

  double err_norm=0;
  double norm=0;
  for (int k=0; k<ntot; ++k) {
      err_norm+=(psi[k]-psi_ref[k])*(psi[k]-psi_ref[k]);
      norm+=psi_ref[k]*psi_ref[k];
  };
  err_norm=sqrt(err_norm);
  norm=sqrt(norm);
  printf("Total error = %lf, rel err=%lf %\n", err_norm, 100.0*err_norm/norm);

  char outname3[]="test_diff.txt";
  for (int k=0; k<ntot; ++k) {
      psi[k]=abs(psi[k]-psi_ref[k]);
  };
  printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
  print_ref(nx,ny, psi, outname3);
#ifdef MASK
  free(cfunc);
#endif
  free(psi);
  free(psi_ref);
  free(psi_temp);
  return 0;
}

And my Makefile:

FC=nvc++
FCFLAG=-O0 -fast -acc=gpu -gpu=cc61 -Minfo=accel #-DMASK

TAR=main_lp_ref_2D.out
OBJ=main_lp_ref_2D.o

$(TAR):$(OBJ) Makefile
        $(FC) $(FCFLAG) -o $(TAR) $(OBJ)

$(OBJ): %.o: %.c Makefile
        $(FC) $(FCFLAG) -c $^ -o $@

clean:
        rm -rf *.o $(TAR) test*

In the Makefile, if the flag -DMASK is not used, the nested loop does not
iterate the boundary point and I got

Total time taken by CPU: 0.714485 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.340217 sec
nx=300, ny=400, niter=10000
Total error = 1.566312, rel err=1.770348 %
nx=300, ny=400, niter=10000

If the flag -DMASK is used, the nest loop iterate the indices for boundary as well and inside the loop we used if statement to compute only at inner point and I got

Total time taken by CPU: 0.711233 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.362877 sec
nx=300, ny=400, niter=10000
Total error = 0.015819, rel err=0.017880 %
nx=300, ny=400, niter=10000

The 2nd one shows a much smaller error.
In the more complicate code I worked with, the resulting error of not looping the indices for boundary is much greater and completely wrong.

I redo the whole thing with fortran and compile with nvfortran.
The conclusion for fortran is more mysterious.
First, without collapse clause I got correct result.
Second, with the collapse clause, in the simple jacobi solver example
I cannot reproduce the same issue with C, things are ok even I did not loop through the indices for boundary.
Third, with the collapse clause, in my more complicate code,
same problem as C arised but the above trick of looping all the indices and
with a if statement did not fix the issue like I did in C. And I suspect
the error is caused by the optimization of fortran compiler, because different optimization level -Ox gave different result, but I use -O0 still could not give correct result (within acceptable error compared with the serial code.)
I would appreciate if anyone knows more about this.
Thanks a lot.

MatColgrove · October 28, 2024, 5:13pm

Hi huccpp,

The problem here is that there’s no implicit barrier between loops within a parallel. Hence some gangs start to copy the arrays in the second loop before the first loop is complete.

Note that it still gets incorrect answers with the “MASK” version, just less so since the timing is different causing fewer gangs to move to the second loop ahead of time.

The solution is to move the copy loop into it’s own parallel region, or better yet, swap the pointers like you do in the serial version. Though if you swap, add a “present” clause on the parallel so the updated pointers are updated in the kernel each time.

% diff -u test.org.cpp test.cpp
--- test.org.cpp        2024-10-28 10:10:32.473492979 -0700
+++ test.cpp    2024-10-28 10:10:15.757299223 -0700
@@ -148,7 +148,7 @@
 { //data region
   for (int it=0;it<niter; it++){

- #pragma acc parallel
+ #pragma acc parallel present(psi,psi_temp)
  { //enter acc parallel
  #pragma acc loop collapse(2)
   //  stepping (nx, ny, psi, psi_temp);
@@ -171,12 +171,9 @@
       }
     }
 #endif
-  //update
-  #pragma acc loop
-    for (int k=0; k<ntot; ++k) {
-      psi[k]=psi_temp[k];
-    }
  }//exit acc parallel
+  //update
+    std::swap(psi,psi_temp);
   } //exit iter

 } // exit acc data region

% nvc++ -fast -acc -Minfo=accel test.cpp ; a.out
main:
    148, Generating copyin(psi_temp[:ntot]) [if not already present]
         Generating copy(psi[:ntot]) [if not already present]
    149, Generating present(psi_temp[:],psi[:])
    152, Generating implicit firstprivate(ny1,nx1)
         Generating NVIDIA GPU code
        166, #pragma acc loop gang, vector(128) collapse(2) /* blockIdx.x threadIdx.x */
        167,   /* blockIdx.x threadIdx.x collapsed */
    167, Generating implicit firstprivate(nx)
Total time taken by CPU: 0.345164 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.095250 sec
nx=300, ny=400, niter=10000
Total error = 0.000000, rel err=0.000000 %
nx=300, ny=400, niter=10000

-Mat

huccpp · October 29, 2024, 7:06am

Thanks a lot, Mat.
It works.

Topic		Replies	Views
OpenACC 2.0 standard and nested loops Legacy PGI Compilers	6	10472	May 2, 2014
Error in computed solution (giving NaN values) while using collapse directive in OpenACC Legacy PGI Compilers	6	996	September 27, 2021
Couple of questions (nested loops, loop bounds, etc.) Legacy PGI Compilers	17	25205	December 11, 2014
OpenACC: Best way to parallelize nested DO loops with data dependency between loops? nvc, nvc++ and nvfortran	14	3633	October 4, 2021
Nvfortran+openacc bug/feature related to the acc loop range nvc, nvc++ and nvfortran	3	464	February 1, 2021
PGI and OpenACC - problem with collapse clause Legacy PGI Compilers	4	6800	May 21, 2014
Nested loop and reductions nvc, nvc++ and nvfortran	4	987	October 12, 2021
OpenACC: Best way to parallelize nested DO loops (continued) nvc, nvc++ and nvfortran	22	1988	March 28, 2023
issue with collapse clause when switching from 18.4 to 18.10 Legacy PGI Compilers	3	1646	November 20, 2018
a 3 levels of loop Legacy PGI Compilers	1	2092	September 6, 2012

Issue in openacc nested loop collapse

Related topics