I added openacc directives to a finite difference code solve a 2D PDE, iterate with nested loop. The dynamical variables is defined by a 1D array of size nx*ny. The iterations only involve inner points in a nested loop along x and y, ie, i=1; i<nx-1; ++i and j=1; j<ny-1, ++j, the boundary point will not be iterated.
I found that using collapse clause gave incorrect results (without collapse my result is correct). However, if I define an array of flag which is 1 at inner points and 0 at boundary, and then I wrote the loop for the whole range,
include the boundary as well, and inside the nested loop I add a if statement that we only compute when the flag is 1, the result is correct with collapse clause. What is the reason?
The problem I am solving is complicated, so I put here a simple example
to demonstrate the issue, which is a jacobi solver (file name main_lp_ref_2D.c):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include <algorithm>
#ifdef _OPENMP
#include <omp.h>
#endif
int print_ref(int mx, int my, double *Aarr, char fname[100]) {
char outfilename[100];
double temp_val;
strcpy(outfilename, fname);
FILE *fp=fopen(outfilename, "w");
for (int j=0;j<my; j++) {
for (int i=0;i<mx; i++) {
temp_val=Aarr[mx*j+i];
fprintf(fp, "%5d %5d %20.10f\n", i, j, temp_val);
}
fprintf(fp, "\n");
}
fclose(fp);
return 0;
}
int main(int argc, char** argv){
int niter=10000;
int nx=300;
int ny=400;
// launch by ./*.out niter nx ny
if (argc >= 2)
{
niter = atoi( argv[1] );
}
if (argc == 3)
{
nx = atoi( argv[2] );
ny = nx;
}
if (argc == 4)
{
nx = atoi( argv[2] );
ny = atoi( argv[3] );
}
int it;
int ntot=nx*ny;
double *psi, *psi_temp, *psi_ref;
clock_t start_t, end_t;
float total_t=0;
//allocate memory
psi= (double *) malloc(ntot*sizeof(double));
psi_ref= (double *) malloc(ntot*sizeof(double));
psi_temp= (double *) malloc(ntot*sizeof(double));
//IC
memset(psi, 0, ntot*sizeof(double));
memset(psi_ref, 0, ntot*sizeof(double));
memset(psi_temp, 0, ntot*sizeof(double));
//BC
for (int i=0;i<nx;i++){
psi[i]=1.0;
psi_temp[i]=1.0;
}
start_t = clock();
#ifdef _OPENMP
int nthreads;
double start_omp = omp_get_wtime();
#endif
//iterate
int nx1=nx-1;
int ny1=ny-1;
#pragma omp parallel private(it) shared(niter, nx, nx1, ny1, psi, psi_temp, nthreads)
{
#ifdef _OPENMP
#pragma omp single
nthreads= omp_get_num_threads();
#endif
for (it=0;it<niter; it++){
#pragma omp for
// stepping (nx, ny, psi, psi_temp);
for (int j=1; j<ny1; j++){
for (int i=1; i<nx1; i++){
psi_temp[i+nx*j] = 0.25*
( psi[i+nx*j-1] +psi[i+nx*j+1]
+psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
}
}
//update
#pragma omp single
std::swap(psi,psi_temp);
}
} // exit omp parallel region
#ifdef _OPENMP
double end_omp = omp_get_wtime();
#endif
end_t = clock();
#ifdef _OPENMP
printf ( "Using %d OMP threads\n", nthreads);
printf("Omp timing %f seconds\n", end_omp - start_omp);
#endif
total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
printf("Total time taken by CPU: %f sec\n", total_t );
//Take cpu result as reference:
std::swap(psi,psi_ref);
char outname[]="test_serial.txt";
printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
print_ref(nx,ny, psi_ref, outname);
//IC
memset(psi, 0, ntot*sizeof(double));
memset(psi_temp, 0, ntot*sizeof(double));
//BC
for (int i=0;i<nx;i++){
psi[i]=1.0;
psi_temp[i]=1.0;
}
#ifdef MASK
int *cfunc;
cfunc=(int *) malloc(ntot*sizeof(int));
memset(cfunc, 0, ntot*sizeof(int));
for (int j=1; j<ny1; j++){
for (int i=1; i<nx1; i++){
cfunc[i+nx*j]=1;
}
}
#endif
//start openacc
#pragma acc init
start_t = clock();
//iterate
#ifdef MASK
#pragma acc data copy(psi[0:ntot]) copyin(psi_temp[0:ntot], cfunc[0:ntot])
#else
#pragma acc data copy(psi[0:ntot]) copyin(psi_temp[0:ntot])
#endif
{ //data region
for (int it=0;it<niter; it++){
#pragma acc parallel
{ //enter acc parallel
#pragma acc loop collapse(2)
// stepping (nx, ny, psi, psi_temp);
#ifdef MASK
for (int j=0; j<ny; j++){
for (int i=0; i<nx; i++){
if (cfunc[i+nx*j]==1) {
psi_temp[i+nx*j] = 0.25*
( psi[i+nx*j-1] +psi[i+nx*j+1]
+psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
}
}
}
#else
for (int j=1; j<ny1; j++){
for (int i=1; i<nx1; i++){
psi_temp[i+nx*j] = 0.25*
( psi[i+nx*j-1] +psi[i+nx*j+1]
+psi[i+nx*(j-1)]+psi[i+nx*(j+1)]);
}
}
#endif
//update
#pragma acc loop
for (int k=0; k<ntot; ++k) {
psi[k]=psi_temp[k];
}
}//exit acc parallel
} //exit iter
} // exit acc data region
end_t = clock();
total_t = (double) (end_t - start_t) / CLOCKS_PER_SEC;
printf("Total time taken by GPU: %f sec\n", total_t );
char outname2[]="test_acc.txt";
printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
print_ref(nx,ny, psi, outname2);
double err_norm=0;
double norm=0;
for (int k=0; k<ntot; ++k) {
err_norm+=(psi[k]-psi_ref[k])*(psi[k]-psi_ref[k]);
norm+=psi_ref[k]*psi_ref[k];
};
err_norm=sqrt(err_norm);
norm=sqrt(norm);
printf("Total error = %lf, rel err=%lf %\n", err_norm, 100.0*err_norm/norm);
char outname3[]="test_diff.txt";
for (int k=0; k<ntot; ++k) {
psi[k]=abs(psi[k]-psi_ref[k]);
};
printf("nx=%d, ny=%d, niter=%d\n", nx, ny, niter);
print_ref(nx,ny, psi, outname3);
#ifdef MASK
free(cfunc);
#endif
free(psi);
free(psi_ref);
free(psi_temp);
return 0;
}
And my Makefile:
FC=nvc++
FCFLAG=-O0 -fast -acc=gpu -gpu=cc61 -Minfo=accel #-DMASK
TAR=main_lp_ref_2D.out
OBJ=main_lp_ref_2D.o
$(TAR):$(OBJ) Makefile
$(FC) $(FCFLAG) -o $(TAR) $(OBJ)
$(OBJ): %.o: %.c Makefile
$(FC) $(FCFLAG) -c $^ -o $@
clean:
rm -rf *.o $(TAR) test*
In the Makefile, if the flag -DMASK is not used, the nested loop does not
iterate the boundary point and I got
Total time taken by CPU: 0.714485 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.340217 sec
nx=300, ny=400, niter=10000
Total error = 1.566312, rel err=1.770348 %
nx=300, ny=400, niter=10000
If the flag -DMASK is used, the nest loop iterate the indices for boundary as well and inside the loop we used if statement to compute only at inner point and I got
Total time taken by CPU: 0.711233 sec
nx=300, ny=400, niter=10000
Total time taken by GPU: 0.362877 sec
nx=300, ny=400, niter=10000
Total error = 0.015819, rel err=0.017880 %
nx=300, ny=400, niter=10000
The 2nd one shows a much smaller error.
In the more complicate code I worked with, the resulting error of not looping the indices for boundary is much greater and completely wrong.
I redo the whole thing with fortran and compile with nvfortran.
The conclusion for fortran is more mysterious.
First, without collapse clause I got correct result.
Second, with the collapse clause, in the simple jacobi solver example
I cannot reproduce the same issue with C, things are ok even I did not loop through the indices for boundary.
Third, with the collapse clause, in my more complicate code,
same problem as C arised but the above trick of looping all the indices and
with a if statement did not fix the issue like I did in C. And I suspect
the error is caused by the optimization of fortran compiler, because different optimization level -Ox gave different result, but I use -O0 still could not give correct result (within acceptable error compared with the serial code.)
I would appreciate if anyone knows more about this.
Thanks a lot.