-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Bad memory unallocation & wrong results with OpenMP #3057
Description
For OpenBLAS version >= 0.3.6, verified through 0.3.13, when we call BLAS routines from within an OpenMP parallel section, we get BLAS : Bad memory unallocation! errors and the results are wrong. It works fine with 0.3.5.
This seems similar to #2760 "BLAS: Bad memory unallocation!" errors on Power, but we see this on Intel (Haswell). The tester is compiled with g++, clang++, or Intel icpc, all with OpenMP. I installed OpenBLAS using Spack, and it is compiled with either [email protected] or [email protected]:
> spack find openblas
==> 16 installed packages
-- linux-scientific7-haswell / [email protected] ------------------------
[email protected] [email protected] [email protected] [email protected]
[email protected] [email protected] [email protected] [email protected]
-- linux-scientific7-haswell / [email protected] ------------------------
[email protected] [email protected] [email protected]
A simple tester is below. Here is correct output with 0.3.5.
> ./test_gemm
batch_size 1000, n 50
init
test
test OpenMP
compare
max error 0.00e+00
delete
done
Here is erroneous output with 0.3.7. The exact output varies, typical of race conditions.
> ./test_gemm
batch_size 1000, n 50
init
test
test OpenMP
BLAS : Bad memory unallocation! : 128 0x7fffcb070000
BLAS : Bad memory unallocation! : 128 0x7fffc5070000
BLAS : Bad memory unallocation! : 128 0x7fffc7070000
BLAS : Bad memory unallocation! : 128 0x7fffe1070000
BLAS : Bad memory unallocation! : 128 0x7fffc9070000
compare
max error 4.02e+02
delete
done
This tester says "batch", but it isn't calling a batched routine, it is just looping over regular gemm. We see the same issue with all other BLAS routines tested (hemm, herk, her2k, trmm, trsm, etc.) and all 4 precisions (s, d, c, z).
Tester (test_gemm.cc)
#include <cblas.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <vector>
//------------------------------------------------------------------------------
void fill_rand( int m, int n, double* A, int ld )
{
for (int j = 0; j < n; ++j) {
for (int i = 0; i < m; ++i) {
A[ i + j*ld ] = rand() / double(RAND_MAX);
}
}
}
//------------------------------------------------------------------------------
inline double max_nan( double x, double y )
{
return (isnan(y) || (y) >= (x) ? (y) : (x));
}
//------------------------------------------------------------------------------
int main( int argc, char** argv )
{
int batch_size = 1000;
int n = 50;
if (argc > 1)
batch_size = atoi( argv[1] );
if (argc > 2)
n = atoi( argv[2] );
printf( "batch_size %d, n %d\n", batch_size, n );
int ld = n;
double alpha = 3.1416;
double beta = 2.7183;
printf( "init\n" );
std::vector<double*> A_array( batch_size ),
B_array( batch_size ),
C_array( batch_size ),
D_array( batch_size );
for (int i = 0; i < batch_size; ++i) {
A_array[ i ] = new double[ ld*n ];
B_array[ i ] = new double[ ld*n ];
C_array[ i ] = new double[ ld*n ];
D_array[ i ] = new double[ ld*n ];
fill_rand( n, n, A_array[ i ], ld );
fill_rand( n, n, B_array[ i ], ld );
fill_rand( n, n, C_array[ i ], ld );
std::copy( C_array[ i ], C_array[ i ] + ld*n, D_array[ i ] );
}
printf( "test\n" );
for (int i = 0; i < batch_size; ++i) {
cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
alpha, A_array[ i ], ld, B_array[ i ], ld,
beta, C_array[ i ], ld );
}
printf( "test OpenMP\n" );
#pragma omp parallel for
for (int i = 0; i < batch_size; ++i) {
cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
alpha, A_array[ i ], ld, B_array[ i ], ld,
beta, D_array[ i ], ld );
}
printf( "compare\n" );
double max_error = 0;
for (int i = 0; i < batch_size; ++i) {
// norm( D - C )
cblas_daxpy( ld*n, -1.0, C_array[ i ], 1, D_array[ i ], 1 );
double error = cblas_dnrm2( ld*n, D_array[ i ], 1 );
max_error = max_nan( error, max_error );
}
printf( "max error %.2e\n", max_error );
printf( "delete\n" );
for (int i = 0; i < batch_size; ++i) {
delete [] A_array[ i ];
delete [] B_array[ i ];
delete [] C_array[ i ];
}
printf( "done\n" );
return 0;
}
Makefile
CXXFLAGS = -fPIC -fopenmp -Wall
LDFLAGS = -fPIC -fopenmp
LIBS = -lopenblas
src = test_gemm.cc
obj = test_gemm.o
exe = test_gemm
$(exe): $(obj)
$(CXX) $(LDFLAGS) -o $@ $^ $(LIBS)
%.o: %.cc
$(CXX) $(CXXFLAGS) -c -o $@ $<
clean:
-rm -f $(exe) $(obj)