Skip to content

Bad memory unallocation & wrong results with OpenMP #3057

@mgates3

Description

@mgates3

For OpenBLAS version >= 0.3.6, verified through 0.3.13, when we call BLAS routines from within an OpenMP parallel section, we get BLAS : Bad memory unallocation! errors and the results are wrong. It works fine with 0.3.5.

This seems similar to #2760 "BLAS: Bad memory unallocation!" errors on Power, but we see this on Intel (Haswell). The tester is compiled with g++, clang++, or Intel icpc, all with OpenMP. I installed OpenBLAS using Spack, and it is compiled with either [email protected] or [email protected]:

> spack find openblas
==> 16 installed packages
-- linux-scientific7-haswell / [email protected] ------------------------
[email protected]  [email protected]  [email protected]  [email protected]
[email protected]  [email protected]  [email protected]  [email protected]

-- linux-scientific7-haswell / [email protected] ------------------------
[email protected]  [email protected]  [email protected]

A simple tester is below. Here is correct output with 0.3.5.

> ./test_gemm 
batch_size 1000, n 50
init
test
test OpenMP
compare
max error 0.00e+00
delete
done

Here is erroneous output with 0.3.7. The exact output varies, typical of race conditions.

> ./test_gemm 
batch_size 1000, n 50
init
test
test OpenMP
BLAS : Bad memory unallocation! :  128  0x7fffcb070000
BLAS : Bad memory unallocation! :  128  0x7fffc5070000
BLAS : Bad memory unallocation! :  128  0x7fffc7070000
BLAS : Bad memory unallocation! :  128  0x7fffe1070000
BLAS : Bad memory unallocation! :  128  0x7fffc9070000
compare
max error 4.02e+02
delete
done

This tester says "batch", but it isn't calling a batched routine, it is just looping over regular gemm. We see the same issue with all other BLAS routines tested (hemm, herk, her2k, trmm, trsm, etc.) and all 4 precisions (s, d, c, z).

Tester (test_gemm.cc)

#include <cblas.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>

#include <vector>

//------------------------------------------------------------------------------
void fill_rand( int m, int n, double* A, int ld )
{
    for (int j = 0; j < n; ++j) {
        for (int i = 0; i < m; ++i) {
            A[ i + j*ld ] = rand() / double(RAND_MAX);
        }
    }
}

//------------------------------------------------------------------------------
inline double max_nan( double x, double y )
{
    return (isnan(y) || (y) >= (x) ? (y) : (x));
}

//------------------------------------------------------------------------------
int main( int argc, char** argv )
{
    int batch_size = 1000;
    int n = 50;
    if (argc > 1)
        batch_size = atoi( argv[1] );
    if (argc > 2)
        n = atoi( argv[2] );
    printf( "batch_size %d, n %d\n", batch_size, n );

    int ld = n;
    double alpha = 3.1416;
    double beta  = 2.7183;

    printf( "init\n" );
    std::vector<double*> A_array( batch_size ),
                         B_array( batch_size ),
                         C_array( batch_size ),
                         D_array( batch_size );
    for (int i = 0; i < batch_size; ++i) {
        A_array[ i ] = new double[ ld*n ];
        B_array[ i ] = new double[ ld*n ];
        C_array[ i ] = new double[ ld*n ];
        D_array[ i ] = new double[ ld*n ];
        fill_rand( n, n, A_array[ i ], ld );
        fill_rand( n, n, B_array[ i ], ld );
        fill_rand( n, n, C_array[ i ], ld );
        std::copy( C_array[ i ], C_array[ i ] + ld*n, D_array[ i ] );
    }

    printf( "test\n" );
    for (int i = 0; i < batch_size; ++i) {
        cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
                     alpha, A_array[ i ], ld, B_array[ i ], ld,
                     beta,  C_array[ i ], ld );
    }

    printf( "test OpenMP\n" );
    #pragma omp parallel for
    for (int i = 0; i < batch_size; ++i) {
        cblas_dgemm( CblasColMajor, CblasNoTrans, CblasNoTrans, n, n, n,
                     alpha, A_array[ i ], ld, B_array[ i ], ld,
                     beta,  D_array[ i ], ld );
    }

    printf( "compare\n" );
    double max_error = 0;
    for (int i = 0; i < batch_size; ++i) {
        // norm( D - C )
        cblas_daxpy( ld*n, -1.0, C_array[ i ], 1, D_array[ i ], 1 );
        double error = cblas_dnrm2( ld*n, D_array[ i ], 1 );
        max_error = max_nan( error, max_error );
    }
    printf( "max error %.2e\n", max_error );

    printf( "delete\n" );
    for (int i = 0; i < batch_size; ++i) {
        delete [] A_array[ i ];
        delete [] B_array[ i ];
        delete [] C_array[ i ];
    }

    printf( "done\n" );
    return 0;
}

Makefile

CXXFLAGS = -fPIC -fopenmp -Wall
LDFLAGS  = -fPIC -fopenmp
LIBS     = -lopenblas

src = test_gemm.cc
obj = test_gemm.o
exe = test_gemm

$(exe): $(obj)
	$(CXX) $(LDFLAGS) -o $@ $^ $(LIBS)

%.o: %.cc
	$(CXX) $(CXXFLAGS) -c -o $@ $<

clean:
	-rm -f $(exe) $(obj)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions