Skip to content

Pandas segfault in _dummy_from_array #973

@mrocklin

Description

@mrocklin

In a recent travis.ci test I see the following:

dask/dataframe/tests/test_io.py::test_usecols PASSED
dask/dataframe/tests/test_io.py::test_dummy_from_array PASSED
dask/dataframe/tests/test_io.py::test_dummy_from_1darray /home/travis/build.sh: line 45:  2972 Segmentation fault      (core dumped) py.test dask --verbose

This likely means that something within Pandas is not responding well to running in threads.

cc @jreback @sinhrks

Relevant functions

def test_dummy_from_1darray():
    x = np.array([1., 2., 3.], dtype=np.float64)
    res = dd.io._dummy_from_array(x)
    assert isinstance(res, pd.Series)
    assert res.dtype == np.float64

    x = np.array([1, 2, 3], dtype=np.object_)
    res = dd.io._dummy_from_array(x, columns='x')
    assert isinstance(res, pd.Series)
    assert res.name == 'x'
    assert res.dtype == np.object_

    x = np.array([1, 2, 3], dtype=np.object_)
    res = dd.io._dummy_from_array(x, columns=['x'])
    assert isinstance(res, pd.DataFrame)
    assert res['x'].dtype == np.object_
    tm.assert_index_equal(res.columns, pd.Index(['x']))

    msg = r"""Length mismatch: Expected axis has 1 elements, new values have 2         elements"""
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.io._dummy_from_array(x, columns=['a', 'b'])

def _dummy_from_array(x, columns=None):
    """ Create empty pd.DataFrame or pd.Series which has correct dtype """

    if x.ndim > 2:
        raise ValueError('from_array does not input more than 2D array, got'
                         ' array with shape %r' % (x.shape,))

    if getattr(x.dtype, 'names', None) is not None:
        # record array has named columns
        cols = tuple(x.dtype.names)
        dtypes = [x.dtype.fields[n][0] for n in x.dtype.names]
    elif x.ndim == 1 and (np.isscalar(columns) or columns is None):
        # Series
        return pd.Series([], name=columns, dtype=x.dtype)
    else:
        cols = list(range(x.shape[1])) if x.ndim == 2 else [0]
        dtypes = [x.dtype] * len(cols)

    data = {}
    for c, dt in zip(cols, dtypes):
        data[c] = np.array([], dtype=dt)
    data = pd.DataFrame(data, columns=cols)

    if columns is not None:
        # if invalid, raise error from pandas
        data.columns = columns
    return data

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions