Skip to content

Commit f9cd802

Browse files
mcsoinijcrist
authored andcommitted
Merge dtype warning (dask#4917)
* add test covering the merge column dtype mismatch warning * for various merge types: checks that the resulting dataframe has either no nans or that a UserWarning has been thrown * Add warning for mismatches between column data types * fixes issue dask#4574 * Warning is thrown if the on-columns of left and right have different dtypes * flake8 fixes * fixes * use asciitable for warning string
1 parent c400691 commit f9cd802

File tree

2 files changed

+65
-1
lines changed

2 files changed

+65
-1
lines changed

dask/dataframe/multi.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@
6969
from .io import from_pandas
7070
from . import methods
7171
from .shuffle import shuffle, rearrange_by_divisions
72-
from .utils import strip_unknown_categories, is_dataframe_like, is_series_like
72+
from .utils import (strip_unknown_categories, is_dataframe_like,
73+
is_series_like, asciitable)
7374

7475

7576
def align_partitions(*dfs):
@@ -325,6 +326,30 @@ def single_partition_join(left, right, **kwargs):
325326
return new_dd_object(graph, name, meta, divisions)
326327

327328

329+
def warn_dtype_mismatch(left, right, left_on, right_on):
330+
""" Checks for merge column dtype mismatches and throws a warning (#4574)
331+
"""
332+
333+
if not isinstance(left_on, list):
334+
left_on = [left_on]
335+
if not isinstance(right_on, list):
336+
right_on = [right_on]
337+
338+
if (all(col in left.columns for col in left_on) and
339+
all(col in right.columns for col in right_on)):
340+
dtype_mism = [((lo, ro), left.dtypes[lo], right.dtypes[ro])
341+
for lo, ro in zip(left_on, right_on)
342+
if not left.dtypes[lo] is right.dtypes[ro]]
343+
344+
if dtype_mism:
345+
col_tb = asciitable(('Merge columns', 'left dtype', 'right dtype'),
346+
dtype_mism)
347+
348+
warnings.warn(('Merging dataframes with merge column data '
349+
'type mismatches: \n{}\nCast dtypes explicitly to '
350+
'avoid unexpected results.').format(col_tb))
351+
352+
328353
@wraps(pd.merge)
329354
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
330355
left_index=False, right_index=False, suffixes=('_x', '_y'),
@@ -413,6 +438,9 @@ def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
413438
empty_index_dtype=meta.index.dtype)
414439
# Catch all hash join
415440
else:
441+
if left_on and right_on:
442+
warn_dtype_mismatch(left, right, left_on, right_on)
443+
416444
return hash_join(left, left.index if left_index else left_on,
417445
right, right.index if right_index else right_on,
418446
how, npartitions, suffixes, shuffle=shuffle,

dask/dataframe/tests/test_multi.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,42 @@ def test_concat_different_dtypes(join):
345345
assert len(set(map(str, dtypes_list))) == 1 # all the same
346346

347347

348+
@pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right'])
349+
@pytest.mark.parametrize('on_index', [True, False])
350+
def test_merge_columns_dtypes(how, on_index):
351+
# tests results of merges with merge columns having different dtypes;
352+
# asserts that either the merge was successful or the corresponding warning is raised
353+
# addresses issue #4574
354+
355+
df1 = pd.DataFrame({"A": list(np.arange(5).astype(float)) * 2,
356+
"B": list(np.arange(5)) * 2})
357+
df2 = pd.DataFrame({"A": np.arange(5), "B": np.arange(5)})
358+
359+
a = dd.from_pandas(df1, 2) # merge column "A" is float
360+
b = dd.from_pandas(df2, 2) # merge column "A" is int
361+
362+
on = ["A"]
363+
left_index = right_index = on_index
364+
365+
if on_index:
366+
a = a.set_index("A")
367+
b = b.set_index("A")
368+
on = None
369+
370+
with pytest.warns(None) as record:
371+
result = dd.merge(a, b, on=on, how=how,
372+
left_index=left_index, right_index=right_index)
373+
374+
warned = any('merge column data type mismatches' in str(r) for r in record)
375+
376+
# result type depends on merge operation -> convert to pandas
377+
result = result if isinstance(result, pd.DataFrame) else result.compute()
378+
379+
has_nans = result.isna().values.any()
380+
381+
assert (has_nans and warned) or not has_nans
382+
383+
348384
@pytest.mark.parametrize('how', ['inner', 'outer', 'left', 'right'])
349385
@pytest.mark.parametrize('shuffle', ['disk', 'tasks'])
350386
def test_merge(how, shuffle):

0 commit comments

Comments
 (0)