Skip to content

Commit 30c9a3a

Browse files
authored
Skip empty partitions when doing groupby.value_counts (dask#7073)
1 parent ce45d1a commit 30c9a3a

File tree

2 files changed

+31
-1
lines changed

2 files changed

+31
-1
lines changed

dask/dataframe/groupby.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1931,7 +1931,7 @@ def agg(self, arg, split_every=None, split_out=1):
19311931
def value_counts(self, split_every=None, split_out=1):
19321932
return self._aca_agg(
19331933
token="value_counts",
1934-
func=M.value_counts,
1934+
func=_value_counts,
19351935
aggfunc=_value_counts_aggregate,
19361936
split_every=split_every,
19371937
split_out=split_out,
@@ -1956,6 +1956,13 @@ def _unique_aggregate(series_gb, name=None):
19561956
return ret
19571957

19581958

1959+
def _value_counts(x, **kwargs):
1960+
if len(x):
1961+
return M.value_counts(x, **kwargs)
1962+
else:
1963+
return pd.Series(dtype=int)
1964+
1965+
19591966
def _value_counts_aggregate(series_gb):
19601967
to_concat = {k: v.sum(level=1) for k, v in series_gb}
19611968
names = list(series_gb.obj.index.names)

dask/dataframe/tests/test_groupby.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2475,3 +2475,26 @@ def agg(grp, **kwargs):
24752475
agg(pdf.groupby(groupby, observed=observed)),
24762476
agg(ddf.groupby(groupby, observed=observed)),
24772477
)
2478+
2479+
2480+
def test_empty_partitions_with_value_counts():
2481+
# https://github.com/dask/dask/issues/7065
2482+
df = pd.DataFrame(
2483+
data=[
2484+
["a1", "b1"],
2485+
["a1", None],
2486+
["a1", "b1"],
2487+
[None, None],
2488+
[None, None],
2489+
[None, None],
2490+
["a3", "b3"],
2491+
["a3", "b3"],
2492+
["a5", "b5"],
2493+
],
2494+
columns=["A", "B"],
2495+
)
2496+
2497+
expected = df.groupby("A")["B"].value_counts()
2498+
ddf = dd.from_pandas(df, npartitions=3)
2499+
actual = ddf.groupby("A")["B"].value_counts()
2500+
assert_eq(expected, actual)

0 commit comments

Comments
 (0)