Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/indexing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,15 @@ usual xarray broadcasting and alignment rules for binary operations (e.g.,

arr2.where(arr2.y < 2)

By default ``where`` maintains the original size of the data. For cases
where the selected data size is much smaller than the original data,
use of the option ``drop=True`` clips coordinate
elements that are fully masked:

.. ipython:: python

arr2.where(arr2.y < 2, drop=True)

Multi-dimensional indexing
--------------------------

Expand Down
13 changes: 13 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,19 @@ What's New
import xarray as xr
np.random.seed(123456)

.. _whats-new.0.7.3:

v0.7.3 (unreleased)
-------------------

This release includes

Enhancements
~~~~~~~~~~~~
- DataArray and Dataset method :py:meth:`where` now supports a ``drop=True``
option that clips coordinate elements that are fully masked. By
`Phillip J. Wolfram <https://github.com/pwolfram>`_.

.. _whats-new.0.7.2:

v0.7.2 (13 March 2016)
Expand Down
44 changes: 41 additions & 3 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None,
result = result.rename({RESAMPLE_DIM: dim.name})
return result

def where(self, cond):
def where(self, cond, other=None, drop=False):
"""Return an object of the same shape with all entries where cond is
True and all other entries masked.

Expand All @@ -459,10 +459,15 @@ def where(self, cond):
Parameters
----------
cond : boolean DataArray or Dataset
other : unimplemented, optional
Unimplemented placeholder for compatability with future numpy / pandas versions
drop : boolean, optional
Coordinate labels that only correspond to NA values should be dropped

Returns
-------
same type as caller
same type as caller or if drop=True same type as caller with dimensions
reduced for dim element where mask is True

Examples
--------
Expand All @@ -479,8 +484,41 @@ def where(self, cond):
Coordinates:
* y (y) int64 0 1 2 3 4
* x (x) int64 0 1 2 3 4
>>> a.where((a > 6) & (a < 18), drop=True)
<xarray.DataArray (x: 5, y: 5)>
array([[ nan, nan, 7., 8., 9.],
[ 10., 11., 12., 13., 14.],
[ 15., 16., 17., nan, nan],
Coordinates:
* x (x) int64 1 2 3
* y (y) int64 0 1 2 3 4
"""
return self._where(cond)
if other is not None:
raise NotImplementedError("The optional argument 'other' has not yet been implemented")

if drop:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given pandas & numpy's where function takes another arg after cond (other), I think we should check for drop's type to ensure anyone who passes that gets caught early

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed this is an arg that could be kwarg-only in python3. (but we can't do that here)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Maybe we should add an other=None argument as a place holder for now, simply raising NotImplementedError if other is not None. I'm pretty sure we'll want to add that argument eventually, anyways.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shoyer and @MaximilianR, thanks for catching this potential issue. Please see the code I just pushed to address this issue. I believe this should be ready to be merged now. I've intentionally left the other option as not implemented for the documentation too, e.g., https://github.com/pydata/xarray/pull/815/files#diff-aec89f8189374cf98061efad7425f561R464

from .dataarray import DataArray
from .dataset import Dataset
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a nit: I think it's a little cleaner to put imports at the method/function definition.

# get cond with the minimal size needed for the Dataset
if isinstance(cond, Dataset):
clipcond = cond.to_array().any('variable')
elif isinstance(cond, DataArray):
clipcond = cond
else:
raise TypeError("Cond argument is %r but must be a %r or %r" %
(cond, Dataset, DataArray))

# clip the data corresponding to coordinate dims that are not used
clip = dict(zip(clipcond.dims, [np.unique(adim)
for adim in np.nonzero(clipcond.values)]))
outcond = cond.isel(**clip)
outobj = self.sel(**outcond.indexes)
else:
outobj = self
outcond = cond

return outobj._where(outcond)


# this has no runtime function - these are listed so IDEs know these methods
# are defined and don't warn on these operations
Expand Down
2 changes: 1 addition & 1 deletion xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def to_dataset(self, dim=None, name=None):
into separate variables. If not provided, this array is converted
into a Dataset of one variable.
name : str, optional
Name to substitute for this array's name. Only valid is ``dim`` is
Name to substitute for this array's name. Only valid if ``dim`` is
not provided.

Returns
Expand Down
57 changes: 57 additions & 0 deletions xarray/test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1949,6 +1949,63 @@ def test_where(self):
actual = ds.groupby('c').where(cond)
self.assertDatasetIdentical(expected, actual)

def test_where_drop(self):
# if drop=True
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's make this its own test -- test_where is long enough already :).

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed


# 1d
# data array case
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we add a test case with more interesting coordinates? i.e., something that would broken your previous code where you used sel mistakenly instead of isel.

array = DataArray(range(5), coords=[range(5)], dims=['x'])
expected = DataArray(range(5)[2:], coords=[range(5)[2:]], dims=['x'])
actual = array.where(array > 1, drop=True)
self.assertDatasetIdentical(expected, actual)

# dataset case
ds = Dataset({'a': array})
expected = Dataset({'a': expected})

actual = ds.where(ds > 1, drop=True)
self.assertDatasetIdentical(expected, actual)

actual = ds.where(ds.a > 1, drop=True)
self.assertDatasetIdentical(expected, actual)

with self.assertRaisesRegexp(TypeError, 'must be a'):
ds.where(np.arange(5) > 1, drop=True)

# 1d with odd coordinates
array = DataArray(np.array([2, 7, 1, 8, 3]), coords=[np.array([3, 1, 4, 5, 9])], dims=['x'])
expected = DataArray(np.array([7, 8, 3]), coords=[np.array([1, 5, 9])], dims=['x'])
actual = array.where(array > 2, drop=True)
self.assertDatasetIdentical(expected, actual)

# 1d multiple variables
ds = Dataset({'a': (('x'), [0, 1, 2, 3]), 'b': (('x'), [4, 5, 6, 7])})
expected = Dataset({'a': (('x'), [np.nan, 1, 2, 3]), 'b': (('x'), [4, 5, 6, np.nan])})
actual = ds.where((ds > 0) & (ds < 7), drop=True)
self.assertDatasetIdentical(expected, actual)

# 2d
ds = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])})
expected = Dataset({'a': (('x', 'y'), [[np.nan, 1], [2, 3]])})
actual = ds.where(ds > 0, drop=True)
self.assertDatasetIdentical(expected, actual)

# 2d with odd coordinates
ds = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])},
coords={'x': [4, 3], 'y': [1, 2],
'z' : (['x','y'], [[np.e, np.pi], [np.pi*np.e, np.pi*3]])})
expected = Dataset({'a': (('x', 'y'), [[3]])},
coords={'x': [3], 'y': [2],
'z' : (['x','y'], [[np.pi*3]])})
actual = ds.where(ds > 2, drop=True)
self.assertDatasetIdentical(expected, actual)

# 2d multiple variables
ds = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]]), 'b': (('x','y'), [[4, 5], [6, 7]])})
expected = Dataset({'a': (('x', 'y'), [[np.nan, 1], [2, 3]]), 'b': (('x', 'y'), [[4, 5], [6,7]])})
actual = ds.where(ds > 0, drop=True)
self.assertDatasetIdentical(expected, actual)

def test_reduce(self):
data = create_test_data()

Expand Down