-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Add drop=True option for where on Dataset and DataArray #815
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -449,7 +449,7 @@ def resample(self, freq, dim, how='mean', skipna=None, closed=None, | |
| result = result.rename({RESAMPLE_DIM: dim.name}) | ||
| return result | ||
|
|
||
| def where(self, cond): | ||
| def where(self, cond, other=None, drop=False): | ||
| """Return an object of the same shape with all entries where cond is | ||
| True and all other entries masked. | ||
|
|
||
|
|
@@ -459,10 +459,15 @@ def where(self, cond): | |
| Parameters | ||
| ---------- | ||
| cond : boolean DataArray or Dataset | ||
| other : unimplemented, optional | ||
| Unimplemented placeholder for compatability with future numpy / pandas versions | ||
| drop : boolean, optional | ||
| Coordinate labels that only correspond to NA values should be dropped | ||
|
|
||
| Returns | ||
| ------- | ||
| same type as caller | ||
| same type as caller or if drop=True same type as caller with dimensions | ||
| reduced for dim element where mask is True | ||
|
|
||
| Examples | ||
| -------- | ||
|
|
@@ -479,8 +484,41 @@ def where(self, cond): | |
| Coordinates: | ||
| * y (y) int64 0 1 2 3 4 | ||
| * x (x) int64 0 1 2 3 4 | ||
| >>> a.where((a > 6) & (a < 18), drop=True) | ||
| <xarray.DataArray (x: 5, y: 5)> | ||
| array([[ nan, nan, 7., 8., 9.], | ||
| [ 10., 11., 12., 13., 14.], | ||
| [ 15., 16., 17., nan, nan], | ||
| Coordinates: | ||
| * x (x) int64 1 2 3 | ||
| * y (y) int64 0 1 2 3 4 | ||
| """ | ||
| return self._where(cond) | ||
| if other is not None: | ||
| raise NotImplementedError("The optional argument 'other' has not yet been implemented") | ||
|
|
||
| if drop: | ||
| from .dataarray import DataArray | ||
| from .dataset import Dataset | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just a nit: I think it's a little cleaner to put imports at the method/function definition. |
||
| # get cond with the minimal size needed for the Dataset | ||
| if isinstance(cond, Dataset): | ||
| clipcond = cond.to_array().any('variable') | ||
| elif isinstance(cond, DataArray): | ||
| clipcond = cond | ||
| else: | ||
| raise TypeError("Cond argument is %r but must be a %r or %r" % | ||
| (cond, Dataset, DataArray)) | ||
|
|
||
| # clip the data corresponding to coordinate dims that are not used | ||
| clip = dict(zip(clipcond.dims, [np.unique(adim) | ||
| for adim in np.nonzero(clipcond.values)])) | ||
| outcond = cond.isel(**clip) | ||
| outobj = self.sel(**outcond.indexes) | ||
| else: | ||
| outobj = self | ||
| outcond = cond | ||
|
|
||
| return outobj._where(outcond) | ||
|
|
||
|
|
||
| # this has no runtime function - these are listed so IDEs know these methods | ||
| # are defined and don't warn on these operations | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1949,6 +1949,63 @@ def test_where(self): | |
| actual = ds.groupby('c').where(cond) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| def test_where_drop(self): | ||
| # if drop=True | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's make this its own test --
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed |
||
|
|
||
| # 1d | ||
| # data array case | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add a test case with more interesting coordinates? i.e., something that would broken your previous code where you used |
||
| array = DataArray(range(5), coords=[range(5)], dims=['x']) | ||
| expected = DataArray(range(5)[2:], coords=[range(5)[2:]], dims=['x']) | ||
| actual = array.where(array > 1, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| # dataset case | ||
| ds = Dataset({'a': array}) | ||
| expected = Dataset({'a': expected}) | ||
|
|
||
| actual = ds.where(ds > 1, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| actual = ds.where(ds.a > 1, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| with self.assertRaisesRegexp(TypeError, 'must be a'): | ||
| ds.where(np.arange(5) > 1, drop=True) | ||
|
|
||
| # 1d with odd coordinates | ||
| array = DataArray(np.array([2, 7, 1, 8, 3]), coords=[np.array([3, 1, 4, 5, 9])], dims=['x']) | ||
| expected = DataArray(np.array([7, 8, 3]), coords=[np.array([1, 5, 9])], dims=['x']) | ||
| actual = array.where(array > 2, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| # 1d multiple variables | ||
| ds = Dataset({'a': (('x'), [0, 1, 2, 3]), 'b': (('x'), [4, 5, 6, 7])}) | ||
| expected = Dataset({'a': (('x'), [np.nan, 1, 2, 3]), 'b': (('x'), [4, 5, 6, np.nan])}) | ||
| actual = ds.where((ds > 0) & (ds < 7), drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| # 2d | ||
| ds = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])}) | ||
| expected = Dataset({'a': (('x', 'y'), [[np.nan, 1], [2, 3]])}) | ||
| actual = ds.where(ds > 0, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| # 2d with odd coordinates | ||
| ds = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]])}, | ||
| coords={'x': [4, 3], 'y': [1, 2], | ||
| 'z' : (['x','y'], [[np.e, np.pi], [np.pi*np.e, np.pi*3]])}) | ||
| expected = Dataset({'a': (('x', 'y'), [[3]])}, | ||
| coords={'x': [3], 'y': [2], | ||
| 'z' : (['x','y'], [[np.pi*3]])}) | ||
| actual = ds.where(ds > 2, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| # 2d multiple variables | ||
| ds = Dataset({'a': (('x', 'y'), [[0, 1], [2, 3]]), 'b': (('x','y'), [[4, 5], [6, 7]])}) | ||
| expected = Dataset({'a': (('x', 'y'), [[np.nan, 1], [2, 3]]), 'b': (('x', 'y'), [[4, 5], [6,7]])}) | ||
| actual = ds.where(ds > 0, drop=True) | ||
| self.assertDatasetIdentical(expected, actual) | ||
|
|
||
| def test_reduce(self): | ||
| data = create_test_data() | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given pandas & numpy's
wherefunction takes another arg aftercond(other), I think we should check fordrop's type to ensure anyone who passes that gets caught earlyThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed this is an arg that could be kwarg-only in python3. (but we can't do that here)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch. Maybe we should add an
other=Noneargument as a place holder for now, simply raisingNotImplementedErrorifother is not None. I'm pretty sure we'll want to add that argument eventually, anyways.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@shoyer and @MaximilianR, thanks for catching this potential issue. Please see the code I just pushed to address this issue. I believe this should be ready to be merged now. I've intentionally left the
otheroption as not implemented for the documentation too, e.g., https://github.com/pydata/xarray/pull/815/files#diff-aec89f8189374cf98061efad7425f561R464