-
-
Notifications
You must be signed in to change notification settings - Fork 1.8k
Add Dask Array._meta attribute #4543
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f909281
0bf17c3
2bfec0d
c8b61af
f54856b
70d2a01
db5e021
1323ee7
04f70f8
95856d7
3e66595
2e0ecf0
1eddd26
30c98c0
7f0ebe8
f584872
c9fad68
ce7e5dd
646ca13
fd5406f
3dec891
2c05399
b703153
8c57904
ef4f970
2cb0e3b
49f0b8e
8d4de71
b8f63db
185765d
0bab189
0f63efb
3698661
1cf6d72
096e4c1
6f95928
7aad860
48add7d
af69515
fe96d2b
0acb35d
fb49b15
6808517
f5281ee
2a62263
0bb0dea
0281f06
757e7ba
6812e68
3885f97
87eed6b
3d2f396
abcddcb
cf59c47
ac44219
2135cb7
4e0ea2a
1abb56d
9a2d333
e3370f2
99e8a79
62f5a42
edc7650
30bc10d
3f659ea
d78540a
0cada3e
80cf6d2
37f9e9c
80e11c6
c22dea5
a52806a
d6912ec
39f43e7
1640271
02d9fb4
b76f1c2
2f2cf72
3c34b56
66a6d00
3a27b7a
c419a07
d2abc3c
342b8ac
b7bfbb0
834c806
113d6e4
90fdfb2
136e0de
426ed92
dc5b7e7
9c41a54
bbc5910
45a852f
1395b20
c84276a
509c4f2
1e778a8
c7ec24d
7149c78
6c4ea85
eb83eb6
9879a77
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -731,7 +731,7 @@ def store(sources, targets, lock=True, regions=None, compute=True, | |
| sources_dsk, | ||
| list(core.flatten([e.__dask_keys__() for e in sources])) | ||
| ) | ||
| sources2 = [Array(sources_dsk, e.name, e.chunks, e.dtype) for e in sources] | ||
| sources2 = [Array(sources_dsk, e.name, e.chunks, meta=e) for e in sources] | ||
|
|
||
| # Optimize all targets together | ||
| targets2 = [] | ||
|
|
@@ -774,7 +774,7 @@ def store(sources, targets, lock=True, regions=None, compute=True, | |
| ) | ||
|
|
||
| result = tuple( | ||
| Array(load_store_dsk, 'load-store-%s' % t, s.chunks, s.dtype) | ||
| Array(load_store_dsk, 'load-store-%s' % t, s.chunks, meta=s) | ||
| for s, t in zip(sources, toks) | ||
| ) | ||
|
|
||
|
|
@@ -857,28 +857,40 @@ class Array(DaskMethodsMixin): | |
| Shape of the entire array | ||
| chunks: iterable of tuples | ||
| block sizes along each dimension | ||
| dtype : str or dtype | ||
| Typecode or data-type for the new Dask Array | ||
| meta : empty ndarray | ||
| empty ndarray created with same NumPy backend, ndim and dtype as the | ||
| Dask Array being created (overrides dtype) | ||
|
|
||
| See Also | ||
| -------- | ||
| dask.array.from_array | ||
| """ | ||
| __slots__ = 'dask', '_name', '_cached_keys', '_chunks', 'dtype' | ||
| __slots__ = 'dask', '_name', '_cached_keys', '_chunks', '_meta' | ||
|
|
||
| def __new__(cls, dask, name, chunks, dtype, shape=None): | ||
| def __new__(cls, dask, name, chunks, dtype=None, meta=None, shape=None): | ||
| self = super(Array, cls).__new__(cls) | ||
| assert isinstance(dask, Mapping) | ||
| if not isinstance(dask, HighLevelGraph): | ||
| dask = HighLevelGraph.from_collections(name, dask, dependencies=()) | ||
| self.dask = dask | ||
| self.name = name | ||
| if dtype is None: | ||
| raise ValueError("You must specify the dtype of the array") | ||
| self.dtype = np.dtype(dtype) | ||
| if dtype is not None and meta is not None: | ||
| raise TypeError("You must not specify both meta and dtype") | ||
| if dtype is None and meta is None: | ||
| raise ValueError("You must specify the meta or dtype of the array") | ||
mrocklin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| self._chunks = normalize_chunks(chunks, shape, dtype=self.dtype) | ||
| self._chunks = normalize_chunks(chunks, shape, dtype=dtype or meta.dtype) | ||
| if self._chunks is None: | ||
| raise ValueError(CHUNKS_NONE_ERROR_MESSAGE) | ||
|
|
||
| if dtype: | ||
| self._meta = np.empty((0,) * self.ndim, dtype=dtype) | ||
| else: | ||
| from .utils import meta_from_array | ||
| self._meta = meta_from_array(meta, meta.ndim) | ||
|
|
||
| for plugin in config.get('array_plugins', ()): | ||
| result = plugin(self) | ||
| if result is not None: | ||
|
|
@@ -944,8 +956,8 @@ def chunksize(self): | |
| return tuple(max(c) for c in self.chunks) | ||
|
|
||
| @property | ||
| def _meta(self): | ||
| return np.empty(shape=(), dtype=self.dtype) | ||
| def dtype(self): | ||
| return self._meta.dtype | ||
|
|
||
| def _get_chunks(self): | ||
| return self._chunks | ||
|
|
@@ -1217,7 +1229,7 @@ def __setitem__(self, key, value): | |
| if isinstance(value, Array) and value.ndim > 1: | ||
| raise ValueError('boolean index array should have 1 dimension') | ||
| y = where(key, value, self) | ||
| self.dtype = y.dtype | ||
| self._meta = y._meta | ||
| self.dask = y.dask | ||
| self.name = y.name | ||
| self._chunks = y.chunks | ||
|
|
@@ -1267,7 +1279,37 @@ def __getitem__(self, index): | |
| dsk, chunks = slice_array(out, self.name, self.chunks, index2) | ||
|
|
||
| graph = HighLevelGraph.from_collections(out, dsk, dependencies=[self]) | ||
| return Array(graph, out, chunks, dtype=self.dtype) | ||
|
|
||
| if isinstance(index2, tuple): | ||
| new_index = [] | ||
| for i in range(len(index2)): | ||
| if not isinstance(index2[i], tuple): | ||
| types = [Integral, list, np.ndarray] | ||
| cond = any([isinstance(index2[i], t) for t in types]) | ||
| new_index.append(slice(0, 0) if cond else index2[i]) | ||
| else: | ||
| new_index.append(tuple([Ellipsis if j is not None else | ||
| None for j in index2[i]])) | ||
| new_index = tuple(new_index) | ||
| meta = self._meta[new_index].astype(self.dtype) | ||
| else: | ||
| meta = self._meta[index2].astype(self.dtype) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm going to try to defer to @jakirkham on this :)
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I'm not totally following what is happening in this block. Could you please provide a summary, @pentschev?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, what happens here is very simple, basically creating a
This way, I convert all cases to 0-sized slices. For example:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for providing more info. I'm still confused unfortunately. Have a couple of follow-up questions. What is the difference between using Also, I thought
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think they're equivalent, I probably started using
It does add a singleton dimension. We would want one if the user requests so, as per @mrocklin's previous example here. Just to be clear here, I was attempting to cover all cases I could think of. Maybe not all of them would make sense in a realistic context, but are technically allowed, so we should cover them. |
||
|
|
||
| # Exception for object dtype and ndim == 1, which results in primitive types | ||
| if not (meta.dtype == object and meta.ndim == 1): | ||
|
|
||
| # If meta still has more dimensions than actual data | ||
| if meta.ndim > len(chunks): | ||
| meta = np.sum(meta, axis=tuple([i for i in range(meta.ndim - len(chunks))])) | ||
|
|
||
| # Ensure all dimensions are 0 | ||
| if not np.isscalar(meta): | ||
| meta = meta[tuple([slice(0, 0) for i in range(meta.ndim)])] | ||
| # If return array is 0-D, ensure _meta is 0-D | ||
| if len(chunks) == 0: | ||
| meta = meta.sum() | ||
|
|
||
| return Array(graph, out, chunks, meta=meta) | ||
|
|
||
| def _vindex(self, key): | ||
| if not isinstance(key, tuple): | ||
|
|
@@ -1336,7 +1378,7 @@ def _blocks(self, index): | |
| layer = {(name,) + key: tuple(new_keys[key].tolist()) for key in keys} | ||
|
|
||
| graph = HighLevelGraph.from_collections(name, layer, dependencies=[self]) | ||
| return Array(graph, name, chunks, self.dtype) | ||
| return Array(graph, name, chunks, meta=self) | ||
|
|
||
| @property | ||
| def blocks(self): | ||
|
|
@@ -1848,7 +1890,7 @@ def copy(self): | |
| if self.npartitions == 1: | ||
| return self.map_blocks(M.copy) | ||
| else: | ||
| return Array(self.dask, self.name, self.chunks, self.dtype) | ||
| return Array(self.dask, self.name, self.chunks, meta=self) | ||
|
|
||
| def __deepcopy__(self, memo): | ||
| c = self.copy() | ||
|
|
@@ -2292,7 +2334,12 @@ def from_array(x, chunks='auto', name=None, lock=False, asarray=True, fancy=True | |
| dtype=x.dtype) | ||
| dsk[original_name] = x | ||
|
|
||
| return Array(dsk, name, chunks, dtype=x.dtype) | ||
| # Workaround for TileDB, its indexing is 1-based, | ||
| # and doesn't seems to support 0-length slicing | ||
| if x.__class__.__module__.split('.')[0] == 'tiledb' and hasattr(x, '_ctx_'): | ||
| return Array(dsk, name, chunks, dtype=x.dtype) | ||
mrocklin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| return Array(dsk, name, chunks, meta=x) | ||
|
|
||
|
|
||
| def from_zarr(url, component=None, storage_options=None, chunks=None,name=None, **kwargs): | ||
|
|
@@ -2908,6 +2955,11 @@ def concatenate(seq, axis=0, allow_unknown_chunksizes=False): | |
| for i, ind in enumerate(inds): | ||
| ind[axis] = -(i + 1) | ||
|
|
||
| from .utils import meta_from_array | ||
| metas = [getattr(s, '_meta', s) for s in seq] | ||
mrocklin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| metas = [meta_from_array(m, getattr(m, 'ndim', 1)) for m in metas] | ||
| meta = np.concatenate(metas) | ||
|
|
||
| uc_args = list(concat(zip(seq, inds))) | ||
| _, seq = unify_chunks(*uc_args, warn=False) | ||
|
|
||
|
|
@@ -2922,8 +2974,6 @@ def concatenate(seq, axis=0, allow_unknown_chunksizes=False): | |
| if len(set(seq_dtypes)) > 1: | ||
| dt = reduce(np.promote_types, seq_dtypes) | ||
| seq = [x.astype(dt) for x in seq] | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Leaving a note here for reference, we should be able to use
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, I totally missed that. For the record, there's still quite some work to be done in that file, as well as a few others, as tracked on #4888. Thanks a lot for fixing this @jakirkham! :) |
||
| else: | ||
| dt = seq_dtypes[0] | ||
|
|
||
| names = [a.name for a in seq] | ||
|
|
||
|
|
@@ -2937,7 +2987,7 @@ def concatenate(seq, axis=0, allow_unknown_chunksizes=False): | |
| dsk = dict(zip(keys, values)) | ||
| graph = HighLevelGraph.from_collections(name, dsk, dependencies=seq) | ||
|
|
||
| return Array(graph, name, chunks, dtype=dt) | ||
| return Array(graph, name, chunks, meta=meta) | ||
|
|
||
|
|
||
| def load_store_chunk(x, out, index, lock, return_stored, load_stored): | ||
|
|
@@ -3318,7 +3368,7 @@ def handle_out(out, result): | |
| "out=%s, result=%s" % (str(out.shape), str(result.shape))) | ||
| out._chunks = result.chunks | ||
| out.dask = result.dask | ||
| out.dtype = result.dtype | ||
| out._meta = result._meta | ||
| out.name = result.name | ||
| elif out is not None: | ||
| msg = ("The out parameter is not fully supported." | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.