Skip to content

can't create index file for PubDAS's Valencia data #587

@ahmadtourei

Description

@ahmadtourei

Description

There is an issue with h5py reading the febus data below. @d-chambers you should have access to a directory on google drive containing 2 sample patches that reproduce the error.

Example

import dascore as dc

data_path = "test_data/"

sp = dc.spool(data_path).update()

pa = sp[0]

Error:

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
Cell In[4], line 1
----> 1 sp = dc.spool(data_path).update()

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/clients/dirspool.py:108, in DirectorySpool.update(self, progress)
    104 @compose_docstring(doc=BaseSpool.update.__doc__)
    105 def update(self, progress: PROGRESS_LEVELS = "standard") -> Self:
    106     """{doc}."""
    107     out = self.__class__(
--> 108         base_path=self.indexer.update(progress=progress),
    109         preferred_format=self._preferred_format,
    110         select_kwargs=self._select_kwargs,
    111     )
    112     return out

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/indexer.py:328, in DirectoryIndexer.update(self, paths, progress)
    326 timestamp = self._get_mtime(only_new=True)
    327 paths = self._get_paths(paths)
--> 328 df = dc.scan_to_df(
    329     path=paths,
    330     timestamp=timestamp,
    331     progress=progress,
    332     ext=self.ext,
    333 )
    334 # Put contents found into database.
    335 if not df.empty:
    336     # Some users were surprised the spool wasn't sorted. We still cant
    337     # guarantee all spools will be sorted but we can make sure most are
    338     # by sorting the contents before dumping to index.

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/core.py:693, in scan_to_df(path, file_format, file_version, ext, timestamp, progress, exclude)
    691 if isinstance(path, DataFrameSpool):
    692     return path.get_contents()
--> 693 info = scan(
    694     path=path,
    695     file_format=file_format,
    696     file_version=file_version,
    697     ext=ext,
    698     timestamp=timestamp,
    699     progress=progress,
    700 )
    701 df = _model_list_to_df(info, exclude=exclude)
    702 return df

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/core.py:884, in scan(path, file_format, file_version, ext, timestamp, progress)
    882 else:
    883     try:
--> 884         source = fiber_io.scan(resource, _pre_cast=True)
    885     # This happens if the file is corrupt see #346.
    886     except (OSError, InvalidFiberFileError, ValueError, TypeError):

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/core.py:404, in _type_caster.<locals>._wrapper(_pre_cast, *args, **kwargs)
    400 # TODO look at replacing this with pydantic's type_guard thing.
    401 
    402 # this allows us to fast-track calls from generic functions
    403 if required_type is None or _pre_cast:
--> 404     return func(*args, **kwargs)
    405 bound = sig.bind(*args, **kwargs)
    406 new_kw = bound.arguments

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/febus/core.py:78, in Febus2.scan(self, resource, **kwargs)
     72 file_version = _get_febus_version_str(resource)
     73 extras = {
     74     "path": resource.filename,
     75     "file_format": self.name,
     76     "file_version": str(file_version),
     77 }
---> 78 for attr, cm, _ in _yield_attrs_coords(resource):
     79     attr["coords"] = cm.to_summary_dict()
     80     attr.update(dict(extras))

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/febus/utils.py:192, in _yield_attrs_coords(fi)
    190 for febus in febuses:
    191     attr = _get_febus_attrs(febus)
--> 192     cm = _get_febus_coord_manager(febus)
    193     yield attr, cm, febus

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/febus/utils.py:180, in _get_febus_coord_manager(feb)
    177 def _get_febus_coord_manager(feb: _FebusSlice) -> CoordManager:
    178     """Get a coordinate manager for febus slice."""
    179     coords = dict(
--> 180         time=_get_time_coord(feb),
    181         distance=_get_distance_coord(feb),
    182     )
    183     cm = get_coord_manager(coords=coords, dims=("time", "distance"))
    184     return cm

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/febus/utils.py:127, in _get_time_coord(feb)
    125 # Get spacing between time samples (in s) and the total time of each block.
    126 time_step = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
--> 127 excess_rows = _get_time_overlap_samples(feb, data_shape)
    128 total_time_rows = (data_shape[1] - excess_rows) * n_blocks
    129 # Get origin info, these are offsets from time to get to the first simple
    130 # of the block. These should always be non-positive.

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/dascore/io/febus/utils.py:103, in _get_time_overlap_samples(feb, data_shape)
    101 """Determine the number of redundant samples in the time dimension."""
    102 time_step = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
--> 103 block_time = _maybe_unpack(1 / (feb.zone.attrs["BlockRate"] / 1_000))
    104 # Since the data have overlaps in each block's time dimension, we need to
    105 # trim the overlap off the time dimension to avoid having to merge blocks later.
    106 # However, sometimes the "BlockOverlap" is wrong, so we calculate it
    107 # manually here.
    108 expected_samples = int(np.round(block_time / time_step))

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File ~/miniconda3/envs/dc_user/lib/python3.12/site-packages/h5py/_hl/attrs.py:55, in AttributeManager.__getitem__(self, name)
     51 @with_phil
     52 def __getitem__(self, name):
     53     """ Read the value of an attribute.
     54     """
---> 55     attr = h5a.open(self._id, self._e(name))
     56     shape = attr.shape
     58     # shape is None for empty dataspaces

File h5py/_objects.pyx:54, in h5py._objects.with_phil.wrapper()

File h5py/_objects.pyx:55, in h5py._objects.with_phil.wrapper()

File h5py/h5a.pyx:79, in h5py.h5a.open()

KeyError: "Unable to synchronously open attribute (can't locate attribute: 'BlockRate')"

Expected behavior

Scan and read febus data with no issue.

Versions

  • OS [e.g. Ubuntu 20.04]: macOS 15.6
  • DASCore Version [e.g. 0.0.5]: 0.1.12
  • Python Version [e.g. 3.10]: 3.12

Metadata

Metadata

Assignees

No one assigned

    Labels

    IOWork for reading/writing different formatsbugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions