support and testing for valencia febus files (#589)

d-chambers · web-flow · commit 1ebc98fc6c61 · 2025-12-17T21:51:21.000-07:00
diff --git a/dascore/__init__.py b/dascore/__init__.py
@@ -1,5 +1,8 @@
 """DASCore - A library for fiber optic sensing."""
 from __future__ import annotations
+
+import warnings
+
 from rich import print  # noqa
 
 from dascore.core.patch import Patch
@@ -16,3 +19,6 @@
 
 # flag for disabling progress bar when debugging
 _debug = False
+
+# Ensure warnings are issued only once (per warning/line)
+warnings.filterwarnings("once", category=UserWarning, module=r"dascore\..*")
diff --git a/dascore/data_registry.txt b/dascore/data_registry.txt
@@ -32,3 +32,4 @@ small_channel_patch.sgy 31e551aadb361189c1c9325d504c883114ba9a7bb75fe4791e5089fa
 gdr_1.h5 aaf11a7333b720436d194e3c7f4fa66f38907bb0c9abfa1804c150e634642aa2 https://github.com/dasdae/test_data/raw/master/das/gdr_1.h5
 sintela_binary_v3_test_1.raw 4c0afae1ab60b73ddcb3c4f7ac55b976d4d18d1ffc9ea6cd2dfa6a881b697101 https://github.com/dasdae/test_data/raw/master/das/sintela_binary_v3_test_1.raw
 prodml_fbe_1.h5 1e7fa0e63bd701ae5f65e5c1adfb02957594b4e334a6933a38cae2481ecdeabb https://github.com/dasdae/test_data/raw/master/das/prodml_fbe_1.h5
+valencia_febus_example.h5 ecfd61dfe7356d327890cb56b6acf2a9ef92206bd3bdc8c7cae387ab707ffc8f https://github.com/dasdae/test_data/raw/master/das/valencia_febus_example.h5
diff --git a/dascore/io/febus/__init__.py b/dascore/io/febus/__init__.py
@@ -6,4 +6,4 @@
 More info about febus can be found here: https://www.febus-optics.com/en/
 """
 from __future__ import annotations
-from .core import Febus2
+from .core import Febus1, Febus2
diff --git a/dascore/io/febus/core.py b/dascore/io/febus/core.py
@@ -98,7 +98,7 @@ def read(
 class Febus1(Febus2):
     """Support for Febus V 1.
 
-    This is here to support legacy febus (eg pubdas Valencia)
+    This is here to support legacy Febus (eg pubdas Valencia)
     """
 
     version = "1"
diff --git a/dascore/io/febus/utils.py b/dascore/io/febus/utils.py
@@ -2,7 +2,9 @@
 
 from __future__ import annotations
 
+import warnings
 from collections import namedtuple
+from functools import cache
 
 import numpy as np
 
@@ -13,6 +15,7 @@
     _maybe_unpack,
     broadcast_for_index,
     maybe_get_items,
+    tukey_fence,
     unbyte,
 )
 
@@ -24,6 +27,58 @@
 )
 
 
+@cache
+def _get_block_time(feb):
+    """Get the block time (time in seconds between each block)."""
+    # Some files have this set. We haven't yet seen any files where this
+    # values exists and is wrong, so we trust it (for now). This is probably
+    # much faster than reading the whole time vector.
+    br = feb.zone.attrs.get("BlockRate", 0) / 1_000
+    if br > 0:
+        return 1 / br
+    # Otherwise we have to try to use the time vector. Here be dragons.
+    time_shape = feb.source["time"].shape
+    # Not sure why but time has the shape of [1, n] for some files and just
+    # n for others. The first might imply different times for different
+    # zones? We aren't set up to handle that, but we don't know if it can happen
+    # so just assert here.
+    assert np.max(time_shape) == np.prod(
+        time_shape
+    ), "Non flat 2d time vector is not supported by DASCore Febus reader."
+    # Get the average time spacing in each block. These can vary a bit so
+    # account for outliers.
+    time = np.squeeze(feb.source["time"][:])
+    d_time = time[1:] - time[:-1]
+    tmin, tmax = tukey_fence(d_time)
+    d_time = d_time[(d_time >= tmin) & (d_time <= tmax)]
+    # After removing outliers, the mean seems to work better than the median
+    # for the test files we have. There is still a concerning amount of
+    # variability.
+    return np.mean(d_time)
+
+
+@cache
+def _get_sample_spacing(feb: _FebusSlice, n_time_samps: int):
+    """
+    Determine the temporal sample spacing (in seconds).
+    """
+    # Note: This is a bit dicey, but we are trying to account for variability
+    # seen in real Febus Files. In some files the zone Spacing attr indicates one
+    # sample rate, while zone.attrs['SamplingRate'] indicates another. It
+    # varies as to which one is actually right, so we try to figure that
+    # out here.
+    ts_1 = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
+    # In most cases sample_rate is either bogus or in Hz. It isn't even mentioned
+    # in some Febus documentation.
+    ts_2 = _maybe_unpack(1.0 / feb.zone.attrs["SamplingRate"])
+    # Get the block time. This doesn't account for overlap, so it wont be exact.
+    block_time = _get_block_time(feb)
+    # Get candidate times, return the closet to the block_time.
+    ts_array = np.array([ts_1, ts_2])
+    block_time_array = ts_array * n_time_samps
+    return ts_array[np.argmin(np.abs(block_time_array - block_time))]
+
+
 def _flatten_febus_info(fi) -> tuple[_FebusSlice, ...]:
     """
     Given a febus file, return a tuple of named tuples with key info.
@@ -97,16 +152,16 @@ def _get_febus_attrs(feb: _FebusSlice) -> dict:
     return out
 
 
-def _get_time_overlap_samples(feb, data_shape):
+def _get_time_overlap_samples(feb, n_time_samps, tstep=None):
     """Determine the number of redundant samples in the time dimension."""
-    time_step = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
-    block_time = _maybe_unpack(1 / (feb.zone.attrs["BlockRate"] / 1_000))
+    tstep = tstep if tstep is not None else _get_sample_spacing(feb, n_time_samps)
+    block_time = _get_block_time(feb)
     # Since the data have overlaps in each block's time dimension, we need to
     # trim the overlap off the time dimension to avoid having to merge blocks later.
     # However, sometimes the "BlockOverlap" is wrong, so we calculate it
-    # manually here.
-    expected_samples = int(np.round(block_time / time_step))
-    excess_rows = data_shape[1] - expected_samples
+    # manually here, rounding to nearest even number.
+    expected_samples = int(np.round((block_time / tstep) / 2) * 2)
+    excess_rows = n_time_samps - expected_samples
     assert (
         excess_rows % 2 == 0
     ), "excess rows must be symmetric to distribute on both ends"
@@ -119,23 +174,32 @@ def _get_time_coord(feb):
     # In older version time shape is different, always grab first element.
     first_slice = tuple(0 for _ in time.shape)
     t_0 = time[first_slice]
-    # Data dimensions are block_index, time, distance
-    data_shape = feb.zone[feb.data_name].shape
-    n_blocks = data_shape[0]
+    # Number of time blocks in the data cube.
+    shape = feb.zone[feb.data_name].shape
+    n_time_samps = shape[1]
+    n_blocks = shape[0]
     # Get spacing between time samples (in s) and the total time of each block.
-    time_step = feb.zone.attrs["Spacing"][1] / 1_000  # value in ms, convert to s.
-    excess_rows = _get_time_overlap_samples(feb, data_shape)
-    total_time_rows = (data_shape[1] - excess_rows) * n_blocks
+    time_step = _get_sample_spacing(feb, n_time_samps)
+    excess_rows = _get_time_overlap_samples(feb, n_time_samps, tstep=time_step)
+    total_time_rows = (n_time_samps - excess_rows) * n_blocks
     # Get origin info, these are offsets from time to get to the first simple
     # of the block. These should always be non-positive.
     time_offset = feb.zone.attrs["Origin"][1] / 1_000  # also convert to s
     assert time_offset <= 0, "time offset must be non positive"
     # Get the start/stop indices for the zone. We assume zones never sub-slice
-    # time (only distance) but assert that here.
+    # time (only distance). However, some files (eg Valencia) have an incorrect
+    # value set here, so we only warn.
     extent = feb.zone.attrs["Extent"]
-    assert (extent[3] - extent[2] + 1) == data_shape[1], "Cant handle sub time zones"
-    # Create time coord
-    # Need to account for removing overlap times.
+    if (extent[3] - extent[2] + 1) != n_time_samps:
+        msg = (
+            "It appears the Febus file extents specify a different range than "
+            "found in the data array. Double check this is correct."
+        )
+        warnings.warn(msg, UserWarning, stacklevel=2)
+    # Create time coord.
+    # Need to account for removing overlap times. Also, time vector refers
+    # to the center of the block, so this finds the first non-overlapping
+    # sample.
     total_start = t_0 + time_offset + (excess_rows // 2) * time_step
     total_end = total_start + total_time_rows * time_step
     time_coord = get_coord(
@@ -224,7 +288,7 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord):
         times = t_start_end[in_time]
         # get start/stop indexes for complete blocks
         start = np.argmax(in_time)
-        stop = np.argmax(np.cumsum(in_time))
+        stop = np.argmax(np.cumsum(in_time)) + (1 if len(times) else 0)
         total_slice[0] = slice(start, stop)
         # load data from disk.
         data_2d = data[tuple(total_slice)].reshape(-1, data.shape[-1])
@@ -244,8 +308,8 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord):
     dist_coord, time_coord = cm.coord_map["distance"], cm.coord_map["time"]
     data = febus.zone[febus.data_name]
     data_shape = data.shape
-    skip_rows = _get_time_overlap_samples(febus, data_shape) // 2
-    # Need to handle case where excess_rows == 0
+    skip_rows = _get_time_overlap_samples(febus, data_shape[1]) // 2
+    # This handles the case where excess_rows == 0
     data_slice = slice(skip_rows, -skip_rows if skip_rows else None)
     total_slice = list(broadcast_for_index(3, 1, data_slice))
     total_time_rows = data_shape[1] - 2 * skip_rows
diff --git a/dascore/utils/misc.py b/dascore/utils/misc.py
@@ -926,3 +926,16 @@ def get_2d_line_intersection(p1, p2, p3, p4):
         px = num_x / denom
         py = num_y / denom
     return np.array([px, py])
+
+
+def tukey_fence(data, fence_multiplier=1.5) -> np.ndarray:
+    """
+    Apply Tukey's fence to determine data range without outliers.
+    """
+    q1, q3 = np.nanpercentile(data, [25, 75])
+    dmin, dmax = np.nanmin(data), np.nanmax(data)
+    diff = q3 - q1  # Interquartile range (IQR)
+    q_lower = np.nanmax([q1 - diff * fence_multiplier, dmin])
+    q_upper = np.nanmin([q3 + diff * fence_multiplier, dmax])
+    lower_and_top = np.asarray([q_lower, q_upper])
+    return lower_and_top
diff --git a/dascore/viz/waterfall.py b/dascore/viz/waterfall.py
@@ -12,6 +12,7 @@
 from dascore.constants import PatchType
 from dascore.exceptions import ParameterError
 from dascore.units import get_quantity_str, maybe_convert_percent_to_fraction
+from dascore.utils.misc import tukey_fence
 from dascore.utils.patch import patch_function
 from dascore.utils.plotting import (
     _format_time_axis,
@@ -22,8 +23,6 @@
 )
 from dascore.utils.time import dtype_time_like, is_datetime64
 
-IQR_FENCE_MULTIPLIER = 1.5
-
 
 def _validate_scale_type(scale_type):
     """Validate that scale_type is either 'relative' or 'absolute'."""
@@ -76,13 +75,7 @@ def _get_scale(scale, scale_type, data):
         # This prevents a few extreme values from obscuring the majority of the
         # data at the cost of a slight performance penalty.
         case ([], "relative"):
-            q1, q3 = np.nanpercentile(data, [25, 75])
-            dmin, dmax = np.nanmin(data), np.nanmax(data)
-            diff = q3 - q1  # Interquartile range (IQR)
-            q_lower = np.nanmax([q1 - diff * IQR_FENCE_MULTIPLIER, dmin])
-            q_upper = np.nanmin([q3 + diff * IQR_FENCE_MULTIPLIER, dmax])
-            scale = np.asarray([q_lower, q_upper])
-            return scale
+            return tukey_fence(data)
         # Case 3: Sequence with relative scaling
         # Scale values represent fractions of the data range [0, 1]
         # and are mapped to [data_min, data_max]
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -4,6 +4,7 @@
 
 import os
 import shutil
+import warnings
 from contextlib import suppress
 from pathlib import Path
 
@@ -35,6 +36,9 @@
 SPOOL_FIXTURES = []
 PATCH_FIXTURES = []
 
+# By default DASCore only issues a warning once per line. This ensures
+# they get issued every time so tests around warning behavior arent flakey.
+warnings.filterwarnings("default", category=UserWarning)
 
 # --- Pytest configuration
 
diff --git a/tests/test_io/test_common_io.py b/tests/test_io/test_common_io.py
@@ -27,7 +27,7 @@
 from dascore.io.ap_sensing import APSensingV10
 from dascore.io.dasdae import DASDAEV1
 from dascore.io.dashdf5 import DASHDF5
-from dascore.io.febus import Febus2
+from dascore.io.febus import Febus1, Febus2
 from dascore.io.gdr import GDR_V1
 from dascore.io.h5simple import H5Simple
 from dascore.io.neubrex import NeubrexDASV1, NeubrexRFSV1
@@ -70,6 +70,7 @@
     H5Simple(): ("h5_simple_2.h5", "h5_simple_1.h5"),
     APSensingV10(): ("ap_sensing_1.hdf5",),
     Febus2(): ("febus_1.h5",),
+    Febus1(): ("valencia_febus_example.h5",),
     OptoDASV8(): ("opto_das_1.hdf5",),
     DASDAEV1(): ("example_dasdae_event_1.h5",),
     TDMSFormatterV4713(): ("sample_tdms_file_v4713.tdms",),
@@ -253,7 +254,7 @@ def test_all_other_files_arent_format(self, io_instance):
         for other_io, data_files in COMMON_IO_READ_TESTS.items():
             if isinstance(other_io, type(io_instance)):
                 continue
-            for key in data_files:
+            for key in iterate(data_files):
                 with skip_timeout():
                     path = fetch(key)
                 out = io_instance.get_format(path)
diff --git a/tests/test_utils/test_misc.py b/tests/test_utils/test_misc.py
@@ -24,7 +24,9 @@
     maybe_get_items,
     maybe_mem_map,
     optional_import,
+    suppress_warnings,
     to_object_array,
+    tukey_fence,
     warn_or_raise,
 )
 
@@ -256,6 +258,43 @@ def test_3_point_2nd_derivative(self):
         assert np.allclose(out, expected)
 
 
+class TestTukeyFence:
+    """Tests for Tukey fence outlier detection."""
+
+    def test_constant_data(self):
+        """Constant data should return the same value for both bounds."""
+        data = np.array([5.0, 5.0, 5.0, 5.0])
+        result = tukey_fence(data)
+        assert np.allclose(result, [5.0, 5.0])
+
+    def test_simple_range(self):
+        """Simple range should calculate fences correctly."""
+        data = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+        result = tukey_fence(data)
+        # Q1=2, Q3=4, IQR=2, fences: 2-1.5*2=-1, 4+1.5*2=7
+        # But clamped to data range [1, 5]
+        assert np.allclose(result, [1.0, 5.0])
+
+    def test_all_nan(self):
+        """All-NaN data should return NaN bounds."""
+        data = np.array([np.nan, np.nan, np.nan])
+        with suppress_warnings():
+            result = tukey_fence(data)
+        assert np.isnan(result[0])
+        assert np.isnan(result[1])
+
+    def test_data_with_outliers(self):
+        """Data with outliers should exclude them from range."""
+        # Core data: 10-20, with outliers at 0 and 100
+        data = np.array([0.0, 10.0, 12.0, 15.0, 18.0, 20.0, 100.0])
+        result = tukey_fence(data)
+        # Q1=11, Q3=19, IQR=8, fences: 11-1.5*8=-1, 19+1.5*8=31
+        # Clamped: [0, 31], but 100 is excluded
+        expected_lower = max(11 - 1.5 * 8, 0.0)
+        expected_upper = min(19 + 1.5 * 8, 100.0)
+        assert np.allclose(result, [expected_lower, expected_upper])
+
+
 class TestCachedMethod:
     """Ensure cached methods caches method calls (duh)."""