Skip to content

Commit 1ebc98f

Browse files
authored
support and testing for valencia febus files (#589)
1 parent 7ae947a commit 1ebc98f

File tree

10 files changed

+153
-32
lines changed

10 files changed

+153
-32
lines changed

dascore/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
"""DASCore - A library for fiber optic sensing."""
22
from __future__ import annotations
3+
4+
import warnings
5+
36
from rich import print # noqa
47

58
from dascore.core.patch import Patch
@@ -16,3 +19,6 @@
1619

1720
# flag for disabling progress bar when debugging
1821
_debug = False
22+
23+
# Ensure warnings are issued only once (per warning/line)
24+
warnings.filterwarnings("once", category=UserWarning, module=r"dascore\..*")

dascore/data_registry.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,4 @@ small_channel_patch.sgy 31e551aadb361189c1c9325d504c883114ba9a7bb75fe4791e5089fa
3232
gdr_1.h5 aaf11a7333b720436d194e3c7f4fa66f38907bb0c9abfa1804c150e634642aa2 https://github.com/dasdae/test_data/raw/master/das/gdr_1.h5
3333
sintela_binary_v3_test_1.raw 4c0afae1ab60b73ddcb3c4f7ac55b976d4d18d1ffc9ea6cd2dfa6a881b697101 https://github.com/dasdae/test_data/raw/master/das/sintela_binary_v3_test_1.raw
3434
prodml_fbe_1.h5 1e7fa0e63bd701ae5f65e5c1adfb02957594b4e334a6933a38cae2481ecdeabb https://github.com/dasdae/test_data/raw/master/das/prodml_fbe_1.h5
35+
valencia_febus_example.h5 ecfd61dfe7356d327890cb56b6acf2a9ef92206bd3bdc8c7cae387ab707ffc8f https://github.com/dasdae/test_data/raw/master/das/valencia_febus_example.h5

dascore/io/febus/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
More info about febus can be found here: https://www.febus-optics.com/en/
77
"""
88
from __future__ import annotations
9-
from .core import Febus2
9+
from .core import Febus1, Febus2

dascore/io/febus/core.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def read(
9898
class Febus1(Febus2):
9999
"""Support for Febus V 1.
100100
101-
This is here to support legacy febus (eg pubdas Valencia)
101+
This is here to support legacy Febus (eg pubdas Valencia)
102102
"""
103103

104104
version = "1"

dascore/io/febus/utils.py

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
from __future__ import annotations
44

5+
import warnings
56
from collections import namedtuple
7+
from functools import cache
68

79
import numpy as np
810

@@ -13,6 +15,7 @@
1315
_maybe_unpack,
1416
broadcast_for_index,
1517
maybe_get_items,
18+
tukey_fence,
1619
unbyte,
1720
)
1821

@@ -24,6 +27,58 @@
2427
)
2528

2629

30+
@cache
31+
def _get_block_time(feb):
32+
"""Get the block time (time in seconds between each block)."""
33+
# Some files have this set. We haven't yet seen any files where this
34+
# values exists and is wrong, so we trust it (for now). This is probably
35+
# much faster than reading the whole time vector.
36+
br = feb.zone.attrs.get("BlockRate", 0) / 1_000
37+
if br > 0:
38+
return 1 / br
39+
# Otherwise we have to try to use the time vector. Here be dragons.
40+
time_shape = feb.source["time"].shape
41+
# Not sure why but time has the shape of [1, n] for some files and just
42+
# n for others. The first might imply different times for different
43+
# zones? We aren't set up to handle that, but we don't know if it can happen
44+
# so just assert here.
45+
assert np.max(time_shape) == np.prod(
46+
time_shape
47+
), "Non flat 2d time vector is not supported by DASCore Febus reader."
48+
# Get the average time spacing in each block. These can vary a bit so
49+
# account for outliers.
50+
time = np.squeeze(feb.source["time"][:])
51+
d_time = time[1:] - time[:-1]
52+
tmin, tmax = tukey_fence(d_time)
53+
d_time = d_time[(d_time >= tmin) & (d_time <= tmax)]
54+
# After removing outliers, the mean seems to work better than the median
55+
# for the test files we have. There is still a concerning amount of
56+
# variability.
57+
return np.mean(d_time)
58+
59+
60+
@cache
61+
def _get_sample_spacing(feb: _FebusSlice, n_time_samps: int):
62+
"""
63+
Determine the temporal sample spacing (in seconds).
64+
"""
65+
# Note: This is a bit dicey, but we are trying to account for variability
66+
# seen in real Febus Files. In some files the zone Spacing attr indicates one
67+
# sample rate, while zone.attrs['SamplingRate'] indicates another. It
68+
# varies as to which one is actually right, so we try to figure that
69+
# out here.
70+
ts_1 = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s.
71+
# In most cases sample_rate is either bogus or in Hz. It isn't even mentioned
72+
# in some Febus documentation.
73+
ts_2 = _maybe_unpack(1.0 / feb.zone.attrs["SamplingRate"])
74+
# Get the block time. This doesn't account for overlap, so it wont be exact.
75+
block_time = _get_block_time(feb)
76+
# Get candidate times, return the closet to the block_time.
77+
ts_array = np.array([ts_1, ts_2])
78+
block_time_array = ts_array * n_time_samps
79+
return ts_array[np.argmin(np.abs(block_time_array - block_time))]
80+
81+
2782
def _flatten_febus_info(fi) -> tuple[_FebusSlice, ...]:
2883
"""
2984
Given a febus file, return a tuple of named tuples with key info.
@@ -97,16 +152,16 @@ def _get_febus_attrs(feb: _FebusSlice) -> dict:
97152
return out
98153

99154

100-
def _get_time_overlap_samples(feb, data_shape):
155+
def _get_time_overlap_samples(feb, n_time_samps, tstep=None):
101156
"""Determine the number of redundant samples in the time dimension."""
102-
time_step = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s.
103-
block_time = _maybe_unpack(1 / (feb.zone.attrs["BlockRate"] / 1_000))
157+
tstep = tstep if tstep is not None else _get_sample_spacing(feb, n_time_samps)
158+
block_time = _get_block_time(feb)
104159
# Since the data have overlaps in each block's time dimension, we need to
105160
# trim the overlap off the time dimension to avoid having to merge blocks later.
106161
# However, sometimes the "BlockOverlap" is wrong, so we calculate it
107-
# manually here.
108-
expected_samples = int(np.round(block_time / time_step))
109-
excess_rows = data_shape[1] - expected_samples
162+
# manually here, rounding to nearest even number.
163+
expected_samples = int(np.round((block_time / tstep) / 2) * 2)
164+
excess_rows = n_time_samps - expected_samples
110165
assert (
111166
excess_rows % 2 == 0
112167
), "excess rows must be symmetric to distribute on both ends"
@@ -119,23 +174,32 @@ def _get_time_coord(feb):
119174
# In older version time shape is different, always grab first element.
120175
first_slice = tuple(0 for _ in time.shape)
121176
t_0 = time[first_slice]
122-
# Data dimensions are block_index, time, distance
123-
data_shape = feb.zone[feb.data_name].shape
124-
n_blocks = data_shape[0]
177+
# Number of time blocks in the data cube.
178+
shape = feb.zone[feb.data_name].shape
179+
n_time_samps = shape[1]
180+
n_blocks = shape[0]
125181
# Get spacing between time samples (in s) and the total time of each block.
126-
time_step = feb.zone.attrs["Spacing"][1] / 1_000 # value in ms, convert to s.
127-
excess_rows = _get_time_overlap_samples(feb, data_shape)
128-
total_time_rows = (data_shape[1] - excess_rows) * n_blocks
182+
time_step = _get_sample_spacing(feb, n_time_samps)
183+
excess_rows = _get_time_overlap_samples(feb, n_time_samps, tstep=time_step)
184+
total_time_rows = (n_time_samps - excess_rows) * n_blocks
129185
# Get origin info, these are offsets from time to get to the first simple
130186
# of the block. These should always be non-positive.
131187
time_offset = feb.zone.attrs["Origin"][1] / 1_000 # also convert to s
132188
assert time_offset <= 0, "time offset must be non positive"
133189
# Get the start/stop indices for the zone. We assume zones never sub-slice
134-
# time (only distance) but assert that here.
190+
# time (only distance). However, some files (eg Valencia) have an incorrect
191+
# value set here, so we only warn.
135192
extent = feb.zone.attrs["Extent"]
136-
assert (extent[3] - extent[2] + 1) == data_shape[1], "Cant handle sub time zones"
137-
# Create time coord
138-
# Need to account for removing overlap times.
193+
if (extent[3] - extent[2] + 1) != n_time_samps:
194+
msg = (
195+
"It appears the Febus file extents specify a different range than "
196+
"found in the data array. Double check this is correct."
197+
)
198+
warnings.warn(msg, UserWarning, stacklevel=2)
199+
# Create time coord.
200+
# Need to account for removing overlap times. Also, time vector refers
201+
# to the center of the block, so this finds the first non-overlapping
202+
# sample.
139203
total_start = t_0 + time_offset + (excess_rows // 2) * time_step
140204
total_end = total_start + total_time_rows * time_step
141205
time_coord = get_coord(
@@ -224,7 +288,7 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord):
224288
times = t_start_end[in_time]
225289
# get start/stop indexes for complete blocks
226290
start = np.argmax(in_time)
227-
stop = np.argmax(np.cumsum(in_time))
291+
stop = np.argmax(np.cumsum(in_time)) + (1 if len(times) else 0)
228292
total_slice[0] = slice(start, stop)
229293
# load data from disk.
230294
data_2d = data[tuple(total_slice)].reshape(-1, data.shape[-1])
@@ -244,8 +308,8 @@ def _get_time_filtered_data(data, t_start_end, time, total_slice, time_coord):
244308
dist_coord, time_coord = cm.coord_map["distance"], cm.coord_map["time"]
245309
data = febus.zone[febus.data_name]
246310
data_shape = data.shape
247-
skip_rows = _get_time_overlap_samples(febus, data_shape) // 2
248-
# Need to handle case where excess_rows == 0
311+
skip_rows = _get_time_overlap_samples(febus, data_shape[1]) // 2
312+
# This handles the case where excess_rows == 0
249313
data_slice = slice(skip_rows, -skip_rows if skip_rows else None)
250314
total_slice = list(broadcast_for_index(3, 1, data_slice))
251315
total_time_rows = data_shape[1] - 2 * skip_rows

dascore/utils/misc.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,3 +926,16 @@ def get_2d_line_intersection(p1, p2, p3, p4):
926926
px = num_x / denom
927927
py = num_y / denom
928928
return np.array([px, py])
929+
930+
931+
def tukey_fence(data, fence_multiplier=1.5) -> np.ndarray:
932+
"""
933+
Apply Tukey's fence to determine data range without outliers.
934+
"""
935+
q1, q3 = np.nanpercentile(data, [25, 75])
936+
dmin, dmax = np.nanmin(data), np.nanmax(data)
937+
diff = q3 - q1 # Interquartile range (IQR)
938+
q_lower = np.nanmax([q1 - diff * fence_multiplier, dmin])
939+
q_upper = np.nanmin([q3 + diff * fence_multiplier, dmax])
940+
lower_and_top = np.asarray([q_lower, q_upper])
941+
return lower_and_top

dascore/viz/waterfall.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from dascore.constants import PatchType
1313
from dascore.exceptions import ParameterError
1414
from dascore.units import get_quantity_str, maybe_convert_percent_to_fraction
15+
from dascore.utils.misc import tukey_fence
1516
from dascore.utils.patch import patch_function
1617
from dascore.utils.plotting import (
1718
_format_time_axis,
@@ -22,8 +23,6 @@
2223
)
2324
from dascore.utils.time import dtype_time_like, is_datetime64
2425

25-
IQR_FENCE_MULTIPLIER = 1.5
26-
2726

2827
def _validate_scale_type(scale_type):
2928
"""Validate that scale_type is either 'relative' or 'absolute'."""
@@ -76,13 +75,7 @@ def _get_scale(scale, scale_type, data):
7675
# This prevents a few extreme values from obscuring the majority of the
7776
# data at the cost of a slight performance penalty.
7877
case ([], "relative"):
79-
q1, q3 = np.nanpercentile(data, [25, 75])
80-
dmin, dmax = np.nanmin(data), np.nanmax(data)
81-
diff = q3 - q1 # Interquartile range (IQR)
82-
q_lower = np.nanmax([q1 - diff * IQR_FENCE_MULTIPLIER, dmin])
83-
q_upper = np.nanmin([q3 + diff * IQR_FENCE_MULTIPLIER, dmax])
84-
scale = np.asarray([q_lower, q_upper])
85-
return scale
78+
return tukey_fence(data)
8679
# Case 3: Sequence with relative scaling
8780
# Scale values represent fractions of the data range [0, 1]
8881
# and are mapped to [data_min, data_max]

tests/conftest.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
import os
66
import shutil
7+
import warnings
78
from contextlib import suppress
89
from pathlib import Path
910

@@ -35,6 +36,9 @@
3536
SPOOL_FIXTURES = []
3637
PATCH_FIXTURES = []
3738

39+
# By default DASCore only issues a warning once per line. This ensures
40+
# they get issued every time so tests around warning behavior arent flakey.
41+
warnings.filterwarnings("default", category=UserWarning)
3842

3943
# --- Pytest configuration
4044

tests/test_io/test_common_io.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from dascore.io.ap_sensing import APSensingV10
2828
from dascore.io.dasdae import DASDAEV1
2929
from dascore.io.dashdf5 import DASHDF5
30-
from dascore.io.febus import Febus2
30+
from dascore.io.febus import Febus1, Febus2
3131
from dascore.io.gdr import GDR_V1
3232
from dascore.io.h5simple import H5Simple
3333
from dascore.io.neubrex import NeubrexDASV1, NeubrexRFSV1
@@ -70,6 +70,7 @@
7070
H5Simple(): ("h5_simple_2.h5", "h5_simple_1.h5"),
7171
APSensingV10(): ("ap_sensing_1.hdf5",),
7272
Febus2(): ("febus_1.h5",),
73+
Febus1(): ("valencia_febus_example.h5",),
7374
OptoDASV8(): ("opto_das_1.hdf5",),
7475
DASDAEV1(): ("example_dasdae_event_1.h5",),
7576
TDMSFormatterV4713(): ("sample_tdms_file_v4713.tdms",),
@@ -253,7 +254,7 @@ def test_all_other_files_arent_format(self, io_instance):
253254
for other_io, data_files in COMMON_IO_READ_TESTS.items():
254255
if isinstance(other_io, type(io_instance)):
255256
continue
256-
for key in data_files:
257+
for key in iterate(data_files):
257258
with skip_timeout():
258259
path = fetch(key)
259260
out = io_instance.get_format(path)

tests/test_utils/test_misc.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@
2424
maybe_get_items,
2525
maybe_mem_map,
2626
optional_import,
27+
suppress_warnings,
2728
to_object_array,
29+
tukey_fence,
2830
warn_or_raise,
2931
)
3032

@@ -256,6 +258,43 @@ def test_3_point_2nd_derivative(self):
256258
assert np.allclose(out, expected)
257259

258260

261+
class TestTukeyFence:
262+
"""Tests for Tukey fence outlier detection."""
263+
264+
def test_constant_data(self):
265+
"""Constant data should return the same value for both bounds."""
266+
data = np.array([5.0, 5.0, 5.0, 5.0])
267+
result = tukey_fence(data)
268+
assert np.allclose(result, [5.0, 5.0])
269+
270+
def test_simple_range(self):
271+
"""Simple range should calculate fences correctly."""
272+
data = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
273+
result = tukey_fence(data)
274+
# Q1=2, Q3=4, IQR=2, fences: 2-1.5*2=-1, 4+1.5*2=7
275+
# But clamped to data range [1, 5]
276+
assert np.allclose(result, [1.0, 5.0])
277+
278+
def test_all_nan(self):
279+
"""All-NaN data should return NaN bounds."""
280+
data = np.array([np.nan, np.nan, np.nan])
281+
with suppress_warnings():
282+
result = tukey_fence(data)
283+
assert np.isnan(result[0])
284+
assert np.isnan(result[1])
285+
286+
def test_data_with_outliers(self):
287+
"""Data with outliers should exclude them from range."""
288+
# Core data: 10-20, with outliers at 0 and 100
289+
data = np.array([0.0, 10.0, 12.0, 15.0, 18.0, 20.0, 100.0])
290+
result = tukey_fence(data)
291+
# Q1=11, Q3=19, IQR=8, fences: 11-1.5*8=-1, 19+1.5*8=31
292+
# Clamped: [0, 31], but 100 is excluded
293+
expected_lower = max(11 - 1.5 * 8, 0.0)
294+
expected_upper = min(19 + 1.5 * 8, 100.0)
295+
assert np.allclose(result, [expected_lower, expected_upper])
296+
297+
259298
class TestCachedMethod:
260299
"""Ensure cached methods caches method calls (duh)."""
261300

0 commit comments

Comments
 (0)