Skip to content

Commit 2d91da6

Browse files
authored
Adding shape validation for add_matrix_to_collection (#4258)
* Adding shape validation for add_matrix_to_collection * Adding test for add_X_layer * Add context to the validation function * Schema validation optional default False * PR fixes * PR fixes vol 2 * Adding all versions in tests * Adding full docstrings * Add entry in HISTORY.md * Minor fixes for PR * Opening internal var dataframe in read mode when writing & text fix * PR fixes - timestamp, closed property * Add unit tests for previous versions * PR comments * PR comments * Add warning instead of log msg * Rebase with main * Final misc * Match argument to all tests
1 parent e21bff2 commit 2d91da6

4 files changed

Lines changed: 261 additions & 5 deletions

File tree

apis/python/HISTORY.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
1010

1111
### Changed
1212

13+
- \[[#4258](https://github.com/single-cell-data/TileDB-SOMA/pull/4258)\] Add optional schema validation to `tiledbsoma.io.add_X_layer`, which will generate an error if the provided matrix does not match the Experiment shape. Validation is enabled by default, for any dataset created with SOMA 1.15 or later.
14+
1315
### Deprecated
1416

1517
### Removed

apis/python/remote_tests/test_02_analysis.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,11 @@ def test_write_with_updates(conftest_context, conftest_namespace, conftest_defau
7979
assert sorted(list(exp.ms["RNA"].X.keys())) == ["data", "logcounts"]
8080

8181
# Add dimensional-reduction results
82+
# identify highly variable genes but don't subset the data
83+
# https://github.com/single-cell-data/TileDB-SOMA/pull/4258#discussion_r2417729246
8284
sc.pp.highly_variable_genes(adata, inplace=True)
83-
adata = adata[:, adata.var.highly_variable]
85+
86+
# Run PCA on the full dataset, scanpy handles HVG selection
8487
sc.pp.scale(adata)
8588
sc.tl.pca(adata, use_highly_variable=True, n_comps=5)
8689

apis/python/src/tiledbsoma/io/ingest.py

Lines changed: 143 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1654,6 +1654,90 @@ def update_matrix(
16541654
)
16551655

16561656

1657+
def _validate_matrix_to_collection(
1658+
exp: Experiment,
1659+
meas: Measurement,
1660+
collection_name: str,
1661+
matrix_name: str,
1662+
matrix_data: Matrix | h5py.Dataset,
1663+
) -> None:
1664+
"""Validates against the Experiment shape, as defined by the axis dataframes.
1665+
Skips validation for pre-1.15 schemas till issue is resolved
1666+
https://github.com/single-cell-data/TileDB-SOMA/pull/4258#discussion_r2429676202.
1667+
1668+
Args:
1669+
exp: The experiment object
1670+
measurement_name: Name of the measurement
1671+
collection_name: Name of the collection
1672+
matrix_name: Name of the matrix
1673+
matrix_data: The matrix data to validate
1674+
meas: The already-opened measurement object
1675+
1676+
Raises:
1677+
ValueError: If the matrix shape is incompatible.
1678+
"""
1679+
if not exp.obs.tiledbsoma_has_upgraded_domain:
1680+
warnings.warn(
1681+
f"Skipped validation for arrays with pre-1.15 schema\
1682+
{matrix_name}",
1683+
stacklevel=2,
1684+
)
1685+
return
1686+
1687+
obs_soma_joinid_domain = exp.obs._maybe_soma_joinid_shape
1688+
1689+
# Try to access the var DataFrame directly, fallback to explicit open if needed
1690+
if meas.var.closed:
1691+
with DataFrame.open(meas.var.uri, "r", context=meas.context, tiledb_timestamp=meas.tiledb_timestamp) as var_df:
1692+
var_soma_joinid_domain = var_df._maybe_soma_joinid_shape
1693+
else:
1694+
var_soma_joinid_domain = meas.var._maybe_soma_joinid_shape
1695+
1696+
target_obs_size = obs_soma_joinid_domain # O: Observation (row) size
1697+
target_var_size = var_soma_joinid_domain # V: Variable (column) size
1698+
target_shape = (target_obs_size, target_var_size)
1699+
# Check if the dimensionality is the same
1700+
if len(matrix_data.shape) != len(target_shape):
1701+
raise SOMAError(
1702+
f"Matrix '{matrix_name}' has {len(matrix_data.shape)} dimensions, "
1703+
f"but the existing matrix has {len(target_shape)}. Shapes must match."
1704+
)
1705+
1706+
if collection_name == "X":
1707+
# X: (O, V) - Must match both dimensions exactly
1708+
if matrix_data.shape != target_shape:
1709+
raise SOMAError(
1710+
f"Matrix 'X' shape {matrix_data.shape} is incompatible with target shape {target_shape}. "
1711+
"Both dimensions must match exactly."
1712+
)
1713+
elif collection_name == "obsm":
1714+
if matrix_data.shape[0] != target_obs_size:
1715+
raise SOMAError(
1716+
f"Matrix '{matrix_name}' must match the observation size (O). "
1717+
f"Found {matrix_data.shape[0]}, expected {target_obs_size}."
1718+
)
1719+
elif collection_name == "varm":
1720+
if matrix_data.shape[0] != target_var_size:
1721+
raise SOMAError(
1722+
f"Matrix '{matrix_name}' must match the variable size (V). "
1723+
f"Found {matrix_data.shape[0]}, expected {target_var_size}."
1724+
)
1725+
elif collection_name == "obsp":
1726+
expected_shape = (target_obs_size, target_obs_size)
1727+
if matrix_data.shape[0:2] != expected_shape:
1728+
raise SOMAError(
1729+
f"Matrix '{matrix_name}' must match the observation size. "
1730+
f"Found {matrix_data.shape[0:2]}, expected {(target_obs_size, target_obs_size)}."
1731+
)
1732+
elif collection_name == "varp":
1733+
expected_shape = (target_var_size, target_var_size)
1734+
if matrix_data.shape[0:2] != expected_shape:
1735+
raise SOMAError(
1736+
f"Matrix '{matrix_name}' must match the observation size (O). "
1737+
f"Found {matrix_data.shape[0:2]}, expected {(target_var_size, target_var_size)}."
1738+
)
1739+
1740+
16571741
def add_X_layer(
16581742
exp: Experiment,
16591743
measurement_name: str,
@@ -1663,11 +1747,34 @@ def add_X_layer(
16631747
ingest_mode: IngestMode = "write",
16641748
use_relative_uri: bool | None = None,
16651749
context: SOMATileDBContext | None = None,
1750+
schema_validation: bool = True,
16661751
) -> None:
1667-
"""This is useful for adding X data, for example from
1752+
"""Add a new X layer to a measurement in the experiment.
1753+
1754+
This is useful for adding X data, for example from
16681755
`Scanpy <https://scanpy.readthedocs.io/>`_'s ``scanpy.pp.normalize_total``,
16691756
``scanpy.pp.log1p``, etc.
16701757
1758+
Args:
1759+
exp: The experiment to add the X layer to. Must be open for writing.
1760+
measurement_name: Name of the measurement within the experiment.
1761+
X_layer_name: Name for the new X layer (e.g., "normalized", "log1p").
1762+
X_layer_data: Matrix data to store in the X layer. Can be a sparse matrix
1763+
(e.g., scipy.csr_matrix) or an h5py.Dataset.
1764+
ingest_mode: Mode for ingestion. Defaults to "write".
1765+
use_relative_uri: Whether to use relative URIs when storing references.
1766+
If None, the default behavior is used.
1767+
context: TileDB context for the operation. If None, uses the default context.
1768+
schema_validation: Whether to validate the matrix schema before adding.
1769+
Defaults to True.
1770+
1771+
Returns:
1772+
None
1773+
1774+
Raises:
1775+
ValueError: If the experiment is not open for writing or if schema
1776+
validation fails.
1777+
16711778
Lifecycle:
16721779
Maturing.
16731780
"""
@@ -1682,6 +1789,7 @@ def add_X_layer(
16821789
ingest_mode=ingest_mode,
16831790
use_relative_uri=use_relative_uri,
16841791
context=context,
1792+
schema_validation=schema_validation,
16851793
)
16861794

16871795

@@ -1695,10 +1803,39 @@ def add_matrix_to_collection(
16951803
ingest_mode: IngestMode = "write",
16961804
use_relative_uri: bool | None = None,
16971805
context: SOMATileDBContext | None = None,
1806+
schema_validation: bool = True,
16981807
) -> None:
1699-
"""This is useful for adding X/obsp/varm/etc data, for example from
1808+
"""Add a matrix to a specified collection within a measurement.
1809+
1810+
This is useful for adding X/obsp/varm/etc data, for example from
17001811
Scanpy's ``scanpy.pp.normalize_total``, ``scanpy.pp.log1p``, etc.
17011812
1813+
Args:
1814+
exp: The experiment to add the matrix to. Must be open for writing.
1815+
measurement_name: Name of the measurement within the experiment.
1816+
collection_name: Name of the collection to add the matrix to
1817+
(e.g., "X", "obsp", "varm").
1818+
matrix_name: Name for the new matrix within the collection.
1819+
matrix_data: Matrix data to store. Can be a sparse matrix
1820+
(e.g., scipy.csr_matrix) or an h5py.Dataset.
1821+
ingest_mode: Mode for ingestion. Defaults to "write".
1822+
use_relative_uri: Whether to use relative URIs when storing references.
1823+
If None, the default behavior is used.
1824+
context: TileDB context for the operation. If None, uses the default context.
1825+
schema_validation: Whether to validate the matrix schema before adding.
1826+
Defaults to True.
1827+
1828+
Returns:
1829+
None
1830+
1831+
Raises:
1832+
ValueError: If schema validation fails or if the collection structure
1833+
is invalid.
1834+
1835+
Notes:
1836+
- If the collection doesn't exist, it will be created automatically.
1837+
- The matrix is stored as a SparseNDArray with identity axis mappings.
1838+
17021839
Lifecycle:
17031840
Maturing.
17041841
"""
@@ -1711,13 +1848,15 @@ def add_matrix_to_collection(
17111848
# tiledb://namespace/uuid. When the caller passes a creation URI (which
17121849
# they must) via exp.uri, we need to follow that.
17131850
extend_creation_uri = exp.uri.startswith("tiledb://")
1714-
17151851
with exp.ms[measurement_name] as meas:
17161852
if extend_creation_uri:
17171853
coll_uri = f"{exp.uri}/ms/{_util.sanitize_key(measurement_name)}/{_util.sanitize_key(collection_name)}"
17181854
else:
17191855
coll_uri = _util.uri_joinpath(meas.uri, _util.sanitize_key(collection_name))
17201856

1857+
if schema_validation:
1858+
_validate_matrix_to_collection(exp, meas, collection_name, matrix_name, matrix_data)
1859+
17211860
if collection_name in meas:
17221861
coll = cast("Collection[RawHandle]", meas[collection_name])
17231862
else:
@@ -1728,6 +1867,7 @@ def add_matrix_to_collection(
17281867
context=context,
17291868
)
17301869
_maybe_set(meas, collection_name, coll, use_relative_uri=use_relative_uri)
1870+
17311871
with coll:
17321872
matrix_uri = _util.uri_joinpath(coll_uri, _util.sanitize_key(matrix_name))
17331873

apis/python/tests/test_basic_anndata_io.py

Lines changed: 112 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import gc
55
import json
66
import random
7+
import shutil
78
from copy import deepcopy
89
from pathlib import Path
910

@@ -15,6 +16,7 @@
1516
import scipy
1617
import somacore
1718
from anndata import AnnData
19+
from packaging.version import Version
1820
from scipy.sparse import csr_matrix
1921

2022
import tiledbsoma
@@ -23,7 +25,7 @@
2325
from tiledbsoma._soma_object import SOMAObject
2426
from tiledbsoma.io._common import _TILEDBSOMA_TYPE, UnsDict, UnsMapping
2527

26-
from ._util import TESTDATA, assert_adata_equal, make_pd_df
28+
from ._util import ROOT_DATA_DIR, TESTDATA, assert_adata_equal, make_pd_df
2729

2830

2931
@pytest.fixture
@@ -472,6 +474,115 @@ def test_add_matrix_to_collection(conftest_pbmc_small, tmp_path):
472474
assert sorted(list(exp_r.ms["RNA"]["newthing"].keys())) == ["X_pcd"]
473475

474476

477+
@pytest.mark.parametrize(
478+
"matrix_type, offset",
479+
[
480+
("obsm", (1, 0)),
481+
("obsm", (-1, 0)),
482+
("obsm", (0, 0)), # Should succeed without raising errors
483+
("varm", (1, 0)),
484+
("varm", (-1, 0)),
485+
("varm", (0, 0)),
486+
("obsp", (1, 0)),
487+
("obsp", (0, 1)),
488+
("obsp", (-1, 0)),
489+
("obsp", (0, 0)),
490+
("varp", (1, 0)),
491+
("varp", (0, 1)),
492+
("varp", (-1, 0)),
493+
("varp", (0, 0)),
494+
],
495+
)
496+
@pytest.mark.parametrize("version", ["1.14.5", "1.7.3", "1.12.3", "1.14.5", "1.15.0", "1.15.7", "1.16.1"])
497+
@pytest.mark.parametrize(
498+
"name_and_expected_shape",
499+
[["pbmc3k_unprocessed", (2700, 13714)], ["pbmc3k_processed", (2638, 1838)]],
500+
)
501+
@pytest.mark.medium_runner
502+
def test_matrix_shape_enforcement(soma_tiledb_context, version, name_and_expected_shape, tmp_path, matrix_type, offset):
503+
"""Test shape validation for all matrix collection types."""
504+
output_path = tmp_path.as_posix()
505+
name, expected_shape = name_and_expected_shape
506+
original_path = ROOT_DATA_DIR / "soma-experiment-versions-2025-04-04" / version / name
507+
uri = str(original_path)
508+
if not Path(uri).is_dir():
509+
raise RuntimeError(
510+
f"Missing '{uri}' directory. Try running `make data` from the TileDB-SOMA project root directory.",
511+
)
512+
513+
shutil.copytree(uri, output_path, dirs_exist_ok=True)
514+
n_obs = expected_shape[0]
515+
n_vars = expected_shape[1]
516+
517+
# Determine expected shape based on matrix type
518+
shape_map = {
519+
"obsm": (n_obs, 5),
520+
"varm": (n_vars, 5),
521+
"obsp": (n_obs, n_obs),
522+
"varp": (n_vars, n_vars),
523+
}
524+
base_shape = shape_map[matrix_type]
525+
bad_shape = (max(1, base_shape[0] + offset[0]), max(1, base_shape[1] + offset[1]))
526+
527+
# Create invalid matrix
528+
bad_matrix = csr_matrix((bad_shape[0], bad_shape[1])) if matrix_type in ["obsp", "varp"] else np.ones(bad_shape)
529+
530+
if Version(version) >= Version("1.15"):
531+
if offset != (0, 0):
532+
# Should raise error of invalid shape for observation/variable size
533+
with (
534+
_factory.open(output_path, "w") as exp,
535+
pytest.raises(tiledbsoma.SOMAError, match=r"Matrix .* must match the .* size"),
536+
):
537+
tiledbsoma.io.add_matrix_to_collection(exp, "RNA", matrix_type, "bad", bad_matrix)
538+
else:
539+
# Should pass the validation without errors
540+
with _factory.open(output_path, "w") as exp:
541+
tiledbsoma.io.add_matrix_to_collection(exp, "RNA", matrix_type, "bad", bad_matrix)
542+
else:
543+
# pre-1.15 versions are not being checked for schema validation.
544+
if offset == (0, 0):
545+
with _factory.open(output_path, "w") as exp:
546+
tiledbsoma.io.add_matrix_to_collection(exp, "RNA", matrix_type, "bad", bad_matrix)
547+
else:
548+
# TODO: _maybe_soma_joinid_shape returns max-domain for pre-1.14 instead of None
549+
# https://github.com/single-cell-data/TileDB-SOMA/pull/4258#discussion_r2429676202
550+
pass
551+
552+
553+
@pytest.mark.parametrize(
554+
"offset",
555+
[
556+
(1, 0),
557+
(0, 1),
558+
(1, 1),
559+
],
560+
)
561+
def test_matrix_X_layer_enforcement(conftest_pbmc_small, tmp_path, offset):
562+
"""Test shape validation for all matrix collection types."""
563+
output_path = tmp_path.as_posix()
564+
original = conftest_pbmc_small.copy()
565+
566+
# Create SOMA experiment
567+
tiledbsoma.io.from_anndata(output_path, conftest_pbmc_small, measurement_name="RNA")
568+
assert_adata_equal(original, conftest_pbmc_small)
569+
570+
# Determine expected shape based on matrix type
571+
shape_map = conftest_pbmc_small.n_obs, conftest_pbmc_small.n_vars
572+
base_shape = shape_map
573+
bad_shape = (max(1, base_shape[0] + offset[0]), max(1, base_shape[1] + offset[1]))
574+
575+
# Create invalid matrix
576+
bad_matrix = np.ones(bad_shape)
577+
578+
# Should raise error for invalid shape
579+
with (
580+
_factory.open(output_path, "w") as exp,
581+
pytest.raises(tiledbsoma.SOMAError, match=r"Matrix .* shape .* is incompatible with target shape *."),
582+
):
583+
tiledbsoma.io.add_X_layer(exp, "RNA", "bad", bad_matrix, schema_validation=True)
584+
585+
475586
def test_export_anndata(conftest_pbmc_small, tmp_path):
476587
output_path = tmp_path.as_posix()
477588

0 commit comments

Comments
 (0)