Problem
There is a memory leak when writing SOMA sparse arrays (maybe other types). Perhaps the Arrow table being written is leaked. Test code below.
Note:
- Issue only occurs in 1.13 or later
- Workaround is to pin to <1.13
The leak is large enough that it prevents copying any large-ish array. For example, trying to read and rewrite a Census X layer (with a new schema), and that OOMs quickly on a 128GiB host.
Reported by @bkmartinjr.
[sc-53264]
Repro
Test code (will create a test_array in the current working directory):
from __future__ import annotations
import gc
import sys
import tiledbsoma as soma
def copy(from_uri, to_uri, context):
with soma.SparseNDArray.open(from_uri, mode="r", context=context) as X_from:
with soma.SparseNDArray.open(to_uri, mode="w", context=context) as X_to:
for i, tbl in enumerate(X_from.read().tables()):
print(f"Read {len(tbl)}")
X_to.write(tbl)
del tbl
gc.collect()
if i == 10: # OOMs w/o this break
break
def create(src_uri, dst_uri, context):
with soma.open(src_uri, mode="r", context=context) as X:
n_obs, n_var = X.shape
type = X.schema.field("soma_data").type
a = soma.SparseNDArray.create(
dst_uri,
type=type,
shape=(n_obs, n_var),
platform_config={
"tiledb": {
"create": {
"capacity": 2**16,
"dims": {
"soma_dim_0": {
"tile": 64,
"filters": [{"_type": "ZstdFilter", "level": 9}],
},
"soma_dim_1": {
"tile": 2048,
"filters": [
"ByteShuffleFilter",
{"_type": "ZstdFilter", "level": 9},
],
},
},
"attrs": {
"soma_data": {
"filters": [
"ByteShuffleFilter",
{"_type": "ZstdFilter", "level": 9},
]
}
},
"cell_order": "row-major",
"tile_order": "row-major",
"allows_duplicates": True,
},
}
},
context=context,
)
a.close()
print(f"Array created at {dst_uri}")
def main():
src_uri = "s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/ms/RNA/X/raw/"
dst_uri = "./test_array/"
context = soma.SOMATileDBContext(
tiledb_config={
"vfs.s3.region": "us-west-2",
"soma.init_buffer_bytes": 1 * 1024**3,
}
)
create(src_uri, dst_uri, context=context)
copy(src_uri, dst_uri, context=context)
if __name__ == "__main__":
sys.exit(main())
Problem
There is a memory leak when writing SOMA sparse arrays (maybe other types). Perhaps the Arrow table being written is leaked. Test code below.
Note:
The leak is large enough that it prevents copying any large-ish array. For example, trying to read and rewrite a Census X layer (with a new schema), and that OOMs quickly on a 128GiB host.
Reported by @bkmartinjr.
[sc-53264]
Repro
Test code (will create a
test_arrayin the current working directory):