Skip to content

[c++/python] Memory leak in Python sparse-array write #2928

@johnkerl

Description

@johnkerl

Problem

There is a memory leak when writing SOMA sparse arrays (maybe other types). Perhaps the Arrow table being written is leaked. Test code below.

Note:

  • Issue only occurs in 1.13 or later
  • Workaround is to pin to <1.13

The leak is large enough that it prevents copying any large-ish array. For example, trying to read and rewrite a Census X layer (with a new schema), and that OOMs quickly on a 128GiB host.

Reported by @bkmartinjr.

[sc-53264]

Repro

Test code (will create a test_array in the current working directory):

from __future__ import annotations

import gc
import sys

import tiledbsoma as soma


def copy(from_uri, to_uri, context):
    with soma.SparseNDArray.open(from_uri, mode="r", context=context) as X_from:
        with soma.SparseNDArray.open(to_uri, mode="w", context=context) as X_to:
            for i, tbl in enumerate(X_from.read().tables()):
                print(f"Read {len(tbl)}")
                X_to.write(tbl)

                del tbl
                gc.collect()

                if i == 10:  # OOMs w/o this break
                    break


def create(src_uri, dst_uri, context):
    with soma.open(src_uri, mode="r", context=context) as X:
        n_obs, n_var = X.shape
        type = X.schema.field("soma_data").type

    a = soma.SparseNDArray.create(
        dst_uri,
        type=type,
        shape=(n_obs, n_var),
        platform_config={
            "tiledb": {
                "create": {
                    "capacity": 2**16,
                    "dims": {
                        "soma_dim_0": {
                            "tile": 64,
                            "filters": [{"_type": "ZstdFilter", "level": 9}],
                        },
                        "soma_dim_1": {
                            "tile": 2048,
                            "filters": [
                                "ByteShuffleFilter",
                                {"_type": "ZstdFilter", "level": 9},
                            ],
                        },
                    },
                    "attrs": {
                        "soma_data": {
                            "filters": [
                                "ByteShuffleFilter",
                                {"_type": "ZstdFilter", "level": 9},
                            ]
                        }
                    },
                    "cell_order": "row-major",
                    "tile_order": "row-major",
                    "allows_duplicates": True,
                },
            }
        },
        context=context,
    )
    a.close()
    print(f"Array created at {dst_uri}")


def main():
    src_uri = "s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/homo_sapiens/ms/RNA/X/raw/"
    dst_uri = "./test_array/"

    context = soma.SOMATileDBContext(
        tiledb_config={
            "vfs.s3.region": "us-west-2",
            "soma.init_buffer_bytes": 1 * 1024**3,
        }
    )

    create(src_uri, dst_uri, context=context)
    copy(src_uri, dst_uri, context=context)


if __name__ == "__main__":
    sys.exit(main())

Metadata

Metadata

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions