feat: add support for validation of polars dataframe and lazyframe (snakemake#3262)

fgvieira · web-flow · commit c7473a6b6d80 · 2025-03-11T16:09:36.000+01:00
&lt;!--Add a description of your PR here--&gt;

Implement validation for polars DataFrame and LazyFrame.

Re-factoring the setting of default values.

### QC
&lt;!-- Make sure that you can tick the boxes below. --&gt;

* [x] The PR contains a test case for the changes or the changes are
already covered by an existing test case.
* [x] The documentation (`docs/`) is updated to reflect the changes or
this is not necessary (e.g. if the change does neither modify the
language nor the behavior or functionalities of Snakemake).


&lt;!-- This is an auto-generated comment: release notes by coderabbit.ai
--&gt;
## Summary by CodeRabbit

- **New Features**
- Enhanced sample data validation now supports both Pandas and Polars
data frames for improved reliability and performance.
- Introduced new methods for reading sample data using Polars, expanding
data handling options.
- Added support for executing Xonsh scripts within Snakemake workflows.
  - New rule added for running Python scripts with Conda environments.
- New functionality for generating self-contained HTML reports,
including default statistics and user-specified results.
- New functions added for parsing input files and extracting checksums.

- **Bug Fixes**
- Improved error handling for validation failures, providing more
specific error messages.

- **Documentation**
- Updated the sample metadata schema with new fields for replicate count
and tissue origin, alongside a refined description for sample condition.
- Clarified usage of conda environments and apptainer integration within
Snakemake workflows.
- Expanded guidance on generating, customizing, and sharing reports in
Snakemake.
- Added documentation for integrating Xonsh scripts into Snakemake
rules.
- Updated help text for the `--keep-storage-local-copies` argument to
enhance clarity and usability.
&lt;!-- end of auto-generated comment: release notes by coderabbit.ai --&gt;
diff --git a/docs/snakefiles/configuration.rst b/docs/snakefiles/configuration.rst
@@ -112,8 +112,8 @@ Instead, for data provenance and reproducibility reasons, you are required to pa
 Validation
 ----------
 
-With Snakemake 5.1, it is possible to validate both types of configuration via `JSON schemas <https://json-schema.org>`_.
-The function ``snakemake.utils.validate`` takes a loaded configuration (a config dictionary or a Pandas data frame) and validates it with a given JSON schema.
+With Snakemake 5.1, it is possible to validate both types of configuration (standard and tabular) via `JSON schemas <https://json-schema.org>`_.
+The function ``snakemake.utils.validate`` takes a loaded configuration (a config dictionary, a Pandas DataFrame, Polars DataFrame or Polars LazyFrame) and validates it with a given JSON schema.
 Thereby, the schema can be provided in JSON or YAML format. Also, by using the defaults property it is possible to populate entries with default values. See `jsonschema FAQ on setting default values <https://python-jsonschema.readthedocs.io/en/latest/faq/>`_ for details.
 In case of the data frame, the schema should model the record that is expected in each row of the data frame.
 In the following example,
diff --git a/snakemake/utils.py b/snakemake/utils.py
@@ -108,43 +108,126 @@ def set_defaults(validator, properties, instance, schema):
         logger.warning("Note that schema file may not be validated correctly.")
     DefaultValidator = extend_with_default(Validator)
 
-    if not isinstance(data, dict):
+    def _validate_record(record):
+        if set_default:
+            DefaultValidator(schema, resolver=resolver).validate(record)
+            return record
+        else:
+            jsonschema.validate(record, schema, resolver=resolver)
+
+    def _validate_pandas(data):
         try:
             import pandas as pd
 
-            recordlist = []
             if isinstance(data, pd.DataFrame):
+                logger.debug("Validating pandas DataFrame")
+
+                recordlist = []
                 for i, record in enumerate(data.to_dict("records")):
-                    record = {k: v for k, v in record.items() if not pd.isnull(v)}
+                    # Exclude NULL values
+                    record = {k: v for k, v in record.items() if pd.notnull(v)}
                     try:
-                        if set_default:
-                            DefaultValidator(schema, resolver=resolver).validate(record)
-                            recordlist.append(record)
-                        else:
-                            jsonschema.validate(record, schema, resolver=resolver)
+                        recordlist.append(_validate_record(record))
                     except jsonschema.exceptions.ValidationError as e:
                         raise WorkflowError(
                             f"Error validating row {i} of data frame.", e
                         )
+
                 if set_default:
                     newdata = pd.DataFrame(recordlist, data.index)
-                    newcol = ~newdata.columns.isin(data.columns)
-                    n = len(data.columns)
-                    for col in newdata.loc[:, newcol].columns:
-                        data.insert(n, col, newdata.loc[:, col])
-                        n = n + 1
-                return
+                    # Add missing columns
+                    newcol = newdata.columns[~newdata.columns.isin(data.columns)]
+                    data[newcol] = None
+                    # Fill in None values with values from newdata
+                    data.update(newdata)
+
+            else:
+                return False
         except ImportError:
-            pass
-        raise WorkflowError("Unsupported data type for validation.")
-    else:
+            return False
+        return True
+
+    def _validate_polars(data):
         try:
-            if set_default:
-                DefaultValidator(schema, resolver=resolver).validate(data)
+            import polars as pl
+
+            if isinstance(data, pl.DataFrame):
+                logger.debug("Validating polars DataFrame")
+
+                recordlist = []
+                for i, record in enumerate(data.iter_rows(named=True)):
+                    # Exclude NULL values
+                    record = {
+                        k: v
+                        for k, v in record.items()
+                        if pl.Series(k, [v]).is_not_null().all()
+                    }
+                    try:
+                        recordlist.append(_validate_record(record))
+                    except jsonschema.exceptions.ValidationError as e:
+                        raise WorkflowError(
+                            f"Error validating row {i} of data frame.", e
+                        )
+
+                if set_default:
+                    newdata = pl.DataFrame(recordlist)
+                    # Add missing columns
+                    newcol = [col for col in newdata.columns if col not in data.columns]
+                    [
+                        data.insert_column(
+                            len(data.columns),
+                            pl.lit(None, newdata[col].dtype).alias(col),
+                        )
+                        for col in newcol
+                    ]
+                    # Fill in None values with values from newdata
+                    for i in range(data.shape[0]):
+                        for j in range(data.shape[1]):
+                            if data[i, j] is None:
+                                data[i, j] = newdata[i, j]
+
+            elif isinstance(data, pl.LazyFrame):
+                # If a LazyFrame is being used, probably it is a large dataframe (so check only first 1000 records)
+                logger.debug("Validating first 1000 rows of polars LazyFrame")
+
+                recordlist = []
+                for i, record in enumerate(
+                    data.head(1000).collect().iter_rows(named=True)
+                ):
+                    # Exclude NULL values
+                    record = {
+                        k: v
+                        for k, v in record.items()
+                        if pl.Series(k, [v]).is_not_null().all()
+                    }
+                    try:
+                        recordlist.append(_validate_record(record))
+                    except jsonschema.exceptions.ValidationError as e:
+                        raise WorkflowError(
+                            f"Error validating row {i} of data frame.", e
+                        )
+
+                if set_default:
+                    logger.warning("LazyFrame does not support setting default values.")
+
             else:
-                jsonschema.validate(data, schema, resolver=resolver)
+                return False
+        except ImportError:
+            return False
+        return True
+
+    if isinstance(data, dict):
+        logger.debug("Validating dictionary")
+        try:
+            _validate_record(data)
         except jsonschema.exceptions.ValidationError as e:
             raise WorkflowError("Error validating config file.", e)
+        logger.debug("Dictionary validated!")
+    else:
+        if _validate_pandas(data):
+            logger.debug("Pandas dataframe validated!")
+        elif _validate_polars(data):
+            logger.debug("Polars dataframe validated!")
 
 
 def simplify_path(path):
diff --git a/tests/test_validate/Snakefile b/tests/test_validate/Snakefile
@@ -1,16 +1,60 @@
 shell.executable("bash")
 
 import pandas as pd
+import polars as pl
 from snakemake.utils import validate
 
 
 configfile: "config.yaml"
 
 
-validate(config, "config.schema.yaml")
+# Dict
+df = pd.read_table(config["samples"])
+samples = df.iloc[0].to_dict()
+validate(samples, "samples.schema.yaml")
+assert samples["tissue"] == "blood"
+assert samples["n"] == 1
+samples = {k: v for k, v in df.iloc[1].to_dict().items() if pd.notnull(v)}
+validate(samples, "samples.schema.yaml")
+assert samples["tissue"] == "blood"
+assert samples["n"] == 0
+
+# Pandas DataFrame without index
+samples = pd.read_table(config["samples"])
+validate(samples, "samples.schema.yaml")
+assert samples.iloc[0]["tissue"] == "blood"
+assert samples.iloc[0]["n"] == 1
+assert samples.iloc[1]["n"] == 0
 
+# Polars DataFrame
+samples = pl.read_csv(
+    config["samples"],
+    separator="\t",
+    schema={"sample": pl.String, "condition": pl.String, "n": pl.UInt8},
+    null_values="NA",
+)
+validate(samples, "samples.schema.yaml")
+assert samples[0, "tissue"] == "blood"
+assert samples[0, "n"] == 1
+assert samples[1, "n"] == 0
+
+# Polars LazyFrame
+samples = pl.scan_csv(
+    config["samples"],
+    separator="\t",
+    schema={"sample": pl.String, "condition": pl.String, "n": pl.UInt8},
+    null_values="NA",
+)
+validate(samples, "samples.schema.yaml", set_default=False)
+assert samples.collect()[0, "n"] == 1
+
+# Pandas DataFrame with index
+validate(config, "config.schema.yaml")
 samples = pd.read_table(config["samples"]).set_index("sample", drop=False)
 validate(samples, "samples.schema.yaml")
+assert samples.iloc[0]["tissue"] == "blood"
+assert samples.iloc[0]["n"] == 1
+assert samples.iloc[1]["n"] == 0
 
 
 rule all:
diff --git a/tests/test_validate/samples.schema.yaml b/tests/test_validate/samples.schema.yaml
@@ -6,7 +6,15 @@ properties:
     description: sample name/identifier
   condition:
     type: string
-    description: sample condition that will be compared during differential expression analysis (e.g. a treatment, a tissue time, a disease)
+    description: sample condition
+  n:
+    type: integer
+    default: 0
+    description: replicate count
+  tissue:
+    type: string
+    default: blood
+    description: sample tissue of origin
 
 required:
   - sample
diff --git a/tests/test_validate/samples.tsv b/tests/test_validate/samples.tsv
@@ -1,3 +1,3 @@
-sample	condition
-A	tumor
-B	blood
+sample	condition	n
+A	case	1
+B	control	NA