Skip to content

Commit a7f3e0f

Browse files
authored
Merge f8b4026 into fa07442
2 parents fa07442 + f8b4026 commit a7f3e0f

File tree

4 files changed

+238
-128
lines changed

4 files changed

+238
-128
lines changed

benchmarks/benchmarks/stats.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright Iris contributors
2+
#
3+
# This file is part of Iris and is released under the BSD license.
4+
# See LICENSE in the root of the repository for full licensing details.
5+
"""Stats benchmark tests."""
6+
7+
import iris
8+
from iris.analysis.stats import pearsonr
9+
import iris.tests
10+
11+
12+
class PearsonR:
13+
def setup(self):
14+
cube_temp = iris.load_cube(
15+
iris.tests.get_data_path(
16+
("NetCDF", "global", "xyt", "SMALL_total_column_co2.nc")
17+
)
18+
)
19+
20+
# Make data non-lazy.
21+
cube_temp.data
22+
23+
self.cube_a = cube_temp[:6]
24+
self.cube_b = cube_temp[20:26]
25+
self.cube_b.replace_coord(self.cube_a.coord("time"))
26+
for name in ["latitude", "longitude"]:
27+
self.cube_b.coord(name).guess_bounds()
28+
self.weights = iris.analysis.cartography.area_weights(self.cube_b)
29+
30+
def time_real(self):
31+
pearsonr(self.cube_a, self.cube_b, weights=self.weights)
32+
33+
def time_lazy(self):
34+
for cube in self.cube_a, self.cube_b:
35+
cube.data = cube.lazy_data()
36+
37+
result = pearsonr(self.cube_a, self.cube_b, weights=self.weights)
38+
result.data

docs/src/whatsnew/latest.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ This document explains the changes made to Iris for this release
104104
lazy data from file. This will also speed up coordinate comparison.
105105
(:pull:`5610`)
106106

107+
#. `@rcomer`_ and `@trexfeathers`_ (reviewer) modified
108+
:func:`~iris.analysis.stats.pearsonr` so it preserves lazy data in all cases
109+
and also runs a little faster. (:pull:`5638`)
107110

108111
🔥 Deprecations
109112
===============

lib/iris/analysis/stats.py

Lines changed: 78 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
# See LICENSE in the root of the repository for full licensing details.
55
"""Statistical operations between cubes."""
66

7+
import dask.array as da
78
import numpy as np
8-
import numpy.ma as ma
99

1010
import iris
11-
from iris.util import broadcast_to_shape
11+
from iris.common import SERVICES, Resolve
12+
from iris.common.lenient import _lenient_client
13+
from iris.util import _mask_array
1214

1315

16+
@_lenient_client(services=SERVICES)
1417
def pearsonr(
1518
cube_a,
1619
cube_b,
@@ -26,7 +29,8 @@ def pearsonr(
2629
cube_a, cube_b : cubes
2730
Cubes between which the correlation will be calculated. The cubes
2831
should either be the same shape and have the same dimension coordinates
29-
or one cube should be broadcastable to the other.
32+
or one cube should be broadcastable to the other. Broadcasting rules
33+
are the same as those for cube arithmetic (see :ref:`cube maths`).
3034
corr_coords : str or list of str
3135
The cube coordinate name(s) over which to calculate correlations. If no
3236
names are provided then correlation will be calculated over all common
@@ -62,13 +66,13 @@ def pearsonr(
6266
6367
Notes
6468
-----
69+
If either of the input cubes has lazy data, the result will have lazy data.
70+
6571
Reference:
6672
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
6773
68-
This operation is non-lazy.
69-
7074
"""
71-
# Assign larger cube to cube_1
75+
# Assign larger cube to cube_1 for simplicity.
7276
if cube_b.ndim > cube_a.ndim:
7377
cube_1 = cube_b
7478
cube_2 = cube_a
@@ -78,90 +82,88 @@ def pearsonr(
7882

7983
smaller_shape = cube_2.shape
8084

81-
dim_coords_1 = [coord.name() for coord in cube_1.dim_coords]
82-
dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
83-
common_dim_coords = list(set(dim_coords_1) & set(dim_coords_2))
85+
# Get the broadcast, auto-transposed safe versions of the cube operands.
86+
resolver = Resolve(cube_1, cube_2)
87+
lhs_cube_resolved = resolver.lhs_cube_resolved
88+
rhs_cube_resolved = resolver.rhs_cube_resolved
89+
90+
if lhs_cube_resolved.has_lazy_data() or rhs_cube_resolved.has_lazy_data():
91+
al = da
92+
array_lhs = lhs_cube_resolved.lazy_data()
93+
array_rhs = rhs_cube_resolved.lazy_data()
94+
else:
95+
al = np
96+
array_lhs = lhs_cube_resolved.data
97+
array_rhs = rhs_cube_resolved.data
98+
8499
# If no coords passed then set to all common dimcoords of cubes.
85100
if corr_coords is None:
86-
corr_coords = common_dim_coords
87-
88-
def _ones_like(cube):
89-
# Return a copy of cube with the same mask, but all data values set to 1.
90-
# The operation is non-lazy.
91-
# For safety we also discard any cell-measures and ancillary-variables, to
92-
# avoid cube arithmetic possibly objecting to them, or inadvertently retaining
93-
# them in the result where they might be inappropriate.
94-
ones_cube = cube.copy()
95-
ones_cube.data = np.ones_like(cube.data)
96-
ones_cube.rename("unknown")
97-
ones_cube.units = 1
98-
for cm in ones_cube.cell_measures():
99-
ones_cube.remove_cell_measure(cm)
100-
for av in ones_cube.ancillary_variables():
101-
ones_cube.remove_ancillary_variable(av)
102-
return ones_cube
101+
dim_coords_1 = {coord.name() for coord in lhs_cube_resolved.dim_coords}
102+
dim_coords_2 = {coord.name() for coord in rhs_cube_resolved.dim_coords}
103+
corr_coords = list(dim_coords_1.intersection(dim_coords_2))
104+
105+
# Interpret coords as array dimensions.
106+
corr_dims = set()
107+
if isinstance(corr_coords, str):
108+
corr_coords = [corr_coords]
109+
for coord in corr_coords:
110+
corr_dims.update(lhs_cube_resolved.coord_dims(coord))
111+
112+
corr_dims = tuple(corr_dims)
103113

104114
# Match up data masks if required.
105115
if common_mask:
106-
# Create a cube of 1's with a common mask.
107-
if ma.is_masked(cube_2.data):
108-
mask_cube = _ones_like(cube_2)
109-
else:
110-
mask_cube = 1.0
111-
if ma.is_masked(cube_1.data):
112-
# Take a slice to avoid unnecessary broadcasting of cube_2.
113-
slice_coords = [
114-
dim_coords_1[i]
115-
for i in range(cube_1.ndim)
116-
if dim_coords_1[i] not in common_dim_coords
117-
and np.array_equal(
118-
cube_1.data.mask.any(axis=i), cube_1.data.mask.all(axis=i)
119-
)
120-
]
121-
cube_1_slice = next(cube_1.slices_over(slice_coords))
122-
mask_cube = _ones_like(cube_1_slice) * mask_cube
123-
# Apply common mask to data.
124-
if isinstance(mask_cube, iris.cube.Cube):
125-
cube_1 = cube_1 * mask_cube
126-
cube_2 = mask_cube * cube_2
127-
dim_coords_2 = [coord.name() for coord in cube_2.dim_coords]
128-
129-
# Broadcast weights to shape of cubes if necessary.
130-
if weights is None or cube_1.shape == smaller_shape:
131-
weights_1 = weights
132-
weights_2 = weights
116+
mask_lhs = al.ma.getmaskarray(array_lhs)
117+
if al is np:
118+
# Reduce all invariant dimensions of mask_lhs to length 1. This avoids
119+
# unnecessary broadcasting of array_rhs.
120+
index = tuple(
121+
slice(0, 1)
122+
if np.array_equal(mask_lhs.any(axis=dim), mask_lhs.all(axis=dim))
123+
else slice(None)
124+
for dim in range(mask_lhs.ndim)
125+
)
126+
mask_lhs = mask_lhs[index]
127+
128+
array_rhs = _mask_array(array_rhs, mask_lhs)
129+
array_lhs = _mask_array(array_lhs, al.ma.getmaskarray(array_rhs))
130+
131+
# Broadcast weights to shape of arrays if necessary.
132+
if weights is None:
133+
weights_lhs = weights_rhs = None
133134
else:
134135
if weights.shape != smaller_shape:
135-
raise ValueError(
136-
"weights array should have dimensions {}".format(smaller_shape)
137-
)
136+
msg = f"weights array should have dimensions {smaller_shape}"
137+
raise ValueError(msg)
138138

139-
dims_1_common = [
140-
i for i in range(cube_1.ndim) if dim_coords_1[i] in common_dim_coords
141-
]
142-
weights_1 = broadcast_to_shape(weights, cube_1.shape, dims_1_common)
143-
if cube_2.shape != smaller_shape:
144-
dims_2_common = [
145-
i for i in range(cube_2.ndim) if dim_coords_2[i] in common_dim_coords
146-
]
147-
weights_2 = broadcast_to_shape(weights, cube_2.shape, dims_2_common)
148-
else:
149-
weights_2 = weights
139+
wt_resolver = Resolve(cube_1, cube_2.copy(weights))
140+
weights = wt_resolver.rhs_cube_resolved.data
141+
weights_rhs = np.broadcast_to(weights, array_rhs.shape)
142+
weights_lhs = np.broadcast_to(weights, array_lhs.shape)
150143

151144
# Calculate correlations.
152-
s1 = cube_1 - cube_1.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_1)
153-
s2 = cube_2 - cube_2.collapsed(corr_coords, iris.analysis.MEAN, weights=weights_2)
154-
155-
covar = (s1 * s2).collapsed(
156-
corr_coords, iris.analysis.SUM, weights=weights_1, mdtol=mdtol
145+
s_lhs = array_lhs - al.ma.average(
146+
array_lhs, axis=corr_dims, weights=weights_lhs, keepdims=True
147+
)
148+
s_rhs = array_rhs - al.ma.average(
149+
array_rhs, axis=corr_dims, weights=weights_rhs, keepdims=True
157150
)
158-
var_1 = (s1**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_1)
159-
var_2 = (s2**2).collapsed(corr_coords, iris.analysis.SUM, weights=weights_2)
160151

161-
denom = iris.analysis.maths.apply_ufunc(
162-
np.sqrt, var_1 * var_2, new_unit=covar.units
152+
s_prod = resolver.cube(s_lhs * s_rhs)
153+
154+
# Use cube collapsed method as it takes care of coordinate collapsing and missing
155+
# data tolerance.
156+
covar = s_prod.collapsed(
157+
corr_coords, iris.analysis.SUM, weights=weights_lhs, mdtol=mdtol
163158
)
159+
160+
var_lhs = iris.analysis._sum(s_lhs**2, axis=corr_dims, weights=weights_lhs)
161+
var_rhs = iris.analysis._sum(s_rhs**2, axis=corr_dims, weights=weights_rhs)
162+
163+
denom = np.sqrt(var_lhs * var_rhs)
164+
164165
corr_cube = covar / denom
165166
corr_cube.rename("Pearson's r")
167+
corr_cube.units = 1
166168

167169
return corr_cube

0 commit comments

Comments
 (0)