FIX & TST

lorentzenchr · lorentzenchr · commit 2d96b566401d · 2024-01-14T12:53:36.000+01:00
- cast cdf to dtype of quantile to avoid surprises
- Convert scalar python objects to pure python objects as in unweighted case
- Extend weighted case to test_linear_interpolation
- Extend weighted case to test_percentile_out
diff --git a/numpy/lib/_function_base_impl.py b/numpy/lib/_function_base_impl.py
@@ -4934,8 +4934,9 @@ def _quantile(
                         gamma,
                         out=out)
     else:
-        # Weighted case, we need to sort anyway.
-        # This implements method="inverted_cdf".
+        # Weighted case
+        # This implements method="inverted_cdf", the only supported weighted
+        # method, which needs to sort anyway.
         weights = np.asanyarray(weights)
         if axis != 0:
             weights = np.moveaxis(weights, axis, destination=0)
@@ -4954,17 +4955,30 @@ def _quantile(
         else:
             # weights is 1d
             weights = weights.reshape(-1)[index_array, ...]
-        # weights = np.take_along_axis(weights, index_array, axis=axis)
-        # We use the weights to calculate the empirical distribution function.
-        cdf = weights.cumsum(axis=0, dtype=float)
-        cdf /= cdf[-1, ...]  # normalization
-        # Only inverted_cdf is supported. Search index i such that
-        # sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i)
+        
+        # We use the weights to calculate the empirical cumulative
+        # distribution function cdf
+        cdf = weights.cumsum(axis=0, dtype=np.float64)
+        cdf /= cdf[-1, ...]  # normalization to 1
+        # Search index i such that
+        #   sum(weights[j], j=0..i-1) < quantile <= sum(weights[j], j=0..i)
+        # is then equivalent to
+        #   cdf[i-1] < quantile <= cdf[i]
+        # Unfortunately, searchsorted only accepts 1-d arrays as first
+        # argument, so we will need to iterate over dimensions.
+
+        # Without the following cast, searchsorted can return surprising
+        # results, e.g.
+        #   np.searchsorted(np.array([0.2, 0.4, 0.6, 0.8, 1.]),
+        #                   np.array(0.4, dtype=np.float32), side="left")
+        # returns 2 instead of 1 because 0.4 is not binary representable.
+        if quantiles.dtype.kind == "f":
+            cdf = cdf.astype(quantiles.dtype)
 
         def find_cdf_1d(arr, cdf):
             indices = np.searchsorted(cdf, quantiles, side="left")
             # We might have reached the maximum with i = len(arr), e.g. for
-            # quantiles == 1.
+            # quantiles = 1, and need to cut it to len(arr) - 1.
             indices = minimum(indices, values_count - 1)
             result = take(arr, indices, axis=0)
             return result
@@ -4984,6 +4998,10 @@ def find_cdf_1d(arr, cdf):
         if out is not None:
             np.copyto(out, result)
 
+        # Make result the same as in unweighted inverted_cdf.
+        if result.shape == () and result.dtype == np.dtype("O"):
+            result = result.item()
+
     if np.any(slices_having_nans):
         if result.ndim == 0 and out is None:
             # can't write to a scalar, but indexing will be correct
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
@@ -3113,21 +3113,23 @@ def test_linear_nan_1D(self, dtype):
                              [(np.quantile, 0.4),
                               (np.percentile, 40.0)])
     @pytest.mark.parametrize(["input_dtype", "expected_dtype"], H_F_TYPE_CODES)
-    @pytest.mark.parametrize(["method", "expected"],
-                             [("inverted_cdf", 20),
-                              ("averaged_inverted_cdf", 27.5),
-                              ("closest_observation", 20),
-                              ("interpolated_inverted_cdf", 20),
-                              ("hazen", 27.5),
-                              ("weibull", 26),
-                              ("linear", 29),
-                              ("median_unbiased", 27),
-                              ("normal_unbiased", 27.125),
+    @pytest.mark.parametrize(["method", "weighted", "expected"],
+                              [("inverted_cdf", False, 20),
+                              ("inverted_cdf", True, 20),
+                              ("averaged_inverted_cdf", False, 27.5),
+                              ("closest_observation", False, 20),
+                              ("interpolated_inverted_cdf", False, 20),
+                              ("hazen", False, 27.5),
+                              ("weibull", False, 26),
+                              ("linear", False, 29),
+                              ("median_unbiased", False, 27),
+                              ("normal_unbiased", False, 27.125),
                               ])
     def test_linear_interpolation(self,
                                   function,
                                   quantile,
                                   method,
+                                  weighted,
                                   expected,
                                   input_dtype,
                                   expected_dtype):
@@ -3136,6 +3138,7 @@ def test_linear_interpolation(self,
             expected_dtype = np.promote_types(expected_dtype, np.float64)
 
         arr = np.asarray([15.0, 20.0, 35.0, 40.0, 50.0], dtype=input_dtype)
+        weights = np.ones_like(arr) if weighted else None
         if input_dtype is np.longdouble:
             if function is np.quantile:
                 # 0.4 is not exactly representable and it matters
@@ -3146,12 +3149,12 @@ def test_linear_interpolation(self,
         else:
             test_function = np.testing.assert_array_almost_equal_nulp
 
-        actual = function(arr, quantile, method=method)
+        actual = function(arr, quantile, method=method, weights=weights)
 
         test_function(actual, expected_dtype.type(expected))
 
         if method in ["inverted_cdf", "closest_observation"]:
-            if input_dtype == "O":
+            if input_dtype == "O" :
                 np.testing.assert_equal(np.asarray(actual).dtype, np.float64)
             else:
                 np.testing.assert_equal(np.asarray(actual).dtype,
@@ -3308,14 +3311,16 @@ def test_percentile_out(self, percentile, with_weights):
         y = np.zeros((3,), dtype=out_dtype)
         p = (1, 2, 3)
         weights = np.ones_like(x) if with_weights else None
-        percentile(x, p, out=y, weights=weights)
+        r = percentile(x, p, out=y, weights=weights)
+        assert r is y
         assert_equal(percentile(x, p, weights=weights), y)
 
         x = np.array([[1, 2, 3],
                       [4, 5, 6]])
         y = np.zeros((3, 3), dtype=out_dtype)
         weights = np.ones_like(x) if with_weights else None
-        percentile(x, p, axis=0, out=y, weights=weights)
+        r = percentile(x, p, axis=0, out=y, weights=weights)
+        assert r is y
         assert_equal(percentile(x, p, weights=weights, axis=0), y)
 
         y = np.zeros((3, 2), dtype=out_dtype)
@@ -3324,7 +3329,10 @@ def test_percentile_out(self, percentile, with_weights):
 
         x = np.arange(12).reshape(3, 4)
         # q.dim > 1, float
-        r0 = np.array([[2.,  3.,  4., 5.], [4., 5., 6., 7.]])
+        if with_weights:
+            r0 = np.array([[0, 1, 2, 3], [4, 5, 6, 7]])
+        else:
+            r0 = np.array([[2., 3., 4., 5.], [4., 5., 6., 7.]])
         out = np.empty((2, 4), dtype=out_dtype)
         weights = np.ones_like(x) if with_weights else None
         assert_equal(