Fixing tests on GPU (deepinv#569)

jscanvic · web-flow · commit 3569ab359388 · 2025-07-31T15:57:45.000+02:00
* create pr * fix test_dataloader_formats * fix test_trainer_physics_generator_params * fix test_trainer_multidatasets * fix test_trainer_identity * black * fix test_DEQ * fix test_inpainting_generators * fix test_metrics * fix test_wavelet_decomposition * black * fix test_noise_model * black * fix None-related bug * making rng states buffers again * specify dtype as well * revert changes made in test_wavelet_decomposition * Revert "revert changes made in test_wavelet_decomposition" This reverts commit 9bc2a3b.
diff --git a/deepinv/loss/metric/distortion.py b/deepinv/loss/metric/distortion.py
@@ -680,16 +680,28 @@ def metric(self, x_net: Tensor = None, x: Tensor = None, *args, **kwargs) -> Ten
         coeffs_deg_y = self._haar_wavelet_decompose(deg_y, n_scales)
         if is_color_image:
             coefficients_ref_i = torch.abs(
-                self._convolve2d(ref_i, torch.ones((2, 2)) / 4.0)
+                self._convolve2d(
+                    ref_i,
+                    torch.ones((2, 2), device=ref_i.device, dtype=ref_i.dtype) / 4.0,
+                )
             )
             coefficients_deg_i = torch.abs(
-                self._convolve2d(deg_i, torch.ones((2, 2)) / 4.0)
+                self._convolve2d(
+                    deg_i,
+                    torch.ones((2, 2), device=deg_i.device, dtype=deg_i.dtype) / 4.0,
+                )
             )
             coefficients_ref_q = torch.abs(
-                self._convolve2d(ref_q, torch.ones((2, 2)) / 4.0)
+                self._convolve2d(
+                    ref_q,
+                    torch.ones((2, 2), device=ref_q.device, dtype=ref_q.dtype) / 4.0,
+                )
             )
             coefficients_deg_q = torch.abs(
-                self._convolve2d(deg_q, torch.ones((2, 2)) / 4.0)
+                self._convolve2d(
+                    deg_q,
+                    torch.ones((2, 2), device=deg_q.device, dtype=deg_q.dtype) / 4.0,
+                )
             )
 
         B, _, H, W = ref_y.shape
diff --git a/deepinv/physics/generator/base.py b/deepinv/physics/generator/base.py
@@ -73,6 +73,15 @@ def __init__(
                 device
             ), f"The random generator is not on the same device as the Physics Generator. Got random generator on {rng.device} and the Physics Generator named {self.__class__.__name__} on {self.device}."
             self.rng = rng
+
+        # NOTE: There is no use in moving RNG states from one device to another
+        # as Generator.set_state only supports inputs living on the CPU. Yet,
+        # by registering the initial random state as a buffer, it might be
+        # moved to another device. This might hinder performance as the tensor
+        # will need to be moved back to the CPU if it needs to be used later.
+        # We could fix that by letting it be a regular class attribute instead
+        # of a buffer but it would prevent it from being included in the
+        # state dicts which is undesirable.
         self.register_buffer("initial_random_state", self.rng.get_state().to(device))
 
         # Set attributes
@@ -114,7 +123,8 @@ def reset_rng(self):
         r"""
         Reset the random number generator to its initial state.
         """
-        self.rng.set_state(self.initial_random_state)
+        # NOTE: Generator.set_state expects a tensor living on the CPU.
+        self.rng.set_state(self.initial_random_state.cpu())
 
     def __add__(self, other):
         r"""
diff --git a/deepinv/physics/noise.py b/deepinv/physics/noise.py
@@ -22,6 +22,14 @@ def __init__(self, noise_model: Callable = None, rng: torch.Generator = None):
         self.noise_model = noise_model
         self.rng = rng
         if rng is not None:
+            # NOTE: There is no use in moving RNG states from one device to another
+            # as Generator.set_state only supports inputs living on the CPU. Yet,
+            # by registering the initial random state as a buffer, it might be
+            # moved to another device. This might hinder performance as the tensor
+            # will need to be moved back to the CPU if it needs to be used later.
+            # We could fix that by letting it be a regular class attribute instead
+            # of a buffer but it would prevent it from being included in the
+            # state dicts which is undesirable.
             self.register_buffer("initial_random_state", rng.get_state())
 
     def forward(self, input: torch.Tensor, seed: int = None) -> torch.Tensor:
@@ -71,7 +79,8 @@ def reset_rng(self):
         Reset the random number generator to its initial state.
         """
         if self.rng is not None:
-            self.rng.set_state(self.initial_random_state)
+            # NOTE: Generator.set_state expects a tensor living on the CPU.
+            self.rng.set_state(self.initial_random_state.cpu())
         else:
             warnings.warn(
                 "Cannot reset state for random number generator because it was not initialized. This is ignored."
@@ -511,10 +520,23 @@ def forward(self, x, gain=None, seed: int = None, **kwargs):
         self.to(x.device)
         gain = self.gain[(...,) + (None,) * (x.dim() - 1)]
 
-        y = torch.poisson(
-            torch.clip(x / gain, min=0.0) if self.clip_positive else x / gain,
-            generator=self.rng,
-        )
+        if self.clip_positive:
+            z = torch.clip(x / gain, min=0.0)
+        else:
+            # NOTE: PyTorch operations are generally run asynchronously on CUDA
+            # devices and the underlying CUDA kernel under
+            # torch.poisson typically raises a CUDA-level assertion error
+            # when its input has negative entries. Those errors can't be
+            # recovered from using Python's exception system due to their
+            # asynchronous nature. For this reason we add a manual check if the
+            # RNG is on a CUDA device.
+            if self.rng is not None and self.rng.device.type == "cuda":
+                assert gain > 0, "Gain must be positive"
+                assert torch.all(x >= 0), "Input tensor must be non-negative"
+
+            z = x / gain
+
+        y = torch.poisson(z, generator=self.rng)
         if self.normalize:
             y = y * gain
         return y
@@ -618,6 +640,17 @@ def forward(self, x, gain=None, sigma=None, seed: int = None, **kwargs):
         if self.clip_positive:
             y = torch.poisson(torch.clip(x / gain, min=0.0), generator=self.rng) * gain
         else:
+            # NOTE: PyTorch operations are generally run asynchronously on CUDA
+            # devices and the underlying CUDA kernel under
+            # torch.poisson typically raises a CUDA-level assertion error
+            # when its input has negative entries. Those errors can't be
+            # recovered from using Python's exception system due to their
+            # asynchronous nature. For this reason we add a manual check if the
+            # RNG is on a CUDA device.
+            if self.rng is not None and self.rng.device.type == "cuda":
+                assert gain > 0, "Gain must be positive"
+                assert torch.all(x >= 0), "Input tensor must be non-negative"
+
             y = torch.poisson(x / gain, generator=self.rng) * gain
 
         y = y + self.randn_like(x) * sigma
diff --git a/deepinv/tests/test_generators.py b/deepinv/tests/test_generators.py
@@ -372,6 +372,7 @@ def choose_inpainting_generator(name, img_size, split_ratio, pixelwise, device,
         return dinv.physics.generator.MultiplicativeSplittingMaskGenerator(
             img_size=img_size,
             split_generator=mri_gen,
+            device=device,
         )
     else:
         raise Exception("The generator chosen doesn't exist")
diff --git a/deepinv/tests/test_loss.py b/deepinv/tests/test_loss.py
@@ -100,7 +100,7 @@ def model(x):
     assert torch.allclose(regfnel2, reg_fne_target, rtol=1e-3)
 
 
-def choose_loss(loss_name, rng=None):
+def choose_loss(loss_name, rng=None, device="cpu"):
     loss = []
     if loss_name == "mcei":
         loss.append(dinv.loss.MCLoss())
@@ -115,7 +115,7 @@ def choose_loss(loss_name, rng=None):
             "installed with `pip install kornia`",
         )
         loss.append(dinv.loss.MCLoss())
-        loss.append(dinv.loss.EILoss(dinv.transform.Homography()))
+        loss.append(dinv.loss.EILoss(dinv.transform.Homography(device=device)))
     elif loss_name == "splittv":
         loss.append(dinv.loss.SplittingLoss(split_ratio=0.25))
         loss.append(dinv.loss.TVLoss())
@@ -293,7 +293,7 @@ def test_losses(
     non_blocking_plots, loss_name, tmp_path, dataset, physics, imsize, device, rng
 ):
     # choose training losses
-    loss = choose_loss(loss_name, rng)
+    loss = choose_loss(loss_name, rng, device=device)
 
     save_dir = tmp_path / "dataset"
     # choose backbone denoiser
@@ -434,6 +434,7 @@ def test_measplit(device, loss_name, rng):
             dinv.physics.generator.BernoulliSplittingMaskGenerator(
                 imsize, 0.5, device=device, rng=rng
             ),
+            device=device,
         )
         loss = dinv.loss.mri.WeightedSplittingLoss(
             mask_generator=gen, physics_generator=physics.gen
@@ -444,6 +445,7 @@ def test_measplit(device, loss_name, rng):
             dinv.physics.generator.BernoulliSplittingMaskGenerator(
                 imsize, 0.5, device=device, rng=rng
             ),
+            device=device,
         )
         loss = dinv.loss.mri.RobustSplittingLoss(
             mask_generator=gen,
diff --git a/deepinv/tests/test_models.py b/deepinv/tests/test_models.py
@@ -493,14 +493,22 @@ def test_wavelet_decomposition(channels, dimension, batch_size, device):
     # 1 decomposition
     out = model.dwt(x)
     x_hat = model.iwt(out)
-    assert x_hat.shape == x.shape and torch.allclose(x, x_hat, rtol=1e-5, atol=1e-5)
+
+    # For some reason the precision is more than 100x lower on GPU.
+    tol = 1e-3 if torch.device(device).type == "cuda" else 1e-5
+
+    # NOTE: Tensors are broadcasted in torch.allclose so
+    # they might pass the test even if they have different shapes. For this
+    # reason we also check the shapes.
+    assert x_hat.shape == x.shape
+    assert torch.allclose(x, x_hat, rtol=tol, atol=tol)
 
     # 2 decomposition
     cA1, cD1 = model.dwt(x)
     cA2, cD2 = model.dwt(cA1)
 
     x_hat = model.iwt((cA2, cD2, cD1))
-    assert torch.allclose(x, x_hat, rtol=1e-5, atol=1e-5)
+    assert torch.allclose(x, x_hat, rtol=tol, atol=tol)
 
 
 def test_drunet_inputs(imsize_1_channel, device):
diff --git a/deepinv/tests/test_trainer.py b/deepinv/tests/test_trainer.py
@@ -110,7 +110,7 @@ def step(self, batch_size=1, seed=None, **kwargs):
                 "f": torch.rand((batch_size,), generator=self.rng, device=device).item()
             }
 
-    return DummyPhysicsGenerator(rng=rng)
+    return DummyPhysicsGenerator(rng=rng, device=device)
 
 
 @pytest.mark.parametrize(
@@ -262,7 +262,7 @@ def test_trainer_physics_generator_params(
 ):
     N = 10
     rng1 = rng
-    rng2 = torch.Generator().manual_seed(0)
+    rng2 = torch.Generator(device).manual_seed(0)
 
     class DummyPhysics(Physics):
         # Dummy physics which sums images, and multiplies by a parameter f
@@ -377,6 +377,7 @@ def forward(self, y=0.0, physics=None, **kwargs):
             return self.dummy_param * y
 
     dummy_model = DummyModel()
+    dummy_model.to(device)
     optimizer = torch.optim.Adam(dummy_model.parameters(), lr=1e-2, weight_decay=0.0)
 
     trainer = Trainer(
@@ -439,6 +440,7 @@ def forward(self, y=0.0, physics=None, **kwargs):
             return self.dummy_param * torch.ones_like(y)
 
     dummy_model = DummyModel()
+    dummy_model.to(device)
     optimizer = torch.optim.Adam(dummy_model.parameters(), lr=1e-2, weight_decay=0.0)
 
     trainer = Trainer(
@@ -574,9 +576,11 @@ def __len__(self):
 
         def __getitem__(self, i):
             params = generator.step(1)
+            # NOTE: The test relies on changing params in place.
             params["mask"] = params["mask"].squeeze(0)
-            x = torch.ones(imsize)
-            y = x * params["mask"]
+            mask = params["mask"]
+            x = torch.ones(imsize, device=mask.device, dtype=mask.dtype)
+            y = x * mask
             if ground_truth:
                 if measurements:
                     if generate_params:
@@ -627,8 +631,8 @@ def __getitem__(self, i):
 
     # fmt: off
     def assert_x_none(x): assert x is None
-    def assert_x_full(x): assert x.mean() == 1.
-    def assert_physics_unchanged(physics): assert physics.mask.mean() == 1. # params not loaded
+    def assert_x_full(x): assert math.isclose(x.mean(), 1.0, abs_tol=1e-7)
+    def assert_physics_unchanged(physics): assert math.isclose(physics.mask.mean(), 1.0, abs_tol=1e-7) # params not loaded
     def assert_physics_offline(physics): assert physics.mask.mean() < .2
     def assert_physics_online(physics): assert physics.mask.mean() > .8
     def assert_y_offline(y): assert y.mean() < .2
diff --git a/deepinv/tests/test_unfolded.py b/deepinv/tests/test_unfolded.py
@@ -124,6 +124,7 @@ def test_DEQ(unfolded_algo, imsize, dummy_dataset, device):
                 anderson_acceleration_backward=and_acc,
                 jacobian_free=jac_free,
             )
+            model.to(device)
 
             for idx, (name, param) in enumerate(model.named_parameters()):
                 assert param.requires_grad

Original file line number	Diff line number	Diff line change
`@@ -372,6 +372,7 @@ def choose_inpainting_generator(name, img_size, split_ratio, pixelwise, device,`
`372`	`372`	`return dinv.physics.generator.MultiplicativeSplittingMaskGenerator(`
`373`	`373`	`img_size=img_size,`
`374`	`374`	`split_generator=mri_gen,`
	`375`	`+ device=device,`
`375`	`376`	`)`
`376`	`377`	`else:`
`377`	`378`	`raise Exception("The generator chosen doesn't exist")`
Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def test_DEQ(unfolded_algo, imsize, dummy_dataset, device):`
`124`	`124`	`anderson_acceleration_backward=and_acc,`
`125`	`125`	`jacobian_free=jac_free,`
`126`	`126`	`)`
	`127`	`+ model.to(device)`
`127`	`128`
`128`	`129`	`for idx, (name, param) in enumerate(model.named_parameters()):`
`129`	`130`	`assert param.requires_grad`