Merge branch 'main' into WSMR/wait_for_state

crusaderky · crusaderky · commit cecf2c0e461c · 2022-06-16T09:27:29.000+01:00
diff --git a/distributed/scheduler.py b/distributed/scheduler.py
@@ -1065,8 +1065,13 @@ class TaskState:
     #: Cached hash of :attr:`~TaskState.client_key`
     _hash: int
 
+    # Support for weakrefs to a class with __slots__
+    __weakref__: Any = None
     __slots__ = tuple(__annotations__)  # type: ignore
 
+    # Instances not part of slots since class variable
+    _instances: ClassVar[weakref.WeakSet[TaskState]] = weakref.WeakSet()
+
     def __init__(self, key: str, run_spec: object):
         self.key = key
         self._hash = hash(key)
@@ -1101,6 +1106,7 @@ def __init__(self, key: str, run_spec: object):
         self.metadata = {}
         self.annotations = {}
         self.erred_on = set()
+        TaskState._instances.add(self)
 
     def __hash__(self) -> int:
         return self._hash
diff --git a/distributed/tests/test_active_memory_manager.py b/distributed/tests/test_active_memory_manager.py
@@ -9,13 +9,20 @@
 
 import pytest
 
-from distributed import Nanny, wait
+from distributed import Event, Nanny, Scheduler, Worker, wait
 from distributed.active_memory_manager import (
     ActiveMemoryManagerExtension,
     ActiveMemoryManagerPolicy,
+    RetireWorker,
 )
 from distributed.core import Status
-from distributed.utils_test import captured_logger, gen_cluster, inc, slowinc
+from distributed.utils_test import (
+    assert_story,
+    captured_logger,
+    gen_cluster,
+    inc,
+    slowinc,
+)
 
 NO_AMM_START = {"distributed.scheduler.active-memory-manager.start": False}
 
@@ -903,6 +910,101 @@ async def test_RetireWorker_all_recipients_are_paused(c, s, a, b):
     assert await c.submit(inc, 1) == 2
 
 
+@gen_cluster(
+    client=True,
+    config={
+        "distributed.scheduler.active-memory-manager.start": True,  # to avoid one-off AMM instance
+        "distributed.scheduler.active-memory-manager.policies": [],
+    },
+    timeout=15,
+)
+async def test_RetireWorker_new_keys_arrive_after_all_keys_moved_away(
+    c, s: Scheduler, a: Worker, b: Worker
+):
+    """
+    If all keys have been moved off a worker, but then new keys arrive (due to task completion or `gather_dep`)
+    before the worker has actually closed, make sure we still retire it (instead of hanging forever).
+
+    This test is timing-sensitive. If it runs too slowly, it *should* `pytest.skip` itself.
+
+    See https://github.com/dask/distributed/issues/6223 for motivation.
+    """
+    ws_a = s.workers[a.address]
+    ws_b = s.workers[b.address]
+    event = Event()
+
+    # Put 200 keys on the worker, so `_track_retire_worker` will sleep for 0.5s
+    xs = c.map(lambda x: x, range(200), workers=[a.address])
+    await wait(xs)
+
+    # Put an extra task on the worker, which we will allow to complete once the `xs`
+    # have been replicated.
+    extra = c.submit(
+        lambda: event.wait("2s"),
+        workers=[a.address],
+        allow_other_workers=True,
+        key="extra",
+    )
+
+    while (
+        extra.key not in a.state.tasks or a.state.tasks[extra.key].state != "executing"
+    ):
+        await asyncio.sleep(0.01)
+
+    t = asyncio.create_task(c.retire_workers([a.address]))
+
+    # Wait for all `xs` to be replicated.
+    while not len(ws_b.has_what) == len(xs):
+        await asyncio.sleep(0)
+
+    # `_track_retire_worker` _should_ now be sleeping for 0.5s, because there were >=200 keys on A.
+    # In this test, everything from the beginning of the transfers needs to happen within 0.5s.
+
+    # Simulate the policy running again. Because the default 2s AMM interval is longer
+    # than the 0.5s wait, what we're about to trigger is unlikely, but still possible
+    # for the times to line up. (Especially with a custom AMM interval.)
+    amm: ActiveMemoryManagerExtension = s.extensions["amm"]
+    assert len(amm.policies) == 1
+    policy = next(iter(amm.policies))
+    assert isinstance(policy, RetireWorker)
+
+    amm.run_once()
+
+    # The policy has removed itself, because all `xs` have been replicated.
+    assert not amm.policies
+    assert policy.done(), {ts.key: ts.who_has for ts in ws_a.has_what}
+
+    # But what if a new key arrives now while `_track_retire_worker` is still (maybe)
+    # sleeping? Let `extra` complete and wait for it to hit the scheduler.
+    await event.set()
+    await wait(extra)
+
+    if a.address not in s.workers:
+        # It took more than 0.5s to get here, and the scheduler closed our worker. Dang.
+        pytest.skip(
+            "Timing didn't work out: `_track_retire_worker` finished before `extra` completed."
+        )
+
+    # `retire_workers` doesn't hang
+    await t
+    assert a.address not in s.workers
+    assert not amm.policies
+
+    # `extra` was not transferred from `a` to `b`. Instead, it was recomputed on `b`.
+    story = b.state.story(extra.key)
+    assert_story(
+        story,
+        [
+            (extra.key, "compute-task", "released"),
+            (extra.key, "released", "waiting", "waiting", {"extra": "ready"}),
+            (extra.key, "waiting", "ready", "ready", {"extra": "executing"}),
+        ],
+    )
+
+    # `extra` completes successfully and is fetched from the other worker.
+    await extra.result()
+
+
 # FIXME can't drop runtime of this test below 10s; see distributed#5585
 @pytest.mark.slow
 @gen_cluster(
diff --git a/distributed/tests/test_worker_state_machine.py b/distributed/tests/test_worker_state_machine.py
@@ -1,12 +1,16 @@
 from __future__ import annotations
 
 import asyncio
+import gc
 from collections.abc import Iterator
 
 import pytest
+from tlz import first
 
-from distributed import Worker, wait
+import distributed.profile as profile
+from distributed import Nanny, Worker, wait
 from distributed.protocol.serialize import Serialize
+from distributed.scheduler import TaskState as SchedulerTaskState
 from distributed.utils import recursive_to_dict
 from distributed.utils_test import (
     BlockedGetData,
@@ -37,6 +41,15 @@
 )
 
 
+def test_TaskState_tracking(cleanup):
+    gc.collect()
+    x = TaskState("x")
+    assert len(TaskState._instances) == 1
+    assert first(TaskState._instances) == x
+    del x
+    assert len(TaskState._instances) == 0
+
+
 def test_TaskState_get_nbytes():
     assert TaskState("x", nbytes=123).get_nbytes() == 123
     # Default to distributed.scheduler.default-data-size
@@ -670,6 +683,34 @@ async def test_missing_to_waiting(c, s, w1, w2, w3):
     await f1
 
 
+@gen_cluster(client=True, Worker=Nanny)
+async def test_task_state_instance_are_garbage_collected(c, s, a, b):
+    futs = c.map(inc, range(10))
+    red = c.submit(sum, futs)
+    f1 = c.submit(inc, red, pure=False)
+    f2 = c.submit(inc, red, pure=False)
+
+    async def check(dask_worker):
+        while dask_worker.tasks:
+            await asyncio.sleep(0.01)
+        with profile.lock:
+            gc.collect()
+        assert not TaskState._instances
+
+    await c.gather([f2, f1])
+    del futs, red, f1, f2
+    await c.run(check)
+
+    async def check(dask_scheduler):
+        while dask_scheduler.tasks:
+            await asyncio.sleep(0.01)
+        with profile.lock:
+            gc.collect()
+        assert not SchedulerTaskState._instances
+
+    await c.run_on_scheduler(check)
+
+
 @gen_cluster(client=True, nthreads=[("", 1)] * 3)
 async def test_fetch_to_missing_on_refresh_who_has(c, s, w1, w2, w3):
     """
diff --git a/distributed/utils_test.py b/distributed/utils_test.py
@@ -58,6 +58,7 @@
 from distributed.node import ServerNode
 from distributed.proctitle import enable_proctitle_on_children
 from distributed.protocol import deserialize
+from distributed.scheduler import TaskState as SchedulerTaskState
 from distributed.security import Security
 from distributed.utils import (
     DequeHandler,
@@ -72,6 +73,7 @@
 )
 from distributed.worker import WORKER_ANY_RUNNING, Worker
 from distributed.worker_state_machine import InvalidTransition
+from distributed.worker_state_machine import TaskState as WorkerTaskState
 
 try:
     import ssl
@@ -1839,9 +1841,8 @@ def check_instances():
     Scheduler._instances.clear()
     SpecCluster._instances.clear()
     Worker._initialized_clients.clear()
-    # assert all(n.status == "closed" for n in Nanny._instances), {
-    #     n: n.status for n in Nanny._instances
-    # }
+    SchedulerTaskState._instances.clear()
+    WorkerTaskState._instances.clear()
     Nanny._instances.clear()
     _global_clients.clear()
     Comm._instances.clear()
diff --git a/distributed/worker_state_machine.py b/distributed/worker_state_machine.py
@@ -8,6 +8,7 @@
 import operator
 import random
 import sys
+import weakref
 from collections import defaultdict, deque
 from collections.abc import (
     Callable,
@@ -262,20 +263,23 @@ class TaskState:
     #: True if the task is in memory or erred; False otherwise
     done: bool = False
 
+    _instances: ClassVar[weakref.WeakSet[TaskState]] = weakref.WeakSet()
+
     # Support for weakrefs to a class with __slots__
     __weakref__: Any = field(init=False)
 
+    def __post_init__(self):
+        TaskState._instances.add(self)
+
     def __repr__(self) -> str:
         return f"<TaskState {self.key!r} {self.state}>"
 
     def __eq__(self, other: object) -> bool:
-        if not isinstance(other, TaskState) or other.key != self.key:
-            return False
-        # When a task transitions to forgotten and exits Worker.tasks, it should be
-        # immediately dereferenced. If the same task is recreated later on on the
-        # worker, we should not have to deal with its previous incarnation lingering.
-        assert other is self
-        return True
+        # A task may be forgotten and a new TaskState object with the same key may be created in
+        # its place later on. In the Worker state, you should never have multiple TaskState objects with
+        # the same key. We can't assert it here however, as this comparison is also used in WeakSets
+        # for instance tracking purposes.
+        return other is self
 
     def __hash__(self) -> int:
         return hash(self.key)
@@ -3002,6 +3006,11 @@ def validate_state(self) -> None:
         if self.transition_counter_max:
             assert self.transition_counter < self.transition_counter_max
 
+        # Test that there aren't multiple TaskState objects with the same key in data_needed
+        assert len({ts.key for ts in self.data_needed}) == len(self.data_needed)
+        for tss in self.data_needed_per_worker.values():
+            assert len({ts.key for ts in tss}) == len(tss)
+
 
 class BaseWorker(abc.ABC):
     """Wrapper around the :class:`WorkerState` that implements instructions handling.