Skip to content

dask distributed , fail to start worker #2446

@redsum

Description

@redsum

I get timeout error when trying to tart a localCluster with 4 process or more

        cluster = LocalCluster(processes=True, n_workers=4, death_timeout=None)#, silence_logs=logging.DEBUG)
        client = Client(cluster)

I'm using dask 1.25.1 with python 2.7 running over mac

This happens also happen during tests
i modify the dask distributed test test_procs found in distributed/deploy/tests/test_local.py
like this

def test_procs():
    with LocalCluster(4, scheduler_port=0, processes=True, threads_per_worker=3,
                      diagnostics_port=None, silence_logs=False) as c:
        assert len(c.workers) == 2
        assert all(isinstance(w, Nanny) for w in c.workers)
        with Client(c.scheduler.address) as e:
            assert all(v == 3 for v in e.ncores().values())

            c.start_worker()
            assert all(isinstance(w, Nanny) for w in c.workers)
        repr(c)

i set the n_workers to 4 instead of 2
and i get this error

test_local.py:68: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
../local.py:141: in __init__
    self.start(ip=ip, n_workers=n_workers)
../local.py:174: in start
    self.sync(self._start, **kwargs)
../local.py:167: in sync
    return sync(self.loop, func, *args, **kwargs)
../../utils.py:277: in sync
    six.reraise(*error[0])
../../utils.py:262: in f
    result[0] = yield future
../../../../../anaconda/lib/python2.7/site-packages/tornado/gen.py:1133: in run
    value = future.result()
../../../../../anaconda/lib/python2.7/site-packages/tornado/concurrent.py:269: in result
    raise_exc_info(self._exc_info)
../../../../../anaconda/lib/python2.7/site-packages/tornado/gen.py:1141: in run
    yielded = self.gen.throw(*exc_info)
../local.py:194: in _start
    yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)]
../../../../../anaconda/lib/python2.7/site-packages/tornado/gen.py:1133: in run
    value = future.result()
../../../../../anaconda/lib/python2.7/site-packages/tornado/concurrent.py:269: in result
    raise_exc_info(self._exc_info)
../../../../../anaconda/lib/python2.7/site-packages/tornado/gen.py:883: in callback
    result_list.append(f.result())
../../../../../anaconda/lib/python2.7/site-packages/tornado/concurrent.py:269: in result
    raise_exc_info(self._exc_info)
../../../../../anaconda/lib/python2.7/site-packages/tornado/gen.py:1147: in run
    yielded = self.gen.send(value)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

self = LocalCluster('tcp://127.0.0.1:55398', workers=3, ncores=9)
death_timeout = 60, kwargs = {'ncores': 3, 'quiet': True, 'services': {}}
W = <class 'distributed.nanny.Nanny'>, w = <Nanny: None, threads: 3>

    @gen.coroutine
    def _start_worker(self, death_timeout=60, **kwargs):
        if self.status and self.status.startswith('clos'):
            warnings.warn("Tried to start a worker while status=='%s'" % self.status)
            return
    
        if self.processes:
            W = Nanny
            kwargs['quiet'] = True
        else:
            W = Worker
    
        w = W(self.scheduler.address, loop=self.loop,
              death_timeout=death_timeout,
              silence_logs=self.silence_logs, **kwargs)
        yield w._start()
    
        self.workers.append(w)
    
        while w.status != 'closed' and w.worker_address not in self.scheduler.workers:
            yield gen.sleep(0.01)
    
        if w.status == 'closed' and self.scheduler.status == 'running':
            self.workers.remove(w)
>           raise gen.TimeoutError("Worker failed to start")
E           TimeoutError: Worker failed to start

../local.py:224: TimeoutError

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions