Skip to content

Commit 0c0f32d

Browse files
authored
feat(worker-manager): Timeout leads to stuck process (#5293)
Force provisioner loop to exit in case of timeouts
1 parent 62629c5 commit 0c0f32d

File tree

5 files changed

+46
-4
lines changed

5 files changed

+46
-4
lines changed

changelog/issue-5003.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
audience: general
2+
level: patch
3+
reference: issue 5003
4+
---
5+
6+
Allow provisioner to exit instead of being stuck in delayed loop.

services/worker-manager/src/provisioner.js

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
const Iterate = require('taskcluster-lib-iterate');
22
const { paginatedIterator } = require('taskcluster-lib-postgres');
33
const { WorkerPool, Worker } = require('./data');
4+
const { ApiError } = require('./providers/provider');
45

56
/**
67
* Run all provisioning logic
@@ -64,8 +65,10 @@ class Provisioner {
6465
*/
6566
async provision() {
6667
if (this.provisioningLoopAlive) {
67-
this.monitor.notice('loop-interference', {});
68-
return;
68+
this.monitor.alert('loop-interference', {});
69+
// should be treated as terminal error
70+
// to let the pod to restart and avoid getting stuck in a loop
71+
throw new ApiError('provision loop interference');
6972
}
7073
try {
7174
this.provisioningLoopAlive = true;

services/worker-manager/test/fakes/aws.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ const ec2Method = (context, method) => {
4747
* A fake region-specific EC2 client. This has the SDK methods (all with a `.promise()`
4848
* trailer). It also has properties:
4949
*
50-
* - runInstancesCalls: an array of the launch config passsed to each runInstances call
50+
* - runInstancesCalls: an array of the launch config passed to each runInstances call
5151
* - terminatedInstances: instance IDs for which terminateInstances has been called
5252
* - instanceStatuses: values returned from describeInstanceStatus, in the form {instanceId: state}
5353
*/

services/worker-manager/test/fakes/azure.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class FakeAzure extends FakeCloud {
3636
}
3737

3838
_reset() {
39-
// managers indexed by resoruceType
39+
// managers indexed by resourceType
4040
this._managers = {
4141
vm: new VMResourceManager(this, 'vm', 'azure-vm.yml'),
4242
// (no schema for disks as provider does not create them directly)

services/worker-manager/test/provisioner_test.js

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ const testing = require('taskcluster-lib-testing');
44
const taskcluster = require('taskcluster-client');
55
const { LEVELS } = require('taskcluster-lib-monitor');
66
const { WorkerPool, Worker } = require('../src/data');
7+
const { ApiError } = require('../src/providers/provider');
78

89
helper.secrets.mockSuite(testing.suiteName(), [], function(mock, skipping) {
910
helper.withDb(mock, skipping);
@@ -358,6 +359,38 @@ helper.secrets.mockSuite(testing.suiteName(), [], function(mock, skipping) {
358359
Severity: LEVELS.info,
359360
});
360361
});
362+
363+
test("provision loop is not running in parallel", async function() {
364+
await WorkerPool.fromApi({
365+
workerPoolId: 'pp/ww',
366+
providerId: 'testing1',
367+
previousProviderIds: ['NO-SUCH'],
368+
description: '',
369+
created: new Date(),
370+
lastModified: new Date(),
371+
config: {},
372+
373+
emailOnError: false,
374+
providerData: {},
375+
}).create(helper.db);
376+
const provisioner = await helper.load('provisioner');
377+
378+
await assert.rejects(async () => {
379+
await Promise.all([
380+
provisioner.provision(),
381+
provisioner.provision(),
382+
]);
383+
}, new ApiError('provision loop interference'));
384+
385+
assert.deepEqual(
386+
monitor.manager.messages.find(msg => msg.Type === 'loop-interference'), {
387+
Fields: {},
388+
Logger: 'taskcluster.test.provisioner',
389+
Severity: 1,
390+
Type: 'loop-interference',
391+
});
392+
monitor.manager.reset();
393+
});
361394
});
362395

363396
suite('deprovisioning loop', function() {

0 commit comments

Comments
 (0)