Skip to content

Commit 13c81a8

Browse files
nvazquezyadvr
authored andcommitted
server: Prevent corner case for infinite PrepareForMaintenance (apache#3095)
A corner case was found on 4.11.2 for apache#2493 leading to an infinite loop in state PrepareForMaintenance To prevent such cases, in which failed migrations are detected but still running on the host, this feature adds a new cluster setting host.maintenance.retries which is the number of retries before marking the host as ErrorInMaintenance if migration errors persist. How Has This Been Tested? - 2 KVM hosts, pick one which has running VMs as H - Block migrations ports on H to simulate failures on migrations: iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 49152:49215 -m comment --comment 'test block migrations' iptables -I OUTPUT -j REJECT -m state --state NEW -m tcp -p tcp --dport 16509 -m comment --comment 'test block migrations - Put host H in Maintenance - Observe that host is indefinitely in PrepareForMaintenance state (after this fix it goes into ErrorInMaintenance after retrying host.maintenance.retries times)
1 parent 68b4b84 commit 13c81a8

File tree

4 files changed

+79
-1
lines changed

4 files changed

+79
-1
lines changed

engine/components-api/src/com/cloud/resource/ResourceManager.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,12 +38,20 @@
3838
import com.cloud.hypervisor.Hypervisor.HypervisorType;
3939
import com.cloud.resource.ResourceState.Event;
4040
import com.cloud.utils.fsm.NoTransitionException;
41+
import org.apache.cloudstack.framework.config.ConfigKey;
42+
import org.apache.cloudstack.framework.config.Configurable;
4143

4244
/**
4345
* ResourceManager manages how physical resources are organized within the
4446
* CloudStack. It also manages the life cycle of the physical resources.
4547
*/
46-
public interface ResourceManager extends ResourceService {
48+
public interface ResourceManager extends ResourceService, Configurable {
49+
50+
ConfigKey<Integer> HostMaintenanceRetries = new ConfigKey<>("Advanced", Integer.class,
51+
"host.maintenance.retries","20",
52+
"Number of retries when preparing a host into Maintenance Mode is faulty before failing",
53+
true, ConfigKey.Scope.Cluster);
54+
4755
/**
4856
* Register a listener for different types of resource life cycle events.
4957
* There can only be one type of listener per type of host.

server/src/com/cloud/resource/ResourceManagerImpl.java

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,13 @@
2626
import java.util.List;
2727
import java.util.Map;
2828
import java.util.Random;
29+
import java.util.concurrent.ConcurrentHashMap;
2930

3031
import javax.inject.Inject;
3132
import javax.naming.ConfigurationException;
3233

3334
import com.cloud.vm.dao.UserVmDetailsDao;
35+
import org.apache.cloudstack.framework.config.ConfigKey;
3436
import org.apache.commons.lang.ObjectUtils;
3537
import org.apache.log4j.Logger;
3638
import org.springframework.stereotype.Component;
@@ -271,6 +273,8 @@ public void setDiscoverers(final List<? extends Discoverer> discoverers) {
271273

272274
private SearchBuilder<HostGpuGroupsVO> _gpuAvailability;
273275

276+
private Map<Long,Integer> retryHostMaintenance = new ConcurrentHashMap<>();
277+
274278
private void insertListener(final Integer event, final ResourceListener listener) {
275279
List<ResourceListener> lst = _lifeCycleListeners.get(event);
276280
if (lst == null) {
@@ -1224,6 +1228,7 @@ private boolean doMaintain(final long hostId) {
12241228

12251229
ActionEventUtils.onStartedActionEvent(CallContext.current().getCallingUserId(), CallContext.current().getCallingAccountId(), EventTypes.EVENT_MAINTENANCE_PREPARE, "starting maintenance for host " + hostId, true, 0);
12261230
_agentMgr.pullAgentToMaintenance(hostId);
1231+
setHostMaintenanceRetries(host);
12271232

12281233
/* TODO: move below to listener */
12291234
if (host.getType() == Host.Type.Routing) {
@@ -1251,6 +1256,16 @@ private boolean doMaintain(final long hostId) {
12511256
return true;
12521257
}
12531258

1259+
/**
1260+
* Set retries for transiting the host into Maintenance
1261+
*/
1262+
protected void setHostMaintenanceRetries(HostVO host) {
1263+
Integer retries = HostMaintenanceRetries.valueIn(host.getClusterId());
1264+
retryHostMaintenance.put(host.getId(), retries);
1265+
s_logger.debug(String.format("Setting the host %s (%s) retries for Maintenance mode: %s",
1266+
host.getId(), host.getName(), retries));
1267+
}
1268+
12541269
@Override
12551270
public boolean maintain(final long hostId) throws AgentUnavailableException {
12561271
final Boolean result = propagateResourceEvent(hostId, ResourceState.Event.AdminAskMaintenace);
@@ -1350,7 +1365,23 @@ protected boolean isHostInMaintenance(HostVO host, List<VMInstanceVO> runningVms
13501365
return CollectionUtils.isEmpty(failedMigrations) ?
13511366
setHostIntoMaintenance(host) :
13521367
setHostIntoErrorInMaintenance(host, failedMigrations);
1368+
} else if (retryHostMaintenance.containsKey(host.getId())) {
1369+
Integer retriesLeft = retryHostMaintenance.get(host.getId());
1370+
if (retriesLeft != null) {
1371+
if (retriesLeft <= 0) {
1372+
retryHostMaintenance.remove(host.getId());
1373+
s_logger.debug(String.format("No retries left while preparing KVM host %s (%s) for Maintenance, " +
1374+
"please investigate this connection.",
1375+
host.getId(), host.getName()));
1376+
return setHostIntoErrorInMaintenance(host, failedMigrations);
1377+
}
1378+
retriesLeft--;
1379+
retryHostMaintenance.put(host.getId(), retriesLeft);
1380+
s_logger.debug(String.format("Retries left preparing KVM host %s (%s) for Maintenance: %s",
1381+
host.getId(), host.getName(), retriesLeft));
1382+
}
13531383
}
1384+
13541385
return false;
13551386
}
13561387

@@ -2316,6 +2347,7 @@ private boolean doCancelMaintenance(final long hostId) {
23162347
try {
23172348
resourceStateTransitTo(host, ResourceState.Event.AdminCancelMaintenance, _nodeId);
23182349
_agentMgr.pullAgentOutMaintenance(hostId);
2350+
retryHostMaintenance.remove(hostId);
23192351

23202352
// for kvm, need to log into kvm host, restart cloudstack-agent
23212353
if ((host.getHypervisorType() == HypervisorType.KVM && !vms_migrating) || host.getHypervisorType() == HypervisorType.LXC) {
@@ -2924,4 +2956,13 @@ public boolean start() {
29242956
return super.start();
29252957
}
29262958

2959+
@Override
2960+
public String getConfigComponentName() {
2961+
return ResourceManagerImpl.class.getSimpleName();
2962+
}
2963+
2964+
@Override
2965+
public ConfigKey<?>[] getConfigKeys() {
2966+
return new ConfigKey<?>[] {HostMaintenanceRetries};
2967+
}
29272968
}

server/test/com/cloud/resource/MockResourceManagerImpl.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
import com.cloud.resource.ResourceState.Event;
5757
import com.cloud.utils.component.ManagerBase;
5858
import com.cloud.utils.fsm.NoTransitionException;
59+
import org.apache.cloudstack.framework.config.ConfigKey;
5960

6061
public class MockResourceManagerImpl extends ManagerBase implements ResourceManager {
6162

@@ -625,4 +626,14 @@ public boolean isHostGpuEnabled(final long hostId) {
625626
// TODO Auto-generated method stub
626627
return false;
627628
}
629+
630+
@Override
631+
public String getConfigComponentName() {
632+
return null;
633+
}
634+
635+
@Override
636+
public ConfigKey<?>[] getConfigKeys() {
637+
return new ConfigKey[0];
638+
}
628639
}

server/test/com/cloud/resource/ResourceManagerImplTest.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ public void setup() throws Exception {
118118
when(host.getId()).thenReturn(hostId);
119119
when(host.getResourceState()).thenReturn(ResourceState.Enabled);
120120
when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.VMware);
121+
when(host.getClusterId()).thenReturn(1L);
121122
when(hostDao.findById(hostId)).thenReturn(host);
122123
when(vm1.getId()).thenReturn(vm1Id);
123124
when(vm2.getId()).thenReturn(vm2Id);
@@ -188,4 +189,21 @@ public void testConfigureVncAccessForKVMHostFailedMigrations() {
188189
verify(userVmDetailsDao).addDetail(eq(vm2Id), eq("kvm.vnc.port"), eq(String.valueOf(vm2VncPort)), anyBoolean());
189190
verify(agentManager).pullAgentToMaintenance(hostId);
190191
}
192+
193+
@Test
194+
public void testCheckAndMaintainErrorInMaintenanceRetries() throws NoTransitionException {
195+
resourceManager.setHostMaintenanceRetries(host);
196+
197+
List<VMInstanceVO> failedMigrations = Arrays.asList(vm1, vm2);
198+
when(vmInstanceDao.listByHostId(host.getId())).thenReturn(failedMigrations);
199+
when(vmInstanceDao.listNonMigratingVmsByHostEqualsLastHost(host.getId())).thenReturn(failedMigrations);
200+
201+
Integer retries = ResourceManager.HostMaintenanceRetries.valueIn(host.getClusterId());
202+
for (int i = 0; i <= retries; i++) {
203+
resourceManager.checkAndMaintain(host.getId());
204+
}
205+
206+
verify(resourceManager, times(retries + 1)).isHostInMaintenance(host, failedMigrations, new ArrayList<>(), failedMigrations);
207+
verify(resourceManager).setHostIntoErrorInMaintenance(host, failedMigrations);
208+
}
191209
}

0 commit comments

Comments
 (0)