Skip to content

Commit 69e8ebc

Browse files
CKS: retry if unable to drain node or unable to upgrade k8s node (#8402)
* CKS: retry if unable to drain node or unable to upgrade k8s node I tried CKS upgrade 16 times, 11 of 16 upgrades succeeded. 2 of 16 upgrades failed due to ``` error: unable to drain node "testcluster-of7974-node-18c8c33c2c3" due to error:[error when evicting pods/"cloud-controller-manager-5b8fc87665-5nwlh" -n "kube-system": Post "https://10.0.66.18:6443/api/v1/namespaces/kube-system/pods/cloud-controller-manager-5b8fc87665-5nwlh/eviction": unexpected EOF, error when evicting pods/"coredns-5d78c9869d-h5nkz" -n "kube-system": Post "https://10.0.66.18:6443/api/v1/namespaces/kube-system/pods/coredns-5d78c9869d-h5nkz/eviction": unexpected EOF], continuing command... ``` 3 of 16 upgrades failed due to ``` Error from server: error when retrieving current configuration of: Resource: "rbac.authorization.k8s.io/v1, Resource=roles", GroupVersionKind: "rbac.authorization.k8s.io/v1, Kind=Role" Name: "kubernetes-dashboard", Namespace: "kubernetes-dashboard" from server for: "/mnt/k8sdisk//dashboard.yaml": etcdserver: leader changed ``` * CKS: remove tests of creating/deleting HA clusters as they are covered by the upgrade test * Update PR 8402 as suggested * test: remove CKS cluster if fail to create or verify
1 parent b2e2993 commit 69e8ebc

File tree

4 files changed

+59
-57
lines changed

4 files changed

+59
-57
lines changed

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterManagerImpl.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,6 +1647,7 @@ public ConfigKey<?>[] getConfigKeys() {
16471647
KubernetesClusterStartTimeout,
16481648
KubernetesClusterScaleTimeout,
16491649
KubernetesClusterUpgradeTimeout,
1650+
KubernetesClusterUpgradeRetries,
16501651
KubernetesClusterExperimentalFeaturesEnabled,
16511652
KubernetesMaxClusterSize
16521653
};

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/KubernetesClusterService.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ public interface KubernetesClusterService extends PluggableService, Configurable
6565
"Timeout interval (in seconds) in which upgrade operation for a Kubernetes cluster should be completed. Not strictly obeyed while upgrade is in progress on a node",
6666
true,
6767
KubernetesServiceEnabled.key());
68+
static final ConfigKey<Integer> KubernetesClusterUpgradeRetries = new ConfigKey<Integer>("Advanced", Integer.class,
69+
"cloud.kubernetes.cluster.upgrade.retries",
70+
"3",
71+
"The number of retries if fail to upgrade kubernetes cluster due to some reasons (e.g. drain node, etcdserver leader changed)",
72+
true,
73+
KubernetesServiceEnabled.key());
6874
static final ConfigKey<Boolean> KubernetesClusterExperimentalFeaturesEnabled = new ConfigKey<Boolean>("Advanced", Boolean.class,
6975
"cloud.kubernetes.cluster.experimental.features.enabled",
7076
"false",

plugins/integrations/kubernetes-service/src/main/java/com/cloud/kubernetes/cluster/actionworkers/KubernetesClusterUpgradeWorker.java

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -77,39 +77,62 @@ private Pair<Boolean, String> runInstallScriptOnVM(final UserVm vm, final int in
7777
}
7878

7979
private void upgradeKubernetesClusterNodes() {
80-
Pair<Boolean, String> result = null;
8180
for (int i = 0; i < clusterVMs.size(); ++i) {
8281
UserVm vm = clusterVMs.get(i);
8382
String hostName = vm.getHostName();
8483
if (StringUtils.isNotEmpty(hostName)) {
8584
hostName = hostName.toLowerCase();
8685
}
87-
result = null;
86+
Pair<Boolean, String> result;
8887
if (LOGGER.isInfoEnabled()) {
8988
LOGGER.info(String.format("Upgrading node on VM %s in Kubernetes cluster %s with Kubernetes version(%s) ID: %s",
9089
vm.getDisplayName(), kubernetesCluster.getName(), upgradeVersion.getSemanticVersion(), upgradeVersion.getUuid()));
9190
}
92-
try {
93-
result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null,
94-
String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName),
95-
10000, 10000, 60000);
96-
} catch (Exception e) {
97-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
98-
}
99-
if (!result.first()) {
100-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
91+
String errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to drain Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName());
92+
for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) {
93+
try {
94+
result = SshHelper.sshExecute(publicIpAddress, sshPort, getControlNodeLoginUser(), sshKeyFile, null,
95+
String.format("sudo /opt/bin/kubectl drain %s --ignore-daemonsets --delete-emptydir-data", hostName),
96+
10000, 10000, 60000);
97+
if (result.first()) {
98+
break;
99+
}
100+
if (retry > 0) {
101+
LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry));
102+
} else {
103+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
104+
}
105+
} catch (Exception e) {
106+
if (retry > 0) {
107+
LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry));
108+
} else {
109+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
110+
}
111+
}
101112
}
102113
if (System.currentTimeMillis() > upgradeTimeoutTime) {
103114
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
104115
}
105-
try {
106-
deployProvider();
107-
result = runInstallScriptOnVM(vm, i);
108-
} catch (Exception e) {
109-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
110-
}
111-
if (!result.first()) {
112-
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
116+
errorMessage = String.format("Failed to upgrade Kubernetes cluster : %s, unable to upgrade Kubernetes node on VM : %s", kubernetesCluster.getName(), vm.getDisplayName());
117+
for (int retry = KubernetesClusterService.KubernetesClusterUpgradeRetries.value(); retry >= 0; retry--) {
118+
try {
119+
deployProvider();
120+
result = runInstallScriptOnVM(vm, i);
121+
if (result.first()) {
122+
break;
123+
}
124+
if (retry > 0) {
125+
LOGGER.error(String.format("%s, retries left: %s", errorMessage, retry));
126+
} else {
127+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);
128+
}
129+
} catch (Exception e) {
130+
if (retry > 0) {
131+
LOGGER.error(String.format("%s due to %s, retries left: %s", errorMessage, e, retry));
132+
} else {
133+
logTransitStateDetachIsoAndThrow(Level.ERROR, errorMessage, kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, e);
134+
}
135+
}
113136
}
114137
if (System.currentTimeMillis() > upgradeTimeoutTime) {
115138
logTransitStateDetachIsoAndThrow(Level.ERROR, String.format("Failed to upgrade Kubernetes cluster : %s, upgrade action timed out", kubernetesCluster.getName()), kubernetesCluster, clusterVMs, KubernetesCluster.Event.OperationFailed, null);

test/integration/smoke/test_kubernetes_clusters.py

Lines changed: 10 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,15 @@ def deleteKubernetesSupportedVersion(cls, version_id):
278278
cls.apiclient.deleteKubernetesSupportedVersion(deleteKubernetesSupportedVersionCmd)
279279

280280
@classmethod
281-
def listKubernetesCluster(cls, cluster_id = None):
281+
def listKubernetesCluster(cls, cluster_id = None, cluster_name = None):
282282
listKubernetesClustersCmd = listKubernetesClusters.listKubernetesClustersCmd()
283283
listKubernetesClustersCmd.listall = True
284284
if cluster_id != None:
285285
listKubernetesClustersCmd.id = cluster_id
286+
if cluster_name != None:
287+
listKubernetesClustersCmd.name = cluster_name
286288
clusterResponse = cls.apiclient.listKubernetesClusters(listKubernetesClustersCmd)
287-
if cluster_id != None and clusterResponse != None:
289+
if (cluster_id != None or cluster_name != None) and clusterResponse != None:
288290
return clusterResponse[0]
289291
return clusterResponse
290292

@@ -523,24 +525,6 @@ def test_06_delete_kubernetes_cluster(self):
523525

524526
return
525527

526-
@attr(tags=["advanced", "smoke"], required_hardware="true")
527-
@skipTestIf("hypervisorNotSupported")
528-
def test_07_deploy_kubernetes_ha_cluster(self):
529-
"""Test to deploy a new HA Kubernetes cluster
530-
531-
# Validate the following:
532-
# 1. createKubernetesCluster should return valid info for new cluster
533-
# 2. The Cloud Database contains the valid information
534-
"""
535-
if self.setup_failed == True:
536-
self.fail("Setup incomplete")
537-
if self.default_network:
538-
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
539-
global k8s_cluster
540-
k8s_cluster = self.getValidKubernetesCluster(1, 3)
541-
self.debug("HA Kubernetes cluster with ID: %s successfully deployed" % k8s_cluster.id)
542-
return
543-
544528
@attr(tags=["advanced", "smoke"], required_hardware="true")
545529
@skipTestIf("hypervisorNotSupported")
546530
def test_08_upgrade_kubernetes_ha_cluster(self):
@@ -568,24 +552,6 @@ def test_08_upgrade_kubernetes_ha_cluster(self):
568552
self.debug("Kubernetes cluster with ID: %s successfully upgraded" % k8s_cluster.id)
569553
return
570554

571-
@attr(tags=["advanced", "smoke"], required_hardware="true")
572-
@skipTestIf("hypervisorNotSupported")
573-
def test_09_delete_kubernetes_ha_cluster(self):
574-
"""Test to delete a HA Kubernetes cluster
575-
576-
# Validate the following:
577-
# 1. deleteKubernetesCluster should delete an existing HA Kubernetes cluster
578-
"""
579-
if self.setup_failed == True:
580-
self.fail("Setup incomplete")
581-
if self.default_network:
582-
self.skipTest("HA cluster on shared network requires external ip address, skipping it")
583-
global k8s_cluster
584-
k8s_cluster = self.getValidKubernetesCluster(1, 3)
585-
586-
self.debug("Deleting Kubernetes cluster with ID: %s" % k8s_cluster.id)
587-
return
588-
589555
@attr(tags=["advanced", "smoke"], required_hardware="true")
590556
@skipTestIf("hypervisorNotSupported")
591557
def test_10_vpc_tier_kubernetes_cluster(self):
@@ -739,8 +705,14 @@ def createNewKubernetesCluster(self, version, size, control_nodes) :
739705
cluster = self.createKubernetesCluster(name, version.id, size, control_nodes)
740706
self.verifyKubernetesCluster(cluster, name, version.id, size, control_nodes)
741707
except Exception as ex:
708+
cluster = self.listKubernetesCluster(cluster_name = name)
709+
if cluster != None:
710+
self.deleteKubernetesClusterAndVerify(cluster.id, False, True)
742711
self.fail("Kubernetes cluster deployment failed: %s" % ex)
743712
except AssertionError as err:
713+
cluster = self.listKubernetesCluster(cluster_name = name)
714+
if cluster != None:
715+
self.deleteKubernetesClusterAndVerify(cluster.id, False, True)
744716
self.fail("Kubernetes cluster deployment failed during cluster verification: %s" % err)
745717
return cluster
746718

0 commit comments

Comments
 (0)