TKC isn't ready after restore in this specific scenario
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k get tkc -n test-gc-e2e-demo-ns
NAMESPACE NAME CONTROL PLANE WORKER TKR NAME AGE READY TKR COMPATIBLE UPDATES AVAILABLE
test-gc-e2e-demo-ns nginx 3 3 v1.25.7---vmware.3-fips.1-tkg.1 145m False True [v1.26.5+vmware.2-fips.1-tkg.1]
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k get tkc -n test-gc-e2e-demo-ns -o yaml
...
conditions:
- lastTransitionTime: "YYYY-MM-DDTHH:MM:SSZ"
message: cluster has fewer than 2 control plane nodes; removing an etcd member
is not supported
reason: RemediationFailed @ /nginx-cwpc6-lqbzv
severity: Error
status: "False"
type: Ready
New api server is initialized on primary TKC control plane node, and it generates a new token.
Other control plane nodes are still using the token restored from the backup, which is no longer valid.
To resolve the issue, please follow the steps mentioned below:
Login to the Supervisor CPVM, and check which TKC control plane nodes fail to join.
# check the control plane vms in tkc
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k get vm -n test-gc-e2e-demo-ns -o wide | grep <tkc name>
nginx-cwpc6-dd7xc PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.11 137m <- this is the control plane node
nginx-cwpc6-j4cds PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.16 128m <- this is the control plane node
nginx-cwpc6-lqbzv PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.15 131m <- this is the control plane node
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-24ldw PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.20 13m
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-64fvp PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.21 11m
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-gfkzm PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.22 11m
# login tkc and check which control plane vm is ready
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# kubectl -n <namespace> get secret <tkc name>-kubeconfig -o jsonpath='{.data.value}' | \
> base64 -d > tkc-config(a file to save kubeconfig)
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# KUBECONFIG=tkc-config k get no -A
NAME STATUS ROLES AGE VERSION
nginx-cwpc6-dd7xc Ready control-plane 15m v1.25.7+vmware.3-fips.1 <- this is the ready control plane node
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-24ldw Ready <none> 7m26s v1.25.7+vmware.3-fips.1
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-64fvp Ready <none> 5m47s v1.25.7+vmware.3-fips.1
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-gfkzm Ready <none> 5m34s v1.25.7+vmware.3-fips.1
In the above example nginx-cwpc6-j4cds and nginx-cwpc6-lqbzv are the control panel nodes failing to join. Manually delete these machines from TKC.
# manually delete NotReady nodes from tkc
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k delete machine nginx-cwpc6-lqbzv -n test-gc-e2e-demo-ns
machine.cluster.x-k8s.io "nginx-cwpc6-lqbzv" deleted
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k delete machine nginx-cwpc6-j4cds -n test-gc-e2e-demo-ns
machine.cluster.x-k8s.io "nginx-cwpc6-j4cds" deleted
# wait for tkc to configure and it should be ready after a few minutes
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k get tkc -A
NAMESPACE NAME CONTROL PLANE WORKER TKR NAME AGE READY TKR COMPATIBLE UPDATES AVAILABLE
test-gc-e2e-demo-ns nginx 3 3 v1.25.7---vmware.3-fips.1-tkg.1 5h5m True True [v1.26.5+vmware.2-fips.1-tkg.1]
root@420fa06d4deee5dc16c726f4dd1324aa [ ~ ]# k get vm -n test-gc-e2e-demo-ns -o wide
NAME POWER-STATE CLASS IMAGE PRIMARY-IP4 AGE
nginx-cwpc6-dd7xc PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.11 5h5m
nginx-cwpc6-vcw5l PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.29 5m5s
nginx-cwpc6-vnw4v PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.28 8m16s
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-24ldw PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.20 3h1m
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-64fvp PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.21 179m
nginx-nodepool-1-vrg4g-57d94b6b99xwhc44-gfkzm PoweredOn best-effort-xsmall vmi-4cd3401664489187e 192.168.128.22 179m