TKGm 2.5.X
The etcd cluster has lost quorum, leaving only a single healthy control-plane node requiring a state reset and rebuild
Backup and forcefully reset the degraded etcd cluster to a single-node configuration using the last healthy control plane node, thereby restoring API server quorum and functionality.
Subsequently, resuming Cluster API reconciliation triggers the automated provisioning and bootstrapping of replacement nodes to restore high availability of the etcd cluster, by following the below steps.
kubectl config get-contexts kubectl config use-context <mgmt-cluster-context>kubectl get nodeskubectl get nodes -o wide kubectl get pods -n kube-system | grep etcdConfirm etcd cluster is unhealthy kubectl exec -n kube-system <etcd-pod> -- etcdctl endpoint health --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key
If the above command is successful, DO NOT proceed.
kubectl patch cluster <cluster-name> -n <namespace> \ --type merge -p '{"spec":{"paused":true}}'kubectl get cluster <cluster-name> -n <namespace> -o yaml | grep pausedmkdir -p /root/etcd-backup
cp -r /etc/kubernetes/manifests /root/etcd-backup/kubernetes-manifests
cp -r /var/lib/etcd /root/etcd-backup/etcd-data
ls -l /root/etcd-backup
du -sh /root/etcd-backup/*scp -r /root/etcd-backup user@<remote-host>:/backup/kubectl exec -it -n kube-system <etcd-pod> -- shexport ETCDCTL_API=3
etcdctl --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.key snapshot save /tmp/etcd-snapshot.dbetcdctl snapshot status /tmp/etcd-snapshot.db -w tableexitkubectl cp kube-system/<etcd-pod>:/tmp/etcd-snapshot.db ./etcd-snapshot.db ls -lh etcd-snapshot.dbsystemctl stop kubeletcrictl ps | grep -E "etcd|apiserver"crictl stop $(crictl ps -q --name kube-apiserver) 2>/dev/nullcrictl stop $(crictl ps -q --name etcd) 2>/dev/nullrm -rf /var/lib/etcd
mkdir -p /var/lib/etcdls -l /etc/kubernetes/pki/etcd/kubeadm init phase certs all --config /etc/kubernetes/kubeadm-config.yamlvi /etc/kubernetes/manifests/etcd.yaml--name=<node-name>
--initial-cluster=<node-name>=https://<node-ip>:2380
--initial-cluster-state=new
--initial-advertise-peer-urls=https://<node-ip>:2380
--listen-peer-urls=https://<node-ip>:2380systemctl start kubeletsystemctl status kubeletcrictl ps | grep etcdcrictl logs $(crictl ps -q --name etcd) export ETCDCTL_API=3etcdctl member list --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.keyetcdctl endpoint healthexport KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodeskubectl get pods -n kube-system | grep vipkubectl get validatingwebhookconfiguration
kubectl get mutatingwebhookconfigurationkubectl delete validatingwebhookconfiguration <webhook-name>
kubectl delete mutatingwebhookconfiguration <webhook-name>kubectl delete validatingwebhookconfiguration --all
kubectl delete mutatingwebhookconfiguration --allkubectl get machine -n <namespace> -wkubectl get nodes -wkubectl delete machine <name> -n <namespace> --force --grace-period=0kubectl get pods -n kube-system | grep etcdkubectl exec -n kube-system <etcd-pod> -- etcdctl member list --endpoints=https://127.0.0.1:2379 --cacert=/etc/kubernetes/pki/etcd/ca.crt --cert=/etc/kubernetes/pki/etcd/server.crt --key=/etc/kubernetes/pki/etcd/server.keykubectl get nodes
kubectl get pods -Asystemctl stop kubelet
rm -rf /var/lib/etcd
cp -r /root/etcd-backup/etcd-data /var/lib/etcd
cp -r /root/etcd-backup/kubernetes-manifests/* /etc/kubernetes/manifests/
systemctl start kubeletNote:
etcdctl member add