Symptoms:
This KB is valid for
Operation Overview
1. Preparation
# Switch to the Management Cluster context
kubectl config use-context ${mc_context_name}
# Set the target cluster name
kubectl get cluster -A
cluster=w2-cc
namespace=default
# Check the cluster type
kubectl -n ${namespace} get cluster ${cluster} -ojson | jq .spec.topology.class
#> "tkg-vsphere-default-v1.x.x" # --> ClusterClass, use Step 3-A later
#> null # --> Legacy Cluster, use Step 3-B later
2. Update Deployment - AutoScaler pod
# Check - Autoscaler version is v1.26 or higher
kubectl -n ${namespace} get deployment ${cluster}-cluster-autoscaler -oyaml | grep image:
#> image: projects.registry.vmware.com/tkg/cluster-autoscaler:v1.28.0_vmware.1
# Check the current parameter
kubectl -n ${namespace} get deployment ${cluster}-cluster-autoscaler -oyaml | grep -A15 args:
#> - args:
#> ....
#> - --scale-down-delay-after-add=10m
#> - --scale-down-delay-after-delete=10s
#> - --scale-down-delay-after-failure=3m
#> - --scale-down-unneeded-time=10m
#> - --max-node-provision-time=15m
#> - --max-nodes-total=0
# Backup
kubectl -n ${namespace} get deployment ${cluster}-cluster-autoscaler -oyaml > ${cluster}-cluster-autoscaler-$(date +%Y-%m%d-%H%M).yaml
# Add "--enforce-node-group-min-size" flag
kubectl -n ${namespace} edit deployment ${cluster}-cluster-autoscaler
#> - args:
#> ....
#> - --max-nodes-total=0
#> - --enforce-node-group-min-size=true # <----------- NEW
# Autoscaler pod will be recreated automatically
kubectl -n ${namespace} get pods | grep -E 'autoscaler|NAME'
#> NAME READY STATUS RESTARTS AGE
#> w2-cc-cluster-autoscaler-f4fb9fc96-tn8wc 1/1 Running 0 12s
# Check the new parameter
kubectl -n ${namespace} get deployment ${cluster}-cluster-autoscaler -oyaml | grep -A15 args:
3-A. (ClusterClass) - Review cluster-api-autoscaler-node-group-min-size
# Check target node-pool
tanzu cluster node-pool list $cluster
#> NAME NAMESPACE PHASE REPLICAS READY UPDATED UNAVAILABLE
#> md-0 default Running 1 1 1 0
# Check current autoscaler configuration
kubectl -n ${namespace} get cluster ${cluster} -ojsonpath="{.spec.topology.workers.machineDeployments}" | jq .[]
#> {
#> "class": "tkg-worker",
#> "metadata": {
#> "annotations": {
#> "cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size": "4",
#> "cluster.x-k8s.io/cluster-api-autoscaler-node-group-min-size": "3",
#> "run.tanzu.vmware.com/resolve-os-image": "image-type=ova,os-name=photon"
#> }
#> },
#> "name": "md-0",
#> "replicas": 1
#> }
# Backup
kubectl -n ${namespace} get cluster ${cluster} -oyaml > ${cluster}-$(date +%Y-%m%d-%H%M).yaml
# Edit ClusterClass
# - Review "cluster-api-autoscaler-node-group-max-size" > "cluster-api-autoscaler-node-group-min-size"
# - Delete "replicas: X"
kubectl -n ${namespace} edit cluster ${cluster}
#> machineDeployments:
#> - class: tkg-worker
#> metadata:
#> annotations:
#> cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size: "4"
#> cluster.x-k8s.io/cluster-api-autoscaler-node-group-min-size: "3" # CHECK
#> run.tanzu.vmware.com/resolve-os-image: image-type=ova,os-name=photon
#> name: md-0
#> replicas: 1 # <--------------- DELETE
# Check the current autoscaler configuration
kubectl -n ${namespace} get cluster ${cluster} -ojsonpath="{.spec.topology.workers.machineDeployments}" | jq .[]
#> {
#> "class": "tkg-worker",
#> "metadata": {
#> "annotations": {
#> "cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size": "4",
#> "cluster.x-k8s.io/cluster-api-autoscaler-node-group-min-size": "3",
#> "run.tanzu.vmware.com/resolve-os-image": "image-type=ova,os-name=photon"
#> }
#> },
#> "name": "md-0",
#> }
3-B. (Legacy Cluster) - Review cluster-api-autoscaler-node-group-min-size
# Set machineDeployment name
tanzu cluster node-pool list $cluster
nodepool=md-0
md=${cluster}-${nodepool}
# Check the current autoscaler configuration
kubectl -n ${namespace} get md ${md} -oyaml | grep "cluster-api-autoscaler" | grep size:
#> cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size: "4"
#> cluster.x-k8s.io/cluster-api-autoscaler-node-group-min-size: "3"
# Option: Update "cluster-api-autoscaler-node-group-min-size" for your requirements
# Backup
kubectl -n ${namespace} get md ${md} -oyaml > ${md}-$(date +%Y-%m%d-%H%M).yaml
# Review "cluster-api-autoscaler-node-group-min-size"
kubectl -n ${namespace} edit md ${md}
# Check the new autoscaler configuration
kubectl -n ${namespace} get md ${md} -oyaml | grep "cluster-api-autoscaler" | grep size:
4. Check
Autoscaler starts to increase the number of worker nodes (1 --> 3)
# Check the target cluster STATUS is "updating" --> "running"
tanzu cluster list
# Check - All nodes PAHSE should be "Running"
kubectl get ma -A
Document Links
Troubleshooting
Check Autoscaler pod log.
kubectl -n ${namespace} logs ${cluster}-cluster-autoscaler-xxxxx
Monitor the machine's status to know the current behavior.
kubectl get ma -A -w
# If you found the "Deleting" stuck node, delete it forcibly
namespace=default
machine=w2-cc-md-0-npvz4-7n4l8-lmhms
# Action - Delete the stuck node forcibly
kubectl -n ${namespace} patch ma ${ma} -p '{"metadata": {"finalizers": null}}' --type=merge
If scale-out or scale-in event for the worker nodes is not triggered, check whether the cluster is paused.
# Check (no output --> pause: false)
kubectl -n ${namespace} get cluster ${cluster} -ojsonpath='{.spec.paused}' | jq .
# Action: Unpause is must for node scaling
kubectl -n ${namespace} patch cluster ${cluster} --type merge -p '{"spec":{"paused": false}}'
# Action: Pause - If the cluster is updated frequently, consider to pause at once
kubectl -n ${namespace} patch cluster ${cluster} --type merge -p '{"spec":{"paused": true}}'