Symptoms:
- The handover to do the upgrade of the final node in the Manager cluster fails.
- Upgrade is stuck in incomplete state.
- GET /upgrade-coordinator/api/v1/upgrade/status-summary reports the following status for "component_type": "MP": "node_count_at_target_version": 2
...
{
"can_rollback": true,
"can_skip": false,
"component_type": "MP",
"current_version_node_summary": {
"results": [
{
"component_version": "Done",
"node_count": 2,
"type": "MP",
"upgrade_unit_subtype": "ACTION"
},
{
"component_version": "Pending",
"node_count": 30,
"type": "MP",
"upgrade_unit_subtype": "ACTION"
},
{
"component_version": "3.2.2.0.0.20737190",
"node_count": 1,
"type": "MP",
"upgrade_unit_subtype": "RESOURCE"
}
]
},
"details": "",
"node_count_at_target_version": 2,
"percent_complete": 11,
"pre_upgrade_status": {
"end_time": 1686849901485,
"error_count": 0,
"failure_count": 1,
"start_time": 1686849823612,
"status": "COMPLETED",
"warning_count": 1
},
"status": "FAILED",
"target_component_version": "4.1.0.0.0.21332677"
}
],
"overall_upgrade_status": "PAUSED"
}
- And GET /upgrade-coordinator/api/v1/upgrade/upgrade-units/aggregate-info contains the following errors for the MPNodesGroup Node OS Upgrade.
"errors": [
"Unexpected error while upgrading upgrade unit: Failed to handover upgrade control to another node. Please ensure that the other nodes are functioning, and retry the upgrade."
],
"group": {
"display_name": "Node OS Upgrade",
"id": "MPNodesGroup"
},
- Within /var/log/upgrade-coordinator/upgrade-coordinator.log you see the following errors reported:
2023-06-15T18:10:30.994Z INFO task-executor-0-workitem-MP-########-####-####-####-##########4b NsxTrustManager 6867 SYSTEM [nsx@6876 comp="nsx-manager" level="INFO" subcomp="upgrade-coordinator"] checkServerTrusted: CN=<customer-domain.com>,OU=<internal-org>,O=<org-name>,C=<Country> for authType=ECDHE_RSA failed: PKIX path building failed: java.security.cert.CertPathBuilderException: Unable to find certificate chain.
2023-06-15T18:10:30.994Z WARN task-executor-0-workitem-MP-########-####-####-####-##########4b HandoverUpgradeUtils 6867 SYSTEM [nsx@6876 comp="nsx-manager" level="WARNING" subcomp="upgrade-coordinator"] UC is not responding
org.springframework.web.client.ResourceAccessException: I/O error on GET request for "https://10.252.192.135:443/api/v1/upgrade/plugin/uc-startup-status": PKIX path building failed: java.security.cert.CertPathBuilderException: Unable to find certificate chain.; nested exception is javax.net.ssl.SSLHandshakeException: PKIX path building failed: java.security.cert.CertPathBuilderException: Unable to find certificate chain.
at org.springframework.web.client.RestTemplate.doExecute(RestTemplate.java:785) ~[spring-web-5.3.20.jar:5.3.20]
at org.springframework.web.client.RestTemplate.execute(RestTemplate.java:711) ~[spring-web-5.3.20.jar:5.3.20]
at org.springframework.web.client.RestTemplate.exchange(RestTemplate.java:602) ~[spring-web-5.3.20.jar:5.3.20]
at com.vmware.nsx.management.common.rest.RestRequestImpl.createEntityAndExchange(RestRequestImpl.java:54) ~[librest-util.jar:?]
at com.vmware.nsx.management.common.rest.RestRequestImpl.doGet(RestRequestImpl.java:73) ~[librest-util.jar:?]
at com.vmware.nsx.management.upgrade.rpcframework.UcRestClient.sendGetRequest(UcRestClient.java:102) ~[libuc-core.jar:?]
at com.vmware.nsx.management.upgrade.utils.HandoverUpgradeUtils.invokeHandover(HandoverUpgradeUtils.java:95) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.utils.HandoverUpgradeUtils.performHandover(HandoverUpgradeUtils.java:72) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.plugin.mp.service.MPRollingUpgradeServiceImpl.upgradeMPNode(MPRollingUpgradeServiceImpl.java:744) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.plugin.mp.service.MPRollingUpgradeServiceImpl.upgrade(MPRollingUpgradeServiceImpl.java:246) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.plugin.mp.MpUpgradePlugin.upgrade(MpUpgradePlugin.java:275) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.executionengine.SingleWorkItem.executeWorkItem(SingleWorkItem.java:115) ~[libuc-core.jar:?]
at com.vmware.nsx.management.upgrade.executionengine.SingleWorkItem.run(SingleWorkItem.java:90) ~[libuc-core.jar:?]
at com.vmware.nsx.management.common.executor.TaskExecutorImpl$TaskWrapper$1.run(TaskExecutorImpl.java:238) ~[libmp_common.jar:?]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_342]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_342]
at com.vmware.nsx.management.common.executor.TaskExecutorImpl$TaskWrapper.run(TaskExecutorImpl.java:271) ~[libmp_common.jar:?]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_342]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_342]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_342]
2023-06-15T18:10:30.995Z ERROR task-executor-0-workitem-MP-########-####-####-####-##########4b WorkItem 6867 SYSTEM [nsx@6876 comp="nsx-manager" errorCode="MP30062" level="ERROR" subcomp="upgrade-coordinator"] Error encountered while upgrading upgrade unit <Hostname/FQDN>
com.vmware.nsx.management.upgrade.plugin.mp.exceptions.MPUpgradeException: null
at com.vmware.nsx.management.upgrade.utils.HandoverUpgradeUtils.performHandover(HandoverUpgradeUtils.java:79) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.plugin.mp.service.MPRollingUpgradeServiceImpl.upgradeMPNode(MPRollingUpgradeServiceImpl.java:744) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.plugin.mp.service.MPRollingUpgradeServiceImpl.upgrade(MPRollingUpgradeServiceImpl.java:246) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.plugin.mp.MpUpgradePlugin.upgrade(MpUpgradePlugin.java:275) ~[libnsx-upgrade-plugins.jar:?]
at com.vmware.nsx.management.upgrade.executionengine.SingleWorkItem.executeWorkItem(SingleWorkItem.java:115) ~[libuc-core.jar:?]
at com.vmware.nsx.management.upgrade.executionengine.SingleWorkItem.run(SingleWorkItem.java:90) ~[libuc-core.jar:?]
at com.vmware.nsx.management.common.executor.TaskExecutorImpl$TaskWrapper$1.run(TaskExecutorImpl.java:238) ~[libmp_common.jar:?]
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) ~[?:1.8.0_342]
at java.util.concurrent.FutureTask.run(FutureTask.java:266) ~[?:1.8.0_342]
at com.vmware.nsx.management.common.executor.TaskExecutorImpl$TaskWrapper.run(TaskExecutorImpl.java:271) ~[libmp_common.jar:?]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) ~[?:1.8.0_342]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) ~[?:1.8.0_342]
at java.lang.Thread.run(Thread.java:750) ~[?:1.8.0_342]
2023-06-15T18:10:31.011Z INFO task-executor-0-workitem-MP-########-####-####-####-##########4b ExecutionMonitorServiceImpl 6867 SYSTEM [nsx@6876 comp="nsx-manager" level="INFO" subcomp="upgrade-coordinator"] Execution monitor service invoked to react to failure of node ########-####-####-####-##########4b [Failed to handover upgrade control to another node. Please ensure that the other nodes are functioning, and retry the upgrade.]