Customer may observe a couple of "Memory Usage Warning" events are generated by the VMware SD-WAN Edge followed by "Edge down", "HA Failover Identified", "High Availability Going Active" and eventually "High Availability Ready" (If HA is enabled on this site).
All supported VMware VeloCloud SD-WAN edge versions
This is because SD-WAN edge memory usage exceeded defined thresholds and triggerred a defensive restart causing HA switchover. If above pattern exhibits a clear periodicity, it is necessary to consider whether a memory leak is occurring. Customer can also identity memory leak by Memory Utilization graph in the SD-WAN orchestrator. The general trend of the curve is monotonically increasing, but the Flow Count shows no significant changes over days.
In most cases, the output of "vc_mem_mon.sh" command is a very helpful input for identitying which process is causing memory leak.
edge:VCE610:~# vc_mem_mon.sh
Top 10 Memory Hoggers
---------------------
14136720 memb.mod_vc_ht_t.tot_bytes
10797036 memb.mod_misc_t.tot_bytes
6171264 memb.mod_nat_ht_t.tot_bytes
6073984 memb.mod_rt_cache_ht_t.tot_bytes
3085632 memb.mod_vc_uflow_ht_t.tot_bytes
3085632 memb.mod_fc_ht_t.tot_bytes
2946944 memb.mod_app_data_t.tot_bytes
1822144 memb.mod_flow_tracker_t.tot_bytes
1802560 memb.mod_misc_eventdisp_t.tot_bytes
1638440 memb.mod_vc_mutex_t.tot_bytes
Top 10 Objects by volume
------------------------
7884 memb.mod_port_range_t.obj_cnt
3076 memb.mod_misc_t.obj_cnt
2585 memb.mod_rxt_root_t.obj_cnt
2585 memb.mod_rxt_ops_t.obj_cnt
2585 memb.mod_radix_node_head_t.obj_cnt
2585 memb.mod_radix_mask_head_t.obj_cnt
1815 memb.mod_json_t.obj_cnt
1325 memb.mod_appmap_ip_port_entry_t.obj_cnt
693 memb.mod_misc_dhcp_t.obj_cnt
693 memb.mod_dhcp_os_desc_t.obj_cnt
However it is not included in the diagnostic bundle, and it only has troubleshooting significance when the memory usage reaches its peak. Hence, a python script is useful to record and save the output to a file periodically. Consider below python script:
#!/usr/bin/python
import subprocess
import threading
import time
import datetime
import shlex
import logging
import os
import signal
from logging.handlers import RotatingFileHandler
from subprocess import Popen, PIPE
from threading import Timer
signal.signal(signal.SIGTTOU, signal.SIG_IGN)
PERFORMANCE_MONITOR_DIR = "/velocloud/debug/memory_leak_check/"
MAX_DEBUG_FILES = 3600
# copied this as commands in python2.7 doesn't have support for timeout
class Command(object):
def __init__(self, cmd):
self.cmd = cmd
self.process = None
self.out = None
def run_parallel(self, capture_file=None, timeout_sec=180):
redirect_file = None
if capture_file != None:
redirect_file = open(capture_file, "wb")
logger.info("capture file is %s" % (capture_file))
proc = Popen(shlex.split(self.cmd), stdout=redirect_file)
timer = Timer(timeout_sec, proc.kill)
try:
timer.start()
stdout, stderr = proc.communicate()
finally:
logger.info("Debug recorded at %s" % (capture_file))
timer.cancel()
def run_command(self, capture=False):
if not capture:
self.process = subprocess.Popen(self.cmd, shell=True)
self.process.communicate()
return
# capturing the outputs of shell commands
self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stdin=subprocess.PIPE)
out, err = self.process.communicate()
if len(out) > 0:
self.out = out.splitlines()
else:
self.out = None
# set default timeout to 2 minutes
def run(self, capture_file=None, timeout=120):
#thread = threading.Thread(target=self.run_parallel, args=(capture_file, timeout,))
thread = threading.Thread(target=self.run_command, args=())
thread.start()
def get_debug_info():
defect_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
capture_file_name = PERFORMANCE_MONITOR_DIR + "/mem_%s" % (defect_time)
cmd ="vc_mem_mon.sh > %s" % (capture_file_name)
logger.info("Capture %s" % (cmd))
Command(cmd).run(timeout=60, capture_file=capture_file_name)
def cleanup_top_monitor_files(file_list):
for file in file_list:
if file == '' or file == None:
continue
os.remove(PERFORMANCE_MONITOR_DIR + file)
def main():
global logger
logger = logging.getLogger('my_logger')
logger.setLevel(logging.INFO)
formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
handler = RotatingFileHandler('/var/log/memory_leak_check.log', maxBytes=8000, backupCount=7)
handler.setFormatter(formatter)
logger.addHandler(handler)
if not os.path.exists(PERFORMANCE_MONITOR_DIR):
os.mkdir(PERFORMANCE_MONITOR_DIR)
# change get_cntr_cmd with required commands
get_cntr_cmd = 'debug.py -v --link | grep GE4 -B 42 | grep Loss'
last_time = datetime.datetime.now()
while True:
#packet_loss = get_counters_value(get_cntr_cmd)
#if not packet_loss:
# logger.info("no packet loss")
#else:
get_debug_info()
time.sleep(21600)
cur_time = datetime.datetime.now()
if cur_time - last_time > datetime.timedelta(minutes=59):
cmd="ls -t %s/ | tail -n +%d" % (PERFORMANCE_MONITOR_DIR, MAX_DEBUG_FILES)
ret_out = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE).stdout.decode('utf-8').strip()
last_time = cur_time
logger.info("Checking for files to be cleared")
cleanup_top_monitor_files(ret_out.split("\n"))
if __name__ == "__main__":
main()
Run the script in the background:
nohup python mem_leak_monitor_py3.9.py &
Script firstly creates a new folder /velocloud/debug/memory_leak_check/. Then, every 6 hours, it saves the output of the vc_mem_mon.sh command into a separate file into that folder, named with UTC time. Memory leak monitor script can also be downloaded here:
https://ent.box.com/s/e3x16hla8fxiqkdqszkdt47mjqz8nj41
Sometimes engineering may request SD-WAN edge core file when memory utilization is high, customer can manually generate a core file with below command:
kill -SIGSEGV `pidof edged`
This command kills edged and will cause HA switchover(where applicable), use it in a maintenance window.