VMware VeloCloud SD-WAN General troubleshooting guide for memory leak
search cancel

VMware VeloCloud SD-WAN General troubleshooting guide for memory leak

book

Article ID: 383584

calendar_today

Updated On:

Products

VMware VeloCloud SD-WAN

Issue/Introduction

Customer may observe a couple of "Memory Usage Warning" events are generated by the VMware SD-WAN Edge followed by "Edge down", "HA Failover Identified", "High Availability Going Active" and eventually "High Availability Ready" (If HA is enabled on this site).

Environment

All supported VMware VeloCloud SD-WAN edge versions

Cause

This is because SD-WAN edge memory usage exceeded defined thresholds and triggerred a defensive restart causing HA switchover. If above pattern exhibits a clear periodicity, it is necessary to consider whether a memory leak is occurring. Customer can also identity memory leak by Memory Utilization graph in the SD-WAN orchestrator. The general trend of the curve is monotonically increasing, but the Flow Count shows no significant changes over days.

Resolution

In most cases, the output of "vc_mem_mon.sh" command is a very helpful input for identitying which process is causing memory leak.

edge:VCE610:~# vc_mem_mon.sh
Top 10 Memory Hoggers
---------------------
14136720     memb.mod_vc_ht_t.tot_bytes
10797036     memb.mod_misc_t.tot_bytes
6171264      memb.mod_nat_ht_t.tot_bytes
6073984      memb.mod_rt_cache_ht_t.tot_bytes
3085632      memb.mod_vc_uflow_ht_t.tot_bytes
3085632      memb.mod_fc_ht_t.tot_bytes
2946944      memb.mod_app_data_t.tot_bytes
1822144      memb.mod_flow_tracker_t.tot_bytes
1802560      memb.mod_misc_eventdisp_t.tot_bytes
1638440      memb.mod_vc_mutex_t.tot_bytes

Top 10 Objects by volume
------------------------
7884         memb.mod_port_range_t.obj_cnt
3076         memb.mod_misc_t.obj_cnt
2585         memb.mod_rxt_root_t.obj_cnt
2585         memb.mod_rxt_ops_t.obj_cnt
2585         memb.mod_radix_node_head_t.obj_cnt
2585         memb.mod_radix_mask_head_t.obj_cnt
1815         memb.mod_json_t.obj_cnt
1325         memb.mod_appmap_ip_port_entry_t.obj_cnt
693          memb.mod_misc_dhcp_t.obj_cnt
693          memb.mod_dhcp_os_desc_t.obj_cnt


However it is not included in the diagnostic bundle, and it only has troubleshooting significance when the memory usage reaches its peak. Hence, a python script is useful to record and save the output to a file periodically. Consider below python script:

#!/usr/bin/python

import subprocess
import threading
import time
import datetime
import shlex
import logging
import os
import signal
from logging.handlers import RotatingFileHandler

from subprocess import Popen, PIPE
from threading import Timer

signal.signal(signal.SIGTTOU, signal.SIG_IGN)

PERFORMANCE_MONITOR_DIR = "/velocloud/debug/memory_leak_check/"
MAX_DEBUG_FILES = 3600

# copied this as commands in python2.7 doesn't have support for timeout


class Command(object):
    def __init__(self, cmd):
        self.cmd = cmd
        self.process = None
        self.out = None

    def run_parallel(self, capture_file=None, timeout_sec=180):
        redirect_file = None
        if capture_file != None:
            redirect_file = open(capture_file, "wb")
        logger.info("capture file is %s" % (capture_file))
        proc = Popen(shlex.split(self.cmd), stdout=redirect_file)
        timer = Timer(timeout_sec, proc.kill)
        try:
            timer.start()
            stdout, stderr = proc.communicate()
        finally:
            logger.info("Debug recorded at %s" % (capture_file))
            timer.cancel()

    def run_command(self, capture=False):
        if not capture:
            self.process = subprocess.Popen(self.cmd, shell=True)
            self.process.communicate()
            return
        # capturing the outputs of shell commands
        self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE,
                                        stderr=subprocess.PIPE, stdin=subprocess.PIPE)
        out, err = self.process.communicate()
        if len(out) > 0:
            self.out = out.splitlines()
        else:
            self.out = None

    # set default timeout to 2 minutes
    def run(self, capture_file=None, timeout=120):
        #thread = threading.Thread(target=self.run_parallel, args=(capture_file, timeout,))
        thread = threading.Thread(target=self.run_command, args=())
        thread.start()

def get_debug_info():
    defect_time = datetime.datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
    capture_file_name = PERFORMANCE_MONITOR_DIR + "/mem_%s" % (defect_time)
    cmd ="vc_mem_mon.sh > %s" % (capture_file_name)
    logger.info("Capture %s" % (cmd))
    Command(cmd).run(timeout=60, capture_file=capture_file_name)
    
    
    
def cleanup_top_monitor_files(file_list):
    for file in file_list:
        if file == '' or file == None:
            continue
        os.remove(PERFORMANCE_MONITOR_DIR + file)
       
def main():  
    global logger
    logger = logging.getLogger('my_logger')
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
                                  datefmt='%Y-%m-%d %H:%M:%S')
    handler = RotatingFileHandler('/var/log/memory_leak_check.log', maxBytes=8000, backupCount=7)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    
    if not os.path.exists(PERFORMANCE_MONITOR_DIR):
        os.mkdir(PERFORMANCE_MONITOR_DIR)
    
    # change get_cntr_cmd with required commands  
    get_cntr_cmd = 'debug.py -v --link | grep GE4 -B 42 | grep Loss'
    last_time = datetime.datetime.now()
    while True:
        #packet_loss = get_counters_value(get_cntr_cmd)
        #if not packet_loss:
        #    logger.info("no packet loss")
        #else:
        
        get_debug_info()
        time.sleep(21600)
        cur_time = datetime.datetime.now()
        if cur_time - last_time > datetime.timedelta(minutes=59):
            cmd="ls -t %s/ | tail -n +%d" % (PERFORMANCE_MONITOR_DIR, MAX_DEBUG_FILES)
            ret_out = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE).stdout.decode('utf-8').strip()
            last_time = cur_time
            logger.info("Checking for files to be cleared")
            cleanup_top_monitor_files(ret_out.split("\n"))
            
         
if __name__ == "__main__":
  main()

Run the script in the background:

nohup python mem_leak_monitor_py3.9.py &

Script firstly creates a new folder /velocloud/debug/memory_leak_check/. Then, every 6 hours, it saves the output of the vc_mem_mon.sh command into a separate file into that folder, named with UTC time. Memory leak monitor script can also be downloaded here:

https://ent.box.com/s/e3x16hla8fxiqkdqszkdt47mjqz8nj41

 

Sometimes engineering may request SD-WAN edge core file when memory utilization is high, customer can manually generate a core file with below command:

kill -SIGSEGV `pidof edged`

This command kills edged and will cause HA switchover(where applicable), use it in a maintenance window.

Additional Information