How to manually collect logs from a TKG cluster
search cancel

How to manually collect logs from a TKG cluster


Article ID: 327469


Updated On:




Instructions to collect logs in scenarios where log collection is not feasible using crashd and additional information related to the cluster nodes is needed.


VMware Tanzu Kubernetes Grid 1.x


The steps in this KB can be used to collect logs manually
  • Process, CPU, Memory, Network and Disk statistics
  • Operating system logs
  • All pods and container logs
  • Useful in scenarios where crashd is not an available option
  • These steps have been tested against cluster with Ubuntu and Photon nodes


Generate script to gather node statistics

cat << EOF > $PWD/

set -x

# Process/OS
sudo ulimit -a &> ulimit.out
sudo sysctl -a &> sysctl.out
sudo cat /proc/loadavg &> loadavg.out
sudo cat /proc/sys/kernel/threads-max &> threads-max.out
sudo ps -elf &> pself.out
sudo ps -elfT &> psthreads.out
sudo cat /proc/sys/fs/file-nr &> file-nr.out
sudo cat /proc/sys/fs/file-max &> file-max.out
sudo lsof -n &> lsof-n.out
sudo uname -a &> uname.out
sudo vmstat 1 5 &> vmstat.out
sudo sar -A &> sar-A.out
sudo pidstat -p ALL -T ALL -I -l -r  -t  -u &> pidstat.out
sudo cat /proc/vmstat &> proc-vmstat.out

# Memory
sudo ps -eo pid,ppid,cmd,vsz,fuser,%mem,%cpu --sort=-%mem | head -n 20 &> psmem.out
sudo cat /proc/meminfo &> mem-info.out
sudo vmstat -m &> vmstat-m.out
sudo cat /proc/slabinfo &> slabinfo.out
sudo slabtop -s c -o &> slabtop.out
sudo free -m &> free-m.out

sudo ps -eo pid,ppid,cmd,vsz,fuser,%mem,%cpu --sort=-%cpu | head -n 20 &> pscpu.out
sudo cat /proc/cpuinfo &> cpuinfo.out
sudo cat /proc/softirqs &> softirqs.out
sudo cat /proc/interrupts &> interrupts.out
sudo lscpu &> lscpu.out
sudo uptime &> uptime.out
sudo mpstat 1 5 -P ALL &> mpstat-all.out

# Network
sudo netstat -lantupWe &> netstat-lantupWe.out
sudo netstat -i &> netstat-i.out
sudo netstat -s &> netstat-s.out
sudo ss -noemitaup &> ss-noemitaup.out
sudo cat /proc/net/dev &> proc-net-dev.out
sudo cat /proc/net/sockstat &> sockstat.out
sudo cat /proc/net/sockstat6 &> sockstat6.out
sudo ip addr &> ip-addr.out
sudo route -n &> route-n.out
sudo ip rule list &> ip-rule-list.out
sudo ip route show table all &> ip-show-table.out
sudo iptables -L -v -n --line-numbers &> iptables-all.out
sudo conntrack -L &> conntrack-L.out
sudo conntrack -S &> conntrack-S.out

# Disk/IO
sudo df -ih --total &> df-ih.out
sudo df -ah --total &> df-ah.out
sudo arp -an &> arp-an.out
sudo iostat 1 5 &> iostat.out

# Kubernetes and containerd
sudo journalctl -xeu containerd &> containerd.out
sudo journalctl -xeu kubelet &> kubelet.out
sudo crictl info &> crictlinfo.out

# Bundle Artifacts
sudo tar czvf /var/log/node-stats-\$(date +%F-%H-%M-%Z).tar.gz *.out

Generate script to gather node logs
cat << EOF > $PWD/
set -x
sudo tar czvf /home/capv/\$1-varlog.tar.gz --exclude='/var/log/journal/*/*@*' /var/log/
sudo chown capv:users /home/capv/\$1-varlog.tar.gz

Kickoff Log Collection
chmod +x
kubectl get nodes -o jsonpath='{.items[*].status.addresses[?(@.type=="ExternalIP")].address}' > $HOME/nodes

# Update the key path according to your environment
export SSH_KEY_PATH=$HOME/.ssh/id_rsa

for i in `cat $HOME/nodes`; do
    echo "Collecting data on node $i"
    ssh -i $SSH_KEY_PATH -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" -q capv@$i 'bash -s' <
    echo "Collecting logs on node $i"
    ssh -i $SSH_KEY_PATH -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" -q capv@$i 'bash -s' < $i
    echo "Transferring files from node $i to local path $HOME"
    scp -i $SSH_KEY_PATH -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" capv@$i:/home/capv/$i-varlog.tar.gz $HOME/.
    echo "Removing collected artifacts from node $i"
    ssh -i $SSH_KEY_PATH -o "UserKnownHostsFile=/dev/null" -o "StrictHostKeyChecking=no" -q capv@$i sudo rm -rf /var/log/node-stats-*.tar.gz /home/capv/$i-varlog.tar.gz /home/capv/*.out