From 45c42ac2f26d6f8ef484b32290791c659d9488f4 Mon Sep 17 00:00:00 2001 From: Michal Berger Date: Tue, 1 Dec 2020 12:57:47 +0100 Subject: [PATCH] scripts: Use core dump collector This is done in order to track core dumps in a more efficient manner. Till now, some cores could be missed if the binary was executed outside of the cwd of the autotest (i.e. outside of the spdk repo) but which was part of the critical path of the actual test (e.g. fio in vhost-initiator tests). Also, since core_pattern was set to plain "core", impact on the underlying storage wasn't controlled either - if core was 20G in size, this is what we would get. This could easly exhaust storage in case error-prone patchsets were submitted on the CI side. The collector will try to mitigate all the above by doing the following: - collecting all the cores, regardless of their cwd - limiting size of the core to 2G - compressing the cores (lz4) Also, limit of 2 collectors executing at once is set - if more processes crashes at approx. the same time, they will be logged in the kernel log instead. Signed-off-by: Michal Berger Change-Id: I5956a9030c463ae85a21bfe95f28af5568c5c285 Reviewed-on: https://review.spdk.io/gerrit/c/spdk/spdk/+/5369 Tested-by: SPDK CI Jenkins Community-CI: Mellanox Build Bot Reviewed-by: Karol Latecki Reviewed-by: Tomasz Zawadzki Reviewed-by: Jim Harris Reviewed-by: Shuhei Matsumoto --- autotest.sh | 4 +- scripts/core-collector.sh | 89 ++++++++++++++++++++++++++++++++++ test/common/autotest_common.sh | 48 +++++++++++------- 3 files changed, 122 insertions(+), 19 deletions(-) create mode 100755 scripts/core-collector.sh diff --git a/autotest.sh b/autotest.sh index d98c846db9..75bab1ddfa 100755 --- a/autotest.sh +++ b/autotest.sh @@ -30,8 +30,10 @@ fi if [ $(uname -s) = Linux ]; then old_core_pattern=$(< /proc/sys/kernel/core_pattern) + mkdir -p "$output_dir/coredumps" # set core_pattern to a known value to avoid ABRT, systemd-coredump, etc. - echo "core" > /proc/sys/kernel/core_pattern + echo "|$rootdir/scripts/core-collector.sh %P %s %t $output_dir/coredumps" > /proc/sys/kernel/core_pattern + echo 2 > /proc/sys/kernel/core_pipe_limit # Make sure that the hugepage state for our VM is fresh so we don't fail # hugepage allocation. Allow time for this action to complete. diff --git a/scripts/core-collector.sh b/scripts/core-collector.sh new file mode 100755 index 0000000000..55125b12ef --- /dev/null +++ b/scripts/core-collector.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# We don't want to tell kernel to include %e or %E since these +# can include whitespaces or other funny characters, and working +# with those on the cmdline would be a nightmare. Use procfs for +# the remaining pieces we want to gather: +# |$rootdir/scripts/core-collector.sh %P %s %t $output_dir + +get_rlimit() { + local limit + + while read -ra limit; do + [[ ${limit[1]} == core ]] && echo "${limit[4]}" # soft + done < "/proc/$core_pid/limits" +} + +core_meta() { + jq . <<- CORE + { + "$exe_comm": { + "ts": "$core_time", + "size": "$core_size bytes", + "PID": $core_pid, + "signal": "$core_sig ($core_sig_name)", + "path": "$exe_path", + "statm": "$statm" + } + } + CORE +} + +bt() { hash gdb && gdb -batch -ex "thread apply all bt full" "$1" "$2" 2>&1; } + +stderr() { + exec 2> "$core.stderr.txt" + set -x +} + +args+=(core_pid) +args+=(core_sig) +args+=(core_ts) +args+=(output_dir) + +read -r "${args[@]}" <<< "$*" + +exe_path=$(readlink -f "/proc/$core_pid/exe") +exe_comm=$(< "/proc/$core_pid/comm") +statm=$(< "/proc/$core_pid/statm") +core_time=$(date -d@"$core_ts") +core_sig_name=$(kill -l "$core_sig") + +core=$output_dir/${exe_path##*/}_$core_pid.core +stderr + +# RLIMIT_CORE is not enforced when core is piped to us. To make +# sure we won't attempt to overload underlying storage, copy +# only the reasonable amount of bytes (systemd defaults to 2G +# so let's follow that). But first, check limits of terminating +# process to see if we need to make any adjustments. +max_core=$((1024 * 1024 * 1024 * 2)) + +rlimit=$(get_rlimit) +if [[ $rlimit == unlimited ]] || ((rlimit > max_core)); then + rlimit=$max_core +fi + +# Nothing to do +((rlimit == 0)) && exit 0 + +# Clear path for lz +rm -f "$core"{,.{bin,bt,gz,json}} + +# Slurp the core +head -c "$rlimit" <&0 > "$core" +core_size=$(wc -c < "$core") + +# Compress it +gzip -c "$core" > "$core.gz" + +# Save the binary +cp "$exe_path" "$core.bin" + +# Save the backtrace +bt "$exe_path" "$core" > "$core.bt.txt" + +# Save the metadata of the core +core_meta > "$core.json" + +# Nuke the original core +rm "$core" diff --git a/test/common/autotest_common.sh b/test/common/autotest_common.sh index ac55f2e3c0..700cb17eff 100755 --- a/test/common/autotest_common.sh +++ b/test/common/autotest_common.sh @@ -600,24 +600,36 @@ function gdb_attach() { } function process_core() { - ret=0 - while IFS= read -r -d '' core; do - exe=$(eu-readelf -n "$core" | grep psargs | sed "s/.*psargs: \([^ \'\" ]*\).*/\1/") - if [[ ! -f "$exe" ]]; then - exe=$(eu-readelf -n "$core" | grep -oP -m1 "$exe.+") - fi - echo "exe for $core is $exe" - if [[ -n "$exe" ]]; then - if hash gdb &> /dev/null; then - gdb -batch -ex "thread apply all bt full" $exe $core - fi - cp $exe $output_dir - fi - mv $core $output_dir - chmod a+r $output_dir/$core - ret=1 - done < <(find . -type f \( -name 'core.[0-9]*' -o -name 'core' -o -name '*.core' \) -print0) - return $ret + # Note that this always was racy as we can't really sync with the kernel + # to see if there's any core queued up for writing. We could check if + # collector is running and wait for it explicitly, but it doesn't seem + # to be worth the effort. So assume that if we are being called via + # trap, as in, when some error has occurred, wait up to 5s for any + # potential cores. If we are called just for cleanup at the very end, + # don't wait since all the tests ended successfully, hence having any + # critical cores lying around is unlikely. + local es=$? + ((es != 0)) && sleep 5s + + local coredumps core + + shopt -s nullglob + coredumps=("$output_dir/coredumps/"*.bt.txt) + shopt -u nullglob + + ((${#coredumps[@]} > 0)) || return 0 + chmod -R a+r "$output_dir/coredumps" + + for core in "${coredumps[@]}"; do + cat <<- BT + ##### CORE BT ${core##*/} ##### + + $(<"$core") + + -- + BT + done + return 1 } function process_shm() {