diff --git a/autotest.sh b/autotest.sh index d98c846db9..75bab1ddfa 100755 --- a/autotest.sh +++ b/autotest.sh @@ -30,8 +30,10 @@ fi if [ $(uname -s) = Linux ]; then old_core_pattern=$(< /proc/sys/kernel/core_pattern) + mkdir -p "$output_dir/coredumps" # set core_pattern to a known value to avoid ABRT, systemd-coredump, etc. - echo "core" > /proc/sys/kernel/core_pattern + echo "|$rootdir/scripts/core-collector.sh %P %s %t $output_dir/coredumps" > /proc/sys/kernel/core_pattern + echo 2 > /proc/sys/kernel/core_pipe_limit # Make sure that the hugepage state for our VM is fresh so we don't fail # hugepage allocation. Allow time for this action to complete. diff --git a/scripts/core-collector.sh b/scripts/core-collector.sh new file mode 100755 index 0000000000..55125b12ef --- /dev/null +++ b/scripts/core-collector.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +# We don't want to tell kernel to include %e or %E since these +# can include whitespaces or other funny characters, and working +# with those on the cmdline would be a nightmare. Use procfs for +# the remaining pieces we want to gather: +# |$rootdir/scripts/core-collector.sh %P %s %t $output_dir + +get_rlimit() { + local limit + + while read -ra limit; do + [[ ${limit[1]} == core ]] && echo "${limit[4]}" # soft + done < "/proc/$core_pid/limits" +} + +core_meta() { + jq . <<- CORE + { + "$exe_comm": { + "ts": "$core_time", + "size": "$core_size bytes", + "PID": $core_pid, + "signal": "$core_sig ($core_sig_name)", + "path": "$exe_path", + "statm": "$statm" + } + } + CORE +} + +bt() { hash gdb && gdb -batch -ex "thread apply all bt full" "$1" "$2" 2>&1; } + +stderr() { + exec 2> "$core.stderr.txt" + set -x +} + +args+=(core_pid) +args+=(core_sig) +args+=(core_ts) +args+=(output_dir) + +read -r "${args[@]}" <<< "$*" + +exe_path=$(readlink -f "/proc/$core_pid/exe") +exe_comm=$(< "/proc/$core_pid/comm") +statm=$(< "/proc/$core_pid/statm") +core_time=$(date -d@"$core_ts") +core_sig_name=$(kill -l "$core_sig") + +core=$output_dir/${exe_path##*/}_$core_pid.core +stderr + +# RLIMIT_CORE is not enforced when core is piped to us. To make +# sure we won't attempt to overload underlying storage, copy +# only the reasonable amount of bytes (systemd defaults to 2G +# so let's follow that). But first, check limits of terminating +# process to see if we need to make any adjustments. +max_core=$((1024 * 1024 * 1024 * 2)) + +rlimit=$(get_rlimit) +if [[ $rlimit == unlimited ]] || ((rlimit > max_core)); then + rlimit=$max_core +fi + +# Nothing to do +((rlimit == 0)) && exit 0 + +# Clear path for lz +rm -f "$core"{,.{bin,bt,gz,json}} + +# Slurp the core +head -c "$rlimit" <&0 > "$core" +core_size=$(wc -c < "$core") + +# Compress it +gzip -c "$core" > "$core.gz" + +# Save the binary +cp "$exe_path" "$core.bin" + +# Save the backtrace +bt "$exe_path" "$core" > "$core.bt.txt" + +# Save the metadata of the core +core_meta > "$core.json" + +# Nuke the original core +rm "$core" diff --git a/test/common/autotest_common.sh b/test/common/autotest_common.sh index ac55f2e3c0..700cb17eff 100755 --- a/test/common/autotest_common.sh +++ b/test/common/autotest_common.sh @@ -600,24 +600,36 @@ function gdb_attach() { } function process_core() { - ret=0 - while IFS= read -r -d '' core; do - exe=$(eu-readelf -n "$core" | grep psargs | sed "s/.*psargs: \([^ \'\" ]*\).*/\1/") - if [[ ! -f "$exe" ]]; then - exe=$(eu-readelf -n "$core" | grep -oP -m1 "$exe.+") - fi - echo "exe for $core is $exe" - if [[ -n "$exe" ]]; then - if hash gdb &> /dev/null; then - gdb -batch -ex "thread apply all bt full" $exe $core - fi - cp $exe $output_dir - fi - mv $core $output_dir - chmod a+r $output_dir/$core - ret=1 - done < <(find . -type f \( -name 'core.[0-9]*' -o -name 'core' -o -name '*.core' \) -print0) - return $ret + # Note that this always was racy as we can't really sync with the kernel + # to see if there's any core queued up for writing. We could check if + # collector is running and wait for it explicitly, but it doesn't seem + # to be worth the effort. So assume that if we are being called via + # trap, as in, when some error has occurred, wait up to 5s for any + # potential cores. If we are called just for cleanup at the very end, + # don't wait since all the tests ended successfully, hence having any + # critical cores lying around is unlikely. + local es=$? + ((es != 0)) && sleep 5s + + local coredumps core + + shopt -s nullglob + coredumps=("$output_dir/coredumps/"*.bt.txt) + shopt -u nullglob + + ((${#coredumps[@]} > 0)) || return 0 + chmod -R a+r "$output_dir/coredumps" + + for core in "${coredumps[@]}"; do + cat <<- BT + ##### CORE BT ${core##*/} ##### + + $(<"$core") + + -- + BT + done + return 1 } function process_shm() {