8021da8ba6
When running "setup.sh config" (or setup.sh without specify a mode - which defaults to config), do not unload the contigmem driver and then reload it. Over time, memory can get fragmented and contigmem will fail, crashing the system. But still check if the requested HUGEMEM matches what was previously specified. If a different amount was requested, unload contigmem as before and then reload with the new amount. This patch brings FreeBSD behavior in line with Linux. On Linux, we do not release all of the hugepages and then reallocate them. Signed-off-by: Jim Harris <james.r.harris@intel.com> Change-Id: I18cb6fd4bdc1e1da0382423455ebce58c13bee7a Reviewed-on: https://review.gerrithub.io/412238 Reviewed-by: Daniel Verkamp <daniel.verkamp@intel.com> Reviewed-by: Ben Walker <benjamin.walker@intel.com> Tested-by: SPDK Automated Test System <sys_sgsw@intel.com>
541 lines
15 KiB
Bash
Executable File
541 lines
15 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
rootdir=$(readlink -f $(dirname $0))/..
|
|
source "$rootdir/scripts/common.sh"
|
|
|
|
function usage()
|
|
{
|
|
if [ `uname` = Linux ]; then
|
|
options="[config|reset|status|help]"
|
|
else
|
|
options="[config|reset|help]"
|
|
fi
|
|
|
|
[[ ! -z $2 ]] && ( echo "$2"; echo ""; )
|
|
echo "Helper script for allocating hugepages and binding NVMe, I/OAT and Virtio devices to"
|
|
echo "a generic VFIO kernel driver. If VFIO is not available on the system, this script will"
|
|
echo "fall back to UIO. NVMe and Virtio devices with active mountpoints will be ignored."
|
|
echo "All hugepage operations use default hugepage size on the system (hugepagesz)."
|
|
echo "Usage: $(basename $1) $options"
|
|
echo
|
|
echo "$options - as following:"
|
|
echo "config Default mode. Allocate hugepages and bind PCI devices."
|
|
echo "reset Rebind PCI devices back to their original drivers."
|
|
echo " Also cleanup any leftover spdk files/resources."
|
|
echo " Hugepage memory size will remain unchanged."
|
|
if [ `uname` = Linux ]; then
|
|
echo "status Print status of all SPDK-compatible devices on the system."
|
|
fi
|
|
echo "help Print this help message."
|
|
echo
|
|
echo "The following environment variables can be specified."
|
|
echo "HUGEMEM Size of hugepage memory to allocate (in MB). 2048 by default."
|
|
echo " For NUMA systems, the hugepages will be evenly distributed"
|
|
echo " between CPU nodes"
|
|
echo "NRHUGE Number of hugepages to allocate. This variable overwrites HUGEMEM."
|
|
echo "HUGENODE Specific NUMA node to allocate hugepages on. To allocate"
|
|
echo " hugepages on multiple nodes run this script multiple times -"
|
|
echo " once for each node."
|
|
echo "PCI_WHITELIST Whitespace separated list of PCI devices (NVMe, I/OAT, Virtio) to bind."
|
|
echo " Each device must be specified as a full PCI address."
|
|
echo " E.g. PCI_WHITELIST=\"0000:01:00.0 0000:02:00.0\""
|
|
echo " To blacklist all PCI devices use a non-valid address."
|
|
echo " E.g. PCI_WHITELIST=\"none\""
|
|
echo " If empty or unset, all PCI devices will be bound."
|
|
echo "TARGET_USER User that will own hugepage mountpoint directory and vfio groups."
|
|
echo " By default the current user will be used."
|
|
exit 0
|
|
}
|
|
|
|
# In monolithic kernels the lsmod won't work. So
|
|
# back that with a /sys/modules check. Return a different code for
|
|
# built-in vs module just in case we want that down the road.
|
|
function check_for_driver {
|
|
$(lsmod | grep $1 > /dev/null)
|
|
if [ $? -eq 0 ]; then
|
|
return 1
|
|
else
|
|
if [[ -d /sys/module/$1 ]]; then
|
|
return 2
|
|
else
|
|
return 0
|
|
fi
|
|
fi
|
|
return 0
|
|
}
|
|
|
|
function pci_can_bind() {
|
|
if [[ ${#PCI_WHITELIST[@]} == 0 ]]; then
|
|
#no whitelist specified, bind all devices
|
|
return 1
|
|
fi
|
|
|
|
for i in ${PCI_WHITELIST[@]}
|
|
do
|
|
if [ "$i" == "$1" ] ; then
|
|
return 1
|
|
fi
|
|
done
|
|
return 0
|
|
}
|
|
|
|
function linux_bind_driver() {
|
|
bdf="$1"
|
|
driver_name="$2"
|
|
old_driver_name="no driver"
|
|
ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
|
|
|
|
if [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
|
|
old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
|
|
|
|
if [ "$driver_name" = "$old_driver_name" ]; then
|
|
return 0
|
|
fi
|
|
|
|
echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
|
|
echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
|
|
fi
|
|
|
|
echo "$bdf ($ven_dev_id): $old_driver_name -> $driver_name"
|
|
|
|
echo "$ven_dev_id" > "/sys/bus/pci/drivers/$driver_name/new_id" 2> /dev/null || true
|
|
echo "$bdf" > "/sys/bus/pci/drivers/$driver_name/bind" 2> /dev/null || true
|
|
|
|
iommu_group=$(basename $(readlink -f /sys/bus/pci/devices/$bdf/iommu_group))
|
|
if [ -e "/dev/vfio/$iommu_group" ]; then
|
|
if [ -n "$TARGET_USER" ]; then
|
|
chown "$TARGET_USER" "/dev/vfio/$iommu_group"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
function linux_unbind_driver() {
|
|
bdf="$1"
|
|
ven_dev_id=$(lspci -n -s $bdf | cut -d' ' -f3 | sed 's/:/ /')
|
|
|
|
if ! [ -e "/sys/bus/pci/devices/$bdf/driver" ]; then
|
|
return 0
|
|
fi
|
|
|
|
old_driver_name=$(basename $(readlink /sys/bus/pci/devices/$bdf/driver))
|
|
|
|
echo "$ven_dev_id" > "/sys/bus/pci/devices/$bdf/driver/remove_id" 2> /dev/null || true
|
|
echo "$bdf" > "/sys/bus/pci/devices/$bdf/driver/unbind"
|
|
echo "$bdf ($ven_dev_id): $old_driver_name -> no driver"
|
|
}
|
|
|
|
function linux_hugetlbfs_mounts() {
|
|
mount | grep ' type hugetlbfs ' | awk '{ print $3 }'
|
|
}
|
|
|
|
function get_nvme_name_from_bdf {
|
|
set +e
|
|
nvme_devs=`lsblk -d --output NAME | grep "^nvme"`
|
|
set -e
|
|
for dev in $nvme_devs; do
|
|
link_name=$(readlink /sys/block/$dev/device/device) || true
|
|
if [ -z "$link_name" ]; then
|
|
link_name=$(readlink /sys/block/$dev/device)
|
|
fi
|
|
link_bdf=$(basename "$link_name")
|
|
if [ "$link_bdf" = "$1" ]; then
|
|
eval "$2=$dev"
|
|
return
|
|
fi
|
|
done
|
|
}
|
|
|
|
function get_virtio_names_from_bdf {
|
|
blk_devs=`lsblk --nodeps --output NAME`
|
|
virtio_names=''
|
|
|
|
for dev in $blk_devs; do
|
|
if readlink "/sys/block/$dev" | grep -q "$1"; then
|
|
virtio_names="$virtio_names $dev"
|
|
fi
|
|
done
|
|
|
|
eval "$2='$virtio_names'"
|
|
}
|
|
|
|
function configure_linux_pci {
|
|
driver_name=vfio-pci
|
|
if [ -z "$(ls /sys/kernel/iommu_groups)" ]; then
|
|
# No IOMMU. Use uio.
|
|
driver_name=uio_pci_generic
|
|
fi
|
|
|
|
# NVMe
|
|
modprobe $driver_name || true
|
|
for bdf in $(iter_pci_class_code 01 08 02); do
|
|
blkname=''
|
|
get_nvme_name_from_bdf "$bdf" blkname
|
|
if pci_can_bind $bdf == "0" ; then
|
|
echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
|
|
continue
|
|
fi
|
|
if [ "$blkname" != "" ]; then
|
|
mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
|
|
else
|
|
mountpoints="0"
|
|
fi
|
|
if [ "$mountpoints" = "0" ]; then
|
|
linux_bind_driver "$bdf" "$driver_name"
|
|
else
|
|
echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
|
|
fi
|
|
done
|
|
|
|
# IOAT
|
|
TMP=`mktemp`
|
|
#collect all the device_id info of ioat devices.
|
|
grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}' > $TMP
|
|
|
|
for dev_id in `cat $TMP`; do
|
|
for bdf in $(iter_pci_dev_id 8086 $dev_id); do
|
|
if pci_can_bind $bdf == "0" ; then
|
|
echo "Skipping un-whitelisted I/OAT device at $bdf"
|
|
continue
|
|
fi
|
|
linux_bind_driver "$bdf" "$driver_name"
|
|
done
|
|
done
|
|
rm $TMP
|
|
|
|
# virtio
|
|
TMP=`mktemp`
|
|
#collect all the device_id info of virtio devices.
|
|
grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}' > $TMP
|
|
|
|
for dev_id in `cat $TMP`; do
|
|
for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
|
|
if pci_can_bind $bdf == "0" ; then
|
|
echo "Skipping un-whitelisted Virtio device at $bdf"
|
|
continue
|
|
fi
|
|
blknames=''
|
|
get_virtio_names_from_bdf "$bdf" blknames
|
|
for blkname in $blknames; do
|
|
if mount | grep -q "/dev/$blkname"; then
|
|
echo Active mountpoints on /dev/$blkname, so not binding PCI dev $bdf
|
|
continue 2
|
|
fi
|
|
done
|
|
|
|
linux_bind_driver "$bdf" "$driver_name"
|
|
done
|
|
done
|
|
rm $TMP
|
|
|
|
echo "1" > "/sys/bus/pci/rescan"
|
|
}
|
|
|
|
function configure_linux {
|
|
configure_linux_pci
|
|
hugetlbfs_mounts=$(linux_hugetlbfs_mounts)
|
|
|
|
if [ -z "$hugetlbfs_mounts" ]; then
|
|
hugetlbfs_mounts=/mnt/huge
|
|
echo "Mounting hugetlbfs at $hugetlbfs_mounts"
|
|
mkdir -p "$hugetlbfs_mounts"
|
|
mount -t hugetlbfs nodev "$hugetlbfs_mounts"
|
|
fi
|
|
|
|
if [ -z "$HUGENODE" ]; then
|
|
hugepages_target="/proc/sys/vm/nr_hugepages"
|
|
else
|
|
hugepages_target="/sys/devices/system/node/node${HUGENODE}/hugepages/hugepages-${HUGEPGSZ}kB/nr_hugepages"
|
|
fi
|
|
|
|
echo "$NRHUGE" > "$hugepages_target"
|
|
allocated_hugepages=`cat $hugepages_target`
|
|
if [ "$allocated_hugepages" -lt "$NRHUGE" ]; then
|
|
echo ""
|
|
echo "## ERROR: requested $NRHUGE hugepages but only $allocated_hugepages could be allocated."
|
|
echo "## Memory might be heavily fragmented. Please try flushing the system cache, or reboot the machine."
|
|
exit 1
|
|
fi
|
|
|
|
if [ "$driver_name" = "vfio-pci" ]; then
|
|
if [ -n "$TARGET_USER" ]; then
|
|
for mount in $hugetlbfs_mounts; do
|
|
chown "$TARGET_USER" "$mount"
|
|
chmod g+w "$mount"
|
|
done
|
|
fi
|
|
|
|
MEMLOCK_AMNT=`ulimit -l`
|
|
if [ "$MEMLOCK_AMNT" != "unlimited" ] ; then
|
|
MEMLOCK_MB=$(( $MEMLOCK_AMNT / 1024 ))
|
|
echo ""
|
|
echo "Current user memlock limit: ${MEMLOCK_MB} MB"
|
|
echo ""
|
|
echo "This is the maximum amount of memory you will be"
|
|
echo "able to use with DPDK and VFIO if run as current user."
|
|
echo -n "To change this, please adjust limits.conf memlock "
|
|
echo "limit for current user."
|
|
|
|
if [ $MEMLOCK_AMNT -lt 65536 ] ; then
|
|
echo ""
|
|
echo "## WARNING: memlock limit is less than 64MB"
|
|
echo -n "## DPDK with VFIO may not be able to initialize "
|
|
echo "if run as current user."
|
|
fi
|
|
fi
|
|
fi
|
|
}
|
|
|
|
function reset_linux_pci {
|
|
# NVMe
|
|
set +e
|
|
check_for_driver nvme
|
|
driver_loaded=$?
|
|
set -e
|
|
for bdf in $(iter_pci_class_code 01 08 02); do
|
|
if pci_can_bind $bdf == "0" ; then
|
|
echo "Skipping un-whitelisted NVMe controller $blkname ($bdf)"
|
|
continue
|
|
fi
|
|
if [ $driver_loaded -ne 0 ]; then
|
|
linux_bind_driver "$bdf" nvme
|
|
else
|
|
linux_unbind_driver "$bdf"
|
|
fi
|
|
done
|
|
|
|
# IOAT
|
|
TMP=`mktemp`
|
|
#collect all the device_id info of ioat devices.
|
|
grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}' > $TMP
|
|
|
|
set +e
|
|
check_for_driver ioatdma
|
|
driver_loaded=$?
|
|
set -e
|
|
for dev_id in `cat $TMP`; do
|
|
for bdf in $(iter_pci_dev_id 8086 $dev_id); do
|
|
if pci_can_bind $bdf == "0" ; then
|
|
echo "Skipping un-whitelisted I/OAT device at $bdf"
|
|
continue
|
|
fi
|
|
if [ $driver_loaded -ne 0 ]; then
|
|
linux_bind_driver "$bdf" ioatdma
|
|
else
|
|
linux_unbind_driver "$bdf"
|
|
fi
|
|
done
|
|
done
|
|
rm $TMP
|
|
|
|
# virtio
|
|
TMP=`mktemp`
|
|
#collect all the device_id info of virtio devices.
|
|
grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}' > $TMP
|
|
|
|
# TODO: check if virtio-pci is loaded first and just unbind if it is not loaded
|
|
# Requires some more investigation - for example, some kernels do not seem to have
|
|
# virtio-pci but just virtio_scsi instead. Also need to make sure we get the
|
|
# underscore vs. dash right in the virtio_scsi name.
|
|
modprobe virtio-pci || true
|
|
for dev_id in `cat $TMP`; do
|
|
for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
|
|
if pci_can_bind $bdf == "0" ; then
|
|
echo "Skipping un-whitelisted Virtio device at $bdf"
|
|
continue
|
|
fi
|
|
linux_bind_driver "$bdf" virtio-pci
|
|
done
|
|
done
|
|
rm $TMP
|
|
|
|
echo "1" > "/sys/bus/pci/rescan"
|
|
}
|
|
|
|
function reset_linux {
|
|
reset_linux_pci
|
|
for mount in $(linux_hugetlbfs_mounts); do
|
|
rm -f "$mount"/spdk*map_*
|
|
done
|
|
rm -f /run/.spdk*
|
|
}
|
|
|
|
function status_linux {
|
|
echo "Hugepages"
|
|
printf "%-6s %10s %8s / %6s\n" "node" "hugesize" "free" "total"
|
|
|
|
numa_nodes=0
|
|
shopt -s nullglob
|
|
for path in /sys/devices/system/node/node?/hugepages/hugepages-*/; do
|
|
numa_nodes=$((numa_nodes + 1))
|
|
free_pages=`cat $path/free_hugepages`
|
|
all_pages=`cat $path/nr_hugepages`
|
|
|
|
[[ $path =~ (node[0-9]+)/hugepages/hugepages-([0-9]+kB) ]]
|
|
|
|
node=${BASH_REMATCH[1]}
|
|
huge_size=${BASH_REMATCH[2]}
|
|
|
|
printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
|
|
done
|
|
shopt -u nullglob
|
|
|
|
# fall back to system-wide hugepages
|
|
if [ "$numa_nodes" = "0" ]; then
|
|
free_pages=`grep HugePages_Free /proc/meminfo | awk '{ print $2 }'`
|
|
all_pages=`grep HugePages_Total /proc/meminfo | awk '{ print $2 }'`
|
|
node="-"
|
|
huge_size="$HUGEPGSZ"
|
|
|
|
printf "%-6s %10s %8s / %6s\n" $node $huge_size $free_pages $all_pages
|
|
fi
|
|
|
|
echo "NVMe devices"
|
|
|
|
echo -e "BDF\t\tNuma Node\tDriver name\t\tDevice name"
|
|
for bdf in $(iter_pci_class_code 01 08 02); do
|
|
driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
|
|
node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
|
|
if [ "$driver" = "nvme" -a -d /sys/bus/pci/devices/$bdf/nvme ]; then
|
|
name="\t"`ls /sys/bus/pci/devices/$bdf/nvme`;
|
|
else
|
|
name="-";
|
|
fi
|
|
echo -e "$bdf\t$node\t\t$driver\t\t$name";
|
|
done
|
|
|
|
echo "I/OAT DMA"
|
|
|
|
#collect all the device_id info of ioat devices.
|
|
TMP=`grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}'`
|
|
echo -e "BDF\t\tNuma Node\tDriver Name"
|
|
for dev_id in $TMP; do
|
|
for bdf in $(iter_pci_dev_id 8086 $dev_id); do
|
|
driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
|
|
node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
|
|
echo -e "$bdf\t$node\t\t$driver"
|
|
done
|
|
done
|
|
|
|
echo "virtio"
|
|
|
|
#collect all the device_id info of virtio devices.
|
|
TMP=`grep "PCI_DEVICE_ID_VIRTIO" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}'`
|
|
echo -e "BDF\t\tNuma Node\tDriver Name\t\tDevice Name"
|
|
for dev_id in $TMP; do
|
|
for bdf in $(iter_pci_dev_id 1af4 $dev_id); do
|
|
driver=`grep DRIVER /sys/bus/pci/devices/$bdf/uevent |awk -F"=" '{print $2}'`
|
|
node=`cat /sys/bus/pci/devices/$bdf/numa_node`;
|
|
blknames=''
|
|
get_virtio_names_from_bdf "$bdf" blknames
|
|
echo -e "$bdf\t$node\t\t$driver\t\t$blknames"
|
|
done
|
|
done
|
|
}
|
|
|
|
function configure_freebsd_pci {
|
|
TMP=`mktemp`
|
|
|
|
# NVMe
|
|
GREP_STR="class=0x010802"
|
|
|
|
# IOAT
|
|
grep "PCI_DEVICE_ID_INTEL_IOAT" $rootdir/include/spdk/pci_ids.h \
|
|
| awk -F"x" '{print $2}' > $TMP
|
|
for dev_id in `cat $TMP`; do
|
|
GREP_STR="${GREP_STR}\|chip=0x${dev_id}8086"
|
|
done
|
|
|
|
AWK_PROG="{if (count > 0) printf \",\"; printf \"%s:%s:%s\",\$2,\$3,\$4; count++}"
|
|
echo $AWK_PROG > $TMP
|
|
|
|
BDFS=`pciconf -l | grep "${GREP_STR}" | awk -F: -f $TMP`
|
|
|
|
kldunload nic_uio.ko || true
|
|
kenv hw.nic_uio.bdfs=$BDFS
|
|
kldload nic_uio.ko
|
|
rm $TMP
|
|
}
|
|
|
|
function configure_freebsd {
|
|
configure_freebsd_pci
|
|
# If contigmem is already loaded but the HUGEMEM specified doesn't match the
|
|
# previous value, unload contigmem so that we can reload with the new value.
|
|
if kldstat -q -m contigmem; then
|
|
if [ `kenv hw.contigmem.num_buffers` -ne "$((HUGEMEM / 256))" ]; then
|
|
kldunload contigmem.ko
|
|
fi
|
|
fi
|
|
if ! kldstat -q -m contigmem; then
|
|
kenv hw.contigmem.num_buffers=$((HUGEMEM / 256))
|
|
kenv hw.contigmem.buffer_size=$((256 * 1024 * 1024))
|
|
kldload contigmem.ko
|
|
fi
|
|
}
|
|
|
|
function reset_freebsd {
|
|
kldunload contigmem.ko || true
|
|
kldunload nic_uio.ko || true
|
|
}
|
|
|
|
mode=$1
|
|
|
|
if [ -z "$mode" ]; then
|
|
mode="config"
|
|
fi
|
|
|
|
: ${HUGEMEM:=2048}
|
|
: ${PCI_WHITELIST:=""}
|
|
|
|
if [ -n "$NVME_WHITELIST" ]; then
|
|
PCI_WHITELIST="$PCI_WHITELIST $NVME_WHITELIST"
|
|
fi
|
|
|
|
if [ -n "$SKIP_PCI" ]; then
|
|
PCI_WHITELIST="none"
|
|
fi
|
|
|
|
declare -a PCI_WHITELIST=(${PCI_WHITELIST})
|
|
|
|
if [ -z "$TARGET_USER" ]; then
|
|
TARGET_USER="$SUDO_USER"
|
|
if [ -z "$TARGET_USER" ]; then
|
|
TARGET_USER=`logname 2>/dev/null` || true
|
|
fi
|
|
fi
|
|
|
|
if [ `uname` = Linux ]; then
|
|
HUGEPGSZ=$(( `grep Hugepagesize /proc/meminfo | cut -d : -f 2 | tr -dc '0-9'` ))
|
|
HUGEPGSZ_MB=$(( $HUGEPGSZ / 1024 ))
|
|
: ${NRHUGE=$(( (HUGEMEM + HUGEPGSZ_MB - 1) / HUGEPGSZ_MB ))}
|
|
|
|
if [ "$mode" == "config" ]; then
|
|
configure_linux
|
|
elif [ "$mode" == "reset" ]; then
|
|
reset_linux
|
|
elif [ "$mode" == "status" ]; then
|
|
status_linux
|
|
elif [ "$mode" == "help" ]; then
|
|
usage $0
|
|
else
|
|
usage $0 "Invalid argument '$mode'"
|
|
fi
|
|
else
|
|
if [ "$mode" == "config" ]; then
|
|
configure_freebsd
|
|
elif [ "$mode" == "reset" ]; then
|
|
reset_freebsd
|
|
elif [ "$mode" == "help" ]; then
|
|
usage $0
|
|
else
|
|
usage $0 "Invalid argument '$mode'"
|
|
fi
|
|
fi
|