freebsd-dev/sys/dev/ena/ena_sysctl.c
Marcin Wojtas eb4c4f4a2e ena: merge ena-com v2.5.0 upgrade
Merge commit '2530eb1fa01bf28fbcfcdda58bd41e055dcb2e4a'

Adjust the driver to the upgraded ena-com part twofold:

First update is related to the driver's NUMA awareness.

Allocate I/O queue memory in NUMA domain local to the CPU bound to the
given queue, improving data access time. Since this can result in
performance hit for unaware users, this is done only when RSS
option is enabled, for other cases the driver relies on kernel to
allocate memory by itself.

Information about first CPU bound is saved in adapter structure, so
the binding persists after bringing the interface down and up again.

If there are more buckets than interface queues, the driver will try to
bind different interfaces to different CPUs using round-robin algorithm
(but it will not bind queues to CPUs which do not have any RSS buckets
associated with them). This is done to better utilize hardware
resources by spreading the load.

Add (read-only) per-queue sysctls in order to provide the following
information:
- queueN.domain: NUMA domain associated with the queue
- queueN.cpu:    CPU affinity of the queue

The second change is for the CSUM_OFFLOAD constant, as ENA platform
file has removed its definition. To align to that change, it has been
added to the ena_datapath.h file.

Submitted by: Artur Rojek <ar@semihalf.com>
Submitted by: Dawid Gorecki <dgr@semihalf.com>
Obtained from: Semihalf
MFC after: 2 weeks
Sponsored by: Amazon, Inc.
2022-01-23 20:27:13 +01:00

929 lines
28 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2015-2021 Amazon.com, Inc. or its affiliates.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include <sys/param.h>
__FBSDID("$FreeBSD$");
#include "opt_rss.h"
#include "ena_sysctl.h"
#include "ena_rss.h"
static void ena_sysctl_add_wd(struct ena_adapter *);
static void ena_sysctl_add_stats(struct ena_adapter *);
static void ena_sysctl_add_eni_metrics(struct ena_adapter *);
static void ena_sysctl_add_tuneables(struct ena_adapter *);
/* Kernel option RSS prevents manipulation of key hash and indirection table. */
#ifndef RSS
static void ena_sysctl_add_rss(struct ena_adapter *);
#endif
static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS);
static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS);
static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS);
static int ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS);
#ifndef RSS
static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS);
static int ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS);
#endif
/* Limit max ENI sample rate to be an hour. */
#define ENI_METRICS_MAX_SAMPLE_INTERVAL 3600
#define ENA_HASH_KEY_MSG_SIZE (ENA_HASH_KEY_SIZE * 2 + 1)
static SYSCTL_NODE(_hw, OID_AUTO, ena, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
"ENA driver parameters");
/*
* Logging level for changing verbosity of the output
*/
int ena_log_level = ENA_INFO;
SYSCTL_INT(_hw_ena, OID_AUTO, log_level, CTLFLAG_RWTUN,
&ena_log_level, 0, "Logging level indicating verbosity of the logs");
SYSCTL_CONST_STRING(_hw_ena, OID_AUTO, driver_version, CTLFLAG_RD,
DRV_MODULE_VERSION, "ENA driver version");
/*
* Use 9k mbufs for the Rx buffers. Default to 0 (use page size mbufs instead).
* Using 9k mbufs in low memory conditions might cause allocation to take a lot
* of time and lead to the OS instability as it needs to look for the contiguous
* pages.
* However, page size mbufs has a bit smaller throughput than 9k mbufs, so if
* the network performance is the priority, the 9k mbufs can be used.
*/
int ena_enable_9k_mbufs = 0;
SYSCTL_INT(_hw_ena, OID_AUTO, enable_9k_mbufs, CTLFLAG_RDTUN,
&ena_enable_9k_mbufs, 0, "Use 9 kB mbufs for Rx descriptors");
/*
* Force the driver to use large LLQ (Low Latency Queue) header. Defaults to
* false. This option may be important for platforms, which often handle packet
* headers on Tx with total header size greater than 96B, as it may
* reduce the latency.
* It also reduces the maximum Tx queue size by half, so it may cause more Tx
* packet drops.
*/
bool ena_force_large_llq_header = false;
SYSCTL_BOOL(_hw_ena, OID_AUTO, force_large_llq_header, CTLFLAG_RDTUN,
&ena_force_large_llq_header, 0,
"Increases maximum supported header size in LLQ mode to 224 bytes, while reducing the maximum Tx queue size by half.\n");
int ena_rss_table_size = ENA_RX_RSS_TABLE_SIZE;
void
ena_sysctl_add_nodes(struct ena_adapter *adapter)
{
ena_sysctl_add_wd(adapter);
ena_sysctl_add_stats(adapter);
ena_sysctl_add_eni_metrics(adapter);
ena_sysctl_add_tuneables(adapter);
#ifndef RSS
ena_sysctl_add_rss(adapter);
#endif
}
static void
ena_sysctl_add_wd(struct ena_adapter *adapter)
{
device_t dev;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct sysctl_oid_list *child;
dev = adapter->pdev;
ctx = device_get_sysctl_ctx(dev);
tree = device_get_sysctl_tree(dev);
child = SYSCTL_CHILDREN(tree);
/* Sysctl calls for Watchdog service */
SYSCTL_ADD_INT(ctx, child, OID_AUTO, "wd_active",
CTLFLAG_RWTUN, &adapter->wd_active, 0,
"Watchdog is active");
SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "keep_alive_timeout",
CTLFLAG_RWTUN, &adapter->keep_alive_timeout,
"Timeout for Keep Alive messages");
SYSCTL_ADD_QUAD(ctx, child, OID_AUTO, "missing_tx_timeout",
CTLFLAG_RWTUN, &adapter->missing_tx_timeout,
"Timeout for TX completion");
SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_max_queues",
CTLFLAG_RWTUN, &adapter->missing_tx_max_queues, 0,
"Number of TX queues to check per run");
SYSCTL_ADD_U32(ctx, child, OID_AUTO, "missing_tx_threshold",
CTLFLAG_RWTUN, &adapter->missing_tx_threshold, 0,
"Max number of timeouted packets");
}
static void
ena_sysctl_add_stats(struct ena_adapter *adapter)
{
device_t dev;
struct ena_ring *tx_ring;
struct ena_ring *rx_ring;
struct ena_hw_stats *hw_stats;
struct ena_stats_dev *dev_stats;
struct ena_stats_tx *tx_stats;
struct ena_stats_rx *rx_stats;
struct ena_com_stats_admin *admin_stats;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct sysctl_oid_list *child;
struct sysctl_oid *queue_node, *tx_node, *rx_node, *hw_node;
struct sysctl_oid *admin_node;
struct sysctl_oid_list *queue_list, *tx_list, *rx_list, *hw_list;
struct sysctl_oid_list *admin_list;
#define QUEUE_NAME_LEN 32
char namebuf[QUEUE_NAME_LEN];
int i;
dev = adapter->pdev;
ctx = device_get_sysctl_ctx(dev);
tree = device_get_sysctl_tree(dev);
child = SYSCTL_CHILDREN(tree);
tx_ring = adapter->tx_ring;
rx_ring = adapter->rx_ring;
hw_stats = &adapter->hw_stats;
dev_stats = &adapter->dev_stats;
admin_stats = &adapter->ena_dev->admin_queue.stats;
SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "wd_expired",
CTLFLAG_RD, &dev_stats->wd_expired,
"Watchdog expiry count");
SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_up",
CTLFLAG_RD, &dev_stats->interface_up,
"Network interface up count");
SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "interface_down",
CTLFLAG_RD, &dev_stats->interface_down,
"Network interface down count");
SYSCTL_ADD_COUNTER_U64(ctx, child, OID_AUTO, "admin_q_pause",
CTLFLAG_RD, &dev_stats->admin_q_pause,
"Admin queue pauses");
for (i = 0; i < adapter->num_io_queues; ++i, ++tx_ring, ++rx_ring) {
snprintf(namebuf, QUEUE_NAME_LEN, "queue%d", i);
queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO,
namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name");
queue_list = SYSCTL_CHILDREN(queue_node);
adapter->que[i].oid = queue_node;
#ifdef RSS
/* Common stats */
SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "cpu",
CTLFLAG_RD, &adapter->que[i].cpu, 0, "CPU affinity");
SYSCTL_ADD_INT(ctx, queue_list, OID_AUTO, "domain",
CTLFLAG_RD, &adapter->que[i].domain, 0, "NUMA domain");
#endif
/* TX specific stats */
tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
"tx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX ring");
tx_list = SYSCTL_CHILDREN(tx_node);
tx_stats = &tx_ring->tx_stats;
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"count", CTLFLAG_RD,
&tx_stats->cnt, "Packets sent");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"bytes", CTLFLAG_RD,
&tx_stats->bytes, "Bytes sent");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"prepare_ctx_err", CTLFLAG_RD,
&tx_stats->prepare_ctx_err,
"TX buffer preparation failures");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"dma_mapping_err", CTLFLAG_RD,
&tx_stats->dma_mapping_err, "DMA mapping failures");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"doorbells", CTLFLAG_RD,
&tx_stats->doorbells, "Queue doorbells");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"missing_tx_comp", CTLFLAG_RD,
&tx_stats->missing_tx_comp, "TX completions missed");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"bad_req_id", CTLFLAG_RD,
&tx_stats->bad_req_id, "Bad request id count");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"mbuf_collapses", CTLFLAG_RD,
&tx_stats->collapse,
"Mbuf collapse count");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"mbuf_collapse_err", CTLFLAG_RD,
&tx_stats->collapse_err,
"Mbuf collapse failures");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"queue_wakeups", CTLFLAG_RD,
&tx_stats->queue_wakeup, "Queue wakeups");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"queue_stops", CTLFLAG_RD,
&tx_stats->queue_stop, "Queue stops");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"llq_buffer_copy", CTLFLAG_RD,
&tx_stats->llq_buffer_copy,
"Header copies for llq transaction");
SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO,
"unmask_interrupt_num", CTLFLAG_RD,
&tx_stats->unmask_interrupt_num,
"Unmasked interrupt count");
/* RX specific stats */
rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO,
"rx_ring", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX ring");
rx_list = SYSCTL_CHILDREN(rx_node);
rx_stats = &rx_ring->rx_stats;
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"count", CTLFLAG_RD,
&rx_stats->cnt, "Packets received");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"bytes", CTLFLAG_RD,
&rx_stats->bytes, "Bytes received");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"refil_partial", CTLFLAG_RD,
&rx_stats->refil_partial, "Partial refilled mbufs");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"csum_bad", CTLFLAG_RD,
&rx_stats->csum_bad, "Bad RX checksum");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"mbuf_alloc_fail", CTLFLAG_RD,
&rx_stats->mbuf_alloc_fail, "Failed mbuf allocs");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"mjum_alloc_fail", CTLFLAG_RD,
&rx_stats->mjum_alloc_fail, "Failed jumbo mbuf allocs");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"dma_mapping_err", CTLFLAG_RD,
&rx_stats->dma_mapping_err, "DMA mapping errors");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"bad_desc_num", CTLFLAG_RD,
&rx_stats->bad_desc_num, "Bad descriptor count");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"bad_req_id", CTLFLAG_RD,
&rx_stats->bad_req_id, "Bad request id count");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"empty_rx_ring", CTLFLAG_RD,
&rx_stats->empty_rx_ring, "RX descriptors depletion count");
SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO,
"csum_good", CTLFLAG_RD,
&rx_stats->csum_good, "Valid RX checksum calculations");
}
/* Stats read from device */
hw_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "hw_stats",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Statistics from hardware");
hw_list = SYSCTL_CHILDREN(hw_node);
SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_packets", CTLFLAG_RD,
&hw_stats->rx_packets, "Packets received");
SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_packets", CTLFLAG_RD,
&hw_stats->tx_packets, "Packets transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_bytes", CTLFLAG_RD,
&hw_stats->rx_bytes, "Bytes received");
SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_bytes", CTLFLAG_RD,
&hw_stats->tx_bytes, "Bytes transmitted");
SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "rx_drops", CTLFLAG_RD,
&hw_stats->rx_drops, "Receive packet drops");
SYSCTL_ADD_COUNTER_U64(ctx, hw_list, OID_AUTO, "tx_drops", CTLFLAG_RD,
&hw_stats->tx_drops, "Transmit packet drops");
/* ENA Admin queue stats */
admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "admin_stats",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA Admin Queue statistics");
admin_list = SYSCTL_CHILDREN(admin_node);
SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "aborted_cmd", CTLFLAG_RD,
&admin_stats->aborted_cmd, 0, "Aborted commands");
SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "sumbitted_cmd", CTLFLAG_RD,
&admin_stats->submitted_cmd, 0, "Submitted commands");
SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "completed_cmd", CTLFLAG_RD,
&admin_stats->completed_cmd, 0, "Completed commands");
SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "out_of_space", CTLFLAG_RD,
&admin_stats->out_of_space, 0, "Queue out of space");
SYSCTL_ADD_U64(ctx, admin_list, OID_AUTO, "no_completion", CTLFLAG_RD,
&admin_stats->no_completion, 0, "Commands not completed");
}
static void
ena_sysctl_add_eni_metrics(struct ena_adapter *adapter)
{
device_t dev;
struct ena_admin_eni_stats *eni_metrics;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct sysctl_oid_list *child;
struct sysctl_oid *eni_node;
struct sysctl_oid_list *eni_list;
dev = adapter->pdev;
ctx = device_get_sysctl_ctx(dev);
tree = device_get_sysctl_tree(dev);
child = SYSCTL_CHILDREN(tree);
eni_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "eni_metrics",
CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "ENA's ENI metrics");
eni_list = SYSCTL_CHILDREN(eni_node);
eni_metrics = &adapter->eni_metrics;
SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_in_allowance_exceeded",
CTLFLAG_RD, &eni_metrics->bw_in_allowance_exceeded, 0,
"Inbound BW allowance exceeded");
SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "bw_out_allowance_exceeded",
CTLFLAG_RD, &eni_metrics->bw_out_allowance_exceeded, 0,
"Outbound BW allowance exceeded");
SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "pps_allowance_exceeded",
CTLFLAG_RD, &eni_metrics->pps_allowance_exceeded, 0,
"PPS allowance exceeded");
SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "conntrack_allowance_exceeded",
CTLFLAG_RD, &eni_metrics->conntrack_allowance_exceeded, 0,
"Connection tracking allowance exceeded");
SYSCTL_ADD_U64(ctx, eni_list, OID_AUTO, "linklocal_allowance_exceeded",
CTLFLAG_RD, &eni_metrics->linklocal_allowance_exceeded, 0,
"Linklocal packet rate allowance exceeded");
/*
* Tuneable, which determines how often ENI metrics will be read.
* 0 means it's turned off. Maximum allowed value is limited by:
* ENI_METRICS_MAX_SAMPLE_INTERVAL.
*/
SYSCTL_ADD_PROC(ctx, eni_list, OID_AUTO, "sample_interval",
CTLTYPE_U16 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
ena_sysctl_eni_metrics_interval, "SU",
"Interval in seconds for updating ENI emetrics. 0 turns off the update.");
}
static void
ena_sysctl_add_tuneables(struct ena_adapter *adapter)
{
device_t dev;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct sysctl_oid_list *child;
dev = adapter->pdev;
ctx = device_get_sysctl_ctx(dev);
tree = device_get_sysctl_tree(dev);
child = SYSCTL_CHILDREN(tree);
/* Tuneable number of buffers in the buf-ring (drbr) */
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "buf_ring_size",
CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
ena_sysctl_buf_ring_size, "I",
"Size of the Tx buffer ring (drbr).");
/* Tuneable number of the Rx ring size */
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_queue_size",
CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
ena_sysctl_rx_queue_size, "I",
"Size of the Rx ring. The size should be a power of 2.");
/* Tuneable number of IO queues */
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "io_queues_nb",
CTLTYPE_U32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
ena_sysctl_io_queues_nb, "I", "Number of IO queues.");
}
/* Kernel option RSS prevents manipulation of key hash and indirection table. */
#ifndef RSS
static void
ena_sysctl_add_rss(struct ena_adapter *adapter)
{
device_t dev;
struct sysctl_ctx_list *ctx;
struct sysctl_oid *tree;
struct sysctl_oid_list *child;
dev = adapter->pdev;
ctx = device_get_sysctl_ctx(dev);
tree = device_get_sysctl_tree(dev);
child = SYSCTL_CHILDREN(tree);
/* RSS options */
tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rss",
CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Receive Side Scaling options.");
child = SYSCTL_CHILDREN(tree);
/* RSS hash key */
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "key",
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
ena_sysctl_rss_key, "A", "RSS key.");
/* Tuneable RSS indirection table */
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "indir_table",
CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
ena_sysctl_rss_indir_table, "A", "RSS indirection table.");
/* RSS indirection table size */
SYSCTL_ADD_INT(ctx, child, OID_AUTO, "indir_table_size",
CTLFLAG_RD | CTLFLAG_MPSAFE, &ena_rss_table_size, 0,
"RSS indirection table size.");
}
#endif /* RSS */
/*
* ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes.
*
* Whether the nodes are registered or unregistered depends on a delta between
* the `old` and `new` parameters, representing the number of queues.
*
* This function is used to hide sysctl attributes for queue nodes which aren't
* currently used by the HW (e.g. after a call to `ena_sysctl_io_queues_nb`).
*
* NOTE:
* All unregistered nodes must be registered again at detach, i.e. by a call to
* this function.
*/
void
ena_sysctl_update_queue_node_nb(struct ena_adapter *adapter, int old, int new)
{
device_t dev;
struct sysctl_oid *oid;
int min, max, i;
dev = adapter->pdev;
min = MIN(old, new);
max = MIN(MAX(old, new), adapter->max_num_io_queues);
for (i = min; i < max; ++i) {
oid = adapter->que[i].oid;
sysctl_wlock();
if (old > new)
sysctl_unregister_oid(oid);
else
sysctl_register_oid(oid);
sysctl_wunlock();
}
}
static int
ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS)
{
struct ena_adapter *adapter = arg1;
uint32_t val;
int error;
ENA_LOCK_LOCK();
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
error = EINVAL;
goto unlock;
}
val = 0;
error = sysctl_wire_old_buffer(req, sizeof(val));
if (error == 0) {
val = adapter->buf_ring_size;
error = sysctl_handle_32(oidp, &val, 0, req);
}
if (error != 0 || req->newptr == NULL)
goto unlock;
if (!powerof2(val) || val == 0) {
ena_log(adapter->pdev, ERR,
"Requested new Tx buffer ring size (%u) is not a power of 2\n",
val);
error = EINVAL;
goto unlock;
}
if (val != adapter->buf_ring_size) {
ena_log(adapter->pdev, INFO,
"Requested new Tx buffer ring size: %d. Old size: %d\n",
val, adapter->buf_ring_size);
error = ena_update_buf_ring_size(adapter, val);
} else {
ena_log(adapter->pdev, ERR,
"New Tx buffer ring size is the same as already used: %u\n",
adapter->buf_ring_size);
}
unlock:
ENA_LOCK_UNLOCK();
return (error);
}
static int
ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS)
{
struct ena_adapter *adapter = arg1;
uint32_t val;
int error;
ENA_LOCK_LOCK();
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
error = EINVAL;
goto unlock;
}
val = 0;
error = sysctl_wire_old_buffer(req, sizeof(val));
if (error == 0) {
val = adapter->requested_rx_ring_size;
error = sysctl_handle_32(oidp, &val, 0, req);
}
if (error != 0 || req->newptr == NULL)
goto unlock;
if (val < ENA_MIN_RING_SIZE || val > adapter->max_rx_ring_size) {
ena_log(adapter->pdev, ERR,
"Requested new Rx queue size (%u) is out of range: [%u, %u]\n",
val, ENA_MIN_RING_SIZE, adapter->max_rx_ring_size);
error = EINVAL;
goto unlock;
}
/* Check if the parameter is power of 2 */
if (!powerof2(val)) {
ena_log(adapter->pdev, ERR,
"Requested new Rx queue size (%u) is not a power of 2\n",
val);
error = EINVAL;
goto unlock;
}
if (val != adapter->requested_rx_ring_size) {
ena_log(adapter->pdev, INFO,
"Requested new Rx queue size: %u. Old size: %u\n",
val, adapter->requested_rx_ring_size);
error = ena_update_queue_size(adapter,
adapter->requested_tx_ring_size, val);
} else {
ena_log(adapter->pdev, ERR,
"New Rx queue size is the same as already used: %u\n",
adapter->requested_rx_ring_size);
}
unlock:
ENA_LOCK_UNLOCK();
return (error);
}
/*
* Change number of effectively used IO queues adapter->num_io_queues
*/
static int
ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS)
{
struct ena_adapter *adapter = arg1;
uint32_t old_num_queues, tmp = 0;
int error;
ENA_LOCK_LOCK();
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
error = EINVAL;
goto unlock;
}
error = sysctl_wire_old_buffer(req, sizeof(tmp));
if (error == 0) {
tmp = adapter->num_io_queues;
error = sysctl_handle_int(oidp, &tmp, 0, req);
}
if (error != 0 || req->newptr == NULL)
goto unlock;
if (tmp == 0) {
ena_log(adapter->pdev, ERR,
"Requested number of IO queues is zero\n");
error = EINVAL;
goto unlock;
}
/*
* The adapter::max_num_io_queues is the HW capability. The system
* resources availability may potentially be a tighter limit. Therefore
* the relation `adapter::max_num_io_queues >= adapter::msix_vecs`
* always holds true, while the `adapter::msix_vecs` is variable across
* device reset (`ena_destroy_device()` + `ena_restore_device()`).
*/
if (tmp > (adapter->msix_vecs - ENA_ADMIN_MSIX_VEC)) {
ena_log(adapter->pdev, ERR,
"Requested number of IO queues is higher than maximum "
"allowed (%u)\n", adapter->msix_vecs - ENA_ADMIN_MSIX_VEC);
error = EINVAL;
goto unlock;
}
if (tmp == adapter->num_io_queues) {
ena_log(adapter->pdev, ERR,
"Requested number of IO queues is equal to current value "
"(%u)\n", adapter->num_io_queues);
} else {
ena_log(adapter->pdev, INFO,
"Requested new number of IO queues: %u, current value: "
"%u\n", tmp, adapter->num_io_queues);
old_num_queues = adapter->num_io_queues;
error = ena_update_io_queue_nb(adapter, tmp);
if (error != 0)
return (error);
ena_sysctl_update_queue_node_nb(adapter, old_num_queues, tmp);
}
unlock:
ENA_LOCK_UNLOCK();
return (error);
}
static int
ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS)
{
struct ena_adapter *adapter = arg1;
uint16_t interval;
int error;
ENA_LOCK_LOCK();
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
error = EINVAL;
goto unlock;
}
error = sysctl_wire_old_buffer(req, sizeof(interval));
if (error == 0) {
interval = adapter->eni_metrics_sample_interval;
error = sysctl_handle_16(oidp, &interval, 0, req);
}
if (error != 0 || req->newptr == NULL)
goto unlock;
if (interval > ENI_METRICS_MAX_SAMPLE_INTERVAL) {
ena_log(adapter->pdev, ERR,
"ENI metrics update interval is out of range - maximum allowed value: %d seconds\n",
ENI_METRICS_MAX_SAMPLE_INTERVAL);
error = EINVAL;
goto unlock;
}
if (interval == 0) {
ena_log(adapter->pdev, INFO,
"ENI metrics update is now turned off\n");
bzero(&adapter->eni_metrics, sizeof(adapter->eni_metrics));
} else {
ena_log(adapter->pdev, INFO,
"ENI metrics update interval is set to: %"PRIu16" seconds\n",
interval);
}
adapter->eni_metrics_sample_interval = interval;
unlock:
ENA_LOCK_UNLOCK();
return (0);
}
#ifndef RSS
/*
* Change the Receive Side Scaling hash key.
*/
static int
ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS)
{
struct ena_adapter *adapter = arg1;
struct ena_com_dev *ena_dev = adapter->ena_dev;
enum ena_admin_hash_functions ena_func;
char msg[ENA_HASH_KEY_MSG_SIZE];
char elem[3] = { 0 };
char *endp;
u8 rss_key[ENA_HASH_KEY_SIZE];
int error, i;
ENA_LOCK_LOCK();
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
error = EINVAL;
goto unlock;
}
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
error = ENOTSUP;
goto unlock;
}
error = sysctl_wire_old_buffer(req, sizeof(msg));
if (error != 0)
goto unlock;
error = ena_com_get_hash_function(adapter->ena_dev, &ena_func);
if (error != 0) {
device_printf(adapter->pdev, "Cannot get hash function\n");
goto unlock;
}
if (ena_func != ENA_ADMIN_TOEPLITZ) {
error = EINVAL;
device_printf(adapter->pdev, "Unsupported hash algorithm\n");
goto unlock;
}
error = ena_rss_get_hash_key(ena_dev, rss_key);
if (error != 0) {
device_printf(adapter->pdev, "Cannot get hash key\n");
goto unlock;
}
for (i = 0; i < ENA_HASH_KEY_SIZE; ++i)
snprintf(&msg[i * 2], 3, "%02x", rss_key[i]);
error = sysctl_handle_string(oidp, msg, sizeof(msg), req);
if (error != 0 || req->newptr == NULL)
goto unlock;
if (strlen(msg) != sizeof(msg) - 1) {
error = EINVAL;
device_printf(adapter->pdev, "Invalid key size\n");
goto unlock;
}
for (i = 0; i < ENA_HASH_KEY_SIZE; ++i) {
strncpy(elem, &msg[i * 2], 2);
rss_key[i] = strtol(elem, &endp, 16);
/* Both hex nibbles in the string must be valid to continue. */
if (endp == elem || *endp != '\0' || rss_key[i] < 0) {
error = EINVAL;
device_printf(adapter->pdev,
"Invalid key hex value: '%c'\n", *endp);
goto unlock;
}
}
error = ena_rss_set_hash(ena_dev, rss_key);
if (error != 0)
device_printf(adapter->pdev, "Cannot fill hash key\n");
unlock:
ENA_LOCK_UNLOCK();
return (error);
}
/*
* Change the Receive Side Scaling indirection table.
*
* The sysctl entry string consists of one or more `x:y` keypairs, where
* x stands for the table index and y for its new value.
* Table indices that don't need to be updated can be omitted from the string
* and will retain their existing values. If an index is entered more than once,
* the last value is used.
*
* Example:
* To update two selected indices in the RSS indirection table, e.g. setting
* index 0 to queue 5 and then index 5 to queue 0, the below command should be
* used:
* sysctl dev.ena.0.rss.indir_table="0:5 5:0"
*/
static int
ena_sysctl_rss_indir_table(SYSCTL_HANDLER_ARGS)
{
int num_queues, error;
struct ena_adapter *adapter = arg1;
struct ena_com_dev *ena_dev;
struct ena_indir *indir;
char *msg, *buf, *endp;
uint32_t idx, value;
ENA_LOCK_LOCK();
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
error = EINVAL;
goto unlock;
}
if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_RSS_ACTIVE, adapter))) {
error = ENOTSUP;
goto unlock;
}
ena_dev = adapter->ena_dev;
indir = adapter->rss_indir;
msg = indir->sysctl_buf;
if (unlikely(indir == NULL)) {
error = ENOTSUP;
goto unlock;
}
error = sysctl_handle_string(oidp, msg, sizeof(indir->sysctl_buf), req);
if (error != 0 || req->newptr == NULL)
goto unlock;
num_queues = adapter->num_io_queues;
/*
* This sysctl expects msg to be a list of `x:y` record pairs,
* where x is the indirection table index and y is its value.
*/
for (buf = msg; *buf != '\0'; buf = endp) {
idx = strtol(buf, &endp, 10);
if (endp == buf || idx < 0) {
device_printf(adapter->pdev, "Invalid index: %s\n",
buf);
error = EINVAL;
break;
}
if (idx >= ENA_RX_RSS_TABLE_SIZE) {
device_printf(adapter->pdev, "Index %d out of range\n",
idx);
error = ERANGE;
break;
}
buf = endp;
if (*buf++ != ':') {
device_printf(adapter->pdev, "Missing ':' separator\n");
error = EINVAL;
break;
}
value = strtol(buf, &endp, 10);
if (endp == buf || value < 0) {
device_printf(adapter->pdev, "Invalid value: %s\n",
buf);
error = EINVAL;
break;
}
if (value >= num_queues) {
device_printf(adapter->pdev, "Value %d out of range\n",
value);
error = ERANGE;
break;
}
indir->table[idx] = value;
}
if (error != 0) /* Reload indirection table with last good data. */
ena_rss_indir_get(adapter, indir->table);
/* At this point msg has been clobbered by sysctl_handle_string. */
ena_rss_copy_indir_buf(msg, indir->table);
if (error == 0)
error = ena_rss_indir_set(adapter, indir->table);
unlock:
ENA_LOCK_UNLOCK();
return (error);
}
#endif /* RSS */