MFC r263317, r263412, and r263451.
r263317: cxgbe(4): significant rx rework. - More flexible cluster size selection, including the ability to fall back to a safe cluster size (PAGE_SIZE from zone_jumbop by default) in case an allocation of a larger size fails. - A single get_fl_payload() function that assembles the payload into an mbuf chain for any kind of freelist. This replaces two variants: one for freelists with buffer packing enabled and another for those without. - Buffer packing with any sized cluster. It was limited to 4K clusters only before this change. - Enable buffer packing for TOE rx queues as well. - Statistics and tunables to go with all these changes. The driver's man page will be updated separately. r263412: cxgbe(4): if_iqdrops statistic should include tunnel congestion drops. r263451: cxgbe(4): man page updates.
This commit is contained in:
parent
c4599dcf44
commit
ebd722c9b4
@ -1,4 +1,4 @@
|
||||
.\" Copyright (c) 2011-2013, Chelsio Inc
|
||||
.\" Copyright (c) 2011-2014, Chelsio Inc
|
||||
.\" All rights reserved.
|
||||
.\"
|
||||
.\" Redistribution and use in source and binary forms, with or without
|
||||
@ -31,7 +31,7 @@
|
||||
.\"
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd December 18, 2013
|
||||
.Dd March 20, 2014
|
||||
.Dt CXGBE 4
|
||||
.Os
|
||||
.Sh NAME
|
||||
@ -141,11 +141,11 @@ prompt before booting the kernel or stored in
|
||||
.Xr loader.conf 5 .
|
||||
.Bl -tag -width indent
|
||||
.It Va hw.cxgbe.ntxq10g
|
||||
The number of tx queues to use for a 10Gb port.
|
||||
The number of tx queues to use for a 10Gb or 40Gb port.
|
||||
The default is 16 or the number
|
||||
of CPU cores in the system, whichever is less.
|
||||
.It Va hw.cxgbe.nrxq10g
|
||||
The number of rx queues to use for a 10Gb port.
|
||||
The number of rx queues to use for a 10Gb or 40Gb port.
|
||||
The default is 8 or the number
|
||||
of CPU cores in the system, whichever is less.
|
||||
.It Va hw.cxgbe.ntxq1g
|
||||
@ -157,11 +157,11 @@ The number of rx queues to use for a 1Gb port.
|
||||
The default is 2 or the number
|
||||
of CPU cores in the system, whichever is less.
|
||||
.It Va hw.cxgbe.nofldtxq10g
|
||||
The number of TOE tx queues to use for a 10Gb port.
|
||||
The number of TOE tx queues to use for a 10Gb or 40Gb port.
|
||||
The default is 8 or the
|
||||
number of CPU cores in the system, whichever is less.
|
||||
.It Va hw.cxgbe.nofldrxq10g
|
||||
The number of TOE rx queues to use for a 10Gb port.
|
||||
The number of TOE rx queues to use for a 10Gb or 40Gb port.
|
||||
The default is 2 or the
|
||||
number of CPU cores in the system, whichever is less.
|
||||
.It Va hw.cxgbe.nofldtxq1g
|
||||
@ -177,8 +177,7 @@ The timer index value to use to delay interrupts.
|
||||
The holdoff timer list has the values 1, 5, 10, 50, 100, and 200
|
||||
by default (all values are in microseconds) and the index selects a
|
||||
value from this list.
|
||||
The default value is 1 for both 10Gb and 1Gb ports, which means the
|
||||
timer value is 5us.
|
||||
The default value is 1 which means the timer value is 5us.
|
||||
Different interfaces can be assigned different values at any time via the
|
||||
dev.cxgbe.X.holdoff_tmr_idx or dev.cxl.X.holdoff_tmr_idx sysctl.
|
||||
.It Va hw.cxgbe.holdoff_pktc_idx_10G
|
||||
@ -186,9 +185,8 @@ dev.cxgbe.X.holdoff_tmr_idx or dev.cxl.X.holdoff_tmr_idx sysctl.
|
||||
The packet-count index value to use to delay interrupts.
|
||||
The packet-count list has the values 1, 8, 16, and 32 by default
|
||||
and the index selects a value from this list.
|
||||
The default value is -1 for both 10Gb and 1Gb ports, which means packet
|
||||
counting is disabled and interrupts are generated based solely on the
|
||||
holdoff timer value.
|
||||
The default value is -1 which means packet counting is disabled and interrupts
|
||||
are generated based solely on the holdoff timer value.
|
||||
Different interfaces can be assigned different values via the
|
||||
dev.cxgbe.X.holdoff_pktc_idx or dev.cxl.X.holdoff_pktc_idx sysctl.
|
||||
This sysctl works only when the interface has never been marked up (as done by
|
||||
@ -228,6 +226,43 @@ already on the card.
|
||||
long as it is compatible with the driver and is a different version than
|
||||
the one already on the card.
|
||||
The default is 1.
|
||||
.It Va hw.cxgbe.fl_pktshift
|
||||
The number of bytes of padding inserted before the begining of an Ethernet
|
||||
frame in the receive buffer.
|
||||
The default value of 2 ensures that the Ethernet payload (usually the IP header)
|
||||
is at a 4 byte aligned address.
|
||||
0-7 are all valid values.
|
||||
.It Va hw.cxgbe.fl_pad
|
||||
A non-zero value ensures that writes from the hardware to a receive buffer are
|
||||
padded up to the specified boundary.
|
||||
The default is -1 which lets the driver pick a pad boundary.
|
||||
0 disables trailer padding completely.
|
||||
.It Va hw.cxgbe.cong_drop
|
||||
Controls the hardware response to congestion.
|
||||
-1 disables congestion feedback and is not recommended.
|
||||
0 instructs the hardware to backpressure its pipeline on congestion.
|
||||
This usually results in the port emitting pause frames.
|
||||
1 instructs the hardware to drop frames destined for congested queues.
|
||||
.It Va hw.cxgbe.buffer_packing
|
||||
Allow the hardware to deliver multiple frames in the same receive buffer
|
||||
opportunistically.
|
||||
The default is -1 which lets the driver decide.
|
||||
0 or 1 explicitly disable or enable this feature.
|
||||
.It Va hw.cxgbe.allow_mbufs_in_cluster
|
||||
1 allows the driver to lay down one or more mbufs within the receive buffer
|
||||
opportunistically. This is the default.
|
||||
0 prohibits the driver from doing so.
|
||||
.It Va hw.cxgbe.largest_rx_cluster
|
||||
.It Va hw.cxgbe.safest_rx_cluster
|
||||
Sizes of rx clusters. Each of these must be set to one of the sizes available
|
||||
(usually 2048, 4096, 9216, and 16384) and largest_rx_cluster must be greater
|
||||
than or equal to safest_rx_cluster.
|
||||
The defaults are 16384 and 4096 respectively.
|
||||
The driver will never attempt to allocate a receive buffer larger than
|
||||
largest_rx_cluster and will fall back to allocating buffers of
|
||||
safest_rx_cluster size if an allocation larger than safest_rx_cluster fails.
|
||||
Note that largest_rx_cluster merely establishes a ceiling -- the driver is
|
||||
allowed to allocate buffers of smaller sizes.
|
||||
.It Va hw.cxgbe.config_file
|
||||
Select a pre-packaged device configuration file.
|
||||
A configuration file contains a recipe for partitioning and configuring the
|
||||
@ -235,7 +270,7 @@ hardware resources on the card.
|
||||
This tunable is for specialized applications only and should not be used in
|
||||
normal operation.
|
||||
The configuration profile currently in use is available in the dev.t4nex.X.cf
|
||||
and dev.t4nex.X.cfcsum sysctls.
|
||||
and dev.t4nex.X.cfcsum (dev.t5nex for T5 cards) sysctls.
|
||||
.It Va hw.cxgbe.linkcaps_allowed
|
||||
.It Va hw.cxgbe.niccaps_allowed
|
||||
.It Va hw.cxgbe.toecaps_allowed
|
||||
@ -249,7 +284,7 @@ capability.
|
||||
This tunable is for specialized applications only and should not be used in
|
||||
normal operation.
|
||||
The capabilities for which hardware resources have been reserved are listed in
|
||||
dev.t4nex.X.*caps sysctls.
|
||||
dev.t4nex.X.*caps or dev.t5nex.X.*caps sysctls.
|
||||
.El
|
||||
.Sh SUPPORT
|
||||
For general information and support,
|
||||
|
@ -128,10 +128,11 @@ enum {
|
||||
|
||||
RX_FL_ESIZE = EQ_ESIZE, /* 8 64bit addresses */
|
||||
#if MJUMPAGESIZE != MCLBYTES
|
||||
FL_BUF_SIZES_MAX = 5, /* cluster, jumbop, jumbo9k, jumbo16k, extra */
|
||||
SW_ZONE_SIZES = 4, /* cluster, jumbop, jumbo9k, jumbo16k */
|
||||
#else
|
||||
FL_BUF_SIZES_MAX = 4, /* cluster, jumbo9k, jumbo16k, extra */
|
||||
SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */
|
||||
#endif
|
||||
CL_METADATA_SIZE = CACHE_LINE_SIZE,
|
||||
|
||||
CTRL_EQ_QSIZE = 128,
|
||||
|
||||
@ -235,15 +236,28 @@ struct port_info {
|
||||
uint8_t hw_addr[ETHER_ADDR_LEN]; /* factory MAC address, won't change */
|
||||
};
|
||||
|
||||
struct fl_sdesc {
|
||||
bus_dmamap_t map;
|
||||
caddr_t cl;
|
||||
uint8_t tag_idx; /* the fl->tag entry this map comes from */
|
||||
/* Where the cluster came from, how it has been carved up. */
|
||||
struct cluster_layout {
|
||||
int8_t zidx;
|
||||
int8_t hwidx;
|
||||
uint16_t region1; /* mbufs laid out within this region */
|
||||
/* region2 is the DMA region */
|
||||
uint16_t region3; /* cluster_metadata within this region */
|
||||
};
|
||||
|
||||
struct cluster_metadata {
|
||||
u_int refcount;
|
||||
#ifdef INVARIANTS
|
||||
__be64 ba_hwtag;
|
||||
struct fl_sdesc *sd; /* For debug only. Could easily be stale */
|
||||
#endif
|
||||
};
|
||||
|
||||
struct fl_sdesc {
|
||||
caddr_t cl;
|
||||
uint8_t nmbuf;
|
||||
struct cluster_layout cll;
|
||||
};
|
||||
|
||||
struct tx_desc {
|
||||
__be64 flit[8];
|
||||
};
|
||||
@ -362,17 +376,19 @@ struct sge_eq {
|
||||
uint32_t unstalled; /* recovered from stall */
|
||||
};
|
||||
|
||||
struct fl_buf_info {
|
||||
u_int size;
|
||||
int type;
|
||||
int hwtag:4; /* tag in low 4 bits of the pa. */
|
||||
uma_zone_t zone;
|
||||
struct sw_zone_info {
|
||||
uma_zone_t zone; /* zone that this cluster comes from */
|
||||
int size; /* size of cluster: 2K, 4K, 9K, 16K, etc. */
|
||||
int type; /* EXT_xxx type of the cluster */
|
||||
int8_t head_hwidx;
|
||||
int8_t tail_hwidx;
|
||||
};
|
||||
|
||||
struct hw_buf_info {
|
||||
int8_t zidx; /* backpointer to zone; -ve means unused */
|
||||
int8_t next; /* next hwidx for this zone; -1 means no more */
|
||||
int size;
|
||||
};
|
||||
#define FL_BUF_SIZES(sc) (sc->sge.fl_buf_sizes)
|
||||
#define FL_BUF_SIZE(sc, x) (sc->sge.fl_buf_info[x].size)
|
||||
#define FL_BUF_TYPE(sc, x) (sc->sge.fl_buf_info[x].type)
|
||||
#define FL_BUF_HWTAG(sc, x) (sc->sge.fl_buf_info[x].hwtag)
|
||||
#define FL_BUF_ZONE(sc, x) (sc->sge.fl_buf_info[x].zone)
|
||||
|
||||
enum {
|
||||
FL_STARVING = (1 << 0), /* on the adapter's list of starving fl's */
|
||||
@ -386,9 +402,8 @@ enum {
|
||||
struct sge_fl {
|
||||
bus_dma_tag_t desc_tag;
|
||||
bus_dmamap_t desc_map;
|
||||
bus_dma_tag_t tag[FL_BUF_SIZES_MAX]; /* only first FL_BUF_SIZES(sc) are
|
||||
valid */
|
||||
uint8_t tag_idx;
|
||||
struct cluster_layout cll_def; /* default refill zone, layout */
|
||||
struct cluster_layout cll_alt; /* alternate refill zone, layout */
|
||||
struct mtx fl_lock;
|
||||
char lockname[16];
|
||||
int flags;
|
||||
@ -405,9 +420,17 @@ struct sge_fl {
|
||||
uint32_t needed; /* # of buffers needed to fill up fl. */
|
||||
uint32_t lowat; /* # of buffers <= this means fl needs help */
|
||||
uint32_t pending; /* # of bufs allocated since last doorbell */
|
||||
u_int dmamap_failed;
|
||||
struct mbuf *mstash[8];
|
||||
TAILQ_ENTRY(sge_fl) link; /* All starving freelists */
|
||||
|
||||
struct mbuf *m0;
|
||||
struct mbuf **pnext;
|
||||
u_int remaining;
|
||||
|
||||
uint64_t mbuf_allocated;/* # of mbuf allocated from zone_mbuf */
|
||||
uint64_t mbuf_inlined; /* # of mbuf created within clusters */
|
||||
uint64_t cl_allocated; /* # of clusters allocated */
|
||||
uint64_t cl_recycled; /* # of clusters recycled */
|
||||
uint64_t cl_fast_recycled; /* # of clusters recycled (fast) */
|
||||
};
|
||||
|
||||
/* txq: SGE egress queue + what's needed for Ethernet NIC */
|
||||
@ -541,8 +564,11 @@ struct sge {
|
||||
struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */
|
||||
struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */
|
||||
|
||||
u_int fl_buf_sizes __aligned(CACHE_LINE_SIZE);
|
||||
struct fl_buf_info fl_buf_info[FL_BUF_SIZES_MAX];
|
||||
int pack_boundary;
|
||||
int8_t safe_hwidx1; /* may not have room for metadata */
|
||||
int8_t safe_hwidx2; /* with room for metadata and maybe more */
|
||||
struct sw_zone_info sw_zone_info[SW_ZONE_SIZES];
|
||||
struct hw_buf_info hw_buf_info[SGE_FLBUF_SIZES];
|
||||
};
|
||||
|
||||
struct rss_header;
|
||||
|
@ -87,6 +87,7 @@ enum {
|
||||
SGE_NTIMERS = 6, /* # of interrupt holdoff timer values */
|
||||
SGE_NCOUNTERS = 4, /* # of interrupt packet counter values */
|
||||
SGE_MAX_IQ_SIZE = 65520,
|
||||
SGE_FLBUF_SIZES = 16,
|
||||
};
|
||||
|
||||
struct sge_qstat { /* data written to SGE queue status entries */
|
||||
|
@ -494,6 +494,8 @@ CTASSERT(offsetof(struct sge_ofld_rxq, fl) == offsetof(struct sge_rxq, fl));
|
||||
CTASSERT(nitems(((struct adapter *)0)->cpl_handler) == NUM_CPL_CMDS);
|
||||
CTASSERT(nitems(((struct adapter *)0)->fw_msg_handler) == NUM_FW6_TYPES);
|
||||
|
||||
CTASSERT(sizeof(struct cluster_metadata) <= CL_METADATA_SIZE);
|
||||
|
||||
static int
|
||||
t4_probe(device_t dev)
|
||||
{
|
||||
@ -4039,6 +4041,7 @@ static void
|
||||
cxgbe_tick(void *arg)
|
||||
{
|
||||
struct port_info *pi = arg;
|
||||
struct adapter *sc = pi->adapter;
|
||||
struct ifnet *ifp = pi->ifp;
|
||||
struct sge_txq *txq;
|
||||
int i, drops;
|
||||
@ -4050,7 +4053,7 @@ cxgbe_tick(void *arg)
|
||||
return; /* without scheduling another callout */
|
||||
}
|
||||
|
||||
t4_get_port_stats(pi->adapter, pi->tx_chan, s);
|
||||
t4_get_port_stats(sc, pi->tx_chan, s);
|
||||
|
||||
ifp->if_opackets = s->tx_frames - s->tx_pause;
|
||||
ifp->if_ipackets = s->rx_frames - s->rx_pause;
|
||||
@ -4061,6 +4064,19 @@ cxgbe_tick(void *arg)
|
||||
ifp->if_iqdrops = s->rx_ovflow0 + s->rx_ovflow1 + s->rx_ovflow2 +
|
||||
s->rx_ovflow3 + s->rx_trunc0 + s->rx_trunc1 + s->rx_trunc2 +
|
||||
s->rx_trunc3;
|
||||
for (i = 0; i < 4; i++) {
|
||||
if (pi->rx_chan_map & (1 << i)) {
|
||||
uint32_t v;
|
||||
|
||||
/*
|
||||
* XXX: indirect reads from the same ADDR/DATA pair can
|
||||
* race with each other.
|
||||
*/
|
||||
t4_read_indirect(sc, A_TP_MIB_INDEX, A_TP_MIB_DATA, &v,
|
||||
1, A_TP_MIB_TNL_CNG_DROP_0 + i);
|
||||
ifp->if_iqdrops += v;
|
||||
}
|
||||
}
|
||||
|
||||
drops = s->tx_drop;
|
||||
for_each_txq(pi, i, txq)
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user