From e3207e1973cb2ec1ca7e5d922f58678c687439a1 Mon Sep 17 00:00:00 2001 From: Navdeep Parhar Date: Sat, 6 Dec 2014 00:13:56 +0000 Subject: [PATCH] cxgbe(4): Allow for different pad and pack boundaries for different adapters. Set the pack boundary for T5 cards to be the same as the PCIe max payload size. The chip likes it this way. In this revision the driver allocate rx buffers that align on both boundaries. This is not a strict requirement and a followup commit will switch the driver to a more relaxed allocation strategy. MFC after: 2 weeks --- sys/dev/cxgbe/adapter.h | 3 +- sys/dev/cxgbe/t4_sge.c | 287 +++++++++++++++++++++++----------------- 2 files changed, 164 insertions(+), 126 deletions(-) diff --git a/sys/dev/cxgbe/adapter.h b/sys/dev/cxgbe/adapter.h index 8cebdaa7d8ae..17b3d593d707 100644 --- a/sys/dev/cxgbe/adapter.h +++ b/sys/dev/cxgbe/adapter.h @@ -148,7 +148,7 @@ enum { #else SW_ZONE_SIZES = 3, /* cluster, jumbo9k, jumbo16k */ #endif - CL_METADATA_SIZE = CACHE_LINE_SIZE, + CL_METADATA_SIZE = 256, /* same as MSIZE for now */ SGE_MAX_WR_NDESC = SGE_MAX_WR_LEN / EQ_ESIZE, /* max WR size in desc */ TX_SGL_SEGS = 36, @@ -695,6 +695,7 @@ struct sge { struct sge_iq **iqmap; /* iq->cntxt_id to iq mapping */ struct sge_eq **eqmap; /* eq->cntxt_id to eq mapping */ + int pad_boundary; int pack_boundary; int8_t safe_hwidx1; /* may not have room for metadata */ int8_t safe_hwidx2; /* with room for metadata and maybe more */ diff --git a/sys/dev/cxgbe/t4_sge.c b/sys/dev/cxgbe/t4_sge.c index 58940fe4959c..bfdb1eaf9b92 100644 --- a/sys/dev/cxgbe/t4_sge.c +++ b/sys/dev/cxgbe/t4_sge.c @@ -120,19 +120,10 @@ TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing); /* * Start next frame in a packed buffer at this boundary. * -1: driver should figure out a good value. - * T4: - * --- - * if fl_pad != 0 - * value specified here will be overridden by fl_pad. - * else - * power of 2 from 32 to 4096 (both inclusive) is a valid value here. - * T5: - * --- - * 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. + * T4: driver will ignore this and use the same value as fl_pad above. + * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value. */ static int fl_pack = -1; -static int t4_fl_pack; -static int t5_fl_pack; TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack); /* @@ -175,8 +166,7 @@ static int service_iq(struct sge_iq *, int); static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t); static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *); static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int); -static inline void init_fl(struct adapter *, struct sge_fl *, int, int, int, - char *); +static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *); static inline void init_eq(struct sge_eq *, int, int, uint8_t, uint16_t, char *); static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *, @@ -264,15 +254,6 @@ static counter_u64_t extfree_rels; void t4_sge_modload(void) { - int pad; - - /* set pad to a reasonable powerof2 between 16 and 4096 (inclusive) */ -#if defined(__i386__) || defined(__amd64__) - pad = max(cpu_clflush_line_size, 16); -#else - pad = max(CACHE_LINE_SIZE, 16); -#endif - pad = min(pad, 4096); if (fl_pktshift < 0 || fl_pktshift > 7) { printf("Invalid hw.cxgbe.fl_pktshift value (%d)," @@ -280,35 +261,6 @@ t4_sge_modload(void) fl_pktshift = 2; } - if (fl_pad != 0 && - (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad))) { - - if (fl_pad != -1) { - printf("Invalid hw.cxgbe.fl_pad value (%d)," - " using %d instead.\n", fl_pad, max(pad, 32)); - } - fl_pad = max(pad, 32); - } - - /* - * T4 has the same pad and pack boundary. If a pad boundary is set, - * pack boundary must be set to the same value. Otherwise take the - * specified value or auto-calculate something reasonable. - */ - if (fl_pad) - t4_fl_pack = fl_pad; - else if (fl_pack < 32 || fl_pack > 4096 || !powerof2(fl_pack)) - t4_fl_pack = max(pad, 32); - else - t4_fl_pack = fl_pack; - - /* T5's pack boundary is independent of the pad boundary. */ - if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || - !powerof2(fl_pack)) - t5_fl_pack = max(pad, CACHE_LINE_SIZE); - else - t5_fl_pack = fl_pack; - if (spg_len != 64 && spg_len != 128) { int len; @@ -366,6 +318,71 @@ t4_init_sge_cpl_handlers(struct adapter *sc) t4_register_fw_msg_handler(sc, FW6_TYPE_CMD_RPL, t4_handle_fw_rpl); } +static inline void +setup_pad_and_pack_boundaries(struct adapter *sc) +{ + uint32_t v, m; + int pad, pack; + + pad = fl_pad; + if (fl_pad < 32 || fl_pad > 4096 || !powerof2(fl_pad)) { + /* + * If there is any chance that we might use buffer packing and + * the chip is a T4, then pick 64 as the pad/pack boundary. Set + * it to 32 in all other cases. + */ + pad = is_t4(sc) && buffer_packing ? 64 : 32; + + /* + * For fl_pad = 0 we'll still write a reasonable value to the + * register but all the freelists will opt out of padding. + * We'll complain here only if the user tried to set it to a + * value greater than 0 that was invalid. + */ + if (fl_pad > 0) { + device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value" + " (%d), using %d instead.\n", fl_pad, pad); + } + } + m = V_INGPADBOUNDARY(M_INGPADBOUNDARY); + v = V_INGPADBOUNDARY(ilog2(pad) - 5); + t4_set_reg_field(sc, A_SGE_CONTROL, m, v); + + if (is_t4(sc)) { + if (fl_pack != -1 && fl_pack != pad) { + /* Complain but carry on. */ + device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored," + " using %d instead.\n", fl_pack, pad); + } + return; + } + + pack = fl_pack; + if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 || + !powerof2(fl_pack)) { + pack = max(sc->params.pci.mps, CACHE_LINE_SIZE); + MPASS(powerof2(pack)); + if (pack < 16) + pack = 16; + if (pack == 32) + pack = 64; + if (pack > 4096) + pack = 4096; + if (fl_pack != -1) { + device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value" + " (%d), using %d instead.\n", fl_pack, pack); + } + } + m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); + if (pack == 16) + v = V_INGPACKBOUNDARY(0); + else + v = V_INGPACKBOUNDARY(ilog2(pack) - 5); + + MPASS(!is_t4(sc)); /* T4 doesn't have SGE_CONTROL2 */ + t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); +} + /* * adap->params.vpd.cclk must be set up before this is called. */ @@ -398,24 +415,9 @@ t4_tweak_chip_settings(struct adapter *sc) m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); - if (is_t4(sc) && (fl_pad || buffer_packing)) { - /* t4_fl_pack has the correct value even when fl_pad = 0 */ - m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); - v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5); - } else if (is_t5(sc) && fl_pad) { - m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); - v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5); - } t4_set_reg_field(sc, A_SGE_CONTROL, m, v); - if (is_t5(sc) && buffer_packing) { - m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); - if (t5_fl_pack == 16) - v = V_INGPACKBOUNDARY(0); - else - v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5); - t4_set_reg_field(sc, A_SGE_CONTROL2, m, v); - } + setup_pad_and_pack_boundaries(sc); v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | @@ -486,15 +488,25 @@ t4_tweak_chip_settings(struct adapter *sc) } /* - * SGE wants the buffer to be at least 64B and then a multiple of the pad - * boundary or 16, whichever is greater. + * SGE wants the buffer to be at least 64B and then a multiple of 16. If + * padding and packing are enabled, the buffer's start and end need to be + * correctly aligned as well. We'll just make sure that the size is a multiple + * of the alignment, it is up to other parts . */ static inline int -hwsz_ok(int hwsz) +hwsz_ok(struct adapter *sc, int hwsz) { - int mask = max(fl_pad, 16) - 1; + int align = 16; + + if (fl_pad) { + MPASS(sc->sge.pad_boundary > align); + align = sc->sge.pad_boundary; + } + if (buffer_packing && sc->sge.pack_boundary > align) + align = sc->sge.pack_boundary; + align--; /* now a mask */ + return (hwsz >= 64 && (hwsz & align) == 0); - return (hwsz >= 64 && (hwsz & mask) == 0); } /* @@ -521,33 +533,22 @@ t4_read_chip_settings(struct adapter *sc) m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE; v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE | V_EGRSTATUSPAGESIZE(spg_len == 128); - if (is_t4(sc) && (fl_pad || buffer_packing)) { - m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); - v |= V_INGPADBOUNDARY(ilog2(t4_fl_pack) - 5); - } else if (is_t5(sc) && fl_pad) { - m |= V_INGPADBOUNDARY(M_INGPADBOUNDARY); - v |= V_INGPADBOUNDARY(ilog2(fl_pad) - 5); - } r = t4_read_reg(sc, A_SGE_CONTROL); if ((r & m) != v) { device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r); rc = EINVAL; } + s->pad_boundary = 1 << (G_INGPADBOUNDARY(r) + 5); - if (is_t5(sc) && buffer_packing) { - m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY); - if (t5_fl_pack == 16) - v = V_INGPACKBOUNDARY(0); - else - v = V_INGPACKBOUNDARY(ilog2(t5_fl_pack) - 5); + if (is_t4(sc)) + s->pack_boundary = s->pad_boundary; + else { r = t4_read_reg(sc, A_SGE_CONTROL2); - if ((r & m) != v) { - device_printf(sc->dev, - "invalid SGE_CONTROL2(0x%x)\n", r); - rc = EINVAL; - } + if (G_INGPACKBOUNDARY(r) == 0) + s->pack_boundary = 16; + else + s->pack_boundary = 1 << (G_INGPACKBOUNDARY(r) + 5); } - s->pack_boundary = is_t4(sc) ? t4_fl_pack : t5_fl_pack; v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) | V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) | @@ -568,13 +569,22 @@ t4_read_chip_settings(struct adapter *sc) for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) { r = t4_read_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i)); hwb->size = r; - hwb->zidx = hwsz_ok(r) ? -1 : -2; + hwb->zidx = hwsz_ok(sc, r) ? -1 : -2; hwb->next = -1; } /* * Create a sorted list in decreasing order of hw buffer sizes (and so * increasing order of spare area) for each software zone. + * + * If padding is enabled then the start and end of the buffer must align + * to the pad boundary; if packing is enabled then they must align with + * the pack boundary as well. Allocations from the cluster zones are + * aligned to min(size, 4K), so the buffer starts at that alignment and + * ends at hwb->size alignment. If mbuf inlining is allowed the + * starting alignment will be reduced to MSIZE and the driver will + * exercise appropriate caution when deciding on the best buffer layout + * to use. */ n = 0; /* no usable buffer size to begin with */ swz = &s->sw_zone_info[0]; @@ -586,6 +596,15 @@ t4_read_chip_settings(struct adapter *sc) swz->zone = m_getzone(swz->size); swz->type = m_gettype(swz->size); + if (swz->size < PAGE_SIZE) { + MPASS(powerof2(swz->size)); + if (fl_pad && (swz->size % sc->sge.pad_boundary != 0)) + continue; + if (buffer_packing && + (swz->size % sc->sge.pack_boundary != 0)) + continue; + } + if (swz->size == safest_rx_cluster) safe_swz = swz; @@ -593,6 +612,12 @@ t4_read_chip_settings(struct adapter *sc) for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) { if (hwb->zidx != -1 || hwb->size > swz->size) continue; +#ifdef INVARIANTS + if (fl_pad) + MPASS(hwb->size % sc->sge.pad_boundary == 0); + if (buffer_packing) + MPASS(hwb->size % sc->sge.pack_boundary == 0); +#endif hwb->zidx = i; if (head == -1) head = tail = j; @@ -640,14 +665,17 @@ t4_read_chip_settings(struct adapter *sc) int spare; hwb = &s->hw_buf_info[i]; +#ifdef INVARIANTS + if (fl_pad) + MPASS(hwb->size % sc->sge.pad_boundary == 0); + if (buffer_packing) + MPASS(hwb->size % sc->sge.pack_boundary == 0); +#endif spare = safe_swz->size - hwb->size; - if (spare < CL_METADATA_SIZE) - continue; - if (s->safe_hwidx2 == -1 || - spare == CL_METADATA_SIZE + MSIZE) + if (spare >= CL_METADATA_SIZE) { s->safe_hwidx2 = i; - if (spare >= CL_METADATA_SIZE + MSIZE) break; + } } } @@ -745,17 +773,6 @@ t4_create_dma_tag(struct adapter *sc) return (rc); } -static inline int -enable_buffer_packing(struct adapter *sc) -{ - - if (sc->flags & BUF_PACKING_OK && - ((is_t5(sc) && buffer_packing) || /* 1 or -1 both ok for T5 */ - (is_t4(sc) && buffer_packing == 1))) - return (1); - return (0); -} - void t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, struct sysctl_oid_list *children) @@ -769,7 +786,7 @@ t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, NULL, fl_pktshift, "payload DMA offset in rx buffer (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD, - NULL, fl_pad, "payload pad boundary (bytes)"); + NULL, sc->sge.pad_boundary, "payload pad boundary (bytes)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD, NULL, spg_len, "status page size (bytes)"); @@ -777,10 +794,6 @@ t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx, SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD, NULL, cong_drop, "congestion drop setting"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, "buffer_packing", CTLFLAG_RD, - NULL, enable_buffer_packing(sc), - "pack multiple frames in one fl buffer"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD, NULL, sc->sge.pack_boundary, "payload pack boundary (bytes)"); } @@ -958,7 +971,6 @@ mtu_to_max_payload(struct adapter *sc, int mtu, const int toe) #ifdef TCP_OFFLOAD } #endif - payload = roundup2(payload, fl_pad); return (payload); } @@ -983,7 +995,7 @@ t4_setup_port_queues(struct port_info *pi) struct ifnet *ifp = pi->ifp; struct sysctl_oid *oid = device_get_sysctl_tree(pi->dev); struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid); - int maxp, pack, mtu = ifp->if_mtu; + int maxp, mtu = ifp->if_mtu; /* Interrupt vector to start from (when using multiple vectors) */ intr_idx = first_vector(pi); @@ -994,7 +1006,6 @@ t4_setup_port_queues(struct port_info *pi) * b) allocate queue iff it will take direct interrupts. */ maxp = mtu_to_max_payload(sc, mtu, 0); - pack = enable_buffer_packing(sc); if (pi->flags & INTR_RXQ) { oid = SYSCTL_ADD_NODE(&pi->ctx, children, OID_AUTO, "rxq", CTLFLAG_RD, NULL, "rx queues"); @@ -1005,7 +1016,7 @@ t4_setup_port_queues(struct port_info *pi) snprintf(name, sizeof(name), "%s rxq%d-fl", device_get_nameunit(pi->dev), i); - init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); + init_fl(sc, &rxq->fl, pi->qsize_rxq / 8, maxp, name); if (pi->flags & INTR_RXQ) { rxq->iq.flags |= IQ_INTR; @@ -1029,7 +1040,7 @@ t4_setup_port_queues(struct port_info *pi) snprintf(name, sizeof(name), "%s ofld_rxq%d-fl", device_get_nameunit(pi->dev), i); - init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, pack, name); + init_fl(sc, &ofld_rxq->fl, pi->qsize_rxq / 8, maxp, name); if (pi->flags & INTR_OFLD_RXQ) { ofld_rxq->iq.flags |= IQ_INTR; @@ -1572,8 +1583,14 @@ get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int total, int flags) caddr_t payload; len = min(total, hwb->size - fl->rx_offset); - padded_len = roundup2(len, fl->buf_boundary); payload = sd->cl + cll->region1 + fl->rx_offset; + if (fl->flags & FL_BUF_PACKING) { + padded_len = roundup2(len, fl->buf_boundary); + MPASS(fl->rx_offset + padded_len <= hwb->size); + } else { + padded_len = hwb->size; + MPASS(fl->rx_offset == 0); /* not packing */ + } if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) { @@ -2121,14 +2138,15 @@ init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx, } static inline void -init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, int pack, - char *name) +init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name) { fl->qsize = qsize; fl->sidx = qsize - spg_len / EQ_ESIZE; strlcpy(fl->lockname, name, sizeof(fl->lockname)); - if (pack) + if (sc->flags & BUF_PACKING_OK && + ((!is_t4(sc) && buffer_packing) || /* T5+: enabled unless 0 */ + (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */ fl->flags |= FL_BUF_PACKING; find_best_refill_source(sc, fl, maxp); find_safe_refill_source(sc, fl); @@ -2277,11 +2295,13 @@ alloc_iq_fl(struct port_info *pi, struct sge_iq *iq, struct sge_fl *fl, if (fl->flags & FL_BUF_PACKING) { fl->lowat = roundup2(sc->sge.fl_starve_threshold2, 8); - fl->buf_boundary = max(fl_pad, sc->sge.pack_boundary); + fl->buf_boundary = sc->sge.pack_boundary; } else { fl->lowat = roundup2(sc->sge.fl_starve_threshold, 8); - fl->buf_boundary = fl_pad; + fl->buf_boundary = 16; } + if (fl_pad && fl->buf_boundary < sc->sge.pad_boundary) + fl->buf_boundary = sc->sge.pad_boundary; c.iqns_to_fl0congen |= htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) | @@ -2452,6 +2472,10 @@ add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid, SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id", CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I", "SGE context id of the freelist"); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL, + fl_pad ? 1 : 0, "padding enabled"); + SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL, + fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx, 0, "consumer index"); if (fl->flags & FL_BUF_PACKING) { @@ -4367,6 +4391,17 @@ find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp) if (allow_mbufs_in_cluster == 0 || hwb->size < maxp) break; + + /* + * Do not inline mbufs if doing so would violate the pad/pack + * boundary alignment requirement. + */ + if (fl_pad && (MSIZE % sc->sge.pad_boundary) != 0) + continue; + if (fl->flags & FL_BUF_PACKING && + (MSIZE % sc->sge.pack_boundary) != 0) + continue; + if (spare < CL_METADATA_SIZE + MSIZE) continue; n = (spare - CL_METADATA_SIZE) / MSIZE; @@ -4449,7 +4484,9 @@ find_safe_refill_source(struct adapter *sc, struct sge_fl *fl) spare = swz->size - hwb->size; fl->cll_alt.hwidx = hwidx; fl->cll_alt.zidx = hwb->zidx; - if (allow_mbufs_in_cluster) + if (allow_mbufs_in_cluster && + (fl_pad == 0 || (MSIZE % sc->sge.pad_boundary) == 0) && + (!(fl->flags & FL_BUF_PACKING) || (MSIZE % sc->sge.pack_boundary) == 0)) fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE; else fl->cll_alt.region1 = 0;