From 8090c9f504c0c19831713ab2392d0993a5fc5b36 Mon Sep 17 00:00:00 2001 From: Kip Macy Date: Mon, 17 Dec 2007 08:17:51 +0000 Subject: [PATCH] Make TCP offload work on HEAD (modulo negative interaction between sbcompress and t3_push_frames). - Import latest changes to cxgb_main.c and cxgb_sge.c from toestack p4 branch - make driver local copy of tcp_subr.c and tcp_usrreq.c and override tcp_usrreqs so TOE can also functions on versions with unmodified TCP - add cxgb back to the build --- sys/conf/NOTES | 2 +- sys/dev/cxgb/common/cxgb_t3_cpl.h | 12 + sys/dev/cxgb/cxgb_adapter.h | 83 +- sys/dev/cxgb/cxgb_config.h | 1 - sys/dev/cxgb/cxgb_l2t.c | 26 +- sys/dev/cxgb/cxgb_l2t.h | 2 +- sys/dev/cxgb/cxgb_main.c | 305 ++++-- sys/dev/cxgb/cxgb_offload.c | 72 +- sys/dev/cxgb/cxgb_offload.h | 2 +- sys/dev/cxgb/cxgb_osdep.h | 134 ++- sys/dev/cxgb/cxgb_sge.c | 1039 +++++++++++------- sys/dev/cxgb/sys/cxgb_support.c | 6 +- sys/dev/cxgb/sys/mvec.h | 3 + sys/dev/cxgb/t3cdev.h | 2 +- sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c | 15 +- sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c | 15 +- sys/dev/cxgb/ulp/tom/cxgb_listen.c | 2 +- sys/dev/cxgb/ulp/tom/cxgb_tcp.h | 44 + sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c | 694 ++++++++++++ sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c | 1362 ++++++++++++++++++++++++ sys/dev/cxgb/ulp/tom/cxgb_tom.c | 82 +- sys/modules/cxgb/Makefile | 2 +- sys/modules/cxgb/cxgb/Makefile | 6 +- sys/modules/cxgb/tom/Makefile | 8 +- 24 files changed, 3314 insertions(+), 605 deletions(-) create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tcp.h create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c create mode 100644 sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c diff --git a/sys/conf/NOTES b/sys/conf/NOTES index 4c88ca3e9423..56c288557dde 100644 --- a/sys/conf/NOTES +++ b/sys/conf/NOTES @@ -1878,7 +1878,7 @@ device xe device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet device bfe # Broadcom BCM440x 10/100 Ethernet device bge # Broadcom BCM570xx Gigabit Ethernet -#device cxgb # Chelsio T3 10 Gigabit Ethernet +device cxgb # Chelsio T3 10 Gigabit Ethernet device dc # DEC/Intel 21143 and various workalikes device fxp # Intel EtherExpress PRO/100B (82557, 82558) hint.fxp.0.prefer_iomap="0" diff --git a/sys/dev/cxgb/common/cxgb_t3_cpl.h b/sys/dev/cxgb/common/cxgb_t3_cpl.h index e1b40303a6f7..672823ce9b56 100644 --- a/sys/dev/cxgb/common/cxgb_t3_cpl.h +++ b/sys/dev/cxgb/common/cxgb_t3_cpl.h @@ -1131,6 +1131,18 @@ struct cpl_tx_pkt_lso { __be32 lso_info; }; +struct cpl_tx_pkt_batch_entry { + __be32 cntrl; + __be32 len; + __be64 addr; +}; + +struct cpl_tx_pkt_batch { + WR_HDR; + struct cpl_tx_pkt_batch_entry pkt_entry[7]; +}; + + /* cpl_tx_pkt*.cntrl fields */ #define S_TXPKT_VLAN 0 #define M_TXPKT_VLAN 0xFFFF diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h index 23db259bc2b7..542668ef2ea2 100644 --- a/sys/dev/cxgb/cxgb_adapter.h +++ b/sys/dev/cxgb/cxgb_adapter.h @@ -31,7 +31,6 @@ POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ - #ifndef _CXGB_ADAPTER_H_ #define _CXGB_ADAPTER_H_ @@ -42,6 +41,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include #include @@ -49,6 +49,7 @@ POSSIBILITY OF SUCH DAMAGE. #include #include + #include #include #include @@ -56,8 +57,8 @@ POSSIBILITY OF SUCH DAMAGE. #ifdef CONFIG_DEFINED #include #include -#include #include +#include #else #include #include @@ -128,10 +129,12 @@ struct port_info { struct task timer_reclaim_task; struct cdev *port_cdev; -#define PORT_NAME_LEN 32 +#define PORT_LOCK_NAME_LEN 32 #define TASKQ_NAME_LEN 32 - char lockbuf[PORT_NAME_LEN]; +#define PORT_NAME_LEN 32 + char lockbuf[PORT_LOCK_NAME_LEN]; char taskqbuf[TASKQ_NAME_LEN]; + char namebuf[PORT_NAME_LEN]; }; enum { /* adapter flags */ @@ -143,19 +146,14 @@ enum { /* adapter flags */ TPS_UPTODATE = (1 << 5), }; - #define FL_Q_SIZE 4096 -#define JUMBO_Q_SIZE 512 +#define JUMBO_Q_SIZE 1024 #define RSPQ_Q_SIZE 1024 #define TX_ETH_Q_SIZE 1024 - - -/* - * Types of Tx queues in each queue set. Order here matters, do not change. - * XXX TOE is not implemented yet, so the extra queues are just placeholders. - */ -enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL }; +enum { TXQ_ETH = 0, + TXQ_OFLD = 1, + TXQ_CTRL = 2, }; /* careful, the following are set on priv_flags and must not collide with @@ -275,7 +273,22 @@ struct sge_txq { bus_dmamap_t desc_map; bus_dma_tag_t entry_tag; struct mbuf_head sendq; + /* + * cleanq should really be an buf_ring to avoid extra + * mbuf touches + */ + struct mbuf_head cleanq; + struct buf_ring txq_mr; + struct mbuf *immpkt; + uint32_t txq_drops; + uint32_t txq_skipped; + uint32_t txq_coalesced; + uint32_t txq_enqueued; + unsigned long txq_frees; struct mtx lock; + struct sg_ent txq_sgl[TX_MAX_SEGS / 2 + 1]; + bus_dma_segment_t txq_segs[TX_MAX_SEGS]; + struct mbuf *txq_m_vec[TX_WR_COUNT_MAX]; #define TXQ_NAME_LEN 32 char lockbuf[TXQ_NAME_LEN]; }; @@ -294,6 +307,10 @@ enum { #define SGE_PSTAT_MAX (SGE_PSTATS_LRO_X_STREAMS+1) +#define QS_EXITING 0x1 +#define QS_RUNNING 0x2 +#define QS_BOUND 0x4 + struct sge_qset { struct sge_rspq rspq; struct sge_fl fl[SGE_RXQ_PER_SET]; @@ -303,6 +320,12 @@ struct sge_qset { uint64_t port_stats[SGE_PSTAT_MAX]; struct port_info *port; int idx; /* qset # */ + int qs_cpuid; + int qs_flags; + struct cv qs_cv; + struct mtx qs_mtx; +#define QS_NAME_LEN 32 + char namebuf[QS_NAME_LEN]; }; struct sge { @@ -344,7 +367,15 @@ struct adapter { void *msix_intr_tag[SGE_QSETS]; uint8_t rxpkt_map[8]; /* maps RX_PKT interface values to port ids */ uint8_t rrss_map[SGE_QSETS]; /* revers RSS map table */ + uint16_t rspq_map[RSS_TABLE_SIZE]; /* maps 7-bit cookie to qidx */ + union { + uint8_t fill[SGE_QSETS]; + uint64_t coalesce; + } u; +#define tunq_fill u.fill +#define tunq_coalesce u.coalesce + struct filter_info *filters; /* Tasks */ @@ -474,7 +505,7 @@ t3_get_next_mcaddr(struct t3_rx_mode *rm) uint8_t *macaddr = NULL; if (rm->idx == 0) - macaddr = rm->port->hw_addr; + macaddr = (uint8_t *)rm->port->hw_addr; rm->idx++; return (macaddr); @@ -515,18 +546,21 @@ void t3_sge_stop(adapter_t *); void t3b_intr(void *data); void t3_intr_msi(void *data); void t3_intr_msix(void *data); -int t3_encap(struct port_info *, struct mbuf **, int *free); +int t3_encap(struct sge_qset *, struct mbuf **, int); int t3_sge_init_adapter(adapter_t *); int t3_sge_init_port(struct port_info *); void t3_sge_deinit_sw(adapter_t *); +void t3_free_tx_desc(struct sge_txq *q, int n); +void t3_free_tx_desc_all(struct sge_txq *q); void t3_rx_eth_lro(adapter_t *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad, uint32_t rss_hash, uint32_t rss_csum, int lro); void t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad); void t3_lro_flush(adapter_t *adap, struct sge_qset *qs, struct lro_state *state); -void t3_add_sysctls(adapter_t *sc); +void t3_add_attach_sysctls(adapter_t *sc); +void t3_add_configured_sysctls(adapter_t *sc); int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx, unsigned char *data); void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p); @@ -535,7 +569,7 @@ void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p); */ #define desc_reclaimable(q) ((int)((q)->processed - (q)->cleaned - TX_MAX_DESC)) -#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) +#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field))) static __inline struct sge_qset * fl_to_qset(struct sge_fl *q, int qidx) @@ -569,5 +603,20 @@ static inline int offload_running(adapter_t *adapter) return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT); } +#ifdef IFNET_MULTIQUEUE +int cxgb_pcpu_enqueue_packet(struct ifnet *ifp, struct mbuf *m); +int cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *m); +int32_t cxgb_pcpu_get_cookie(struct ifnet *ifp, struct in6_addr *lip, uint16_t lport, + struct in6_addr *rip, uint16_t rport, int ipv6); +void cxgb_pcpu_shutdown_threads(struct adapter *sc); +void cxgb_pcpu_startup_threads(struct adapter *sc); +#endif + +int process_responses(adapter_t *adap, struct sge_qset *qs, int budget); +int cxgb_tx_common(struct ifnet *ifp, struct sge_qset *qs, uint32_t txmax); +void t3_free_qset(adapter_t *sc, struct sge_qset *q); +int cxgb_dequeue_packet(struct ifnet *, struct sge_txq *, struct mbuf **); +void cxgb_start(struct ifnet *ifp); +void refill_fl_service(adapter_t *adap, struct sge_fl *fl); #endif diff --git a/sys/dev/cxgb/cxgb_config.h b/sys/dev/cxgb/cxgb_config.h index a12753f86b4c..6b072c3cb6e3 100644 --- a/sys/dev/cxgb/cxgb_config.h +++ b/sys/dev/cxgb/cxgb_config.h @@ -34,7 +34,6 @@ POSSIBILITY OF SUCH DAMAGE. #ifndef CONFIG_DEFINED #define CONFIG_CHELSIO_T3_CORE -#define DISABLE_MBUF_IOVEC #endif #endif diff --git a/sys/dev/cxgb/cxgb_l2t.c b/sys/dev/cxgb/cxgb_l2t.c index 0bb0695f8964..f3e02f206201 100644 --- a/sys/dev/cxgb/cxgb_l2t.c +++ b/sys/dev/cxgb/cxgb_l2t.c @@ -115,7 +115,7 @@ neigh_replace(struct l2t_entry *e, struct rtentry *rt) */ static int setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m, - struct l2t_entry *e) + struct l2t_entry *e) { struct cpl_l2t_write_req *req; @@ -183,7 +183,7 @@ t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e) again: switch (e->state) { case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ - arpresolve(rt->rt_ifp, rt, NULL, (struct sockaddr *)&sin, e->dmac); + arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac); mtx_lock(&e->lock); if (e->state == L2T_STATE_STALE) e->state = L2T_STATE_VALID; @@ -208,8 +208,8 @@ t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e) * A better way would be to use a work request to retry L2T * entries when there's no memory. */ - printf("doing arpresolve on 0x%x \n", e->addr); - if (arpresolve(rt->rt_ifp, rt, NULL, (struct sockaddr *)&sin, e->dmac) == 0) { + printf("doing arpresolve2 on 0x%x \n", e->addr); + if (arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac) == 0) { printf("mac=%x:%x:%x:%x:%x:%x\n", e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]); @@ -223,7 +223,7 @@ t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e) m_freem(m); mtx_unlock(&e->lock); } else - printf("arpresolve returned non-zero\n"); + printf("arpresolve2 returned non-zero\n"); } return 0; } @@ -245,7 +245,7 @@ t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) again: switch (e->state) { case L2T_STATE_STALE: /* entry is stale, kick off revalidation */ - arpresolve(rt->rt_ifp, rt, m0, (struct sockaddr *)&sin, e->dmac); + arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac); mtx_lock(&e->lock); if (e->state == L2T_STATE_STALE) { e->state = L2T_STATE_VALID; @@ -262,8 +262,6 @@ t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) } mtx_unlock(&e->lock); - if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL) - return; /* * Only the first packet added to the arpq should kick off * resolution. However, because the alloc_skb below can fail, @@ -272,7 +270,7 @@ t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e) * A better way would be to use a work request to retry L2T * entries when there's no memory. */ - arpresolve(rt->rt_ifp, rt, m0, (struct sockaddr *)&sin, e->dmac); + arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac); } return; @@ -459,7 +457,8 @@ handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq) } void -t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa) +t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, + uint8_t *enaddr, struct sockaddr *sa) { struct l2t_entry *e; struct mbuf *arpq = NULL; @@ -468,8 +467,6 @@ t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa) int ifidx = neigh->rt_ifp->if_index; int hash = arp_hash(addr, ifidx, d); struct llinfo_arp *la; - u_char edst[ETHER_ADDR_LEN]; - printf("t3_l2t_update called with arp info\n"); @@ -485,10 +482,11 @@ t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa) found: printf("found 0x%08x\n", addr); - arpresolve(neigh->rt_ifp, neigh, NULL, sa, edst); rw_runlock(&d->lock); - memcpy(e->dmac, edst, ETHER_ADDR_LEN); + memcpy(e->dmac, enaddr, ETHER_ADDR_LEN); + printf("mac=%x:%x:%x:%x:%x:%x\n", + e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]); if (atomic_load_acq_int(&e->refcnt)) { if (neigh != e->neigh) diff --git a/sys/dev/cxgb/cxgb_l2t.h b/sys/dev/cxgb/cxgb_l2t.h index 9b4effdb6667..a5d469beba37 100644 --- a/sys/dev/cxgb/cxgb_l2t.h +++ b/sys/dev/cxgb/cxgb_l2t.h @@ -118,7 +118,7 @@ static __inline void set_arp_failure_handler(struct mbuf *m, #define L2DATA(dev) ((dev)->l2opt) void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e); -void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, struct sockaddr *sa); +void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa); struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh, struct ifnet *ifp, struct sockaddr *sa); int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c index ef77dd558237..92e5f2f82a13 100644 --- a/sys/dev/cxgb/cxgb_main.c +++ b/sys/dev/cxgb/cxgb_main.c @@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include #include #include +#include #include #include +#include #include #include @@ -73,23 +74,18 @@ __FBSDID("$FreeBSD$"); #include #include - -#include -#include -#include - #ifdef CONFIG_DEFINED #include -#include #else #include -#include #endif #ifdef PRIV_SUPPORTED #include #endif +#include + static int cxgb_setup_msix(adapter_t *, int); static void cxgb_teardown_msix(adapter_t *); static void cxgb_init(void *); @@ -97,8 +93,6 @@ static void cxgb_init_locked(struct port_info *); static void cxgb_stop_locked(struct port_info *); static void cxgb_set_rxmode(struct port_info *); static int cxgb_ioctl(struct ifnet *, unsigned long, caddr_t); -static void cxgb_start(struct ifnet *); -static void cxgb_start_proc(void *, int ncount); static int cxgb_media_change(struct ifnet *); static void cxgb_media_status(struct ifnet *, struct ifmediareq *); static int setup_sge_qsets(adapter_t *); @@ -109,6 +103,10 @@ static void cxgb_down_locked(struct adapter *sc); static void cxgb_tick(void *); static void setup_rss(adapter_t *sc); +#ifndef IFNET_MULTIQUEUE +static void cxgb_start_proc(void *, int ncount); +#endif + /* Attachment glue for the PCI controller end of the device. Each port of * the device is attached separately, as defined later. */ @@ -122,11 +120,7 @@ static void cxgb_get_regs(adapter_t *sc, struct ifconf_regs *regs, uint8_t *buf) static int cxgb_get_regs_len(void); static int offload_open(struct port_info *pi); static void touch_bars(device_t dev); - -#ifdef notyet static int offload_close(struct t3cdev *tdev); -#endif - static device_method_t cxgb_controller_methods[] = { DEVMETHOD(device_probe, cxgb_controller_probe), @@ -188,7 +182,6 @@ DRIVER_MODULE(cxgb, cxgbc, cxgb_port_driver, cxgb_port_devclass, 0, 0); #define SGE_MSIX_COUNT (SGE_QSETS + 1) -extern int collapse_mbufs; /* * The driver uses the best interrupt scheme available on a platform in the * order MSI-X, MSI, legacy pin interrupts. This parameter determines which @@ -218,11 +211,15 @@ SYSCTL_UINT(_hw_cxgb, OID_AUTO, ofld_disable, CTLFLAG_RDTUN, &ofld_disable, 0, * The driver uses an auto-queue algorithm by default. * To disable it and force a single queue-set per port, use singleq = 1. */ -static int singleq = 1; +static int singleq = 0; TUNABLE_INT("hw.cxgb.singleq", &singleq); SYSCTL_UINT(_hw_cxgb, OID_AUTO, singleq, CTLFLAG_RDTUN, &singleq, 0, "use a single queue-set per port"); +#ifndef IFNET_MULTIQUEUE +int cxgb_txq_buf_ring_size = 0; +#endif + enum { MAX_TXQ_ENTRIES = 16384, MAX_CTRL_TXQ_ENTRIES = 1024, @@ -281,10 +278,24 @@ struct cxgb_ident { {0, 0, 0, NULL} }; - static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset); -static inline char +static __inline void +check_pkt_coalesce(struct sge_qset *qs) +{ + struct adapter *sc; + struct sge_txq *txq; + + txq = &qs->txq[TXQ_ETH]; + sc = qs->port->adapter; + + if (sc->tunq_fill[qs->idx] && (txq->in_use < (txq->size - (txq->size>>2)))) + sc->tunq_fill[qs->idx] = 0; + else if (!sc->tunq_fill[qs->idx] && (txq->in_use > (txq->size - (txq->size>>2)))) + sc->tunq_fill[qs->idx] = 1; +} + +static __inline char t3rev2char(struct adapter *adapter) { char rev = 'z'; @@ -582,6 +593,7 @@ cxgb_controller_attach(device_t dev) pi->tx_chan = i >= ai->nports0; pi->txpkt_intf = pi->tx_chan ? 2 * (i - ai->nports0) + 1 : 2 * i; sc->rxpkt_map[pi->txpkt_intf] = i; + sc->port[i].tx_chan = i >= ai->nports0; sc->portdev[i] = child; device_set_softc(child, pi); } @@ -611,7 +623,7 @@ cxgb_controller_attach(device_t dev) G_FW_VERSION_MAJOR(vers), G_FW_VERSION_MINOR(vers), G_FW_VERSION_MICRO(vers)); - t3_add_sysctls(sc); + t3_add_attach_sysctls(sc); out: if (error) cxgb_free(sc); @@ -636,10 +648,14 @@ cxgb_free(struct adapter *sc) { int i; + +#ifdef IFNET_MULTIQUEUE + cxgb_pcpu_shutdown_threads(sc); +#endif ADAPTER_LOCK(sc); - /* - * drops the lock - */ +/* + * drops the lock + */ cxgb_down_locked(sc); #ifdef MSI_SUPPORTED @@ -664,7 +680,7 @@ cxgb_free(struct adapter *sc) * Wait for last callout */ - tsleep(&sc, 0, "cxgb unload", 3*hz); + DELAY(hz*100); for (i = 0; i < (sc)->params.nports; ++i) { if (sc->portdev[i] != NULL) @@ -674,15 +690,17 @@ cxgb_free(struct adapter *sc) bus_generic_detach(sc->dev); if (sc->tq != NULL) taskqueue_free(sc->tq); -#ifdef notyet if (is_offload(sc)) { cxgb_adapter_unofld(sc); if (isset(&sc->open_device_map, OFFLOAD_DEVMAP_BIT)) offload_close(&sc->tdev); - } -#endif - + else + printf("cxgb_free: DEVMAP_BIT not set\n"); + } else + printf("not offloading set\n"); +#ifndef IFNET_MULTIQUEUE t3_free_sge_resources(sc); +#endif free(sc->filters, M_DEVBUF); t3_sge_free(sc); @@ -696,8 +714,6 @@ cxgb_free(struct adapter *sc) MTX_DESTROY(&sc->sge.reg_lock); MTX_DESTROY(&sc->elmer_lock); ADAPTER_LOCK_DEINIT(sc); - - return; } /** @@ -803,7 +819,7 @@ cxgb_setup_msix(adapter_t *sc, int msix_count) printf("setting up interrupt for port=%d\n", qs->port->port_id); if (bus_setup_intr(sc->dev, sc->msix_irq_res[k], - INTR_MPSAFE|INTR_TYPE_NET, + INTR_MPSAFE|INTR_TYPE_NET, #ifdef INTR_FILTERS NULL, #endif @@ -812,10 +828,17 @@ cxgb_setup_msix(adapter_t *sc, int msix_count) "interrupt for message %d\n", rid); return (EINVAL); } +#ifdef IFNET_MULTIQUEUE + if (singleq == 0) { + int vector = rman_get_start(sc->msix_irq_res[k]); + if (bootverbose) + device_printf(sc->dev, "binding vector=%d to cpu=%d\n", vector, k % mp_ncpus); + intr_bind(vector, k % mp_ncpus); + } +#endif } } - return (0); } @@ -892,6 +915,12 @@ cxgb_port_attach(device_t dev) ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = cxgb_ioctl; ifp->if_start = cxgb_start; + +#ifdef IFNET_MULTIQUEUE + ifp->if_flags |= IFF_MULTIQ; + ifp->if_mq_start = cxgb_pcpu_start; +#endif + ifp->if_timer = 0; /* Disable ifnet watchdog */ ifp->if_watchdog = NULL; @@ -965,7 +994,7 @@ cxgb_port_attach(device_t dev) p->tq = taskqueue_create_fast(p->taskqbuf, M_NOWAIT, taskqueue_thread_enqueue, &p->tq); #endif - +#ifndef IFNET_MULTIQUEUE if (p->tq == NULL) { device_printf(dev, "failed to allocate port task queue\n"); return (ENOMEM); @@ -974,7 +1003,7 @@ cxgb_port_attach(device_t dev) device_get_nameunit(dev)); TASK_INIT(&p->start_task, 0, cxgb_start_proc, ifp); - +#endif t3_sge_init_port(p); return (0); @@ -999,6 +1028,9 @@ cxgb_port_detach(device_t dev) } ether_ifdetach(p->ifp); + printf("waiting for callout to stop ..."); + DELAY(1000000); + printf("done\n"); /* * the lock may be acquired in ifdetach */ @@ -1247,9 +1279,7 @@ offload_tx(struct t3cdev *tdev, struct mbuf *m) { int ret; - critical_enter(); ret = t3_offload_tx(tdev, m); - critical_exit(); return (ret); } @@ -1264,6 +1294,8 @@ write_smt_entry(struct adapter *adapter, int idx) return (ENOMEM); req = mtod(m, struct cpl_smt_write_req *); + m->m_pkthdr.len = m->m_len = sizeof(struct cpl_smt_write_req); + req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD)); OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx)); req->mtu_idx = NMTUS - 1; /* should be 0 but there's a T3 bug */ @@ -1325,6 +1357,10 @@ bind_qsets(adapter_t *sc) { int i, j; +#ifdef IFNET_MULTIQUEUE + cxgb_pcpu_startup_threads(sc); +#endif + for (i = 0; i < (sc)->params.nports; ++i) { const struct port_info *pi = adap2pinfo(sc, i); @@ -1473,6 +1509,7 @@ cxgb_up(struct adapter *sc) goto out; setup_rss(sc); + t3_add_configured_sysctls(sc); sc->flags |= FULL_INIT_DONE; } @@ -1545,6 +1582,8 @@ cxgb_down_locked(struct adapter *sc) cxgb_teardown_msix(sc); ADAPTER_UNLOCK(sc); + callout_stop(&sc->cxgb_tick_ch); + callout_stop(&sc->sge_timer_ch); callout_drain(&sc->cxgb_tick_ch); callout_drain(&sc->sge_timer_ch); @@ -1553,26 +1592,28 @@ cxgb_down_locked(struct adapter *sc) for (i = 0; i < sc->params.nports; i++) taskqueue_drain(sc->tq, &sc->port[i].timer_reclaim_task); } -#ifdef notyet - - if (sc->port[i].tq != NULL) -#endif - } static int offload_open(struct port_info *pi) { struct adapter *adapter = pi->adapter; - struct t3cdev *tdev = TOEDEV(pi->ifp); + struct t3cdev *tdev = &adapter->tdev; +#ifdef notyet + T3CDEV(pi->ifp); +#endif int adap_up = adapter->open_device_map & PORT_MASK; int err = 0; + printf("device_map=0x%x\n", adapter->open_device_map); if (atomic_cmpset_int(&adapter->open_device_map, - (adapter->open_device_map & ~OFFLOAD_DEVMAP_BIT), - (adapter->open_device_map | OFFLOAD_DEVMAP_BIT)) == 0) + (adapter->open_device_map & ~(1<open_device_map | (1<open_device_map, OFFLOAD_DEVMAP_BIT)) + printf("offload_open: DEVMAP_BIT did not get set 0x%x\n", adapter->open_device_map); ADAPTER_LOCK(pi->adapter); if (!adap_up) err = cxgb_up(adapter); @@ -1581,7 +1622,7 @@ offload_open(struct port_info *pi) return (err); t3_tp_set_offload_mode(adapter, 1); - tdev->lldev = adapter->port[0].ifp; + tdev->lldev = pi->ifp; err = cxgb_offload_activate(adapter); if (err) goto out; @@ -1605,15 +1646,18 @@ offload_open(struct port_info *pi) } return (err); } -#ifdef notyet + static int -offload_close(struct t3cev *tdev) +offload_close(struct t3cdev *tdev) { struct adapter *adapter = tdev2adap(tdev); - if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) + if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) { + printf("offload_close: DEVMAP_BIT not set\n"); + return (0); - + } + /* Call back all registered clients */ cxgb_remove_clients(tdev); tdev->lldev = NULL; @@ -1621,13 +1665,15 @@ offload_close(struct t3cev *tdev) t3_tp_set_offload_mode(adapter, 0); clrbit(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT); + ADAPTER_LOCK(adapter); if (!adapter->open_device_map) - cxgb_down(adapter); - + cxgb_down_locked(adapter); + else + ADAPTER_UNLOCK(adapter); cxgb_offload_deactivate(adapter); return (0); } -#endif + static void cxgb_init(void *arg) @@ -1667,6 +1713,8 @@ cxgb_init_locked(struct port_info *p) if (err) log(LOG_WARNING, "Could not initialize offload capabilities\n"); + else + printf("offload opened\n"); } cxgb_link_start(p); t3_link_changed(sc, p->port_id); @@ -1675,8 +1723,7 @@ cxgb_init_locked(struct port_info *p) device_printf(sc->dev, "enabling interrupts on port=%d\n", p->port_id); t3_port_intr_enable(sc, p->port_id); - callout_reset(&sc->cxgb_tick_ch, sc->params.stats_update_period * hz, - cxgb_tick, sc); + callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; @@ -1703,7 +1750,6 @@ cxgb_stop_locked(struct port_info *p) ADAPTER_LOCK_ASSERT_NOTOWNED(p->adapter); ifp = p->ifp; - t3_port_intr_disable(p->adapter, p->port_id); ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); p->phy.ops->power_down(&p->phy, 1); @@ -1712,7 +1758,6 @@ cxgb_stop_locked(struct port_info *p) ADAPTER_LOCK(p->adapter); clrbit(&p->adapter->open_device_map, p->port_id); - if (p->adapter->open_device_map == 0) { cxgb_down_locked(p->adapter); } else @@ -1786,8 +1831,7 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data) if (ifp->if_drv_flags & IFF_DRV_RUNNING) { adapter_t *sc = p->adapter; - callout_reset(&sc->cxgb_tick_ch, - sc->params.stats_update_period * hz, + callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc); } PORT_UNLOCK(p); @@ -1838,77 +1882,92 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data) return (error); } +int +cxgb_tx_common(struct ifnet *ifp, struct sge_qset *qs, uint32_t txmax) +{ + struct sge_txq *txq; + int err, in_use_init, count; + struct mbuf **m_vec; + + txq = &qs->txq[TXQ_ETH]; + m_vec = txq->txq_m_vec; + in_use_init = txq->in_use; + err = 0; + while ((txq->in_use - in_use_init < txmax) && + (txq->size > txq->in_use + TX_MAX_DESC)) { + check_pkt_coalesce(qs); + count = cxgb_dequeue_packet(ifp, txq, m_vec); + if (count == 0) + break; + ETHER_BPF_MTAP(ifp, m_vec[0]); + + if ((err = t3_encap(qs, m_vec, count)) != 0) + break; + txq->txq_enqueued += count; + } +#ifndef IFNET_MULTIQUEUE + if (__predict_false(err)) { + if (err == ENOMEM) { + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + IFQ_LOCK(&ifp->if_snd); + IFQ_DRV_PREPEND(&ifp->if_snd, m_vec[0]); + IFQ_UNLOCK(&ifp->if_snd); + } + } + if (err == 0 && m_vec[0] == NULL) { + err = ENOBUFS; + } + else if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC) && + (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + err = ENOSPC; + } +#else + if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC)) { + err = ENOSPC; + setbit(&qs->txq_stopped, TXQ_ETH); + } + if (err == ENOMEM) { + int i; + /* + * Sub-optimal :-/ + */ + for (i = 0; i < count; i++) + m_freem(m_vec[i]); + } +#endif + return (err); +} + +#ifndef IFNET_MULTIQUEUE static int cxgb_start_tx(struct ifnet *ifp, uint32_t txmax) { struct sge_qset *qs; struct sge_txq *txq; struct port_info *p = ifp->if_softc; - struct mbuf *m = NULL; - int err, in_use_init, free; - + int err; + if (!p->link_config.link_ok) return (ENXIO); - if (IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + if (IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { return (ENOBUFS); - + } + qs = &p->adapter->sge.qs[p->first_qset]; txq = &qs->txq[TXQ_ETH]; err = 0; if (txq->flags & TXQ_TRANSMITTING) return (EINPROGRESS); - + mtx_lock(&txq->lock); txq->flags |= TXQ_TRANSMITTING; - in_use_init = txq->in_use; - while ((txq->in_use - in_use_init < txmax) && - (txq->size > txq->in_use + TX_MAX_DESC)) { - free = 0; - IFQ_DRV_DEQUEUE(&ifp->if_snd, m); - if (m == NULL) - break; - /* - * Convert chain to M_IOVEC - */ - KASSERT((m->m_flags & M_IOVEC) == 0, ("IOVEC set too early")); -#ifdef notyet - m0 = m; - if (collapse_mbufs && m->m_pkthdr.len > MCLBYTES && - m_collapse(m, TX_MAX_SEGS, &m0) == EFBIG) { - if ((m0 = m_defrag(m, M_NOWAIT)) != NULL) { - m = m0; - m_collapse(m, TX_MAX_SEGS, &m0); - } else - break; - } - m = m0; -#endif - if ((err = t3_encap(p, &m, &free)) != 0) - break; - BPF_MTAP(ifp, m); - if (free) - m_freem(m); - } + cxgb_tx_common(ifp, qs, txmax); txq->flags &= ~TXQ_TRANSMITTING; mtx_unlock(&txq->lock); - if (__predict_false(err)) { - if (err == ENOMEM) { - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - IFQ_LOCK(&ifp->if_snd); - IFQ_DRV_PREPEND(&ifp->if_snd, m); - IFQ_UNLOCK(&ifp->if_snd); - } - } - if (err == 0 && m == NULL) - err = ENOBUFS; - else if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC) && - (ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) { - ifp->if_drv_flags |= IFF_DRV_OACTIVE; - err = ENOSPC; - } return (err); } @@ -1932,7 +1991,15 @@ cxgb_start_proc(void *arg, int ncount) } while (error == 0); } -static void +int +cxgb_dequeue_packet(struct ifnet *ifp, struct sge_txq *unused, struct mbuf **m_vec) +{ + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m_vec[0]); + return (m_vec[0] ? 1 : 0); +} + +void cxgb_start(struct ifnet *ifp) { struct port_info *pi = ifp->if_softc; @@ -1952,7 +2019,7 @@ cxgb_start(struct ifnet *ifp) if (err == 0) taskqueue_enqueue(pi->tq, &pi->start_task); } - +#endif static int cxgb_media_change(struct ifnet *ifp) @@ -2078,12 +2145,26 @@ static void cxgb_tick(void *arg) { adapter_t *sc = (adapter_t *)arg; + int i, running = 0; + + for_each_port(sc, i) { + + struct port_info *p = &sc->port[i]; + struct ifnet *ifp = p->ifp; + PORT_LOCK(p); + if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) + running = 1; + PORT_UNLOCK(p); + } + + if (running == 0) + return; + taskqueue_enqueue(sc->tq, &sc->tick_task); if (sc->open_device_map != 0) - callout_reset(&sc->cxgb_tick_ch, sc->params.stats_update_period * hz, - cxgb_tick, sc); + callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc); } static void @@ -2478,7 +2559,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data, * Read 256 bytes at a time as len can be large and we don't * want to use huge intermediate buffers. */ - useraddr = (uint8_t *)(t + 1); /* advance to start of buffer */ + useraddr = (uint8_t *)t->buf; while (t->len) { unsigned int chunk = min(t->len, sizeof(buf)); diff --git a/sys/dev/cxgb/cxgb_offload.c b/sys/dev/cxgb/cxgb_offload.c index d0b9b324f2c6..3ce1a11b77ef 100644 --- a/sys/dev/cxgb/cxgb_offload.c +++ b/sys/dev/cxgb/cxgb_offload.c @@ -108,9 +108,12 @@ cxgb_register_client(struct cxgb_client *client) printf("client->add set\n"); TAILQ_FOREACH(tdev, &ofld_dev_list, entry) { - if (offload_activated(tdev)) + if (offload_activated(tdev)) { + printf("calling add=%p on %p\n", + client->add, tdev); + client->add(tdev); - else + } else printf("%p not activated\n", tdev); } @@ -477,7 +480,8 @@ rx_offload_blackhole(struct t3cdev *dev, struct mbuf **m, int n) } static void -dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa) +dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, + struct sockaddr *sa) { } @@ -895,17 +899,32 @@ do_term(struct t3cdev *dev, struct mbuf *m) } static void -cxgb_route_event(void *unused, int event, struct rtentry *rt0, +cxgb_arp_update_event(void *unused, struct rtentry *rt0, + uint8_t *enaddr, struct sockaddr *sa) +{ + + if (TOEDEV(rt0->rt_ifp) == NULL) + return; + + RT_ADDREF(rt0); + RT_UNLOCK(rt0); + cxgb_neigh_update(rt0, enaddr, sa); + RT_LOCK(rt0); + RT_REMREF(rt0); +} + + +static void +cxgb_redirect_event(void *unused, int event, struct rtentry *rt0, struct rtentry *rt1, struct sockaddr *sa) { - struct toedev *tdev0, *tdev1 = NULL; + struct toedev *tdev0, *tdev1; /* * ignore events on non-offloaded interfaces */ tdev0 = TOEDEV(rt0->rt_ifp); - if (rt1) - tdev1 = TOEDEV(rt1->rt_ifp); + tdev1 = TOEDEV(rt1->rt_ifp); if (tdev0 == NULL && tdev1 == NULL) return; /* @@ -914,34 +933,16 @@ cxgb_route_event(void *unused, int event, struct rtentry *rt0, */ RT_ADDREF(rt0); RT_UNLOCK(rt0); - if (rt1) { - RT_ADDREF(rt1); - RT_UNLOCK(rt1); - } - - switch (event) { - case RTEVENT_ARP_UPDATE: { - cxgb_neigh_update(rt0, sa); - break; - } - case RTEVENT_REDIRECT_UPDATE: { - cxgb_redirect(rt0, rt1, sa); - cxgb_neigh_update(rt1, sa); + RT_ADDREF(rt1); + RT_UNLOCK(rt1); - break; - } - case RTEVENT_PMTU_UPDATE: - default: - break; - } + cxgb_redirect(rt0, rt1, sa); + cxgb_neigh_update(rt1, NULL, sa); RT_LOCK(rt0); RT_REMREF(rt0); - if (rt1) { - RT_LOCK(rt1); - RT_REMREF(rt1); - } - + RT_LOCK(rt1); + RT_REMREF(rt1); } /* @@ -1048,14 +1049,14 @@ cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n) } void -cxgb_neigh_update(struct rtentry *rt, struct sockaddr *sa) +cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa) { if (is_offloading(rt->rt_ifp)) { struct t3cdev *tdev = T3CDEV(rt->rt_ifp); PANIC_IF(!tdev); - t3_l2t_update(tdev, rt, sa); + t3_l2t_update(tdev, rt, enaddr, sa); } } @@ -1425,7 +1426,10 @@ cxgb_offload_init(void) t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl); t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl); - EVENTHANDLER_REGISTER(route_event, cxgb_route_event, NULL, EVENTHANDLER_PRI_ANY); + EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event, + NULL, EVENTHANDLER_PRI_ANY); + EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event, + NULL, EVENTHANDLER_PRI_ANY); #if 0 if (offload_proc_init()) diff --git a/sys/dev/cxgb/cxgb_offload.h b/sys/dev/cxgb/cxgb_offload.h index 59afe6b0fbe9..8c84d07586f0 100644 --- a/sys/dev/cxgb/cxgb_offload.h +++ b/sys/dev/cxgb/cxgb_offload.h @@ -253,7 +253,7 @@ static inline struct toe_tid_entry *lookup_atid(const struct tid_info *t, void *cxgb_alloc_mem(unsigned long size); void cxgb_free_mem(void *addr); -void cxgb_neigh_update(struct rtentry *rt, struct sockaddr *sa); +void cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa); void cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa); int process_rx(struct t3cdev *dev, struct mbuf **m, int n); int attach_t3cdev(struct t3cdev *dev); diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h index cf5a8b62f2a2..7f757790dc2e 100644 --- a/sys/dev/cxgb/cxgb_osdep.h +++ b/sys/dev/cxgb/cxgb_osdep.h @@ -36,6 +36,9 @@ POSSIBILITY OF SUCH DAMAGE. #include #include +#include +#include + #include #ifdef CONFIG_DEFINED @@ -52,18 +55,17 @@ POSSIBILITY OF SUCH DAMAGE. typedef struct adapter adapter_t; struct sge_rspq; + struct t3_mbuf_hdr { struct mbuf *mh_head; struct mbuf *mh_tail; }; - #define PANIC_IF(exp) do { \ if (exp) \ panic("BUG: %s", #exp); \ } while (0) - #define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif) #define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri)) #define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl)) @@ -113,6 +115,7 @@ struct t3_mbuf_hdr { #define CXGB_TX_CLEANUP_THRESHOLD 32 + #ifdef DEBUG_PRINT #define DPRINTF printf #else @@ -121,19 +124,25 @@ struct t3_mbuf_hdr { #define TX_MAX_SIZE (1 << 16) /* 64KB */ #define TX_MAX_SEGS 36 /* maximum supported by card */ + #define TX_MAX_DESC 4 /* max descriptors per packet */ + #define TX_START_MIN_DESC (TX_MAX_DESC << 2) -#if 0 -#define TX_START_MAX_DESC (TX_ETH_Q_SIZE >> 2) /* maximum number of descriptors */ -#endif + #define TX_START_MAX_DESC (TX_MAX_DESC << 3) /* maximum number of descriptors * call to start used per */ #define TX_CLEAN_MAX_DESC (TX_MAX_DESC << 4) /* maximum tx descriptors * to clean per iteration */ +#define TX_WR_SIZE_MAX 11*1024 /* the maximum total size of packets aggregated into a single + * TX WR + */ +#define TX_WR_COUNT_MAX 7 /* the maximum total number of packets that can be + * aggregated into a single TX WR + */ #if defined(__i386__) || defined(__amd64__) @@ -142,7 +151,7 @@ struct t3_mbuf_hdr { #define wmb() __asm volatile("sfence" ::: "memory") #define smp_mb() mb() -#define L1_CACHE_BYTES 64 +#define L1_CACHE_BYTES 128 static __inline void prefetch(void *x) { @@ -167,6 +176,107 @@ extern void kdb_backtrace(void); #define prefetch(x) #define L1_CACHE_BYTES 32 #endif + +struct buf_ring { + caddr_t *br_ring; + volatile uint32_t br_cons; + volatile uint32_t br_prod; + int br_size; + struct mtx br_lock; +}; + +struct buf_ring *buf_ring_alloc(int count, int flags); +void buf_ring_free(struct buf_ring *); + +static __inline int +buf_ring_count(struct buf_ring *mr) +{ + int size = mr->br_size; + int mask = size - 1; + + return ((size + mr->br_prod - mr->br_cons) & mask); +} + +static __inline int +buf_ring_empty(struct buf_ring *mr) +{ + return (mr->br_cons == mr->br_prod); +} + +/* + * The producer and consumer are independently locked + * this relies on the consumer providing his own serialization + * + */ +static __inline void * +buf_ring_dequeue(struct buf_ring *mr) +{ + int prod, cons, mask; + caddr_t *ring, m; + + ring = (caddr_t *)mr->br_ring; + mask = mr->br_size - 1; + cons = mr->br_cons; + prod = mr->br_prod; + m = NULL; + if (cons != prod) { + m = ring[cons]; + mr->br_cons = (cons + 1) & mask; + mb(); + } + return (m); +} + + +static __inline int +__buf_ring_enqueue(struct buf_ring *mr, void *m) +{ + + int prod, cons, mask, err; + + cons = mr->br_cons; + prod = mr->br_prod; + mask = mr->br_size - 1; + if (((prod + 1) & mask) != cons) { + mr->br_ring[prod] = m; + mb(); + mr->br_prod = (prod + 1) & mask; + err = 0; + } else + err = ENOBUFS; + + return (err); +} + +static __inline int +buf_ring_enqueue(struct buf_ring *mr, void *m) +{ + int err; + + mtx_lock(&mr->br_lock); + err = __buf_ring_enqueue(mr, m); + mtx_unlock(&mr->br_lock); + + return (err); +} + +static __inline void * +buf_ring_peek(struct buf_ring *mr) +{ + int prod, cons, mask; + caddr_t *ring, m; + + ring = (caddr_t *)mr->br_ring; + mask = mr->br_size - 1; + cons = mr->br_cons; + prod = mr->br_prod; + m = NULL; + if (cons != prod) + m = ring[cons]; + + return (m); +} + #define DBG_RX (1 << 0) static const int debug_flags = DBG_RX; @@ -189,15 +299,12 @@ static const int debug_flags = DBG_RX; #define t3_os_sleep(x) DELAY((x) * 1000) -#define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | bit), ((*(p)) & ~bit)) - +#define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | (1< #include #include -#include #include #include #include #include #include +#include #include #include @@ -59,8 +59,7 @@ __FBSDID("$FreeBSD$"); #include #include -#include -#include +#include #ifdef CONFIG_DEFINED #include @@ -70,14 +69,15 @@ __FBSDID("$FreeBSD$"); #include #endif -uint32_t collapse_free = 0; -uint32_t mb_free_vec_free = 0; int txq_fills = 0; -int collapse_mbufs = 0; static int bogus_imm = 0; #ifndef DISABLE_MBUF_IOVEC static int recycle_enable = 1; #endif +extern int cxgb_txq_buf_ring_size; +int cxgb_cached_allocations; +int cxgb_cached; +int cxgb_ext_freed; #define USE_GTS 0 @@ -134,15 +134,17 @@ struct rsp_desc { /* response queue descriptor */ #define RSPQ_SOP_EOP G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP) struct tx_sw_desc { /* SW state per Tx descriptor */ - struct mbuf *m; + struct mbuf_iovec mi; bus_dmamap_t map; int flags; }; struct rx_sw_desc { /* SW state per Rx descriptor */ - void *cl; - bus_dmamap_t map; - int flags; + caddr_t rxsd_cl; + uint32_t *rxsd_ref; + caddr_t data; + bus_dmamap_t map; + int flags; }; struct txq_state { @@ -186,11 +188,9 @@ static uint8_t flit_desc_map[] = { static int lro_default = 0; int cxgb_debug = 0; -static void t3_free_qset(adapter_t *sc, struct sge_qset *q); static void sge_timer_cb(void *arg); static void sge_timer_reclaim(void *arg, int ncount); static void sge_txq_reclaim_handler(void *arg, int ncount); -static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec); /** * reclaim_completed_tx - reclaims completed Tx descriptors @@ -202,19 +202,17 @@ static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec); * queue's lock held. */ static __inline int -reclaim_completed_tx(struct sge_txq *q, int nbufs, struct mbuf **mvec) +reclaim_completed_tx(struct sge_txq *q) { - int reclaimed, reclaim = desc_reclaimable(q); - int n = 0; + int reclaim = desc_reclaimable(q); mtx_assert(&q->lock, MA_OWNED); if (reclaim > 0) { - n = free_tx_desc(q, min(reclaim, nbufs), mvec); - reclaimed = min(reclaim, nbufs); - q->cleaned += reclaimed; - q->in_use -= reclaimed; + t3_free_tx_desc(q, reclaim); + q->cleaned += reclaim; + q->in_use -= reclaim; } - return (n); + return (reclaim); } /** @@ -298,38 +296,14 @@ sgl_len(unsigned int n) static __inline int get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh) { - struct mbuf *m; - int len; - uint32_t flags = ntohl(resp->flags); - uint8_t sopeop = G_RSPD_SOP_EOP(flags); - - /* - * would be a firmware bug - */ - if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) return (0); - m = m_gethdr(M_NOWAIT, MT_DATA); - len = G_RSPD_LEN(ntohl(resp->len_cq)); + m = m_gethdr(M_DONTWAIT, MT_DATA); + len = IMMED_PKT_SIZE; if (m) { - MH_ALIGN(m, IMMED_PKT_SIZE); memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE); - m->m_len = len; - - switch (sopeop) { - case RSPQ_SOP_EOP: - mh->mh_head = mh->mh_tail = m; - m->m_pkthdr.len = len; - m->m_flags |= M_PKTHDR; - break; - case RSPQ_EOP: - m->m_flags &= ~M_PKTHDR; - mh->mh_head->m_pkthdr.len += len; - mh->mh_tail->m_next = m; - mh->mh_tail = m; - break; - } + m->m_pkthdr.len = m->m_len = len; } return (m != NULL); } @@ -338,35 +312,11 @@ get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *m static int get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags) { - int len, error; - uint8_t sopeop = G_RSPD_SOP_EOP(flags); - - /* - * would be a firmware bug - */ - len = G_RSPD_LEN(ntohl(resp->len_cq)); - if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) { - if (cxgb_debug) - device_printf(sc->dev, "unexpected value sopeop=%d flags=0x%x len=%din get_imm_packet\n", sopeop, flags, len); - bogus_imm++; - return (EINVAL); - } - error = 0; - switch (sopeop) { - case RSPQ_SOP_EOP: - m->m_len = m->m_pkthdr.len = len; - memcpy(mtod(m, uint8_t *), resp->imm_data, len); - break; - case RSPQ_EOP: - memcpy(cl, resp->imm_data, len); - m_iovappend(m, cl, MSIZE, len, 0); - break; - default: - bogus_imm++; - error = EINVAL; - } - return (error); + m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE; + memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE); + return (0); + } #endif @@ -413,11 +363,15 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p) q->polling = adap->params.rev > 0; - if (adap->params.nports > 2) + if (adap->params.nports > 2) { q->coalesce_nsecs = 50000; - else + } else { +#ifdef INVARIANTS + q->coalesce_nsecs = 20000; +#else q->coalesce_nsecs = 5000; - +#endif + } q->rspq_size = RSPQ_Q_SIZE; q->fl_size = FL_Q_SIZE; q->jumbo_size = JUMBO_Q_SIZE; @@ -509,6 +463,7 @@ t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p) qs->rspq.polling = 0 /* p->polling */; } +#if !defined(__i386__) && !defined(__amd64__) static void refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) { @@ -519,7 +474,7 @@ refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error) cb_arg->nseg = nseg; } - +#endif /** * refill_fl - refill an SGE free-buffer list * @sc: the controller softc @@ -535,7 +490,7 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n) struct rx_sw_desc *sd = &q->sdesc[q->pidx]; struct rx_desc *d = &q->desc[q->pidx]; struct refill_fl_cb_arg cb_arg; - void *cl; + caddr_t cl; int err; cb_arg.error = 0; @@ -543,10 +498,11 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n) /* * We only allocate a cluster, mbuf allocation happens after rx */ - if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) { + if ((cl = cxgb_cache_get(q->zone)) == NULL) { log(LOG_WARNING, "Failed to allocate cluster\n"); goto done; } + if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) { if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) { log(LOG_WARNING, "bus_dmamap_create failed %d\n", err); @@ -555,7 +511,9 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n) } sd->flags |= RX_SW_DESC_MAP_CREATED; } - err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size, +#if !defined(__i386__) && !defined(__amd64__) + err = bus_dmamap_load(q->entry_tag, sd->map, + cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t), q->buf_size, refill_fl_cb, &cb_arg, 0); if (err != 0 || cb_arg.error) { @@ -565,9 +523,14 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n) */ return; } - +#else + cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + sizeof(struct m_hdr) + + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t))); +#endif sd->flags |= RX_SW_DESC_INUSE; - sd->cl = cl; + sd->rxsd_cl = cl; + sd->rxsd_ref = (uint32_t *)(cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_)); + sd->data = cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t); d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff); d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff); d->len_gen = htobe32(V_FLD_GEN1(q->gen)); @@ -609,9 +572,9 @@ free_rx_bufs(adapter_t *sc, struct sge_fl *q) if (d->flags & RX_SW_DESC_INUSE) { bus_dmamap_unload(q->entry_tag, d->map); bus_dmamap_destroy(q->entry_tag, d->map); - uma_zfree(q->zone, d->cl); + uma_zfree(q->zone, d->rxsd_cl); } - d->cl = NULL; + d->rxsd_cl = NULL; if (++cidx == q->size) cidx = 0; } @@ -623,6 +586,19 @@ __refill_fl(adapter_t *adap, struct sge_fl *fl) refill_fl(adap, fl, min(16U, fl->size - fl->credits)); } +static __inline void +__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max) +{ + if ((fl->size - fl->credits) < max) + refill_fl(adap, fl, min(max, fl->size - fl->credits)); +} + +void +refill_fl_service(adapter_t *adap, struct sge_fl *fl) +{ + __refill_fl_lt(adap, fl, 512); +} + #ifndef DISABLE_MBUF_IOVEC /** * recycle_rx_buf - recycle a receive buffer @@ -753,12 +729,13 @@ static void sge_timer_cb(void *arg) { adapter_t *sc = arg; - struct port_info *p; +#ifndef IFNET_MULTIQUEUE + struct port_info *pi; struct sge_qset *qs; struct sge_txq *txq; int i, j; int reclaim_eth, reclaim_ofl, refill_rx; - + for (i = 0; i < sc->params.nports; i++) for (j = 0; j < sc->port[i].nqsets; j++) { qs = &sc->sge.qs[i + j]; @@ -768,11 +745,12 @@ sge_timer_cb(void *arg) refill_rx = ((qs->fl[0].credits < qs->fl[0].size) || (qs->fl[1].credits < qs->fl[1].size)); if (reclaim_eth || reclaim_ofl || refill_rx) { - p = &sc->port[i]; - taskqueue_enqueue(p->tq, &p->timer_reclaim_task); + pi = &sc->port[i]; + taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task); break; } } +#endif if (sc->params.nports > 2) { int i; @@ -799,13 +777,15 @@ t3_sge_init_adapter(adapter_t *sc) callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE); callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc); TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc); + mi_init(); + cxgb_cache_init(); return (0); } int -t3_sge_init_port(struct port_info *p) +t3_sge_init_port(struct port_info *pi) { - TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p); + TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi); return (0); } @@ -820,6 +800,8 @@ t3_sge_deinit_sw(adapter_t *sc) for (i = 0; i < sc->params.nports; i++) if (sc->port[i].tq != NULL) taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task); + + mi_deinit(); } /** @@ -843,29 +825,22 @@ refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits) static __inline void sge_txq_reclaim_(struct sge_txq *txq) { - int reclaimable, i, n; - struct mbuf *m_vec[TX_CLEAN_MAX_DESC]; - struct port_info *p; + int reclaimable, n; + struct port_info *pi; - p = txq->port; + pi = txq->port; reclaim_more: n = 0; reclaimable = desc_reclaimable(txq); if (reclaimable > 0 && mtx_trylock(&txq->lock)) { - n = reclaim_completed_tx(txq, TX_CLEAN_MAX_DESC, m_vec); + n = reclaim_completed_tx(txq); mtx_unlock(&txq->lock); } - if (n == 0) - return; - - for (i = 0; i < n; i++) { - m_freem(m_vec[i]); - } - if (p && p->ifp->if_drv_flags & IFF_DRV_OACTIVE && + if (pi && pi->ifp->if_drv_flags & IFF_DRV_OACTIVE && txq->size - txq->in_use >= TX_START_MAX_DESC) { txq_fills++; - p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; - taskqueue_enqueue(p->tq, &p->start_task); + pi->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + taskqueue_enqueue(pi->tq, &pi->start_task); } if (n) @@ -883,13 +858,16 @@ sge_txq_reclaim_handler(void *arg, int ncount) static void sge_timer_reclaim(void *arg, int ncount) { - struct port_info *p = arg; - int i, nqsets = p->nqsets; - adapter_t *sc = p->adapter; + struct port_info *pi = arg; + int i, nqsets = pi->nqsets; + adapter_t *sc = pi->adapter; struct sge_qset *qs; struct sge_txq *txq; struct mtx *lock; +#ifdef IFNET_MULTIQUEUE + panic("%s should not be called with multiqueue support\n", __FUNCTION__); +#endif for (i = 0; i < nqsets; i++) { qs = &sc->sge.qs[i]; txq = &qs->txq[TXQ_ETH]; @@ -942,6 +920,10 @@ init_qset_cntxt(struct sge_qset *qs, u_int id) qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id; qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id; qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id; + + mbufq_init(&qs->txq[TXQ_ETH].sendq); + mbufq_init(&qs->txq[TXQ_OFLD].sendq); + mbufq_init(&qs->txq[TXQ_CTRL].sendq); } @@ -985,7 +967,7 @@ calc_tx_descs(const struct mbuf *m, int nsegs) flits = sgl_len(nsegs) + 2; #ifdef TSO_SUPPORTED - if (m->m_pkthdr.csum_flags & (CSUM_TSO)) + if (m->m_pkthdr.csum_flags & CSUM_TSO) flits++; #endif return flits_to_desc(flits); @@ -993,28 +975,27 @@ calc_tx_descs(const struct mbuf *m, int nsegs) static unsigned int busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq, - struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs) + struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m0; - int err, pktlen; + int err, pktlen, pass = 0; +retry: + err = 0; m0 = *m; pktlen = m0->m_pkthdr.len; - - err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0); -#ifdef DEBUG - if (err) { - int n = 0; - struct mbuf *mtmp = m0; - while(mtmp) { - n++; - mtmp = mtmp->m_next; - } - printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n", - err, m0->m_pkthdr.len, n); - } +#if defined(__i386__) || defined(__amd64__) + if (busdma_map_sg_collapse(m, segs, nsegs) == 0) { + goto done; + } else #endif - if (err == EFBIG) { + err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0); + + if (err == 0) { + goto done; + } + if (err == EFBIG && pass == 0) { + pass = 1; /* Too many segments, try to defrag */ m0 = m_defrag(m0, M_DONTWAIT); if (m0 == NULL) { @@ -1023,23 +1004,21 @@ busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq, return (ENOBUFS); } *m = m0; - err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0); - } - - if (err == ENOMEM) { + goto retry; + } else if (err == ENOMEM) { return (err); - } - - if (err) { + } if (err) { if (cxgb_debug) printf("map failure err=%d pktlen=%d\n", err, pktlen); m_freem(m0); *m = NULL; return (err); } - - bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE); - stx->flags |= TX_SW_DESC_MAPPED; +done: +#if !defined(__i386__) && !defined(__amd64__) + bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE); +#endif + txsd->flags |= TX_SW_DESC_MAPPED; return (0); } @@ -1059,12 +1038,18 @@ make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs) { int i, idx; - for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) { + for (idx = 0, i = 0; i < nsegs; i++) { + /* + * firmware doesn't like empty segments + */ + if (segs[i].ds_len == 0) + continue; if (i && idx == 0) ++sgp; - + sgp->len[idx] = htobe32(segs[i].ds_len); sgp->addr[idx] = htobe64(segs[i].ds_addr); + idx ^= 1; } if (idx) @@ -1112,6 +1097,20 @@ wr_gen2(struct tx_desc *d, unsigned int gen) #endif } +#if 0 +static int print_wr = 0; +static __inline void +do_print_wr(struct tx_desc *d, int flits) +{ + int i = 0; + + if (print_wr) + while (flits--) { + printf("flit[%d]: 0x%016lx\n", i, d->flit[i]); + i++; + } +} +#endif /** @@ -1131,7 +1130,6 @@ wr_gen2(struct tx_desc *d, unsigned int gen) * and we just need to write the WR header. Otherwise we distribute the * SGL across the number of descriptors it spans. */ - static void write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs, const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits, @@ -1149,6 +1147,7 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs V_WR_GEN(txqs->gen)) | wr_lo; /* XXX gen? */ wr_gen2(txd, txqs->gen); + } else { unsigned int ogen = txqs->gen; const uint64_t *fp = (const uint64_t *)sgl; @@ -1183,7 +1182,7 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs * is freed all clusters will be freed * with it */ - txsd->m = NULL; + txsd->mi.mi_base = NULL; wrp = (struct work_request_hdr *)txd; wrp->wr_hi = htonl(V_WR_DATATYPE(1) | V_WR_SGLSFLT(1)) | wr_hi; @@ -1200,80 +1199,151 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs } } - /* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */ #define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20) +#ifdef VLAN_SUPPORTED +#define GET_VTAG(cntrl, m) \ +do { \ + if ((m)->m_flags & M_VLANTAG) \ + cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \ +} while (0) + +#define GET_VTAG_MI(cntrl, mi) \ +do { \ + if ((mi)->mi_flags & M_VLANTAG) \ + cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \ +} while (0) +#else +#define GET_VTAG(cntrl, m) +#define GET_VTAG_MI(cntrl, m) +#endif + int -t3_encap(struct port_info *p, struct mbuf **m, int *free) +t3_encap(struct sge_qset *qs, struct mbuf **m, int count) { adapter_t *sc; struct mbuf *m0; - struct sge_qset *qs; struct sge_txq *txq; - struct tx_sw_desc *stx; struct txq_state txqs; + struct port_info *pi; unsigned int ndesc, flits, cntrl, mlen; int err, nsegs, tso_info = 0; struct work_request_hdr *wrp; struct tx_sw_desc *txsd; - struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1]; - bus_dma_segment_t segs[TX_MAX_SEGS]; + struct sg_ent *sgp, *sgl; + bus_dma_segment_t *segs; uint32_t wr_hi, wr_lo, sgl_flits; struct tx_desc *txd; - struct cpl_tx_pkt *cpl; - - m0 = *m; - sc = p->adapter; - - DPRINTF("t3_encap port_id=%d qsidx=%d ", p->port_id, p->first_qset); - - /* port_id=1 qsid=1 txpkt_intf=2 tx_chan=0 */ - - qs = &sc->sge.qs[p->first_qset]; + struct mbuf_vec *mv; + struct mbuf_iovec *mi; + + DPRINTF("t3_encap cpu=%d ", curcpu); + pi = qs->port; + sc = pi->adapter; txq = &qs->txq[TXQ_ETH]; - stx = &txq->sdesc[txq->pidx]; + txsd = &txq->sdesc[txq->pidx]; txd = &txq->desc[txq->pidx]; - cpl = (struct cpl_tx_pkt *)txd; - mlen = m0->m_pkthdr.len; - cpl->len = htonl(mlen | 0x80000000); - - DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", mlen, p->txpkt_intf, p->tx_chan); - /* - * XXX handle checksum, TSO, and VLAN here - * - */ - cntrl = V_TXPKT_INTF(p->txpkt_intf); + sgl = txq->txq_sgl; + segs = txq->txq_segs; + m0 = *m; + DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset); + DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan); - /* - * XXX need to add VLAN support for 6.x - */ + cntrl = V_TXPKT_INTF(pi->txpkt_intf); +/* + * XXX need to add VLAN support for 6.x + */ #ifdef VLAN_SUPPORTED - if (m0->m_flags & M_VLANTAG) - cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); if (m0->m_pkthdr.csum_flags & (CSUM_TSO)) tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz); -#endif - if (tso_info) { - int eth_type; - struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl; +#endif + + if (count > 1) { + if ((err = busdma_map_sg_vec(m, &m0, segs, count))) + return (err); + nsegs = count; + } else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) { + if (cxgb_debug) + printf("failed ... err=%d\n", err); + return (err); + } + KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count)); + + if (m0->m_type == MT_DATA) + DPRINTF("mbuf type=%d tags:%d head=%p", m0->m_type, !SLIST_EMPTY(&m0->m_pkthdr.tags), + SLIST_FIRST(&m0->m_pkthdr.tags)); + + mi_collapse_mbuf(&txsd->mi, m0); + mi = &txsd->mi; + + if (count > 1) { + struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd; + int i, fidx; + struct mbuf_iovec *batchmi; + + mv = mtomv(m0); + batchmi = mv->mv_vec; + + wrp = (struct work_request_hdr *)txd; + + flits = count*2 + 1; + txq_prod(txq, 1, &txqs); + + for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) { + struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i]; + + cntrl = V_TXPKT_INTF(pi->txpkt_intf); + GET_VTAG_MI(cntrl, batchmi); + cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); + cbe->cntrl = htonl(cntrl); + cbe->len = htonl(batchmi->mi_len | 0x80000000); + cbe->addr = htobe64(segs[i].ds_addr); + txd->flit[fidx] |= htobe64(1 << 24); + } + + wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) | + V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); + wmb(); + wrp->wr_lo = htonl(V_WR_LEN(flits) | + V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token)); + /* XXX gen? */ + wr_gen2(txd, txqs.gen); + check_ring_tx_db(sc, txq); + + return (0); + } else if (tso_info) { + int undersized, eth_type; + struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd; struct ip *ip; struct tcphdr *tcp; - char *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */ + char *pkthdr, tmp[TCPPKTHDRSIZE]; + struct mbuf_vec *mv; + struct mbuf_iovec *tmpmi; + + mv = mtomv(m0); + tmpmi = mv->mv_vec; txd->flit[2] = 0; + GET_VTAG_MI(cntrl, mi); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO); hdr->cntrl = htonl(cntrl); - - if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) { - pkthdr = &tmp[0]; - m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr); - } else { - pkthdr = mtod(m0, char *); - } + mlen = m0->m_pkthdr.len; + hdr->len = htonl(mlen | 0x80000000); + + DPRINTF("tso buf len=%d\n", mlen); + undersized = (((tmpmi->mi_len < TCPPKTHDRSIZE) && + (m0->m_flags & M_VLANTAG)) || + (tmpmi->mi_len < TCPPKTHDRSIZE - ETHER_VLAN_ENCAP_LEN)); + if (__predict_false(undersized)) { + pkthdr = tmp; + dump_mi(mi); + panic("discontig packet - fixxorz"); + } else + pkthdr = m0->m_data; if (__predict_false(m0->m_flags & M_VLANTAG)) { eth_type = CPL_ETH_II_VLAN; @@ -1292,19 +1362,33 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free) hdr->lso_info = htonl(tso_info); flits = 3; } else { + struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd; + + GET_VTAG(cntrl, m0); cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT); cpl->cntrl = htonl(cntrl); - + mlen = m0->m_pkthdr.len; + cpl->len = htonl(mlen | 0x80000000); + if (mlen <= WR_LEN - sizeof(*cpl)) { txq_prod(txq, 1, &txqs); - txq->sdesc[txqs.pidx].m = NULL; - if (m0->m_len == m0->m_pkthdr.len) - memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen); - else + DPRINTF("mlen==%d max=%ld\n", mlen, (WR_LEN - sizeof(*cpl))); + if (mi->mi_type != MT_IOVEC && + mi->mi_type != MT_CLIOVEC) + memcpy(&txd->flit[2], mi->mi_data, mlen); + else { + /* + * XXX mbuf_iovec + */ +#if 0 m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]); +#endif + printf("bailing on m_copydata\n"); + } + m_freem_iovec(&txsd->mi); + txsd->mi.mi_base = NULL; - *free = 1; flits = (mlen + 7) / 8 + 2; cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) | V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | @@ -1315,17 +1399,23 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free) wr_gen2(txd, txqs.gen); check_ring_tx_db(sc, txq); + DPRINTF("pio buf\n"); return (0); } + DPRINTF("regular buf\n"); flits = 2; } - wrp = (struct work_request_hdr *)txd; - - if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) { + +#ifdef nomore + /* + * XXX need to move into one of the helper routines above + * + */ + if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0) return (err); - } m0 = *m; +#endif ndesc = calc_tx_descs(m0, nsegs); sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl; @@ -1335,15 +1425,16 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free) DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc); txq_prod(txq, ndesc, &txqs); - txsd = &txq->sdesc[txqs.pidx]; wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl); wr_lo = htonl(V_WR_TID(txq->token)); - txsd->m = m0; - m_set_priority(m0, txqs.pidx); - write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo); - check_ring_tx_db(p->adapter, txq); + check_ring_tx_db(pi->adapter, txq); + if ((m0->m_type == MT_DATA) && ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT)) { + m0->m_flags &= ~M_EXT ; + m_free(m0); + } + return (0); } @@ -1367,6 +1458,11 @@ write_imm(struct tx_desc *d, struct mbuf *m, struct work_request_hdr *from = mtod(m, struct work_request_hdr *); struct work_request_hdr *to = (struct work_request_hdr *)d; + if (len > WR_LEN) + panic("len too big %d\n", len); + if (len < sizeof(*from)) + panic("len too small %d", len); + memcpy(&to[1], &from[1], len - sizeof(*from)); to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP | V_WR_BCNTLFLT(len & 7)); @@ -1374,7 +1470,14 @@ write_imm(struct tx_desc *d, struct mbuf *m, to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) | V_WR_LEN((len + 7) / 8)); wr_gen2(d, gen); - m_freem(m); + + /* + * This check is a hack we should really fix the logic so + * that this can't happen + */ + if (m->m_type != MT_DONTFREE) + m_freem(m); + } /** @@ -1413,6 +1516,8 @@ addq_exit: mbufq_tail(&q->sendq, m); struct sge_qset *qs = txq_to_qset(q, qid); + printf("stopping q\n"); + setbit(&qs->txq_stopped, qid); smp_mb(); @@ -1472,7 +1577,7 @@ ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m) m_freem(m); return 0; } - + wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP); wrp->wr_lo = htonl(V_WR_TID(q->token)); @@ -1483,13 +1588,14 @@ again: reclaim_completed_tx_imm(q); if (__predict_false(ret)) { if (ret == 1) { mtx_unlock(&q->lock); - return (-1); + log(LOG_ERR, "no desc available\n"); + + return (ENOSPC); } goto again; } - write_imm(&q->desc[q->pidx], m, m->m_len, q->gen); - + q->in_use++; if (++q->pidx >= q->size) { q->pidx = 0; @@ -1517,6 +1623,8 @@ restart_ctrlq(void *data, int npending) struct sge_txq *q = &qs->txq[TXQ_CTRL]; adapter_t *adap = qs->port->adapter; + log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use); + mtx_lock(&q->lock); again: reclaim_completed_tx_imm(q); @@ -1555,6 +1663,7 @@ t3_mgmt_tx(struct adapter *adap, struct mbuf *m) return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m); } + /** * free_qset - free the resources of an SGE queue set * @sc: the controller owning the queue set @@ -1564,11 +1673,18 @@ t3_mgmt_tx(struct adapter *adap, struct mbuf *m) * as HW contexts, packet buffers, and descriptor rings. Traffic to the * queue set must be quiesced prior to calling this. */ -static void +void t3_free_qset(adapter_t *sc, struct sge_qset *q) { int i; - + + t3_free_tx_desc_all(&q->txq[TXQ_ETH]); + + for (i = 0; i < SGE_TXQ_PER_SET; i++) + if (q->txq[i].txq_mr.br_ring != NULL) { + free(q->txq[i].txq_mr.br_ring, M_DEVBUF); + mtx_destroy(&q->txq[i].txq_mr.br_lock); + } for (i = 0; i < SGE_RXQ_PER_SET; ++i) { if (q->fl[i].desc) { mtx_lock(&sc->sge.reg_lock); @@ -1629,10 +1745,13 @@ void t3_free_sge_resources(adapter_t *sc) { int i, nqsets; - + +#ifdef IFNET_MULTIQUEUE + panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__); +#endif for (nqsets = i = 0; i < (sc)->params.nports; i++) nqsets += sc->port[i].nqsets; - + for (i = 0; i < nqsets; ++i) t3_free_qset(sc, &sc->sge.qs[i]); } @@ -1686,52 +1805,76 @@ t3_sge_stop(adapter_t *sc) /** - * free_tx_desc - reclaims Tx descriptors and their buffers + * t3_free_tx_desc - reclaims Tx descriptors and their buffers * @adapter: the adapter * @q: the Tx queue to reclaim descriptors from - * @n: the number of descriptors to reclaim + * @reclaimable: the number of descriptors to reclaim + * @m_vec_size: maximum number of buffers to reclaim + * @desc_reclaimed: returns the number of descriptors reclaimed * * Reclaims Tx descriptors from an SGE Tx queue and frees the associated * Tx buffers. Called with the Tx queue lock held. + * + * Returns number of buffers of reclaimed */ -int -free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec) +void +t3_free_tx_desc(struct sge_txq *q, int reclaimable) { - struct tx_sw_desc *d; - unsigned int cidx = q->cidx; - int nbufs = 0; + struct tx_sw_desc *txsd; + unsigned int cidx; #ifdef T3_TRACE T3_TRACE2(sc->tb[q->cntxt_id & 7], - "reclaiming %u Tx descriptors at cidx %u", n, cidx); + "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx); #endif - d = &q->sdesc[cidx]; - - while (n-- > 0) { - DPRINTF("cidx=%d d=%p\n", cidx, d); - if (d->m) { - if (d->flags & TX_SW_DESC_MAPPED) { - bus_dmamap_unload(q->entry_tag, d->map); - bus_dmamap_destroy(q->entry_tag, d->map); - d->flags &= ~TX_SW_DESC_MAPPED; + cidx = q->cidx; + txsd = &q->sdesc[cidx]; + DPRINTF("reclaiming %d WR\n", reclaimable); + while (reclaimable--) { + DPRINTF("cidx=%d d=%p\n", cidx, txsd); + if (txsd->mi.mi_base != NULL) { + if (txsd->flags & TX_SW_DESC_MAPPED) { + bus_dmamap_unload(q->entry_tag, txsd->map); + txsd->flags &= ~TX_SW_DESC_MAPPED; } - if (m_get_priority(d->m) == cidx) { - m_vec[nbufs] = d->m; - d->m = NULL; - nbufs++; - } else { - printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx); - } - } - ++d; + m_freem_iovec(&txsd->mi); + txsd->mi.mi_base = NULL; + +#if defined(DIAGNOSTIC) && 0 + if (m_get_priority(txsd->m[0]) != cidx) + printf("pri=%d cidx=%d\n", (int)m_get_priority(txsd->m[0]), cidx); +#endif + + } else + q->txq_skipped++; + + ++txsd; if (++cidx == q->size) { cidx = 0; - d = q->sdesc; + txsd = q->sdesc; } } q->cidx = cidx; - return (nbufs); +} + +void +t3_free_tx_desc_all(struct sge_txq *q) +{ + int i; + struct tx_sw_desc *txsd; + + for (i = 0; i < q->size; i++) { + txsd = &q->sdesc[i]; + if (txsd->mi.mi_base != NULL) { + if (txsd->flags & TX_SW_DESC_MAPPED) { + bus_dmamap_unload(q->entry_tag, txsd->map); + txsd->flags &= ~TX_SW_DESC_MAPPED; + } + m_freem_iovec(&txsd->mi); + bzero(&txsd->mi, sizeof(txsd->mi)); + } + } } /** @@ -1782,31 +1925,31 @@ write_ofld_wr(adapter_t *adap, struct mbuf *m, struct tx_desc *d = &q->desc[pidx]; struct txq_state txqs; - if (immediate(m)) { - q->sdesc[pidx].m = NULL; + if (immediate(m) && segs == NULL) { write_imm(d, m, m->m_len, gen); return; } /* Only TX_DATA builds SGLs */ - from = mtod(m, struct work_request_hdr *); - memcpy(&d->flit[1], &from[1], - (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from)); + memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from)); - flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8; + flits = m->m_len / 8; sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl; make_sgl(sgp, segs, nsegs); sgl_flits = sgl_len(nsegs); - txqs.gen = q->gen; - txqs.pidx = q->pidx; - txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3); + txqs.gen = gen; + txqs.pidx = pidx; + txqs.compl = 0; + write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits, from->wr_hi, from->wr_lo); } + + /** * calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet * @m: the packet @@ -1845,25 +1988,27 @@ ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m) int ret, nsegs; unsigned int ndesc; unsigned int pidx, gen; - struct mbuf *m_vec[TX_CLEAN_MAX_DESC]; - bus_dma_segment_t segs[TX_MAX_SEGS]; - int i, cleaned; - struct tx_sw_desc *stx = &q->sdesc[q->pidx]; + bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs; + struct tx_sw_desc *stx; - mtx_lock(&q->lock); - if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) { - mtx_unlock(&q->lock); - return (ret); - } + nsegs = m_get_sgllen(m); + vsegs = m_get_sgl(m); ndesc = calc_tx_descs_ofld(m, nsegs); -again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec); + busdma_map_sgl(vsegs, segs, nsegs); + stx = &q->sdesc[q->pidx]; + KASSERT(stx->mi.mi_base == NULL, ("mi_base set")); + + mtx_lock(&q->lock); +again: reclaim_completed_tx(q); ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD); if (__predict_false(ret)) { if (ret == 1) { + printf("no ofld desc avail\n"); + m_set_priority(m, ndesc); /* save for restart */ mtx_unlock(&q->lock); - return EINTR; + return (EINTR); } goto again; } @@ -1886,10 +2031,7 @@ again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec); write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs); check_ring_tx_db(adap, q); - - for (i = 0; i < cleaned; i++) { - m_freem(m_vec[i]); - } + return (0); } @@ -1902,18 +2044,16 @@ again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec); static void restart_offloadq(void *data, int npending) { - struct mbuf *m; struct sge_qset *qs = data; struct sge_txq *q = &qs->txq[TXQ_OFLD]; adapter_t *adap = qs->port->adapter; - struct mbuf *m_vec[TX_CLEAN_MAX_DESC]; bus_dma_segment_t segs[TX_MAX_SEGS]; - int nsegs, i, cleaned; struct tx_sw_desc *stx = &q->sdesc[q->pidx]; + int nsegs, cleaned; mtx_lock(&q->lock); -again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec); +again: cleaned = reclaim_completed_tx(q); while ((m = mbufq_peek(&q->sendq)) != NULL) { unsigned int gen, pidx; @@ -1953,10 +2093,12 @@ again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec); #endif t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id)); +#if 0 for (i = 0; i < cleaned; i++) { - m_freem(m_vec[i]); + m_freem_vec(m_vec[i]); } +#endif } /** @@ -2000,7 +2142,7 @@ t3_offload_tx(struct t3cdev *tdev, struct mbuf *m) adapter_t *adap = tdev2adap(tdev); struct sge_qset *qs = &adap->sge.qs[queue_set(m)]; - if (__predict_false(is_ctrl_pkt(m))) + if (__predict_false(is_ctrl_pkt(m))) return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m); return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m); @@ -2031,9 +2173,9 @@ rx_offload(struct t3cdev *tdev, struct sge_rspq *rq, struct mbuf *m, struct mbuf *rx_gather[], unsigned int gather_idx) { + rq->offload_pkts++; m->m_pkthdr.header = mtod(m, void *); - rx_gather[gather_idx++] = m; if (gather_idx == RX_BUNDLE_SIZE) { cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE); @@ -2048,16 +2190,24 @@ restart_tx(struct sge_qset *qs) { struct adapter *sc = qs->port->adapter; + if (isset(&qs->txq_stopped, TXQ_OFLD) && should_restart_tx(&qs->txq[TXQ_OFLD]) && test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) { qs->txq[TXQ_OFLD].restarts++; + printf("restarting TXQ_OFLD\n"); taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task); } + printf("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n", + qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]), + qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned, + qs->txq[TXQ_CTRL].in_use); + if (isset(&qs->txq_stopped, TXQ_CTRL) && should_restart_tx(&qs->txq[TXQ_CTRL]) && test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) { qs->txq[TXQ_CTRL].restarts++; + printf("restarting TXQ_CTRL\n"); taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task); } } @@ -2084,6 +2234,17 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, struct sge_qset *q = &sc->sge.qs[id]; int i, ret = 0; + for (i = 0; i < SGE_TXQ_PER_SET; i++) { + if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *), + M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) { + device_printf(sc->dev, "failed to allocate mbuf ring\n"); + goto err; + } + q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0; + q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size; + mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF); + } + init_qset_cntxt(q, id); if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc), @@ -2155,13 +2316,18 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx, q->txq[TXQ_ETH].stop_thres = nports * flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3); - q->fl[0].buf_size = MCLBYTES; + q->fl[0].buf_size = (MCLBYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_)); q->fl[0].zone = zone_clust; q->fl[0].type = EXT_CLUSTER; - q->fl[1].buf_size = MJUMPAGESIZE; - q->fl[1].zone = zone_jumbop; - q->fl[1].type = EXT_JUMBOP; - +#if __FreeBSD_version > 800000 + q->fl[1].buf_size = MJUM16BYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_); + q->fl[1].zone = zone_jumbo16; + q->fl[1].type = EXT_JUMBO16; +#else + q->fl[1].buf_size = MJUMPAGESIZE - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_); + q->fl[1].zone = zone_jumbop; + q->fl[1].type = EXT_JUMBOP; +#endif q->lro.enabled = lro_default; mtx_lock(&sc->sge.reg_lock); @@ -2269,11 +2435,15 @@ t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad) m->m_pkthdr.rcvif = ifp; m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad; +#ifndef DISABLE_MBUF_IOVEC m_explode(m); +#endif /* * adjust after conversion to mbuf chain */ - m_adj(m, sizeof(*cpl) + ethpad); + m->m_pkthdr.len -= (sizeof(*cpl) + ethpad); + m->m_len -= (sizeof(*cpl) + ethpad); + m->m_data += (sizeof(*cpl) + ethpad); (*ifp->if_input)(ifp, m); } @@ -2307,17 +2477,24 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, uint32_t len = G_RSPD_LEN(len_cq); uint32_t flags = ntohl(r->flags); uint8_t sopeop = G_RSPD_SOP_EOP(flags); + uint32_t *ref; int ret = 0; - prefetch(sd->cl); + prefetch(sd->rxsd_cl); fl->credits--; bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(fl->entry_tag, sd->map); - m_cljset(m, sd->cl, fl->type); + ref = sd->rxsd_ref; + m_cljset(m, sd->rxsd_cl, fl->type, sd->rxsd_ref); + *ref = 1; m->m_len = len; - + /* + * bump past the refcnt address + */ + m->m_data = sd->data; + switch(sopeop) { case RSPQ_SOP_EOP: DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m)); @@ -2363,9 +2540,48 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, } #else +static void +ext_free_handler(void *cl, void * arg) +{ + uintptr_t type = (uintptr_t)arg; + uma_zone_t zone; + struct mbuf *m; + + m = cl; + zone = m_getzonefromtype(type); + m->m_ext.ext_type = (int)type; + cxgb_ext_freed++; + cxgb_cache_put(zone, cl); +} + +static void +init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone) +{ + struct mbuf *m; + int header_size; + + header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t); + + bzero(cl, header_size); + m = (struct mbuf *)cl; + + SLIST_INIT(&m->m_pkthdr.tags); + m->m_type = MT_DATA; + m->m_flags = flags | M_NOFREE | M_EXT; + m->m_data = cl + header_size; + m->m_ext.ext_buf = cl; + m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t)); + m->m_ext.ext_size = m_getsizefromtype(type); + m->m_ext.ext_free = ext_free_handler; + m->m_ext.ext_args = (void *)(uintptr_t)type; + m->m_ext.ext_type = EXT_EXTREF; + *(m->m_ext.ref_cnt) = 1; + DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt); +} + static int get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, - struct mbuf *m, struct rsp_desc *r) + struct mbuf **m, struct rsp_desc *r) { unsigned int len_cq = ntohl(r->len_cq); @@ -2376,45 +2592,61 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs, uint8_t sopeop = G_RSPD_SOP_EOP(flags); void *cl; int ret = 0; - - prefetch(sd->cl); - + struct mbuf *m0; +#if 0 + if ((sd + 1 )->rxsd_cl) + prefetch((sd + 1)->rxsd_cl); + if ((sd + 2)->rxsd_cl) + prefetch((sd + 2)->rxsd_cl); +#endif + DPRINTF("rx cpu=%d\n", curcpu); fl->credits--; bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD); if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) { - cl = mtod(m, void *); - memcpy(cl, sd->cl, len); + if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL) + goto skip_recycle; + cl = mtod(m0, void *); + memcpy(cl, sd->data, len); recycle_rx_buf(adap, fl, fl->cidx); + *m = m0; } else { - cl = sd->cl; + skip_recycle: bus_dmamap_unload(fl->entry_tag, sd->map); + cl = sd->rxsd_cl; + *m = m0 = (struct mbuf *)cl; } + switch(sopeop) { case RSPQ_SOP_EOP: DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m)); - if (cl == sd->cl) - m_cljset(m, cl, fl->type); - m->m_len = m->m_pkthdr.len = len; + if (cl == sd->rxsd_cl) + init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone); + m0->m_len = m0->m_pkthdr.len = len; ret = 1; goto done; break; case RSPQ_NSOP_NEOP: DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m)); + panic("chaining unsupported"); ret = 0; break; case RSPQ_SOP: DBG(DBG_RX, ("get_packet: SOP m %p\n", m)); - m_iovinit(m); + panic("chaining unsupported"); + m_iovinit(m0); ret = 0; break; case RSPQ_EOP: DBG(DBG_RX, ("get_packet: EOP m %p\n", m)); + panic("chaining unsupported"); ret = 1; break; } - m_iovappend(m, cl, fl->buf_size, len, 0); - + panic("append not supported"); +#if 0 + m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref); +#endif done: if (++fl->cidx == fl->size) fl->cidx = 0; @@ -2443,9 +2675,11 @@ handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags) credits = G_RSPD_TXQ0_CR(flags); if (credits) { qs->txq[TXQ_ETH].processed += credits; +#ifndef IFNET_MULTIQUEUE if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC) taskqueue_enqueue(qs->port->adapter->tq, &qs->port->timer_reclaim_task); +#endif } credits = G_RSPD_TXQ2_CR(flags); @@ -2459,6 +2693,7 @@ handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags) credits = G_RSPD_TXQ1_CR(flags); if (credits) qs->txq[TXQ_OFLD].processed += credits; + } static void @@ -2483,7 +2718,7 @@ check_ring_db(adapter_t *adap, struct sge_qset *qs, * on this queue. If the system is under memory shortage use a fairly * long delay to help recovery. */ -static int +int process_responses(adapter_t *adap, struct sge_qset *qs, int budget) { struct sge_rspq *rspq = &qs->rspq; @@ -2506,7 +2741,7 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) int eth, eop = 0, ethpad = 0; uint32_t flags = ntohl(r->flags); uint32_t rss_csum = *(const uint32_t *)r; - uint32_t rss_hash = r->rss_hdr.rss_hash_val; + uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val); eth = (r->rss_hdr.opcode == CPL_RX_PKT); @@ -2517,8 +2752,7 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) } else if (flags & F_RSPD_IMM_DATA_VALID) { #ifdef DISABLE_MBUF_IOVEC - if (cxgb_debug) - printf("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx); + DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx); if(get_imm_packet(adap, r, &rspq->rspq_mh) == 0) { rspq->next_holdoff = NOMEM_INTR_DELAY; @@ -2529,10 +2763,11 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) } #else struct mbuf *m = NULL; - + + DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx); if (rspq->rspq_mbuf == NULL) rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA); - else + else m = m_gethdr(M_DONTWAIT, MT_DATA); /* @@ -2543,82 +2778,79 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) budget_left--; break; } - if (get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags)) - goto skip; + get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags); + eop = 1; -#endif rspq->imm_data++; +#endif } else if (r->len_cq) { int drop_thresh = eth ? SGE_RX_DROP_THRES : 0; #ifdef DISABLE_MBUF_IOVEC struct mbuf *m; - m = m_gethdr(M_NOWAIT, MT_DATA); + m = m_gethdr(M_DONTWAIT, MT_DATA); if (m == NULL) { log(LOG_WARNING, "failed to get mbuf for packet\n"); break; + } else { + m->m_next = m->m_nextpkt = NULL; } - + eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r, m); #else - if (rspq->rspq_mbuf == NULL) - rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA); - if (rspq->rspq_mbuf == NULL) { - log(LOG_WARNING, "failed to get mbuf for packet\n"); - break; - } - eop = get_packet(adap, drop_thresh, qs, rspq->rspq_mbuf, r); + eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r); +#ifdef IFNET_MULTIQUEUE + rspq->rspq_mbuf->m_pkthdr.rss_hash = rss_hash; +#endif #endif ethpad = 2; } else { DPRINTF("pure response\n"); rspq->pure_rsps++; } - if (flags & RSPD_CTRL_MASK) { sleeping |= flags & RSPD_GTS_MASK; handle_rsp_cntrl_info(qs, flags); } -#ifndef DISABLE_MBUF_IOVEC - skip: -#endif + r++; if (__predict_false(++rspq->cidx == rspq->size)) { rspq->cidx = 0; rspq->gen ^= 1; r = rspq->desc; } - prefetch(r); if (++rspq->credits >= (rspq->size / 4)) { refill_rspq(adap, rspq, rspq->credits); rspq->credits = 0; } - - if (eop) { - prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *)); - prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES); + DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags); - if (eth) { - t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad, - rss_hash, rss_csum, lro); - - rspq->rspq_mh.mh_head = NULL; - } else { - rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum; - /* - * XXX size mismatch - */ - m_set_priority(rspq->rspq_mh.mh_head, rss_hash); + if (!eth && eop) { + rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum; + /* + * XXX size mismatch + */ + m_set_priority(rspq->rspq_mh.mh_head, rss_hash); - ngathered = rx_offload(&adap->tdev, rspq, - rspq->rspq_mh.mh_head, offload_mbufs, ngathered); - } - __refill_fl(adap, &qs->fl[0]); - __refill_fl(adap, &qs->fl[1]); + ngathered = rx_offload(&adap->tdev, rspq, + rspq->rspq_mh.mh_head, offload_mbufs, ngathered); + rspq->rspq_mh.mh_head = NULL; + DPRINTF("received offload packet\n"); + + } else if (eth && eop) { + prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *)); + prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES); + + t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad, + rss_hash, rss_csum, lro); + DPRINTF("received tunnel packet\n"); + rspq->rspq_mh.mh_head = NULL; } + __refill_fl_lt(adap, &qs->fl[0], 32); + __refill_fl_lt(adap, &qs->fl[1], 32); --budget_left; } @@ -2629,9 +2861,14 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget) check_ring_db(adap, qs, sleeping); smp_mb(); /* commit Tx queue processed updates */ - if (__predict_false(qs->txq_stopped != 0)) + if (__predict_false(qs->txq_stopped != 0)) { + printf("restarting tx on %p\n", qs); + restart_tx(qs); - + } + + __refill_fl_lt(adap, &qs->fl[0], 512); + __refill_fl_lt(adap, &qs->fl[1], 512); budget -= budget_left; return (budget); } @@ -2718,10 +2955,11 @@ t3_intr_msix(void *data) adapter_t *adap = qs->port->adapter; struct sge_rspq *rspq = &qs->rspq; - mtx_lock(&rspq->lock); - if (process_responses_gts(adap, rspq) == 0) - rspq->unhandled_irqs++; - mtx_unlock(&rspq->lock); + if (mtx_trylock(&rspq->lock)) { + if (process_responses_gts(adap, rspq) == 0) + rspq->unhandled_irqs++; + mtx_unlock(&rspq->lock); + } } /* @@ -2765,7 +3003,10 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS) struct sge_qset *qs; int i, j, err, nqsets = 0; struct mtx *lock; - + + if ((sc->flags & FULL_INIT_DONE) == 0) + return (ENXIO); + coalesce_nsecs = qsp->coalesce_nsecs; err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req); @@ -2801,11 +3042,11 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS) void -t3_add_sysctls(adapter_t *sc) +t3_add_attach_sysctls(adapter_t *sc) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; - + ctx = device_get_sysctl_ctx(sc->dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); @@ -2821,28 +3062,13 @@ t3_add_sysctls(adapter_t *sc) 0, t3_lro_enable, "I", "enable large receive offload"); - SYSCTL_ADD_PROC(ctx, children, OID_AUTO, - "intr_coal", - CTLTYPE_INT|CTLFLAG_RW, sc, - 0, t3_set_coalesce_nsecs, - "I", "interrupt coalescing timer (ns)"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "enable_debug", CTLFLAG_RW, &cxgb_debug, 0, "enable verbose debugging output"); - - SYSCTL_ADD_INT(ctx, children, OID_AUTO, - "collapse_free", - CTLFLAG_RD, &collapse_free, - 0, "frees during collapse"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, - "mb_free_vec_free", - CTLFLAG_RD, &mb_free_vec_free, - 0, "frees during mb_free_vec"); - SYSCTL_ADD_INT(ctx, children, OID_AUTO, - "collapse_mbufs", - CTLFLAG_RW, &collapse_mbufs, - 0, "collapse mbuf chains into iovecs"); + SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce", + CTLFLAG_RD, &sc->tunq_coalesce, + "#tunneled packets freed"); SYSCTL_ADD_INT(ctx, children, OID_AUTO, "txq_overrun", CTLFLAG_RD, &txq_fills, @@ -2851,8 +3077,103 @@ t3_add_sysctls(adapter_t *sc) "bogus_imm", CTLFLAG_RD, &bogus_imm, 0, "#times a bogus immediate response was seen"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, + "cache_alloc", + CTLFLAG_RD, &cxgb_cached_allocations, + 0, "#times a cluster was allocated from cache"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, + "cached", + CTLFLAG_RD, &cxgb_cached, + 0, "#times a cluster was cached"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, + "ext_freed", + CTLFLAG_RD, &cxgb_ext_freed, + 0, "#times a cluster was freed through ext_free"); + } +void +t3_add_configured_sysctls(adapter_t *sc) +{ + struct sysctl_ctx_list *ctx; + struct sysctl_oid_list *children; + int i, j; + + ctx = device_get_sysctl_ctx(sc->dev); + children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev)); + + SYSCTL_ADD_PROC(ctx, children, OID_AUTO, + "intr_coal", + CTLTYPE_INT|CTLFLAG_RW, sc, + 0, t3_set_coalesce_nsecs, + "I", "interrupt coalescing timer (ns)"); + + for (i = 0; i < sc->params.nports; i++) { + struct port_info *pi = &sc->port[i]; + struct sysctl_oid *poid; + struct sysctl_oid_list *poidlist; + + snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i); + poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, + pi->namebuf, CTLFLAG_RD, NULL, "port statistics"); + poidlist = SYSCTL_CHILDREN(poid); + SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO, + "nqsets", CTLFLAG_RD, &pi->nqsets, + 0, "#queue sets"); + + for (j = 0; j < pi->nqsets; j++) { + struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j]; + struct sysctl_oid *qspoid; + struct sysctl_oid_list *qspoidlist; + struct sge_txq *txq = &qs->txq[TXQ_ETH]; + + snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j); + + qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO, + qs->namebuf, CTLFLAG_RD, NULL, "qset statistics"); + qspoidlist = SYSCTL_CHILDREN(qspoid); + + SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "dropped", + CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops, + 0, "#tunneled packets dropped"); + SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "sendqlen", + CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen, + 0, "#tunneled packets waiting to be sent"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_pidx", + CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod, + 0, "#tunneled packets queue producer index"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_cidx", + CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons, + 0, "#tunneled packets queue consumer index"); + SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "processed", + CTLFLAG_RD, &qs->txq[TXQ_ETH].processed, + 0, "#tunneled packets processed by the card"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "cleaned", + CTLFLAG_RD, &txq->cleaned, + 0, "#tunneled packets cleaned"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "in_use", + CTLFLAG_RD, &txq->in_use, + 0, "#tunneled packet slots in use"); + SYSCTL_ADD_ULONG(ctx, qspoidlist, OID_AUTO, "frees", + CTLFLAG_RD, &txq->txq_frees, + "#tunneled packets freed"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "skipped", + CTLFLAG_RD, &txq->txq_skipped, + 0, "#tunneled packet descriptors skipped"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "coalesced", + CTLFLAG_RD, &txq->txq_coalesced, + 0, "#tunneled packets coalesced"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "enqueued", + CTLFLAG_RD, &txq->txq_enqueued, + 0, "#tunneled packets enqueued to hardware"); + SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "stopped_flags", + CTLFLAG_RD, &qs->txq_stopped, + 0, "tx queues stopped"); + + } + } +} + /** * t3_get_desc - dump an SGE descriptor for debugging purposes * @qs: the queue set diff --git a/sys/dev/cxgb/sys/cxgb_support.c b/sys/dev/cxgb/sys/cxgb_support.c index 7a2855634014..176206c10a8a 100644 --- a/sys/dev/cxgb/sys/cxgb_support.c +++ b/sys/dev/cxgb/sys/cxgb_support.c @@ -126,11 +126,11 @@ cxgb_cache_pcpu_init(struct cxgb_cache_pcpu *ccp) if ((err = buf_stack_init(&ccp->ccp_cluster_free, (FL_Q_SIZE >> 1)))) return (err); - if (jumbo_phys_contig) +#if __FreeBSD_version > 800000 ccp->ccp_jumbo_zone = zone_jumbo16; - else +#else ccp->ccp_jumbo_zone = zone_jumbop; - +#endif return (0); } diff --git a/sys/dev/cxgb/sys/mvec.h b/sys/dev/cxgb/sys/mvec.h index 2ef7ecdad720..04b64496dbc0 100644 --- a/sys/dev/cxgb/sys/mvec.h +++ b/sys/dev/cxgb/sys/mvec.h @@ -63,6 +63,9 @@ struct m_ext_ { int ext_type; /* type of external storage */ }; +#define MT_IOVEC 9 +#define MT_CLIOVEC 10 + #define EXT_IOVEC 8 #define EXT_CLIOVEC 9 #define EXT_JMPIOVEC 10 diff --git a/sys/dev/cxgb/t3cdev.h b/sys/dev/cxgb/t3cdev.h index 8223f98aed99..67db5523fd78 100644 --- a/sys/dev/cxgb/t3cdev.h +++ b/sys/dev/cxgb/t3cdev.h @@ -50,7 +50,7 @@ struct t3cdev { int (*send)(struct t3cdev *dev, struct mbuf *m); int (*recv)(struct t3cdev *dev, struct mbuf **m, int n); int (*ctl)(struct t3cdev *dev, unsigned int req, void *data); - void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa); + void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, struct sockaddr *sa); void *priv; /* driver private data */ void *l2opt; /* optional layer 2 data */ void *l3opt; /* optional layer 3 data */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c index 0c796b54f6c9..4b17f8ee1456 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c @@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include +#include #include #include #include @@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include @@ -559,7 +560,7 @@ cxgb_toe_disconnect(struct tcpcb *tp) } static int -cxgb_toe_abort(struct tcpcb *tp) +cxgb_toe_reset(struct tcpcb *tp) { struct toepcb *toep = tp->t_toe; @@ -620,7 +621,7 @@ cxgb_toe_detach(struct tcpcb *tp) static struct toe_usrreqs cxgb_toe_usrreqs = { .tu_disconnect = cxgb_toe_disconnect, - .tu_abort = cxgb_toe_abort, + .tu_reset = cxgb_toe_reset, .tu_send = cxgb_toe_send, .tu_rcvd = cxgb_toe_rcvd, .tu_detach = cxgb_toe_detach, @@ -1145,7 +1146,7 @@ fail_act_open(struct toepcb *toep, int errno) t3_release_offload_resources(toep); if (tp) { INP_LOCK_ASSERT(tp->t_inpcb); - tcp_drop(tp, errno); + cxgb_tcp_drop(tp, errno); } #ifdef notyet @@ -1957,7 +1958,7 @@ process_close_con_rpl(struct socket *so, struct mbuf *m) wakeup(&so->so_timeo); } else if ((so->so_options & SO_LINGER) && so->so_linger == 0 && (toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) { - tp = tcp_drop(tp, 0); + tp = cxgb_tcp_drop(tp, 0); } break; @@ -2483,7 +2484,7 @@ handle_syncache_event(int event, void *arg) struct toepcb *toep = arg; switch (event) { - case SC_ENTRY_PRESENT: + case TOE_SC_ENTRY_PRESENT: /* * entry already exists - free toepcb * and l2t @@ -2491,7 +2492,7 @@ handle_syncache_event(int event, void *arg) printf("syncache entry present\n"); toepcb_release(toep); break; - case SC_DROP: + case TOE_SC_DROP: /* * The syncache has given up on this entry * either it timed out, or it was evicted diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c index 8cb42e1d7abf..e411ab4d6620 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c @@ -62,7 +62,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include +#include #include #include @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include static int (*pru_sosend)(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, @@ -99,9 +100,6 @@ static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags); #endif static void vm_fault_unhold_pages(vm_page_t *m, int count); - - - #define TMP_IOV_MAX 16 void @@ -112,6 +110,15 @@ t3_init_socket_ops(void) prp = pffindtype(AF_INET, SOCK_STREAM); pru_sosend = prp->pr_usrreqs->pru_sosend; pru_soreceive = prp->pr_usrreqs->pru_soreceive; + tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect; + tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort; + tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen; + tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send; + tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort; + tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect; + tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close; + tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown; + tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd; } diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c index e7857906dc86..a88b26e9453a 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c @@ -57,7 +57,7 @@ __FBSDID("$FreeBSD$"); #include #include -#include +#include #include #include diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h new file mode 100644 index 000000000000..feb29164ed4f --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h @@ -0,0 +1,44 @@ + +/*- + * Copyright (c) 2007, Chelsio Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Neither the name of the Chelsio Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef CXGB_TCP_H_ +#define CXGB_TCP_H_ + +struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno); +void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip); +struct tcpcb *cxgb_tcp_close(struct tcpcb *tp); + +extern struct pr_usrreqs cxgb_tcp_usrreqs; +#ifdef INET6 +extern struct pr_usrreqs cxgb_tcp6_usrreqs; +#endif + +#include +SYSCTL_DECL(_net_inet_tcp_cxgb); +#endif /* CXGB_TCP_H_ */ diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c new file mode 100644 index 000000000000..2eca09959841 --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c @@ -0,0 +1,694 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_compat.h" +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_mac.h" +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef INET6 +#include +#endif +#include +#include +#ifdef INET6 +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef TCPDEBUG +#include +#endif +#include + +#ifdef IPSEC +#include +#include +#ifdef INET6 +#include +#endif +#include +#endif /*IPSEC*/ + +#include +#include + +#include + +#include + + +SYSCTL_NODE(_net_inet_tcp, 0, cxgb, CTLFLAG_RW, 0, "chelsio TOE"); + +static int tcp_log_debug = 0; +SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, log_debug, CTLFLAG_RW, + &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); + +static int tcp_tcbhashsize = 0; +SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, + &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); + +static int do_tcpdrain = 1; +SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, do_tcpdrain, CTLFLAG_RW, + &do_tcpdrain, 0, + "Enable tcp_drain routine for extra help when low on mbufs"); + +SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, pcbcount, CTLFLAG_RD, + &tcbinfo.ipi_count, 0, "Number of active PCBs"); + +static int icmp_may_rst = 1; +SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, icmp_may_rst, CTLFLAG_RW, + &icmp_may_rst, 0, + "Certain ICMP unreachable messages may abort connections in SYN_SENT"); + +static int tcp_isn_reseed_interval = 0; +SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, isn_reseed_interval, CTLFLAG_RW, + &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret"); + +/* + * TCP bandwidth limiting sysctls. Note that the default lower bound of + * 1024 exists only for debugging. A good production default would be + * something like 6100. + */ +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0, + "TCP inflight data limiting"); + +static int tcp_inflight_enable = 1; +SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW, + &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting"); + +static int tcp_inflight_debug = 0; +SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW, + &tcp_inflight_debug, 0, "Debug TCP inflight calculations"); + +static int tcp_inflight_rttthresh; +SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW, + &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I", + "RTT threshold below which inflight will deactivate itself"); + +static int tcp_inflight_min = 6144; +SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW, + &tcp_inflight_min, 0, "Lower-bound for TCP inflight window"); + +static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT; +SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW, + &tcp_inflight_max, 0, "Upper-bound for TCP inflight window"); + +static int tcp_inflight_stab = 20; +SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW, + &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); + +uma_zone_t sack_hole_zone; + +static struct inpcb *tcp_notify(struct inpcb *, int); +static struct inpcb *cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno); + +/* + * Target size of TCP PCB hash tables. Must be a power of two. + * + * Note that this can be overridden by the kernel environment + * variable net.inet.tcp.tcbhashsize + */ +#ifndef TCBHASHSIZE +#define TCBHASHSIZE 512 +#endif + +/* + * XXX + * Callouts should be moved into struct tcp directly. They are currently + * separate because the tcpcb structure is exported to userland for sysctl + * parsing purposes, which do not know about callouts. + */ +struct tcpcb_mem { + struct tcpcb tcb; + struct tcp_timer tt; +}; + +MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); + +/* + * Drop a TCP connection, reporting + * the specified error. If connection is synchronized, + * then send a RST to peer. + */ +struct tcpcb * +cxgb_tcp_drop(struct tcpcb *tp, int errno) +{ + struct socket *so = tp->t_inpcb->inp_socket; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(tp->t_inpcb); + + if (TCPS_HAVERCVDSYN(tp->t_state)) { + tp->t_state = TCPS_CLOSED; + (void) tcp_gen_reset(tp); + tcpstat.tcps_drops++; + } else + tcpstat.tcps_conndrops++; + if (errno == ETIMEDOUT && tp->t_softerror) + errno = tp->t_softerror; + so->so_error = errno; + return (cxgb_tcp_close(tp)); +} + +/* + * Attempt to close a TCP control block, marking it as dropped, and freeing + * the socket if we hold the only reference. + */ +struct tcpcb * +cxgb_tcp_close(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + if (tp->t_state == TCPS_LISTEN) + tcp_gen_listen_close(tp); + in_pcbdrop(inp); + tcpstat.tcps_closed++; + KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); + so = inp->inp_socket; + soisdisconnected(so); + if (inp->inp_vflag & INP_SOCKREF) { + KASSERT(so->so_state & SS_PROTOREF, + ("tcp_close: !SS_PROTOREF")); + inp->inp_vflag &= ~INP_SOCKREF; + INP_UNLOCK(inp); + ACCEPT_LOCK(); + SOCK_LOCK(so); + so->so_state &= ~SS_PROTOREF; + sofree(so); + return (NULL); + } + return (tp); +} + +/* + * Notify a tcp user of an asynchronous error; + * store error as soft error, but wake up user + * (for now, won't do anything until can select for soft error). + * + * Do not wake up user since there currently is no mechanism for + * reporting soft errors (yet - a kqueue filter may be added). + */ +static struct inpcb * +tcp_notify(struct inpcb *inp, int error) +{ + struct tcpcb *tp; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + if ((inp->inp_vflag & INP_TIMEWAIT) || + (inp->inp_vflag & INP_DROPPED)) + return (inp); + + tp = intotcpcb(inp); + KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); + + /* + * Ignore some errors if we are hooked up. + * If connection hasn't completed, has retransmitted several times, + * and receives a second error, give up now. This is better + * than waiting a long time to establish a connection that + * can never complete. + */ + if (tp->t_state == TCPS_ESTABLISHED && + (error == EHOSTUNREACH || error == ENETUNREACH || + error == EHOSTDOWN)) { + return (inp); + } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && + tp->t_softerror) { + tp = cxgb_tcp_drop(tp, error); + if (tp != NULL) + return (inp); + else + return (NULL); + } else { + tp->t_softerror = error; + return (inp); + } +#if 0 + wakeup( &so->so_timeo); + sorwakeup(so); + sowwakeup(so); +#endif +} + +void +cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip) +{ + struct ip *ip = vip; + struct tcphdr *th; + struct in_addr faddr; + struct inpcb *inp; + struct tcpcb *tp; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + struct icmp *icp; + struct in_conninfo inc; + tcp_seq icmp_tcp_seq; + int mtu; + + faddr = ((struct sockaddr_in *)sa)->sin_addr; + if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) + return; + + if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || + cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip) + notify = cxgb_tcp_drop_syn_sent; + /* + * Redirects don't need to be handled up here. + */ + else if (PRC_IS_REDIRECT(cmd)) + return; + /* + * Source quench is depreciated. + */ + else if (cmd == PRC_QUENCH) + return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + else if (cmd == PRC_HOSTDEAD) + ip = NULL; + else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) + return; + if (ip != NULL) { + icp = (struct icmp *)((caddr_t)ip + - offsetof(struct icmp, icmp_ip)); + th = (struct tcphdr *)((caddr_t)ip + + (ip->ip_hl << 2)); + INP_INFO_WLOCK(&tcbinfo); + inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport, + ip->ip_src, th->th_sport, 0, NULL); + if (inp != NULL) { + INP_LOCK(inp); + if (!(inp->inp_vflag & INP_TIMEWAIT) && + !(inp->inp_vflag & INP_DROPPED) && + !(inp->inp_socket == NULL)) { + icmp_tcp_seq = htonl(th->th_seq); + tp = intotcpcb(inp); + if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) && + SEQ_LT(icmp_tcp_seq, tp->snd_max)) { + if (cmd == PRC_MSGSIZE) { + /* + * MTU discovery: + * If we got a needfrag set the MTU + * in the route to the suggested new + * value (if given) and then notify. + */ + bzero(&inc, sizeof(inc)); + inc.inc_flags = 0; /* IPv4 */ + inc.inc_faddr = faddr; + + mtu = ntohs(icp->icmp_nextmtu); + /* + * If no alternative MTU was + * proposed, try the next smaller + * one. ip->ip_len has already + * been swapped in icmp_input(). + */ + if (!mtu) + mtu = ip_next_mtu(ip->ip_len, + 1); + if (mtu < max(296, (tcp_minmss) + + sizeof(struct tcpiphdr))) + mtu = 0; + if (!mtu) + mtu = tcp_mssdflt + + sizeof(struct tcpiphdr); + /* + * Only cache the the MTU if it + * is smaller than the interface + * or route MTU. tcp_mtudisc() + * will do right thing by itself. + */ + if (mtu <= tcp_maxmtu(&inc, NULL)) + tcp_hc_updatemtu(&inc, mtu); + } + + inp = (*notify)(inp, inetctlerrmap[cmd]); + } + } + if (inp != NULL) + INP_UNLOCK(inp); + } else { + inc.inc_fport = th->th_dport; + inc.inc_lport = th->th_sport; + inc.inc_faddr = faddr; + inc.inc_laddr = ip->ip_src; +#ifdef INET6 + inc.inc_isipv6 = 0; +#endif + syncache_unreach(&inc, th); + } + INP_INFO_WUNLOCK(&tcbinfo); + } else + in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify); +} + +#ifdef INET6 +void +tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d) +{ + struct tcphdr th; + struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; + struct ip6_hdr *ip6; + struct mbuf *m; + struct ip6ctlparam *ip6cp = NULL; + const struct sockaddr_in6 *sa6_src = NULL; + int off; + struct tcp_portonly { + u_int16_t th_sport; + u_int16_t th_dport; + } *thp; + + if (sa->sa_family != AF_INET6 || + sa->sa_len != sizeof(struct sockaddr_in6)) + return; + + if (cmd == PRC_MSGSIZE) + notify = tcp_mtudisc; + else if (!PRC_IS_REDIRECT(cmd) && + ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)) + return; + /* Source quench is depreciated. */ + else if (cmd == PRC_QUENCH) + return; + + /* if the parameter is from icmp6, decode it. */ + if (d != NULL) { + ip6cp = (struct ip6ctlparam *)d; + m = ip6cp->ip6c_m; + ip6 = ip6cp->ip6c_ip6; + off = ip6cp->ip6c_off; + sa6_src = ip6cp->ip6c_src; + } else { + m = NULL; + ip6 = NULL; + off = 0; /* fool gcc */ + sa6_src = &sa6_any; + } + + if (ip6 != NULL) { + struct in_conninfo inc; + /* + * XXX: We assume that when IPV6 is non NULL, + * M and OFF are valid. + */ + + /* check if we can safely examine src and dst ports */ + if (m->m_pkthdr.len < off + sizeof(*thp)) + return; + + bzero(&th, sizeof(th)); + m_copydata(m, off, sizeof(*thp), (caddr_t)&th); + + in6_pcbnotify(&tcbinfo, sa, th.th_dport, + (struct sockaddr *)ip6cp->ip6c_src, + th.th_sport, cmd, NULL, notify); + + inc.inc_fport = th.th_dport; + inc.inc_lport = th.th_sport; + inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr; + inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr; + inc.inc_isipv6 = 1; + INP_INFO_WLOCK(&tcbinfo); + syncache_unreach(&inc, &th); + INP_INFO_WUNLOCK(&tcbinfo); + } else + in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src, + 0, cmd, NULL, notify); +} +#endif /* INET6 */ + + +/* + * Following is where TCP initial sequence number generation occurs. + * + * There are two places where we must use initial sequence numbers: + * 1. In SYN-ACK packets. + * 2. In SYN packets. + * + * All ISNs for SYN-ACK packets are generated by the syncache. See + * tcp_syncache.c for details. + * + * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling + * depends on this property. In addition, these ISNs should be + * unguessable so as to prevent connection hijacking. To satisfy + * the requirements of this situation, the algorithm outlined in + * RFC 1948 is used, with only small modifications. + * + * Implementation details: + * + * Time is based off the system timer, and is corrected so that it + * increases by one megabyte per second. This allows for proper + * recycling on high speed LANs while still leaving over an hour + * before rollover. + * + * As reading the *exact* system time is too expensive to be done + * whenever setting up a TCP connection, we increment the time + * offset in two ways. First, a small random positive increment + * is added to isn_offset for each connection that is set up. + * Second, the function tcp_isn_tick fires once per clock tick + * and increments isn_offset as necessary so that sequence numbers + * are incremented at approximately ISN_BYTES_PER_SECOND. The + * random positive increments serve only to ensure that the same + * exact sequence number is never sent out twice (as could otherwise + * happen when a port is recycled in less than the system tick + * interval.) + * + * net.inet.tcp.isn_reseed_interval controls the number of seconds + * between seeding of isn_secret. This is normally set to zero, + * as reseeding should not be necessary. + * + * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, + * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In + * general, this means holding an exclusive (write) lock. + */ + +#define ISN_BYTES_PER_SECOND 1048576 +#define ISN_STATIC_INCREMENT 4096 +#define ISN_RANDOM_INCREMENT (4096 - 1) + + +/* + * When a specific ICMP unreachable message is received and the + * connection state is SYN-SENT, drop the connection. This behavior + * is controlled by the icmp_may_rst sysctl. + */ +static struct inpcb * +cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno) +{ + struct tcpcb *tp; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + if ((inp->inp_vflag & INP_TIMEWAIT) || + (inp->inp_vflag & INP_DROPPED)) + return (inp); + + tp = intotcpcb(inp); + if (tp->t_state != TCPS_SYN_SENT) + return (inp); + + tp = cxgb_tcp_drop(tp, errno); + if (tp != NULL) + return (inp); + else + return (NULL); +} + +static int +cxgb_sysctl_drop(SYSCTL_HANDLER_ARGS) +{ + /* addrs[0] is a foreign socket, addrs[1] is a local one. */ + struct sockaddr_storage addrs[2]; + struct inpcb *inp; + struct tcpcb *tp; + struct tcptw *tw; + struct sockaddr_in *fin, *lin; +#ifdef INET6 + struct sockaddr_in6 *fin6, *lin6; + struct in6_addr f6, l6; +#endif + int error; + + inp = NULL; + fin = lin = NULL; +#ifdef INET6 + fin6 = lin6 = NULL; +#endif + error = 0; + + if (req->oldptr != NULL || req->oldlen != 0) + return (EINVAL); + if (req->newptr == NULL) + return (EPERM); + if (req->newlen < sizeof(addrs)) + return (ENOMEM); + error = SYSCTL_IN(req, &addrs, sizeof(addrs)); + if (error) + return (error); + + switch (addrs[0].ss_family) { +#ifdef INET6 + case AF_INET6: + fin6 = (struct sockaddr_in6 *)&addrs[0]; + lin6 = (struct sockaddr_in6 *)&addrs[1]; + if (fin6->sin6_len != sizeof(struct sockaddr_in6) || + lin6->sin6_len != sizeof(struct sockaddr_in6)) + return (EINVAL); + if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { + if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) + return (EINVAL); + in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); + in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); + fin = (struct sockaddr_in *)&addrs[0]; + lin = (struct sockaddr_in *)&addrs[1]; + break; + } + error = sa6_embedscope(fin6, ip6_use_defzone); + if (error) + return (error); + error = sa6_embedscope(lin6, ip6_use_defzone); + if (error) + return (error); + break; +#endif + case AF_INET: + fin = (struct sockaddr_in *)&addrs[0]; + lin = (struct sockaddr_in *)&addrs[1]; + if (fin->sin_len != sizeof(struct sockaddr_in) || + lin->sin_len != sizeof(struct sockaddr_in)) + return (EINVAL); + break; + default: + return (EINVAL); + } + INP_INFO_WLOCK(&tcbinfo); + switch (addrs[0].ss_family) { +#ifdef INET6 + case AF_INET6: + inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port, + &l6, lin6->sin6_port, 0, NULL); + break; +#endif + case AF_INET: + inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port, + lin->sin_addr, lin->sin_port, 0, NULL); + break; + } + if (inp != NULL) { + INP_LOCK(inp); + if (inp->inp_vflag & INP_TIMEWAIT) { + /* + * XXXRW: There currently exists a state where an + * inpcb is present, but its timewait state has been + * discarded. For now, don't allow dropping of this + * type of inpcb. + */ + tw = intotw(inp); + if (tw != NULL) + tcp_twclose(tw, 0); + else + INP_UNLOCK(inp); + } else if (!(inp->inp_vflag & INP_DROPPED) && + !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { + tp = intotcpcb(inp); + tp = cxgb_tcp_drop(tp, ECONNABORTED); + if (tp != NULL) + INP_UNLOCK(inp); + } else + INP_UNLOCK(inp); + } else + error = ESRCH; + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_cxgb, TCPCTL_DROP, drop, + CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL, + 0, cxgb_sysctl_drop, "", "Drop TCP connection"); + diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c new file mode 100644 index 000000000000..bd940b2cb17c --- /dev/null +++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c @@ -0,0 +1,1362 @@ +/*- + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. + * Copyright (c) 2006-2007 Robert N. M. Watson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94 + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_ddb.h" +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_tcpdebug.h" + +#include +#include +#include +#include +#include +#include +#ifdef INET6 +#include +#endif /* INET6 */ +#include +#include +#include +#include +#include + +#ifdef DDB +#include +#endif + +#include +#include + +#include +#include +#ifdef INET6 +#include +#endif +#include +#ifdef INET6 +#include +#endif +#include +#include +#ifdef INET6 +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif +#include +#include + + +/* + * TCP protocol interface to socket abstraction. + */ +static int tcp_attach(struct socket *); +static int tcp_connect(struct tcpcb *, struct sockaddr *, + struct thread *td); +#ifdef INET6 +static int tcp6_connect(struct tcpcb *, struct sockaddr *, + struct thread *td); +#endif /* INET6 */ +static void tcp_disconnect(struct tcpcb *); +static void tcp_usrclosed(struct tcpcb *); + +#ifdef TCPDEBUG +#define TCPDEBUG0 int ostate = 0 +#define TCPDEBUG1() ostate = tp ? tp->t_state : 0 +#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \ + tcp_trace(TA_USER, ostate, tp, 0, 0, req) +#else +#define TCPDEBUG0 +#define TCPDEBUG1() +#define TCPDEBUG2(req) +#endif + +/* + * TCP attaches to socket via pru_attach(), reserving space, + * and an internet control block. + */ +static int +tcp_usr_attach(struct socket *so, int proto, struct thread *td) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + int error; + TCPDEBUG0; + + inp = sotoinpcb(so); + KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL")); + TCPDEBUG1(); + + error = tcp_attach(so); + if (error) + goto out; + + if ((so->so_options & SO_LINGER) && so->so_linger == 0) + so->so_linger = TCP_LINGERTIME; + + inp = sotoinpcb(so); + tp = intotcpcb(inp); +out: + TCPDEBUG2(PRU_ATTACH); + return error; +} + +/* + * tcp_detach is called when the socket layer loses its final reference + * to the socket, be it a file descriptor reference, a reference from TCP, + * etc. At this point, there is only one case in which we will keep around + * inpcb state: time wait. + * + * This function can probably be re-absorbed back into tcp_usr_detach() now + * that there is a single detach path. + */ +static void +tcp_detach(struct socket *so, struct inpcb *inp) +{ + struct tcpcb *tp; +#ifdef INET6 + int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; +#endif + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp")); + KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so")); + + tp = intotcpcb(inp); + + if (inp->inp_vflag & INP_TIMEWAIT) { + /* + * There are two cases to handle: one in which the time wait + * state is being discarded (INP_DROPPED), and one in which + * this connection will remain in timewait. In the former, + * it is time to discard all state (except tcptw, which has + * already been discarded by the timewait close code, which + * should be further up the call stack somewhere). In the + * latter case, we detach from the socket, but leave the pcb + * present until timewait ends. + * + * XXXRW: Would it be cleaner to free the tcptw here? + */ + if (inp->inp_vflag & INP_DROPPED) { + KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && " + "INP_DROPPED && tp != NULL")); +#ifdef INET6 + if (isipv6) { + in6_pcbdetach(inp); + in6_pcbfree(inp); + } else { +#endif + in_pcbdetach(inp); + in_pcbfree(inp); +#ifdef INET6 + } +#endif + } else { +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif + in_pcbdetach(inp); + INP_UNLOCK(inp); + } + } else { + /* + * If the connection is not in timewait, we consider two + * two conditions: one in which no further processing is + * necessary (dropped || embryonic), and one in which TCP is + * not yet done, but no longer requires the socket, so the + * pcb will persist for the time being. + * + * XXXRW: Does the second case still occur? + */ + if (inp->inp_vflag & INP_DROPPED || + tp->t_state < TCPS_SYN_SENT) { + tcp_discardcb(tp); +#ifdef INET6 + if (isipv6) { + in6_pcbdetach(inp); + in6_pcbfree(inp); + } else { +#endif + in_pcbdetach(inp); + in_pcbfree(inp); +#ifdef INET6 + } +#endif + } else { +#ifdef INET6 + if (isipv6) + in6_pcbdetach(inp); + else +#endif + in_pcbdetach(inp); + } + } +} + +/* + * pru_detach() detaches the TCP protocol from the socket. + * If the protocol state is non-embryonic, then can't + * do this directly: have to initiate a pru_disconnect(), + * which may finish later; embryonic TCB's can just + * be discarded here. + */ +static void +tcp_usr_detach(struct socket *so) +{ + struct inpcb *inp; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(inp); + KASSERT(inp->inp_socket != NULL, + ("tcp_usr_detach: inp_socket == NULL")); + tcp_detach(so, inp); + INP_INFO_WUNLOCK(&tcbinfo); +} + +/* + * Give the socket an address. + */ +static int +tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in *sinp; + + sinp = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sinp)) + return (EINVAL); + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + if (sinp->sin_family == AF_INET && + IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) + return (EAFNOSUPPORT); + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + error = in_pcbbind(inp, nam, td->td_ucred); +out: + TCPDEBUG2(PRU_BIND); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + + return (error); +} + +#ifdef INET6 +static int +tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in6 *sin6p; + + sin6p = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6p)) + return (EINVAL); + /* + * Must check for multicast addresses and disallow binding + * to them. + */ + if (sin6p->sin6_family == AF_INET6 && + IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + return (EAFNOSUPPORT); + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) { + if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr)) + inp->inp_vflag |= INP_IPV4; + else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + error = in_pcbbind(inp, (struct sockaddr *)&sin, + td->td_ucred); + goto out; + } + } + error = in6_pcbbind(inp, nam, td->td_ucred); +out: + TCPDEBUG2(PRU_BIND); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} +#endif /* INET6 */ + +/* + * Prepare to accept connections. + */ +static int +tcp_usr_listen(struct socket *so, int backlog, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0 && inp->inp_lport == 0) + error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + if (error == 0) { + tp->t_state = TCPS_LISTEN; + solisten_proto(so, backlog); + tcp_gen_listen_open(tp); + } + SOCK_UNLOCK(so); + +out: + TCPDEBUG2(PRU_LISTEN); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} + +#ifdef INET6 +static int +tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + SOCK_LOCK(so); + error = solisten_proto_check(so); + if (error == 0 && inp->inp_lport == 0) { + inp->inp_vflag &= ~INP_IPV4; + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) + inp->inp_vflag |= INP_IPV4; + error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + } + if (error == 0) { + tp->t_state = TCPS_LISTEN; + solisten_proto(so, backlog); + } + SOCK_UNLOCK(so); + +out: + TCPDEBUG2(PRU_LISTEN); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} +#endif /* INET6 */ + +/* + * Initiate connection to peer. + * Create a template for use in transmissions on this connection. + * Enter SYN_SENT state, and mark socket as connecting. + * Start keep-alive timer, and seed output sequence space. + * Send initial segment on connection. + */ +static int +tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in *sinp; + + sinp = (struct sockaddr_in *)nam; + if (nam->sa_len != sizeof (*sinp)) + return (EINVAL); + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + if (sinp->sin_family == AF_INET + && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) + return (EAFNOSUPPORT); + if (jailed(td->td_ucred)) + prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr); + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if ((error = tcp_connect(tp, nam, td)) != 0) + goto out; + printf("calling tcp_gen_connect\n"); + + error = tcp_gen_connect(so, nam); +out: + TCPDEBUG2(PRU_CONNECT); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} + +#ifdef INET6 +static int +tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + struct sockaddr_in6 *sin6p; + + TCPDEBUG0; + + sin6p = (struct sockaddr_in6 *)nam; + if (nam->sa_len != sizeof (*sin6p)) + return (EINVAL); + /* + * Must disallow TCP ``connections'' to multicast addresses. + */ + if (sin6p->sin6_family == AF_INET6 + && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr)) + return (EAFNOSUPPORT); + + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = EINVAL; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) { + struct sockaddr_in sin; + + if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) { + error = EINVAL; + goto out; + } + + in6_sin6_2_sin(&sin, sin6p); + inp->inp_vflag |= INP_IPV4; + inp->inp_vflag &= ~INP_IPV6; + if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0) + goto out; + error = tcp_gen_connect(so, nam); + goto out; + } + inp->inp_vflag &= ~INP_IPV4; + inp->inp_vflag |= INP_IPV6; + inp->inp_inc.inc_isipv6 = 1; + if ((error = tcp6_connect(tp, nam, td)) != 0) + goto out; + error = tcp_gen_connect(so, nam); + +out: + TCPDEBUG2(PRU_CONNECT); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} +#endif /* INET6 */ + +/* + * Initiate disconnect from peer. + * If connection never passed embryonic stage, just drop; + * else if don't need to let data drain, then can just drop anyways, + * else have to begin TCP shutdown process: mark socket disconnecting, + * drain unread data, state switch to reflect user close, and + * send segment (e.g. FIN) to peer. Socket will be really disconnected + * when peer sends FIN and acks ours. + * + * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. + */ +static int +tcp_usr_disconnect(struct socket *so) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + int error = 0; + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_disconnect(tp); +out: + TCPDEBUG2(PRU_DISCONNECT); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} + +/* + * Accept a connection. Essentially all the work is + * done at higher levels; just return the address + * of the peer, storing through addr. + */ +static int +tcp_usr_accept(struct socket *so, struct sockaddr **nam) +{ + int error = 0; + struct inpcb *inp = NULL; + struct tcpcb *tp = NULL; + struct in_addr addr; + in_port_t port = 0; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) + return (ECONNABORTED); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNABORTED; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + + /* + * We inline in_getpeeraddr and COMMON_END here, so that we can + * copy the data of interest and defer the malloc until after we + * release the lock. + */ + port = inp->inp_fport; + addr = inp->inp_faddr; + +out: + TCPDEBUG2(PRU_ACCEPT); + INP_UNLOCK(inp); + if (error == 0) + *nam = in_sockaddr(port, &addr); + return error; +} + +#ifdef INET6 +static int +tcp6_usr_accept(struct socket *so, struct sockaddr **nam) +{ + struct inpcb *inp = NULL; + int error = 0; + struct tcpcb *tp = NULL; + struct in_addr addr; + struct in6_addr addr6; + in_port_t port = 0; + int v4 = 0; + TCPDEBUG0; + + if (so->so_state & SS_ISDISCONNECTED) + return (ECONNABORTED); + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNABORTED; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + + /* + * We inline in6_mapped_peeraddr and COMMON_END here, so that we can + * copy the data of interest and defer the malloc until after we + * release the lock. + */ + if (inp->inp_vflag & INP_IPV4) { + v4 = 1; + port = inp->inp_fport; + addr = inp->inp_faddr; + } else { + port = inp->inp_fport; + addr6 = inp->in6p_faddr; + } + +out: + TCPDEBUG2(PRU_ACCEPT); + INP_UNLOCK(inp); + if (error == 0) { + if (v4) + *nam = in6_v4mapsin6_sockaddr(port, &addr); + else + *nam = in6_sockaddr(port, &addr6); + } + return error; +} +#endif /* INET6 */ + +/* + * Mark the connection as being incapable of further output. + */ +static int +tcp_usr_shutdown(struct socket *so) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + INP_INFO_WLOCK(&tcbinfo); + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + socantsendmore(so); + tcp_usrclosed(tp); + error = tcp_gen_disconnect(tp); + +out: + TCPDEBUG2(PRU_SHUTDOWN); + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + + return (error); +} + +/* + * After a receive, possibly send window update to peer. + */ +static int +tcp_usr_rcvd(struct socket *so, int flags) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + int error = 0; + + TCPDEBUG0; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_gen_rcvd(tp); + +out: + TCPDEBUG2(PRU_RCVD); + INP_UNLOCK(inp); + return (error); +} + +/* + * Do a send by putting data in output queue and updating urgent + * marker if URG set. Possibly send more data. Unlike the other + * pru_*() routines, the mbuf chains are our responsibility. We + * must either enqueue them or free them. The other pru_* routines + * generally are caller-frees. + */ +static int +tcp_usr_send(struct socket *so, int flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct thread *td) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + int headlocked = 0; +#ifdef INET6 + int isipv6; +#endif + TCPDEBUG0; + + /* + * We require the pcbinfo lock in two cases: + * + * (1) An implied connect is taking place, which can result in + * binding IPs and ports and hence modification of the pcb hash + * chains. + * + * (2) PRUS_EOF is set, resulting in explicit close on the send. + */ + if ((nam != NULL) || (flags & PRUS_EOF)) { + INP_INFO_WLOCK(&tcbinfo); + headlocked = 1; + } + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + if (control) + m_freem(control); + if (m) + m_freem(m); + error = ECONNRESET; + goto out; + } +#ifdef INET6 + isipv6 = nam && nam->sa_family == AF_INET6; +#endif /* INET6 */ + tp = intotcpcb(inp); + TCPDEBUG1(); + if (control) { + /* TCP doesn't do control messages (rights, creds, etc) */ + if (control->m_len) { + m_freem(control); + if (m) + m_freem(m); + error = EINVAL; + goto out; + } + m_freem(control); /* empty control, just free it */ + } + if (!(flags & PRUS_OOB)) { + sbappendstream(&so->so_snd, m); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + INP_INFO_WLOCK_ASSERT(&tcbinfo); +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, td); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, td); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + } + if (flags & PRUS_EOF) { + /* + * Close the send side of the connection after + * the data is sent. + */ + INP_INFO_WLOCK_ASSERT(&tcbinfo); + socantsendmore(so); + tcp_usrclosed(tp); + } + if (headlocked) { + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + } + if (tp != NULL) { + if (flags & PRUS_MORETOCOME) + tp->t_flags |= TF_MORETOCOME; + error = tcp_gen_send(tp); + if (flags & PRUS_MORETOCOME) + tp->t_flags &= ~TF_MORETOCOME; + } + } else { + /* + * XXXRW: PRUS_EOF not implemented with PRUS_OOB? + */ + SOCKBUF_LOCK(&so->so_snd); + if (sbspace(&so->so_snd) < -512) { + SOCKBUF_UNLOCK(&so->so_snd); + m_freem(m); + error = ENOBUFS; + goto out; + } + /* + * According to RFC961 (Assigned Protocols), + * the urgent pointer points to the last octet + * of urgent data. We continue, however, + * to consider it to indicate the first octet + * of data past the urgent section. + * Otherwise, snd_up should be one lower. + */ + sbappendstream_locked(&so->so_snd, m); + SOCKBUF_UNLOCK(&so->so_snd); + if (nam && tp->t_state < TCPS_SYN_SENT) { + /* + * Do implied connect if not yet connected, + * initialize window to default value, and + * initialize maxseg/maxopd using peer's cached + * MSS. + */ + INP_INFO_WLOCK_ASSERT(&tcbinfo); +#ifdef INET6 + if (isipv6) + error = tcp6_connect(tp, nam, td); + else +#endif /* INET6 */ + error = tcp_connect(tp, nam, td); + if (error) + goto out; + tp->snd_wnd = TTCP_CLIENT_SND_WND; + tcp_mss(tp, -1); + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + } else if (nam) { + INP_INFO_WUNLOCK(&tcbinfo); + headlocked = 0; + } + tp->snd_up = tp->snd_una + so->so_snd.sb_cc; + tp->t_flags |= TF_FORCEDATA; + error = tcp_gen_send(tp); + tp->t_flags &= ~TF_FORCEDATA; + } +out: + TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : + ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); + INP_UNLOCK(inp); + if (headlocked) + INP_INFO_WUNLOCK(&tcbinfo); + return (error); +} + +/* + * Abort the TCP. Drop the connection abruptly. + */ +static void +tcp_usr_abort(struct socket *so) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + TCPDEBUG0; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); + + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(inp); + KASSERT(inp->inp_socket != NULL, + ("tcp_usr_abort: inp_socket == NULL")); + + /* + * If we still have full TCP state, and we're not dropped, drop. + */ + if (!(inp->inp_vflag & INP_TIMEWAIT) && + !(inp->inp_vflag & INP_DROPPED)) { + tp = intotcpcb(inp); + TCPDEBUG1(); + cxgb_tcp_drop(tp, ECONNABORTED); + TCPDEBUG2(PRU_ABORT); + } + if (!(inp->inp_vflag & INP_DROPPED)) { + SOCK_LOCK(so); + so->so_state |= SS_PROTOREF; + SOCK_UNLOCK(so); + inp->inp_vflag |= INP_SOCKREF; + } + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); +} + +/* + * TCP socket is closed. Start friendly disconnect. + */ +static void +tcp_usr_close(struct socket *so) +{ + struct inpcb *inp; + struct tcpcb *tp = NULL; + TCPDEBUG0; + + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); + + INP_INFO_WLOCK(&tcbinfo); + INP_LOCK(inp); + KASSERT(inp->inp_socket != NULL, + ("tcp_usr_close: inp_socket == NULL")); + + /* + * If we still have full TCP state, and we're not dropped, initiate + * a disconnect. + */ + if (!(inp->inp_vflag & INP_TIMEWAIT) && + !(inp->inp_vflag & INP_DROPPED)) { + tp = intotcpcb(inp); + TCPDEBUG1(); + tcp_disconnect(tp); + TCPDEBUG2(PRU_CLOSE); + } + if (!(inp->inp_vflag & INP_DROPPED)) { + SOCK_LOCK(so); + so->so_state |= SS_PROTOREF; + SOCK_UNLOCK(so); + inp->inp_vflag |= INP_SOCKREF; + } + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); +} + +/* + * Receive out-of-band data. + */ +static int +tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + int error = 0; + struct inpcb *inp; + struct tcpcb *tp = NULL; + + TCPDEBUG0; + inp = sotoinpcb(so); + KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL")); + INP_LOCK(inp); + if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) { + error = ECONNRESET; + goto out; + } + tp = intotcpcb(inp); + TCPDEBUG1(); + if ((so->so_oobmark == 0 && + (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) || + so->so_options & SO_OOBINLINE || + tp->t_oobflags & TCPOOB_HADDATA) { + error = EINVAL; + goto out; + } + if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { + error = EWOULDBLOCK; + goto out; + } + m->m_len = 1; + *mtod(m, caddr_t) = tp->t_iobc; + if ((flags & MSG_PEEK) == 0) + tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); + +out: + TCPDEBUG2(PRU_RCVOOB); + INP_UNLOCK(inp); + return (error); +} + +struct pr_usrreqs cxgb_tcp_usrreqs = { + .pru_abort = tcp_usr_abort, + .pru_accept = tcp_usr_accept, + .pru_attach = tcp_usr_attach, + .pru_bind = tcp_usr_bind, + .pru_connect = tcp_usr_connect, + .pru_control = in_control, + .pru_detach = tcp_usr_detach, + .pru_disconnect = tcp_usr_disconnect, + .pru_listen = tcp_usr_listen, + .pru_peeraddr = in_getpeeraddr, + .pru_rcvd = tcp_usr_rcvd, + .pru_rcvoob = tcp_usr_rcvoob, + .pru_send = tcp_usr_send, + .pru_shutdown = tcp_usr_shutdown, + .pru_sockaddr = in_getsockaddr, + .pru_sosetlabel = in_pcbsosetlabel, + .pru_close = tcp_usr_close, +}; + +#ifdef INET6 +struct pr_usrreqs cxgb_tcp6_usrreqs = { + .pru_abort = tcp_usr_abort, + .pru_accept = tcp6_usr_accept, + .pru_attach = tcp_usr_attach, + .pru_bind = tcp6_usr_bind, + .pru_connect = tcp6_usr_connect, + .pru_control = in6_control, + .pru_detach = tcp_usr_detach, + .pru_disconnect = tcp_usr_disconnect, + .pru_listen = tcp6_usr_listen, + .pru_peeraddr = in6_mapped_peeraddr, + .pru_rcvd = tcp_usr_rcvd, + .pru_rcvoob = tcp_usr_rcvoob, + .pru_send = tcp_usr_send, + .pru_shutdown = tcp_usr_shutdown, + .pru_sockaddr = in6_mapped_sockaddr, + .pru_sosetlabel = in_pcbsosetlabel, + .pru_close = tcp_usr_close, +}; +#endif /* INET6 */ + +/* + * Common subroutine to open a TCP connection to remote host specified + * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local + * port number if needed. Call in_pcbconnect_setup to do the routing and + * to choose a local host address (interface). If there is an existing + * incarnation of the same connection in TIME-WAIT state and if the remote + * host was sending CC options and if the connection duration was < MSL, then + * truncate the previous TIME-WAIT state and proceed. + * Initialize connection parameters and enter SYN-SENT state. + */ +static int +tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct in_addr laddr; + u_short lport; + int error; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + if (inp->inp_lport == 0) { + error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + */ + laddr = inp->inp_laddr; + lport = inp->inp_lport; + error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport, + &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred); + if (error && oinp == NULL) + return error; + if (oinp) + return EADDRINUSE; + inp->inp_laddr = laddr; + in_pcbrehash(inp); + + /* + * Compute window scaling to request: + * Scale to fit into sweet spot. See tcp_syncache.c. + * XXX: This should move to tcp_output(). + */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < sb_max) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); + tp->iss = tcp_new_isn(tp); + tp->t_bw_rtseq = tp->iss; + tcp_sendseqinit(tp); + + return 0; +} + +#ifdef INET6 +static int +tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td) +{ + struct inpcb *inp = tp->t_inpcb, *oinp; + struct socket *so = inp->inp_socket; + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; + struct in6_addr *addr6; + int error; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + if (inp->inp_lport == 0) { + error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + if (error) + return error; + } + + /* + * Cannot simply call in_pcbconnect, because there might be an + * earlier incarnation of this same connection still in + * TIME_WAIT state, creating an ADDRINUSE error. + * in6_pcbladdr() also handles scope zone IDs. + */ + error = in6_pcbladdr(inp, nam, &addr6); + if (error) + return error; + oinp = in6_pcblookup_hash(inp->inp_pcbinfo, + &sin6->sin6_addr, sin6->sin6_port, + IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) + ? addr6 + : &inp->in6p_laddr, + inp->inp_lport, 0, NULL); + if (oinp) + return EADDRINUSE; + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) + inp->in6p_laddr = *addr6; + inp->in6p_faddr = sin6->sin6_addr; + inp->inp_fport = sin6->sin6_port; + /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ + inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK; + if (inp->in6p_flags & IN6P_AUTOFLOWLABEL) + inp->in6p_flowinfo |= + (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); + in_pcbrehash(inp); + + /* Compute window scaling to request. */ + while (tp->request_r_scale < TCP_MAX_WINSHIFT && + (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat) + tp->request_r_scale++; + + soisconnecting(so); + tcpstat.tcps_connattempt++; + tp->t_state = TCPS_SYN_SENT; + tcp_timer_activate(tp, TT_KEEP, tcp_keepinit); + tp->iss = tcp_new_isn(tp); + tp->t_bw_rtseq = tp->iss; + tcp_sendseqinit(tp); + + return 0; +} +#endif /* INET6 */ + +/* + * tcp_sendspace and tcp_recvspace are the default send and receive window + * sizes, respectively. These are obsolescent (this information should + * be set by the route). + */ +u_long tcp_sendspace = 1024*32; +SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW, + &tcp_sendspace , 0, "Maximum outgoing TCP datagram size"); +u_long tcp_recvspace = 1024*64; +SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW, + &tcp_recvspace , 0, "Maximum incoming TCP datagram size"); + +/* + * Attach TCP protocol to socket, allocating + * internet protocol control block, tcp control block, + * bufer space, and entering LISTEN state if to accept connections. + */ +static int +tcp_attach(struct socket *so) +{ + struct tcpcb *tp; + struct inpcb *inp; + int error; +#ifdef INET6 + int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0; +#endif + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + error = soreserve(so, tcp_sendspace, tcp_recvspace); + if (error) + return (error); + } + so->so_rcv.sb_flags |= SB_AUTOSIZE; + so->so_snd.sb_flags |= SB_AUTOSIZE; + INP_INFO_WLOCK(&tcbinfo); + error = in_pcballoc(so, &tcbinfo); + if (error) { + INP_INFO_WUNLOCK(&tcbinfo); + return (error); + } + inp = sotoinpcb(so); +#ifdef INET6 + if (isipv6) { + inp->inp_vflag |= INP_IPV6; + inp->in6p_hops = -1; /* use kernel default */ + } + else +#endif + inp->inp_vflag |= INP_IPV4; + tp = tcp_newtcpcb(inp); + if (tp == NULL) { +#ifdef INET6 + if (isipv6) { + in6_pcbdetach(inp); + in6_pcbfree(inp); + } else { +#endif + in_pcbdetach(inp); + in_pcbfree(inp); +#ifdef INET6 + } +#endif + INP_INFO_WUNLOCK(&tcbinfo); + return (ENOBUFS); + } + tp->t_state = TCPS_CLOSED; + INP_UNLOCK(inp); + INP_INFO_WUNLOCK(&tcbinfo); + return (0); +} + +/* + * Initiate (or continue) disconnect. + * If embryonic state, just send reset (once). + * If in ``let data drain'' option and linger null, just drop. + * Otherwise (hard), mark socket disconnecting and drop + * current input data; switch states based on user close, and + * send segment to peer (with FIN). + */ +static void +tcp_disconnect(struct tcpcb *tp) +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(inp); + + /* + * Neither tcp_close() nor tcp_drop() should return NULL, as the + * socket is still open. + */ + if (tp->t_state < TCPS_ESTABLISHED) { + tp = cxgb_tcp_close(tp); + KASSERT(tp != NULL, + ("tcp_disconnect: tcp_close() returned NULL")); + } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) { + tp = cxgb_tcp_drop(tp, 0); + KASSERT(tp != NULL, + ("tcp_disconnect: tcp_drop() returned NULL")); + } else { + soisdisconnecting(so); + sbflush(&so->so_rcv); + tcp_usrclosed(tp); + if (!(inp->inp_vflag & INP_DROPPED)) + tcp_gen_disconnect(tp); + } +} + +/* + * User issued close, and wish to trail through shutdown states: + * if never received SYN, just forget it. If got a SYN from peer, + * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. + * If already got a FIN from peer, then almost done; go to LAST_ACK + * state. In all other cases, have already sent FIN to peer (e.g. + * after PRU_SHUTDOWN), and just have to play tedious game waiting + * for peer to send FIN or not respond to keep-alives, etc. + * We can let the user exit from the close as soon as the FIN is acked. + */ +static void +tcp_usrclosed(struct tcpcb *tp) +{ + + INP_INFO_WLOCK_ASSERT(&tcbinfo); + INP_LOCK_ASSERT(tp->t_inpcb); + + switch (tp->t_state) { + case TCPS_LISTEN: + tcp_gen_listen_close(tp); + case TCPS_CLOSED: + tp->t_state = TCPS_CLOSED; + tp = cxgb_tcp_close(tp); + /* + * tcp_close() should never return NULL here as the socket is + * still open. + */ + KASSERT(tp != NULL, + ("tcp_usrclosed: tcp_close() returned NULL")); + break; + + case TCPS_SYN_SENT: + case TCPS_SYN_RECEIVED: + tp->t_flags |= TF_NEEDFIN; + break; + + case TCPS_ESTABLISHED: + tp->t_state = TCPS_FIN_WAIT_1; + break; + + case TCPS_CLOSE_WAIT: + tp->t_state = TCPS_LAST_ACK; + break; + } + if (tp->t_state >= TCPS_FIN_WAIT_2) { + soisdisconnected(tp->t_inpcb->inp_socket); + /* Prevent the connection hanging in FIN_WAIT_2 forever. */ + if (tp->t_state == TCPS_FIN_WAIT_2) { + int timeout; + + timeout = (tcp_fast_finwait2_recycle) ? + tcp_finwait2_timeout : tcp_maxidle; + tcp_timer_activate(tp, TT_2MSL, timeout); + } + } +} diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c index 2dc6150681a1..b5b87b7b0b35 100644 --- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c +++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c @@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$"); #include #include #include -#include +#include #include #include @@ -77,6 +77,8 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include + static int activated = 1; TUNABLE_INT("hw.t3toe.activated", &activated); @@ -177,6 +179,8 @@ toepcb_release(struct toepcb *toep) static void t3cdev_add(struct tom_data *t) { + printf("t3cdev_add\n"); + mtx_lock(&cxgb_list_lock); TAILQ_INSERT_TAIL(&cxgb_list, t, entry); mtx_unlock(&cxgb_list_lock); @@ -187,7 +191,8 @@ t3cdev_add(struct tom_data *t) * initialize its cpl_handlers * and register it as a T3C client */ -static void t3c_tom_add(struct t3cdev *cdev) +static void +t3c_tom_add(struct t3cdev *cdev) { int i; unsigned int wr_len; @@ -195,9 +200,12 @@ static void t3c_tom_add(struct t3cdev *cdev) struct toedev *tdev; struct adap_ports *port_info; + printf("%s called\n", __FUNCTION__); + + t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO); - if (!t) + if (t == NULL) return; if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0) @@ -226,11 +234,15 @@ static void t3c_tom_add(struct t3cdev *cdev) } TOM_DATA(tdev) = t; + printf("nports=%d\n", port_info->nports); for (i = 0; i < port_info->nports; i++) { struct ifnet *ifp = port_info->lldevs[i]; TOEDEV(ifp) = tdev; + + printf("enabling toe on %p\n", ifp); - ifp->if_capabilities |= IFCAP_TOE; + ifp->if_capabilities |= IFCAP_TOE4; + ifp->if_capenable |= IFCAP_TOE4; } t->ports = port_info; @@ -242,8 +254,10 @@ static void t3c_tom_add(struct t3cdev *cdev) return; out_free_all: + printf("out_free_all fail\n"); free(port_info, M_CXGB); out_free_tom: + printf("out_free_tom fail\n"); free(t, M_CXGB); return; } @@ -293,8 +307,8 @@ can_offload(struct toedev *dev, struct socket *so) atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn); } - -static int tom_ctl(struct toedev *dev, unsigned int req, void *data) +static int +tom_ctl(struct toedev *dev, unsigned int req, void *data) { struct tom_data *t = TOM_DATA(dev); struct t3cdev *cdev = t->cdev; @@ -377,32 +391,33 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry) } static void -cxgb_toe_listen(void *unused, int event, struct tcpcb *tp) +cxgb_toe_listen_start(void *unused, struct tcpcb *tp) { struct socket *so = tp->t_inpcb->inp_socket; struct tom_data *p; - - switch (event) { - case OFLD_LISTEN_OPEN: - case OFLD_LISTEN_CLOSE: - mtx_lock(&cxgb_list_lock); - TAILQ_FOREACH(p, &cxgb_list, entry) { - if (event == OFLD_LISTEN_OPEN) - t3_listen_start(&p->tdev, so, p->cdev); - else if (tp->t_state == TCPS_LISTEN) { - printf("stopping listen on port=%d\n", - ntohs(tp->t_inpcb->inp_lport)); - - t3_listen_stop(&p->tdev, so, p->cdev); - } - - } - mtx_unlock(&cxgb_list_lock); - break; - default: - log(LOG_ERR, "unrecognized listen event %d\n", event); - break; + + mtx_lock(&cxgb_list_lock); + TAILQ_FOREACH(p, &cxgb_list, entry) { + t3_listen_start(&p->tdev, so, p->cdev); } + mtx_unlock(&cxgb_list_lock); +} + +static void +cxgb_toe_listen_stop(void *unused, struct tcpcb *tp) +{ + struct socket *so = tp->t_inpcb->inp_socket; + struct tom_data *p; + + mtx_lock(&cxgb_list_lock); + TAILQ_FOREACH(p, &cxgb_list, entry) { + if (tp->t_state == TCPS_LISTEN) { + printf("stopping listen on port=%d\n", + ntohs(tp->t_inpcb->inp_lport)); + t3_listen_stop(&p->tdev, so, p->cdev); + } + } + mtx_unlock(&cxgb_list_lock); } static void @@ -416,7 +431,7 @@ cxgb_register_listeners(void) tp = intotcpcb(inp); if (tp->t_state == TCPS_LISTEN) - cxgb_toe_listen(NULL, OFLD_LISTEN_OPEN, tp); + cxgb_toe_listen_start(NULL, tp); } INP_INFO_RUNLOCK(&tcbinfo); } @@ -450,12 +465,19 @@ t3_tom_init(void) "Unable to register Chelsio T3 TCP offload module.\n"); return -1; } + INP_INFO_WLOCK(&tcbinfo); + + INP_INFO_WUNLOCK(&tcbinfo); mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF); - listen_tag = EVENTHANDLER_REGISTER(ofld_listen, cxgb_toe_listen, NULL, EVENTHANDLER_PRI_ANY); + listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start, + cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY); + listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop, + cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY); TAILQ_INIT(&cxgb_list); /* Register to offloading devices */ + printf("setting add to %p\n", t3c_tom_add); t3c_tom_client.add = t3c_tom_add; cxgb_register_client(&t3c_tom_client); cxgb_register_listeners(); diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile index 120cc9bcaaab..ef633e7492cc 100644 --- a/sys/modules/cxgb/Makefile +++ b/sys/modules/cxgb/Makefile @@ -1,7 +1,7 @@ # $FreeBSD$ SUBDIR= cxgb SUBDIR+= toecore -#SUBDIR+= tom +SUBDIR+= tom #SUBDIR+= iw_cxgb .include diff --git a/sys/modules/cxgb/cxgb/Makefile b/sys/modules/cxgb/cxgb/Makefile index b8455f1c3cb5..1f41ac2a06e9 100644 --- a/sys/modules/cxgb/cxgb/Makefile +++ b/sys/modules/cxgb/cxgb/Makefile @@ -8,11 +8,11 @@ SRCS= cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c SRCS+= cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c SRCS+= cxgb_sge.c cxgb_lro.c cxgb_offload.c cxgb_l2t.c SRCS+= device_if.h bus_if.h pci_if.h opt_zero.h opt_sched.h -SRCS+= uipc_mvec.c -#SRCS+= cxgb_multiq.c cxgb_support.c +SRCS+= uipc_mvec.c cxgb_support.c +#SRCS+= cxgb_multiq.c CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB} -DSMP -CFLAGS+= -DDISABLE_MBUF_IOVEC +#CFLAGS+= -DDISABLE_MBUF_IOVEC #CFLAGS+= -DIFNET_MULTIQUEUE #CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS #CFLAGS+= -DWITNESS diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile index ece891ce79fb..ba02b9196bb9 100644 --- a/sys/modules/cxgb/tom/Makefile +++ b/sys/modules/cxgb/tom/Makefile @@ -4,5 +4,9 @@ TOM = ${.CURDIR}/../../../dev/cxgb/ulp/tom KMOD= tom SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c -SRCS+= device_if.h bus_if.h pci_if.h -.include \ No newline at end of file +SRCS+= cxgb_tcp_subr.c cxgb_tcp_usrreq.c +SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h opt_tcpdebug.h opt_ddb.h +SRCS+= device_if.h bus_if.h pci_if.h + +#CFLAGS+= -DDEBUG_PRINT -DDEBUG +.include