From 989ae30975b56b9d5e1171ddec7b11bc650bfbe0 Mon Sep 17 00:00:00 2001 From: np Date: Wed, 11 Jan 2017 23:48:17 +0000 Subject: [PATCH] cxgbe/tom: Add VIMAGE support to the TOE driver. Active Open: - Save the socket's vnet at the time of the active open (t4_connect) and switch to it when processing the reply (do_act_open_rpl or do_act_establish). Passive Open: - Save the listening socket's vnet in the driver's listen_ctx and switch to it when processing incoming SYNs for the socket. - Reject SYNs that arrive on an ifnet that's not in the same vnet as the listening socket. CLIP (Compressed Local IPv6) table: - Add only those IPv6 addresses to the CLIP that are in a vnet associated with one of the card's ifnets. Misc: - Set vnet from the toepcb when processing TCP state transitions. - The kernel sets the vnet when calling the driver's output routine so t4_push_frames runs in proper vnet context already. One exception is when incoming credits trigger tx within the driver's ithread. Set the vnet explicitly in do_fw4_ack for that case. MFC after: 3 days Sponsored by: Chelsio Communications --- sys/dev/cxgbe/tom/t4_connect.c | 5 ++ sys/dev/cxgbe/tom/t4_cpl_io.c | 19 ++++-- sys/dev/cxgbe/tom/t4_ddp.c | 3 +- sys/dev/cxgbe/tom/t4_listen.c | 20 +++++- sys/dev/cxgbe/tom/t4_tom.c | 120 +++++++++++++++++++-------------- sys/dev/cxgbe/tom/t4_tom.h | 2 + 6 files changed, 113 insertions(+), 56 deletions(-) diff --git a/sys/dev/cxgbe/tom/t4_connect.c b/sys/dev/cxgbe/tom/t4_connect.c index f2e4ad456ef4..bd78c7fced61 100644 --- a/sys/dev/cxgbe/tom/t4_connect.c +++ b/sys/dev/cxgbe/tom/t4_connect.c @@ -126,6 +126,7 @@ do_act_establish(struct sge_iq *iq, const struct rss_header *rss, CTR3(KTR_CXGBE, "%s: atid %u, tid %u", __func__, atid, tid); free_atid(sc, atid); + CURVNET_SET(toep->vnet); INP_WLOCK(inp); toep->tid = tid; insert_tid(sc, tid, toep, inp->inp_vflag & INP_IPV6 ? 2 : 1); @@ -141,6 +142,7 @@ do_act_establish(struct sge_iq *iq, const struct rss_header *rss, make_established(toep, cpl->snd_isn, cpl->rcv_isn, cpl->tcp_opt); done: INP_WUNLOCK(inp); + CURVNET_RESTORE(); return (0); } @@ -178,6 +180,7 @@ act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) free_atid(sc, atid); toep->tid = -1; + CURVNET_SET(toep->vnet); if (status != EAGAIN) INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); @@ -185,6 +188,7 @@ act_open_failure_cleanup(struct adapter *sc, u_int atid, u_int status) final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); } /* @@ -360,6 +364,7 @@ t4_connect(struct toedev *tod, struct socket *so, struct rtentry *rt, if (wr == NULL) DONT_OFFLOAD_ACTIVE_OPEN(ENOMEM); + toep->vnet = so->so_vnet; if (sc->tt.ddp && (so->so_options & SO_NO_DDP) == 0) set_tcpddp_ulp_mode(toep); else diff --git a/sys/dev/cxgbe/tom/t4_cpl_io.c b/sys/dev/cxgbe/tom/t4_cpl_io.c index c22f8bb17271..25ad1e355348 100644 --- a/sys/dev/cxgbe/tom/t4_cpl_io.c +++ b/sys/dev/cxgbe/tom/t4_cpl_io.c @@ -306,7 +306,6 @@ make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, uint16_t tcpopt = be16toh(opt); struct flowc_tx_params ftxp; - CURVNET_SET(so->so_vnet); INP_WLOCK_ASSERT(inp); KASSERT(tp->t_state == TCPS_SYN_SENT || tp->t_state == TCPS_SYN_RECEIVED, @@ -357,7 +356,6 @@ make_established(struct toepcb *toep, uint32_t snd_isn, uint32_t rcv_isn, send_flowc_wr(toep, &ftxp); soisconnected(so); - CURVNET_RESTORE(); } static int @@ -1146,6 +1144,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + CURVNET_SET(toep->vnet); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1191,6 +1190,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); @@ -1203,6 +1203,7 @@ do_peer_close(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) done: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return (0); } @@ -1229,6 +1230,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); + CURVNET_SET(toep->vnet); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1248,6 +1250,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ @@ -1272,6 +1275,7 @@ do_close_con_rpl(struct sge_iq *iq, const struct rss_header *rss, done: INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return (0); } @@ -1345,6 +1349,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) } inp = toep->inp; + CURVNET_SET(toep->vnet); INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); @@ -1380,6 +1385,7 @@ do_abort_req(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) final_cpl_received(toep); done: INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } @@ -1501,18 +1507,21 @@ do_rx_data(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) DDP_UNLOCK(toep); INP_WUNLOCK(inp); + CURVNET_SET(toep->vnet); INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return (0); } /* receive buffer autosize */ - CURVNET_SET(so->so_vnet); + MPASS(toep->vnet == so->so_vnet); + CURVNET_SET(toep->vnet); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && sb->sb_hiwat < V_tcp_autorcvbuf_max && @@ -1713,10 +1722,12 @@ do_fw4_ack(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m) tid); #endif toep->flags &= ~TPF_TX_SUSPENDED; + CURVNET_SET(toep->vnet); if (toep->ulp_mode == ULP_MODE_ISCSI) t4_push_pdus(sc, toep, plen); else t4_push_frames(sc, toep, plen); + CURVNET_RESTORE(); } else if (plen > 0) { struct sockbuf *sb = &so->so_snd; int sbu; @@ -2143,7 +2154,7 @@ t4_aiotx_task(void *context, int pending) struct socket *so = inp->inp_socket; struct kaiocb *job; - CURVNET_SET(so->so_vnet); + CURVNET_SET(toep->vnet); SOCKBUF_LOCK(&so->so_snd); while (!TAILQ_EMPTY(&toep->aiotx_jobq) && sowriteable(so)) { job = TAILQ_FIRST(&toep->aiotx_jobq); diff --git a/sys/dev/cxgbe/tom/t4_ddp.c b/sys/dev/cxgbe/tom/t4_ddp.c index 1a4cb7104360..762eb2eeeffa 100644 --- a/sys/dev/cxgbe/tom/t4_ddp.c +++ b/sys/dev/cxgbe/tom/t4_ddp.c @@ -546,7 +546,8 @@ handle_ddp_data(struct toepcb *toep, __be32 ddp_report, __be32 rcv_nxt, int len) #endif /* receive buffer autosize */ - CURVNET_SET(so->so_vnet); + MPASS(toep->vnet == so->so_vnet); + CURVNET_SET(toep->vnet); SOCKBUF_LOCK(sb); if (sb->sb_flags & SB_AUTOSIZE && V_tcp_do_autorcvbuf && diff --git a/sys/dev/cxgbe/tom/t4_listen.c b/sys/dev/cxgbe/tom/t4_listen.c index bdb93ac9a427..836c9cb6fcf4 100644 --- a/sys/dev/cxgbe/tom/t4_listen.c +++ b/sys/dev/cxgbe/tom/t4_listen.c @@ -222,6 +222,7 @@ alloc_lctx(struct adapter *sc, struct inpcb *inp, struct vi_info *vi) TAILQ_INIT(&lctx->synq); lctx->inp = inp; + lctx->vnet = inp->inp_socket->so_vnet; in_pcbref(inp); return (lctx); @@ -1200,6 +1201,8 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, pi = sc->port[G_SYN_INTF(be16toh(cpl->l2info))]; + CURVNET_SET(lctx->vnet); + /* * Use the MAC index to lookup the associated VI. If this SYN * didn't match a perfect MAC filter, punt. @@ -1274,6 +1277,13 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, ntids = 1; } + /* + * Don't offload if the ifnet that the SYN came in on is not in the same + * vnet as the listening socket. + */ + if (lctx->vnet != ifp->if_vnet) + REJECT_PASS_ACCEPT(); + e = get_l2te_for_nexthop(pi, ifp, &inc); if (e == NULL) REJECT_PASS_ACCEPT(); @@ -1313,7 +1323,6 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, REJECT_PASS_ACCEPT(); } so = inp->inp_socket; - CURVNET_SET(so->so_vnet); mtu_idx = find_best_mtu_idx(sc, &inc, be16toh(cpl->tcpopt.mss)); rscale = cpl->tcpopt.wsf && V_tcp_do_rfc1323 ? select_rcv_wscale() : 0; @@ -1360,7 +1369,6 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, */ toe_syncache_add(&inc, &to, &th, inp, tod, synqe); INP_UNLOCK_ASSERT(inp); /* ok to assert, we have a ref on the inp */ - CURVNET_RESTORE(); /* * If we replied during syncache_add (synqe->wr has been consumed), @@ -1415,10 +1423,12 @@ do_pass_accept_req(struct sge_iq *iq, const struct rss_header *rss, return (__LINE__); } INP_WUNLOCK(inp); + CURVNET_RESTORE(); release_synqe(synqe); /* extra hold */ return (0); reject: + CURVNET_RESTORE(); CTR4(KTR_CXGBE, "%s: stid %u, tid %u, REJECT (%d)", __func__, stid, tid, reject_reason); @@ -1490,6 +1500,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); + CURVNET_SET(lctx->vnet); INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ INP_WLOCK(inp); @@ -1507,6 +1518,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return (0); } @@ -1532,6 +1544,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, send_reset_synqe(TOEDEV(ifp), synqe); INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); return (0); } toep->tid = tid; @@ -1568,6 +1581,8 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, /* New connection inpcb is already locked by syncache_expand(). */ new_inp = sotoinpcb(so); INP_WLOCK_ASSERT(new_inp); + MPASS(so->so_vnet == lctx->vnet); + toep->vnet = lctx->vnet; /* * This is for the unlikely case where the syncache entry that we added @@ -1591,6 +1606,7 @@ do_pass_establish(struct sge_iq *iq, const struct rss_header *rss, if (inp != NULL) INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); + CURVNET_RESTORE(); release_synqe(synqe); return (0); diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c index 5445e265f7dc..1448aad009f8 100644 --- a/sys/dev/cxgbe/tom/t4_tom.c +++ b/sys/dev/cxgbe/tom/t4_tom.c @@ -799,74 +799,96 @@ update_clip_table(struct adapter *sc, struct tom_data *td) struct in6_addr *lip, tlip; struct clip_head stale; struct clip_entry *ce, *ce_temp; - int rc, gen = atomic_load_acq_int(&in6_ifaddr_gen); + struct vi_info *vi; + int rc, gen, i, j; + uintptr_t last_vnet; ASSERT_SYNCHRONIZED_OP(sc); IN6_IFADDR_RLOCK(&in6_ifa_tracker); mtx_lock(&td->clip_table_lock); + gen = atomic_load_acq_int(&in6_ifaddr_gen); if (gen == td->clip_gen) goto done; TAILQ_INIT(&stale); TAILQ_CONCAT(&stale, &td->clip_table, link); - TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { - lip = &ia->ia_addr.sin6_addr; - - KASSERT(!IN6_IS_ADDR_MULTICAST(lip), - ("%s: mcast address in in6_ifaddr list", __func__)); - - if (IN6_IS_ADDR_LOOPBACK(lip)) + /* + * last_vnet optimizes the common cases where all if_vnet = NULL (no + * VIMAGE) or all if_vnet = vnet0. + */ + last_vnet = (uintptr_t)(-1); + for_each_port(sc, i) + for_each_vi(sc->port[i], j, vi) { + if (last_vnet == (uintptr_t)vi->ifp->if_vnet) continue; - if (IN6_IS_SCOPE_EMBED(lip)) { - /* Remove the embedded scope */ - tlip = *lip; - lip = &tlip; - in6_clearscope(lip); - } - /* - * XXX: how to weed out the link local address for the loopback - * interface? It's fe80::1 usually (always?). - */ - /* - * If it's in the main list then we already know it's not stale. - */ - TAILQ_FOREACH(ce, &td->clip_table, link) { - if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) - goto next; - } + /* XXX: races with if_vmove */ + CURVNET_SET(vi->ifp->if_vnet); + TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { + lip = &ia->ia_addr.sin6_addr; - /* - * If it's in the stale list we should move it to the main list. - */ - TAILQ_FOREACH(ce, &stale, link) { - if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { - TAILQ_REMOVE(&stale, ce, link); - TAILQ_INSERT_TAIL(&td->clip_table, ce, link); - goto next; + KASSERT(!IN6_IS_ADDR_MULTICAST(lip), + ("%s: mcast address in in6_ifaddr list", __func__)); + + if (IN6_IS_ADDR_LOOPBACK(lip)) + continue; + if (IN6_IS_SCOPE_EMBED(lip)) { + /* Remove the embedded scope */ + tlip = *lip; + lip = &tlip; + in6_clearscope(lip); } - } + /* + * XXX: how to weed out the link local address for the + * loopback interface? It's fe80::1 usually (always?). + */ - /* A new IP6 address; add it to the CLIP table */ - ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); - memcpy(&ce->lip, lip, sizeof(ce->lip)); - ce->refcount = 0; - rc = add_lip(sc, lip); - if (rc == 0) - TAILQ_INSERT_TAIL(&td->clip_table, ce, link); - else { - char ip[INET6_ADDRSTRLEN]; + /* + * If it's in the main list then we already know it's + * not stale. + */ + TAILQ_FOREACH(ce, &td->clip_table, link) { + if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) + goto next; + } - inet_ntop(AF_INET6, &ce->lip, &ip[0], sizeof(ip)); - log(LOG_ERR, "%s: could not add %s (%d)\n", - __func__, ip, rc); - free(ce, M_CXGBE); - } + /* + * If it's in the stale list we should move it to the + * main list. + */ + TAILQ_FOREACH(ce, &stale, link) { + if (IN6_ARE_ADDR_EQUAL(&ce->lip, lip)) { + TAILQ_REMOVE(&stale, ce, link); + TAILQ_INSERT_TAIL(&td->clip_table, ce, + link); + goto next; + } + } + + /* A new IP6 address; add it to the CLIP table */ + ce = malloc(sizeof(*ce), M_CXGBE, M_NOWAIT); + memcpy(&ce->lip, lip, sizeof(ce->lip)); + ce->refcount = 0; + rc = add_lip(sc, lip); + if (rc == 0) + TAILQ_INSERT_TAIL(&td->clip_table, ce, link); + else { + char ip[INET6_ADDRSTRLEN]; + + inet_ntop(AF_INET6, &ce->lip, &ip[0], + sizeof(ip)); + log(LOG_ERR, "%s: could not add %s (%d)\n", + __func__, ip, rc); + free(ce, M_CXGBE); + } next: - continue; + continue; + } + CURVNET_RESTORE(); + last_vnet = (uintptr_t)vi->ifp->if_vnet; } /* diff --git a/sys/dev/cxgbe/tom/t4_tom.h b/sys/dev/cxgbe/tom/t4_tom.h index c0a5288e9ad8..29d819252b4d 100644 --- a/sys/dev/cxgbe/tom/t4_tom.h +++ b/sys/dev/cxgbe/tom/t4_tom.h @@ -141,6 +141,7 @@ struct toepcb { int refcount; struct tom_data *td; struct inpcb *inp; /* backpointer to host stack's PCB */ + struct vnet *vnet; struct vi_info *vi; /* virtual interface */ struct sge_wrq *ofld_txq; struct sge_ofld_rxq *ofld_rxq; @@ -232,6 +233,7 @@ struct listen_ctx { struct stid_region stid_region; int flags; struct inpcb *inp; /* listening socket's inp */ + struct vnet *vnet; struct sge_wrq *ctrlq; struct sge_ofld_rxq *ofld_rxq; struct clip_entry *ce;