convert inpcbinfo hash and info rwlocks to epoch + mutex

- Convert inpcbinfo info & hash locks to epoch for read and mutex for write
- Garbage collect code that handled INP_INFO_TRY_RLOCK failures as
  INP_INFO_RLOCK which can no longer fail

When running 64 netperfs sending minimal sized packets on a 2x8x2 reduces
unhalted core cycles samples in rwlock rlock/runlock in udp_send from 51% to
3%.

Overall packet throughput rate limited by CPU affinity and NIC driver design
choices.

On the receiver unhalted core cycles samples in in_pcblookup_hash went from
13% to to 1.6%

Tested by LLNW and pho@

Reviewed by: jtl
Sponsored by: Limelight Networks
Differential Revision: https://reviews.freebsd.org/D15686
This commit is contained in:
Matt Macy 2018-06-19 01:54:00 +00:00
parent 9ee9d38672
commit 9e58ff6ff9
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=335356
6 changed files with 79 additions and 205 deletions

View File

@ -561,14 +561,14 @@ static struct witness_order_list_entry order_lists[] = {
/*
* UDP/IP
*/
{ "udp", &lock_class_rw },
{ "udp", &lock_class_mtx_sleep },
{ "udpinp", &lock_class_rw },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* TCP/IP
*/
{ "tcp", &lock_class_rw },
{ "tcp", &lock_class_mtx_sleep },
{ "tcpinp", &lock_class_rw },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },

View File

@ -51,6 +51,8 @@
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <net/vnet.h>
#include <net/if.h>
#include <net/if_var.h>
#include <vm/uma.h>
#endif
#include <sys/ck.h>
@ -157,6 +159,7 @@ struct in_conninfo {
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (e) - Protected by the net_epoch_prempt epoch
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
@ -231,7 +234,7 @@ struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
CK_LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
@ -324,8 +327,8 @@ struct inpcb {
struct route_in6 inp_route6;
};
CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
/* (p[w]) for list iteration */
/* (p[r]/l) for addition/removal */
/* (e[r]) for list iteration */
/* (p[w]/l) for addition/removal */
struct epoch_context inp_epoch_ctx;
};
#endif /* _KERNEL */
@ -436,22 +439,23 @@ struct in_pcblist {
* Locking key:
*
* (c) Constant or nearly constant after initialisation
* (e) - Protected by the net_epoch_prempt epoch
* (g) Locked by ipi_lock
* (l) Locked by ipi_list_lock
* (h) Read using either ipi_hash_lock or inpcb lock; write requires both
* (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
* (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
/*
* Global lock protecting full inpcb list traversal
* Global lock protecting inpcb list modification
*/
struct rwlock ipi_lock;
struct mtx ipi_lock;
/*
* Global list of inpcbs on the protocol.
*/
struct inpcbhead *ipi_listhead; /* (g/l) */
struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */
u_int ipi_count; /* (l) */
/*
@ -482,9 +486,9 @@ struct inpcbinfo {
u_int ipi_hashfields; /* (c) */
/*
* Global lock protecting non-pcbgroup hash lookup tables.
* Global lock protecting modification non-pcbgroup hash lookup tables.
*/
struct rwlock ipi_hash_lock;
struct mtx ipi_hash_lock;
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
@ -626,20 +630,18 @@ int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
#define INP_INFO_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock)
#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock)
#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock)
#define INP_INFO_RLOCK(ipi) NET_EPOCH_ENTER()
#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT()
#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock))
#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch())
#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock))
#define INP_LIST_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
@ -660,17 +662,14 @@ int inp_so_options(const struct inpcb *inp);
#define INP_LIST_UNLOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
#define INP_HASH_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
RA_LOCKED)
#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
RA_WLOCKED)
#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock)
#define INP_HASH_RLOCK(ipi) NET_EPOCH_ENTER()
#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT()
#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock))
#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
MTX_DEF | MTX_DUPOK)

View File

@ -184,9 +184,6 @@ TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
static struct tcp_hptsi tcp_pace;
static int
tcp_hptsi_lock_inpinfo(struct inpcb *inp,
struct tcpcb **tp);
static void tcp_wakehpts(struct tcp_hpts_entry *p);
static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
@ -498,59 +495,6 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTL
0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
/*
* Try to get the INP_INFO lock.
*
* This function always succeeds in getting the lock. It will clear
* *tpp and return (1) if something critical changed while the inpcb
* was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
*
* This function relies on the fact that the hpts always holds a
* reference on the inpcb while the segment is on the hptsi wheel and
* in the input queue.
*
*/
static int
tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
{
struct tcp_function_block *tfb;
struct tcpcb *tp;
void *ptr;
/* Try the easy way. */
if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
return (0);
/*
* OK, let's try the hard way. We'll save the function pointer block
* to make sure that doesn't change while we aren't holding the
* lock.
*/
tp = *tpp;
tfb = tp->t_fb;
ptr = tp->t_fb_ptr;
INP_WUNLOCK(inp);
INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
/* If the session went away, return an error. */
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
*tpp = NULL;
return (1);
}
/*
* If the function block or stack-specific data block changed,
* report an error.
*/
tp = intotcpcb(inp);
if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
*tpp = NULL;
return (1);
}
return (0);
}
static void
tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
@ -1290,10 +1234,7 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
(m->m_pkthdr.pace_lock == TI_RLOCKED ||
tp->t_state != TCPS_ESTABLISHED)) {
ti_locked = TI_RLOCKED;
if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
CURVNET_RESTORE();
goto out;
}
INP_INFO_RLOCK(&V_tcbinfo);
m = tp->t_in_pkt;
}
if (in_newts_every_tcb) {
@ -1360,7 +1301,6 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
*/
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
out_free:
while (m) {
m_freem(m);
m = n;
@ -1376,8 +1316,7 @@ tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
if (ti_locked == TI_UNLOCKED &&
(tp->t_state != TCPS_ESTABLISHED)) {
ti_locked = TI_RLOCKED;
if (tcp_hptsi_lock_inpinfo(inp, &tp))
goto out_free;
INP_INFO_RLOCK(&V_tcbinfo);
}
} /** end while(m) */
} /** end if ((m != NULL) && (m == tp->t_in_pkt)) */

View File

@ -960,25 +960,10 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
*
* XXXRW: It may be time to rethink timewait locking.
*/
relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
if (ti_locked == TI_UNLOCKED) {
if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
INP_INFO_RLOCK(&V_tcbinfo);
ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
} else if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
inp = NULL;
goto findpcb;
}
} else
ti_locked = TI_RLOCKED;
INP_INFO_RLOCK();
ti_locked = TI_RLOCKED;
}
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
@ -1026,23 +1011,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
(tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
!IS_FASTOPEN(tp->t_flags)))) {
if (ti_locked == TI_UNLOCKED) {
if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
in_pcbref(inp);
INP_WUNLOCK(inp);
INP_INFO_RLOCK(&V_tcbinfo);
ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
inp = NULL;
goto findpcb;
} else if (inp->inp_flags & INP_DROPPED) {
INP_WUNLOCK(inp);
inp = NULL;
goto findpcb;
}
goto relocked;
} else
ti_locked = TI_RLOCKED;
INP_INFO_RLOCK();
ti_locked = TI_RLOCKED;
}
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}

View File

@ -6837,34 +6837,8 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* Initial input (ACK to SYN-ACK etc)lets go ahead and get
* it processed
*/
if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo))
ti_locked = TI_RLOCKED;
if (ti_locked != TI_RLOCKED) {
inp = tp->t_inpcb;
tfb = tp->t_fb;
in_pcbref(inp);
INP_WUNLOCK(inp);
INP_INFO_RLOCK(&V_tcbinfo);
ti_locked = TI_RLOCKED;
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp))
inp = NULL;
if (inp == NULL || (inp->inp_flags2 & INP_FREED) ||
(inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
/* The TCPCB went away. Free the packet. */
INP_INFO_RUNLOCK(&V_tcbinfo);
if (inp)
INP_WUNLOCK(inp);
m_freem(m);
return;
}
/* If the stack changed, call the correct stack. */
if (tp->t_fb != tfb) {
tp->t_fb->tfb_tcp_do_segment(m, th, so, tp,
drop_hdrlen, tlen, iptos, ti_locked);
return;
}
}
INP_INFO_RLOCK();
ti_locked = TI_RLOCKED;
tcp_get_usecs(&tv);
rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
tlen, iptos, ti_locked, 0, &tv);

View File

@ -707,54 +707,46 @@ tcp_tw_2msl_scan(int reuse)
in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
INP_WLOCK(inp);
tw = intotw(inp);
if (in_pcbrele_wlocked(inp)) {
if (__predict_true(tw == NULL)) {
INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
} else {
/* This should not happen as in TIMEWAIT
* state the inp should not be destroyed
* before its tcptw. If INVARIANTS is
* defined panic.
*/
#ifdef INVARIANTS
panic("%s: Panic before an infinite "
"loop: INP_TIMEWAIT && (INP_FREED "
"|| inp last reference) && tw != "
"NULL", __func__);
#else
log(LOG_ERR, "%s: Avoid an infinite "
"loop: INP_TIMEWAIT && (INP_FREED "
"|| inp last reference) && tw != "
"NULL", __func__);
#endif
INP_INFO_RUNLOCK(&V_tcbinfo);
break;
}
}
if (tw == NULL) {
/* tcp_twclose() has already been called */
INP_WUNLOCK(inp);
INP_INFO_RLOCK(&V_tcbinfo);
INP_WLOCK(inp);
tw = intotw(inp);
if (in_pcbrele_wlocked(inp)) {
if (__predict_true(tw == NULL)) {
INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
} else {
/* This should not happen as in TIMEWAIT
* state the inp should not be destroyed
* before its tcptw. If INVARIANTS is
* defined panic.
*/
#ifdef INVARIANTS
panic("%s: Panic before an infinite "
"loop: INP_TIMEWAIT && (INP_FREED "
"|| inp last reference) && tw != "
"NULL", __func__);
#else
log(LOG_ERR, "%s: Avoid an infinite "
"loop: INP_TIMEWAIT && (INP_FREED "
"|| inp last reference) && tw != "
"NULL", __func__);
#endif
INP_INFO_RUNLOCK(&V_tcbinfo);
break;
}
tcp_twclose(tw, reuse);
INP_INFO_RUNLOCK(&V_tcbinfo);
if (reuse)
return tw;
} else {
/* INP_INFO lock is busy, continue later. */
INP_WLOCK(inp);
if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
break;
}
if (tw == NULL) {
/* tcp_twclose() has already been called */
INP_WUNLOCK(inp);
INP_INFO_RUNLOCK(&V_tcbinfo);
continue;
}
tcp_twclose(tw, reuse);
INP_INFO_RUNLOCK(&V_tcbinfo);
if (reuse)
return tw;
}
return NULL;