Merge last of a series of rwlock conversion changes to UDP, which

completes the move to a fully parallel UDP transmit path by using
global read, rather than write, locking of inpcbinfo in further
semi-connected cases:

- Add macros to allow try-locking of inpcb and inpcbinfo.
- Always acquire an incpcb read lock in udp_output(), which stablizes the
  local inpcb address and port bindings in order to determine what further
  locking is required:
  - If the inpcb is currently not bound (at all) and are implicitly
    connecting, we require inpcbinfo and inpcb write locks, so drop the
    read lock and re-acquire.
  - If the inpcb is bound for at least one of the port or address, but an
    explicit source or destination is requested, trylock the inpcbinfo
    lock, and if that fails, drop the inpcb lock, lock the global lock,
    and relock the inpcb lock.
  - Otherwise, no further locking is required (common case).
- Update comments.

In practice, this means that the vast majority of consumers of UDP sockets
will not acquire any exclusive locks at the socket or UDP levels of the
network stack.  This leads to a marked performance improvement in several
important workloads, including BIND, nsd, and memcached over UDP, as well
as significant improvements in pps microbenchmarks.

The plan is to MFC all of the rwlock changes to RELENG_7 once they have
settled for a weeks in the tree.

Tested by:	ps, kris (older revision), bde
MFC after:	3 weeks
This commit is contained in:
Robert Watson 2008-07-15 15:38:47 +00:00
parent b27227029b
commit 43cc0bc1df
2 changed files with 91 additions and 36 deletions

View File

@ -315,6 +315,8 @@ struct inpcbinfo {
#define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock)
#define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock)
#define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock)
#define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock)
#define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock)
#define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock)
#define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock)
#define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED)
@ -356,6 +358,8 @@ inp_unlock_assert(struct inpcb *inp __unused)
#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock)
#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock)
#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)

View File

@ -846,14 +846,42 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
return (error);
}
if (src.sin_family == AF_INET || addr != NULL) {
/*
* Depending on whether or not the application has bound or connected
* the application, we may have to do varying levels of work. The
* optimal case is for a connected UDP socket, as a global lock isn't
* required.
*
* In order to decide which we need, we require stability of the
* inpcb binding, which we ensure by acquiring a read lock on the
* inpcb. This doesn't strictly follow the lock order, so we play
* the trylock and retry game; note that we may end up with more
* conservative locks than required the second time around, so later
* assertions have to accept that. Further analysis of the number of
* misses under contention is required.
*/
sin = (struct sockaddr_in *)addr;
INP_RLOCK(inp);
if (sin != NULL &&
(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
INP_RUNLOCK(inp);
INP_INFO_WLOCK(&udbinfo);
INP_WLOCK(inp);
unlock_udbinfo = 2;
} else if ((sin != NULL && (
(sin->sin_addr.s_addr == INADDR_ANY) ||
(sin->sin_addr.s_addr == INADDR_BROADCAST) ||
(inp->inp_laddr.s_addr == INADDR_ANY) ||
(inp->inp_lport == 0))) ||
(src.sin_family == AF_INET)) {
if (!INP_INFO_TRY_RLOCK(&udbinfo)) {
INP_RUNLOCK(inp);
INP_INFO_RLOCK(&udbinfo);
INP_RLOCK(inp);
}
unlock_udbinfo = 1;
} else {
} else
unlock_udbinfo = 0;
INP_RLOCK(inp);
}
/*
* If the IP_SENDSRCADDR control message was specified, override the
@ -880,12 +908,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* If a UDP socket has been connected, then a local address/port will
* have been selected and bound.
*
* If a UDP socket has not been connect to, then an explicit
* If a UDP socket has not been connected to, then an explicit
* destination address must be used, in which case a local
* address/port may not have been selected and bound.
*/
if (addr) {
INP_INFO_LOCK_ASSERT(&udbinfo);
if (sin != NULL) {
INP_LOCK_ASSERT(inp);
if (inp->inp_faddr.s_addr != INADDR_ANY) {
error = EISCONN;
@ -896,39 +923,58 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* Jail may rewrite the destination address, so let it do
* that before we use it.
*/
sin = (struct sockaddr_in *)addr;
if (jailed(td->td_ucred))
prison_remote_ip(td->td_ucred, 0,
&sin->sin_addr.s_addr);
/*
* If a local address or port hasn't yet been selected, do
* that now. Once a port is selected, we commit the binding
* back to the socket; we also commit the binding of the
* address if in jail.
* If a local address or port hasn't yet been selected, or if
* the destination address needs to be rewritten due to using
* a special INADDR_ constant, invoke in_pcbconnect_setup()
* to do the heavy lifting. Once a port is selected, we
* commit the binding back to the socket; we also commit the
* binding of the address if in jail.
*
* If we already have a valid binding and we're not
* requesting a destination address rewrite, use a fast path.
*/
error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport,
&faddr.s_addr, &fport, NULL, td->td_ucred);
if (error)
goto release;
/* Commit the local port if newly assigned. */
if (inp->inp_laddr.s_addr == INADDR_ANY &&
inp->inp_lport == 0) {
INP_INFO_WLOCK_ASSERT(&udbinfo);
INP_WLOCK_ASSERT(inp);
/*
* Remember addr if jailed, to prevent rebinding.
*/
if (jailed(td->td_ucred))
inp->inp_laddr = laddr;
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
inp->inp_lport = 0;
error = EAGAIN;
if (inp->inp_laddr.s_addr == INADDR_ANY ||
inp->inp_lport == 0 ||
sin->sin_addr.s_addr == INADDR_ANY ||
sin->sin_addr.s_addr == INADDR_BROADCAST) {
INP_INFO_LOCK_ASSERT(&udbinfo);
error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
&lport, &faddr.s_addr, &fport, NULL,
td->td_ucred);
if (error)
goto release;
/*
* XXXRW: Why not commit the port if the address is
* !INADDR_ANY?
*/
/* Commit the local port if newly assigned. */
if (inp->inp_laddr.s_addr == INADDR_ANY &&
inp->inp_lport == 0) {
INP_INFO_WLOCK_ASSERT(&udbinfo);
INP_WLOCK_ASSERT(inp);
/*
* Remember addr if jailed, to prevent
* rebinding.
*/
if (jailed(td->td_ucred))
inp->inp_laddr = laddr;
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
inp->inp_lport = 0;
error = EAGAIN;
goto release;
}
inp->inp_flags |= INP_ANONPORT;
}
inp->inp_flags |= INP_ANONPORT;
} else {
faddr = sin->sin_addr;
fport = sin->sin_port;
}
} else {
INP_LOCK_ASSERT(inp);
@ -1006,20 +1052,25 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */
udpstat.udps_opackets++;
if (unlock_udbinfo)
if (unlock_udbinfo == 2)
INP_INFO_WUNLOCK(&udbinfo);
else if (unlock_udbinfo == 1)
INP_INFO_RUNLOCK(&udbinfo);
error = ip_output(m, inp->inp_options, NULL, ipflags,
inp->inp_moptions, inp);
if (unlock_udbinfo)
if (unlock_udbinfo == 2)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
return (error);
release:
if (unlock_udbinfo) {
INP_INFO_WUNLOCK(&udbinfo);
if (unlock_udbinfo == 2) {
INP_WUNLOCK(inp);
INP_INFO_WUNLOCK(&udbinfo);
} else if (unlock_udbinfo == 1) {
INP_RUNLOCK(inp);
INP_INFO_RUNLOCK(&udbinfo);
} else
INP_RUNLOCK(inp);
m_freem(m);