Now with epoch synchronized PCB lookup tables we can greatly simplify

locking in udp_output() and udp6_output().

First, we select if we need read or write lock in PCB itself, we take
the lock and enter network epoch.  Then, we proceed for the rest of
the function.  In case if we need to modify PCB hash, we would take
write lock on it for a short piece of code.

We could exit the epoch before allocating an mbuf, but with this
patch we are keeping it all the way into ip_output()/ip6_output().
Today this creates an epoch recursion, since ip_output() enters epoch
itself.  However, once all protocols are reviewed, ip_output() and
ip6_output() would require epoch instead of entering it.

Note: I'm not 100% sure that in udp6_output() the epoch is required.
We don't do PCB hash lookup for a bound socket.  And all branches of
in6_select_src() don't require epoch, at least they lack assertions.
Today inet6 address list is protected by rmlock, although it is CKLIST.
AFAIU, the future plan is to protect it by network epoch.  That would
require epoch in in6_select_src().  Anyway, in future ip6_output()
would require epoch, udp6_output() would need to enter it.
This commit is contained in:
Gleb Smirnoff 2019-11-07 21:01:36 +00:00
parent 5a1264335d
commit 2435e507de
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=354479
2 changed files with 50 additions and 193 deletions

View File

@ -1118,9 +1118,6 @@ udp_ctloutput(struct socket *so, struct sockopt *sopt)
}
#ifdef INET
#define UH_WLOCKED 2
#define UH_RLOCKED 1
#define UH_UNLOCKED 0
static int
udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
struct mbuf *control, struct thread *td)
@ -1136,19 +1133,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
int error = 0;
int ipflags;
u_short fport, lport;
int unlock_udbinfo, unlock_inp;
u_char tos;
uint8_t pr;
uint16_t cscov = 0;
uint32_t flowid = 0;
uint8_t flowtype = M_HASHTYPE_NONE;
/*
* udp_output() may need to temporarily bind or connect the current
* inpcb. As such, we don't know up front whether we will need the
* pcbinfo lock or not. Do any work to decide what is needed up
* front before acquiring any locks.
*/
if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
if (control)
m_freem(control);
@ -1158,28 +1148,22 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
src.sin_family = 0;
sin = (struct sockaddr_in *)addr;
retry:
/*
* udp_output() may need to temporarily bind or connect the current
* inpcb. As such, we don't know up front whether we will need the
* pcbinfo lock or not. Do any work to decide what is needed up
* front before acquiring any locks.
*
* We will need network epoch in either case, to safely lookup into
* pcb hash.
*/
if (sin == NULL ||
(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
INP_WLOCK(inp);
/*
* In case we lost a race and another thread bound addr/port
* on the inp we cannot keep the wlock (which still would be
* fine) as further down, based on these values we make
* decisions for the pcbinfo lock. If the locks are not in
* synch the assertions on unlock will fire, hence we go for
* one retry loop.
*/
if (sin != NULL && (inp->inp_laddr.s_addr != INADDR_ANY ||
inp->inp_lport != 0)) {
INP_WUNLOCK(inp);
goto retry;
}
unlock_inp = UH_WLOCKED;
} else {
else
INP_RLOCK(inp);
unlock_inp = UH_RLOCKED;
}
NET_EPOCH_ENTER(et);
tos = inp->inp_ip_tos;
if (control != NULL) {
/*
@ -1187,13 +1171,9 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
* stored in a single mbuf.
*/
if (control->m_next) {
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
m_freem(control);
m_freem(m);
return (EINVAL);
error = EINVAL;
goto release;
}
for (; control->m_len > 0;
control->m_data += CMSG_ALIGN(cm->cmsg_len),
@ -1264,56 +1244,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
}
m_freem(control);
}
if (error) {
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
m_freem(m);
return (error);
}
if (error)
goto release;
/*
* In the old days, depending on whether or not the application had
* bound or connected the socket, we had to do varying levels of work.
* The optimal case was for a connected UDP socket, as a global lock
* wasn't required at all.
* In order to decide which we need, we required stability of the
* inpcb binding, which we ensured by acquiring a read lock on the
* inpcb. This didn't strictly follow the lock order, so we played
* the trylock and retry game.
* With the re-introduction of the route-cache in some cases, we started
* to acquire an early inp wlock and a possible race during re-lock
* went away. With the introduction of epoch(9) some read locking
* became epoch(9) and the lock-order issues also went away.
* Due to route-cache we may now hold more conservative locks than
* otherwise required and have split up the 2nd case in case 2 and 3
* in order to keep the udpinfo lock level in sync with the inp one
* for the IP_SENDSRCADDR case below.
*/
pr = inp->inp_socket->so_proto->pr_protocol;
pcbinfo = udp_get_inpcbinfo(pr);
if (sin != NULL &&
(inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) {
INP_HASH_WLOCK(pcbinfo);
unlock_udbinfo = UH_WLOCKED;
} else if (sin != NULL &&
(sin->sin_addr.s_addr == INADDR_ANY ||
sin->sin_addr.s_addr == INADDR_BROADCAST ||
inp->inp_laddr.s_addr == INADDR_ANY ||
inp->inp_lport == 0)) {
INP_HASH_RLOCK_ET(pcbinfo, et);
unlock_udbinfo = UH_RLOCKED;
} else if (src.sin_family == AF_INET) {
if (unlock_inp == UH_WLOCKED) {
INP_HASH_WLOCK(pcbinfo);
unlock_udbinfo = UH_WLOCKED;
} else {
INP_HASH_RLOCK_ET(pcbinfo, et);
unlock_udbinfo = UH_RLOCKED;
}
} else
unlock_udbinfo = UH_UNLOCKED;
/*
* If the IP_SENDSRCADDR control message was specified, override the
@ -1389,7 +1324,6 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
if (inp->inp_laddr.s_addr == INADDR_ANY &&
inp->inp_lport == 0) {
INP_WLOCK_ASSERT(inp);
INP_HASH_WLOCK_ASSERT(pcbinfo);
/*
* Remember addr if jailed, to prevent
* rebinding.
@ -1397,7 +1331,10 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
if (prison_flag(td->td_ucred, PR_IP4))
inp->inp_laddr = laddr;
inp->inp_lport = lport;
if (in_pcbinshash(inp) != 0) {
INP_HASH_WLOCK(pcbinfo);
error = in_pcbinshash(inp);
INP_HASH_WUNLOCK(pcbinfo);
if (error != 0) {
inp->inp_lport = 0;
error = EAGAIN;
goto release;
@ -1562,48 +1499,20 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
ipflags |= IP_NODEFAULTFLOWID;
#endif /* RSS */
if (unlock_udbinfo == UH_WLOCKED)
INP_HASH_WUNLOCK(pcbinfo);
else if (unlock_udbinfo == UH_RLOCKED)
INP_HASH_RUNLOCK_ET(pcbinfo, et);
if (pr == IPPROTO_UDPLITE)
UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
else
UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
error = ip_output(m, inp->inp_options,
(unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags,
INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
inp->inp_moptions, inp);
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
NET_EPOCH_EXIT(et);
return (error);
release:
if (unlock_udbinfo == UH_WLOCKED) {
KASSERT(unlock_inp == UH_WLOCKED,
("%s: excl udbinfo lock %#03x, shared inp lock %#03x, "
"sin %p daddr %#010x inp %p laddr %#010x lport %#06x "
"src fam %#04x",
__func__, unlock_udbinfo, unlock_inp, sin,
(sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp,
inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family));
INP_HASH_WUNLOCK(pcbinfo);
INP_WUNLOCK(inp);
} else if (unlock_udbinfo == UH_RLOCKED) {
KASSERT(unlock_inp == UH_RLOCKED,
("%s: shared udbinfo lock %#03x, excl inp lock %#03x, "
"sin %p daddr %#010x inp %p laddr %#010x lport %#06x "
"src fam %#04x",
__func__, unlock_udbinfo, unlock_inp, sin,
(sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp,
inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family));
INP_HASH_RUNLOCK_ET(pcbinfo, et);
INP_RUNLOCK(inp);
} else if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
NET_EPOCH_EXIT(et);
m_freem(m);
return (error);
}

View File

@ -678,14 +678,10 @@ udp6_getcred(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0,
0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection");
#define UH_WLOCKED 2
#define UH_RLOCKED 1
#define UH_UNLOCKED 0
static int
udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
struct sockaddr *addr6, struct mbuf *control, struct thread *td)
{
struct inpcbinfo *pcbinfo;
struct inpcb *inp;
struct ip6_hdr *ip6;
struct udphdr *udp6;
@ -697,7 +693,7 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
u_int32_t ulen, plen;
uint16_t cscov;
u_short fport;
uint8_t nxt, unlock_inp, unlock_udbinfo;
uint8_t nxt;
/* addr6 has been validated in udp6_send(). */
sin6 = (struct sockaddr_in6 *)addr6;
@ -740,30 +736,17 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
* - on connected sockets (sin6 is NULL) for route cache updates,
* - when we are not bound to an address and source port (it is
* in6_pcbsetport() which will require the write lock).
*
* We check the inp fields before actually locking the inp, so
* here exists a race, and we may WLOCK the inp and end with already
* bound one by other thread. This is fine.
*/
retry:
if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
inp->inp_lport == 0)) {
inp->inp_lport == 0))
INP_WLOCK(inp);
/*
* In case we lost a race and another thread bound addr/port
* on the inp we cannot keep the wlock (which still would be
* fine) as further down, based on these values we make
* decisions for the pcbinfo lock. If the locks are not in
* synch the assertions on unlock will fire, hence we go for
* one retry loop.
*/
if (sin6 != NULL &&
(!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
inp->inp_lport != 0)) {
INP_WUNLOCK(inp);
goto retry;
}
unlock_inp = UH_WLOCKED;
} else {
else
INP_RLOCK(inp);
unlock_inp = UH_RLOCKED;
}
nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
IPPROTO_UDP : IPPROTO_UDPLITE;
@ -787,10 +770,7 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
* potential race in which the factors causing us to
* select the UDPv4 output routine are invalidated?
*/
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
if (sin6)
in6_sin6_2_sin_in_sock((struct sockaddr *)sin6);
pru = inetsw[ip_protox[nxt]].pr_usrreqs;
@ -805,21 +785,17 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
* Given this is either an IPv6-only socket or no INET is
* supported we will fail the send if the given destination
* address is a v4mapped address.
*
* XXXGL: do we leak m and control?
*/
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
return (EINVAL);
}
if (control) {
if ((error = ip6_setpktopts(control, &opt,
inp->in6p_outputopts, td->td_ucred, nxt)) != 0) {
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
ip6_clearpktopts(&opt, -1);
if (control)
m_freem(control);
@ -830,20 +806,7 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
} else
optp = inp->in6p_outputopts;
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
if (sin6 != NULL &&
IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) {
INP_HASH_WLOCK(pcbinfo);
unlock_udbinfo = UH_WLOCKED;
} else if (sin6 != NULL &&
(IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
inp->inp_lport == 0)) {
INP_HASH_RLOCK_ET(pcbinfo, et);
unlock_udbinfo = UH_RLOCKED;
} else
unlock_udbinfo = UH_UNLOCKED;
NET_EPOCH_ENTER(et);
if (sin6) {
/*
@ -879,9 +842,14 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
laddr = &in6a;
if (inp->inp_lport == 0) {
struct inpcbinfo *pcbinfo;
INP_WLOCK_ASSERT(inp);
pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
INP_HASH_WLOCK(pcbinfo);
error = in6_pcbsetport(laddr, inp, td->td_ucred);
INP_HASH_WUNLOCK(pcbinfo);
if (error != 0) {
/* Undo an address bind that may have occurred. */
inp->in6p_laddr = in6addr_any;
@ -1005,21 +973,15 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
#endif
UDPSTAT_INC(udps_opackets);
if (unlock_udbinfo == UH_WLOCKED)
INP_HASH_WUNLOCK(pcbinfo);
else if (unlock_udbinfo == UH_RLOCKED)
INP_HASH_RUNLOCK_ET(pcbinfo, et);
if (nxt == IPPROTO_UDPLITE)
UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);
else
UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
error = ip6_output(m, optp,
(unlock_inp == UH_WLOCKED) ? &inp->inp_route6 : NULL, flags,
INP_WLOCKED(inp) ? &inp->inp_route6 : NULL, flags,
inp->in6p_moptions, NULL, inp);
if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
NET_EPOCH_EXIT(et);
if (control) {
ip6_clearpktopts(&opt, -1);
@ -1028,22 +990,8 @@ udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
return (error);
release:
if (unlock_udbinfo == UH_WLOCKED) {
KASSERT(unlock_inp == UH_WLOCKED, ("%s: excl udbinfo lock, "
"non-excl inp lock: pcbinfo %p %#x inp %p %#x",
__func__, pcbinfo, unlock_udbinfo, inp, unlock_inp));
INP_HASH_WUNLOCK(pcbinfo);
INP_WUNLOCK(inp);
} else if (unlock_udbinfo == UH_RLOCKED) {
KASSERT(unlock_inp == UH_RLOCKED, ("%s: non-excl udbinfo lock, "
"excl inp lock: pcbinfo %p %#x inp %p %#x",
__func__, pcbinfo, unlock_udbinfo, inp, unlock_inp));
INP_HASH_RUNLOCK_ET(pcbinfo, et);
INP_RUNLOCK(inp);
} else if (unlock_inp == UH_WLOCKED)
INP_WUNLOCK(inp);
else
INP_RUNLOCK(inp);
INP_UNLOCK(inp);
NET_EPOCH_EXIT(et);
if (control) {
ip6_clearpktopts(&opt, -1);
m_freem(control);