Over the past couple of years, there have been a number of reports relating
the use of divert sockets to dead locks. A number of LORs have been reported between divert and a number of other network subsystems including: IPSEC, Pfil, multicast, ipfw and others. Other dead locks could occur because of recursive entry into the IP stack. This change should take care of most if not all of these issues. A summary of the changes follow: - We disallow multicast operations on divert sockets. It really doesn't make semantic sense to allow this, since typically you would set multicast parameters on multicast end points. NOTE: As a part of this change, we actually dis-allow multicast options on any socket that IS a divert socket OR IS NOT a SOCK_RAW or SOCK_DGRAM family - We check to see if there are any socket options that have been specified on the socket, and if there was (which is very un-common and also probably doesnt make sense to support) we duplicate the mbuf carrying the options. - We then drop the INP/INFO locks over the call to ip_output(). It should be noted that since we no longer support multicast operations on divert sockets and we have duplicated any socket options, we no longer need the reference to the pcb to be coherent. - Finally, we replaced the call to ip_input() to use netisr queuing. This should remove the recursive entry into the IP stack from divert. By dropping the locks over the call to ip_output() we eliminate all the lock ordering issues above. By switching over to netisr on the inbound path, we can no longer recursively enter the ip_input() code via divert. I have tested this change by using the following command: ipfwpcap -r 8000 - | tcpdump -r - -nn -v This should exercise the input and re-injection (outbound) path, which is very similar to the work load performed by natd(8). Additionally, I have run some ospf daemons which have a heavy reliance on raw sockets and multicast. Approved by: re@ (kensmith) MFC after: 1 month LOR: 163 LOR: 181 LOR: 202 LOR: 203 Discussed with: julian, andre et al (on freebsd-net) In collaboration with: bms [1], rwatson [2] [1] bms helped out with the multicast decisions [2] rwatson submitted the original netisr patches and came up with some of the original ideas on how to combat this issue.
This commit is contained in:
parent
50d4dc714d
commit
e58c2855d8
@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <sys/kernel.h>
|
||||
#include <sys/malloc.h>
|
||||
#include <sys/mbuf.h>
|
||||
#include <sys/protosw.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/socketvar.h>
|
||||
#include <sys/sysctl.h>
|
||||
@ -856,6 +857,16 @@ inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
|
||||
|
||||
INP_LOCK(inp);
|
||||
imo = inp->inp_moptions;
|
||||
/*
|
||||
* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
|
||||
* or is a divert socket, reject it.
|
||||
*/
|
||||
if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
|
||||
(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
|
||||
inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
|
||||
INP_UNLOCK(inp);
|
||||
return (EOPNOTSUPP);
|
||||
}
|
||||
|
||||
error = 0;
|
||||
switch (sopt->sopt_name) {
|
||||
@ -1675,6 +1686,16 @@ inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
|
||||
|
||||
error = 0;
|
||||
|
||||
/*
|
||||
* If socket is neither of type SOCK_RAW or SOCK_DGRAM,
|
||||
* or is a divert socket, reject it.
|
||||
* XXX Unlocked read of inp_socket believed OK.
|
||||
*/
|
||||
if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
|
||||
(inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
|
||||
inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
|
||||
return (EOPNOTSUPP);
|
||||
|
||||
switch (sopt->sopt_name) {
|
||||
case IP_MULTICAST_VIF: {
|
||||
int vifi;
|
||||
|
@ -61,6 +61,7 @@
|
||||
#include <vm/uma.h>
|
||||
|
||||
#include <net/if.h>
|
||||
#include <net/netisr.h>
|
||||
#include <net/route.h>
|
||||
|
||||
#include <netinet/in.h>
|
||||
@ -305,6 +306,7 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
|
||||
struct m_tag *mtag;
|
||||
struct divert_tag *dt;
|
||||
int error = 0;
|
||||
struct mbuf *options;
|
||||
|
||||
/*
|
||||
* An mbuf may hasn't come from userland, but we pretend
|
||||
@ -361,6 +363,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
|
||||
if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) ||
|
||||
((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
|
||||
error = EINVAL;
|
||||
INP_UNLOCK(inp);
|
||||
INP_INFO_WUNLOCK(&divcbinfo);
|
||||
m_freem(m);
|
||||
} else {
|
||||
/* Convert fields to host order for ip_output() */
|
||||
@ -373,15 +377,46 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
|
||||
#ifdef MAC
|
||||
mac_create_mbuf_from_inpcb(inp, m);
|
||||
#endif
|
||||
error = ip_output(m,
|
||||
inp->inp_options, NULL,
|
||||
((so->so_options & SO_DONTROUTE) ?
|
||||
IP_ROUTETOIF : 0) |
|
||||
IP_ALLOWBROADCAST | IP_RAWOUTPUT,
|
||||
inp->inp_moptions, NULL);
|
||||
/*
|
||||
* Get ready to inject the packet into ip_output().
|
||||
* Just in case socket options were specified on the
|
||||
* divert socket, we duplicate them. This is done
|
||||
* to avoid having to hold the PCB locks over the call
|
||||
* to ip_output(), as doing this results in a number of
|
||||
* lock ordering complexities.
|
||||
*
|
||||
* Note that we set the multicast options argument for
|
||||
* ip_output() to NULL since it should be invariant that
|
||||
* they are not present.
|
||||
*/
|
||||
KASSERT(inp->inp_moptions == NULL,
|
||||
("multicast options set on a divert socket"));
|
||||
options = NULL;
|
||||
/*
|
||||
* XXXCSJP: It is unclear to me whether or not it makes
|
||||
* sense for divert sockets to have options. However,
|
||||
* for now we will duplicate them with the INP locks
|
||||
* held so we can use them in ip_output() without
|
||||
* requring a reference to the pcb.
|
||||
*/
|
||||
if (inp->inp_options != NULL) {
|
||||
options = m_dup(inp->inp_options, M_DONTWAIT);
|
||||
if (options == NULL)
|
||||
error = ENOBUFS;
|
||||
}
|
||||
INP_UNLOCK(inp);
|
||||
INP_INFO_WUNLOCK(&divcbinfo);
|
||||
if (error == ENOBUFS) {
|
||||
m_freem(m);
|
||||
return (error);
|
||||
}
|
||||
error = ip_output(m, options, NULL,
|
||||
((so->so_options & SO_DONTROUTE) ?
|
||||
IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST |
|
||||
IP_RAWOUTPUT, NULL, NULL);
|
||||
if (options != NULL)
|
||||
m_freem(options);
|
||||
}
|
||||
INP_UNLOCK(inp);
|
||||
INP_INFO_WUNLOCK(&divcbinfo);
|
||||
} else {
|
||||
dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG;
|
||||
if (m->m_pkthdr.rcvif == NULL) {
|
||||
@ -406,8 +441,8 @@ div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
|
||||
mac_create_mbuf_from_socket(so, m);
|
||||
SOCK_UNLOCK(so);
|
||||
#endif
|
||||
/* Send packet to input processing */
|
||||
ip_input(m);
|
||||
/* Send packet to input processing via netisr */
|
||||
netisr_queue(NETISR_IP, m);
|
||||
}
|
||||
|
||||
return error;
|
||||
|
Loading…
x
Reference in New Issue
Block a user