AF_UNIX: make unix socket locking finer grained

This change moves to using a reference count across lock drop / reacquire
to guarantee liveness.

Currently sends on unix sockets contend heavily on read locking the list lock.
unix1_processes in will-it-scale peaks at 6 processes and then declines.

With this change I get a substantial improvement in number of operations per second
with 96 processes:

x before
+ after
    N           Min           Max        Median           Avg        Stddev
x  11       1688420       1696389       1693578     1692766.3     2971.1702
+  10      63417955      71030114      70662504      69576423     2374684.6
Difference at 95.0% confidence
        6.78837e+07 +/- 1.49463e+06
        4010.22% +/- 88.4246%
        (Student's t, pooled s = 1.63437e+06)

And even for 2 processes shows a ~18% improvement.
"Small" iron changes (1, 2, and 4 processes):

x before1
+ after1.2
+------------------------------------------------------------------------+
|                                                                  +     |
|                                                           x      +     |
|                                                           x      +     |
|                                                           x      +     |
|                                                           x     ++     |
|                                                          xx     ++     |
|x                                                       x xx     ++     |
|                                  |__________________A_____M_____AM____||
+------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  10       1131648       1197750     1197138.5     1190369.3     20651.839
+  10       1203840       1205056       1204919     1204827.9     353.27404
Difference at 95.0% confidence
        14458.6 +/- 13723
        1.21463% +/- 1.16683%
        (Student's t, pooled s = 14605.2)

x before2
+ after2.2
+------------------------------------------------------------------------+
|                                                                       +|
|                                                                       +|
|                                                                       +|
|                                                                       +|
|                                                                       +|
|                                                                       +|
|           x                                                           +|
|           x                                                           +|
|         x xx                                                          +|
|x        xxxx                                                          +|
|      |___AM_|                                                         A|
+------------------------------------------------------------------------+
    N           Min           Max        Median           Avg        Stddev
x  10       1972843       2045866     2038186.5     2030443.8     21367.694
+  10       2400853       2402196     2401043.5     2401172.7     385.40024
Difference at 95.0% confidence
        370729 +/- 14198.9
        18.2585% +/- 0.826943%
        (Student's t, pooled s = 15111.7)

x before4
+ after4.2
    N           Min           Max        Median           Avg        Stddev
x  10       3986994       3991728     3990137.5     3989985.2     1300.0164
+  10       4799990       4806664     4806116.5       4805194     1990.6625
Difference at 95.0% confidence
        815209 +/- 1579.64
        20.4314% +/- 0.0421713%
        (Student's t, pooled s = 1681.19)

Tested by: pho
Reported by:	mjg
Approved by:	sbruno
Sponsored by:	Limelight Networks
Differential Revision:	https://reviews.freebsd.org/D15430
This commit is contained in:
Matt Macy 2018-05-17 17:59:35 +00:00
parent 087522b840
commit 75a67bf3d0
2 changed files with 383 additions and 204 deletions

@ -191,12 +191,40 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
/*
* Locking and synchronization:
*
* Two types of locks exist in the local domain socket implementation: a
* a global linkage rwlock and per-unpcb mutexes. The linkage lock protects
* the socket count, global generation number, stream/datagram global lists and
* interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be
* held exclusively over the acquisition of multiple unpcb locks to prevent
* deadlock.
* Three types of locks exist in the local domain socket implementation: a
* a global linkage rwlock, the mtxpool lock, and per-unpcb mutexes.
* The linkage lock protects the socket count, global generation number,
* and stream/datagram global lists.
*
* The mtxpool lock protects the vnode from being modified while referenced.
* Lock ordering requires that it be acquired before any unpcb locks.
*
* The unpcb lock (unp_mtx) protects all fields in the unpcb. Of particular
* note is that this includes the unp_conn field. So long as the unpcb lock
* is held the reference to the unpcb pointed to by unp_conn is valid. If we
* require that the unpcb pointed to by unp_conn remain live in cases where
* we need to drop the unp_mtx as when we need to acquire the lock for a
* second unpcb the caller must first acquire an additional reference on the
* second unpcb and then revalidate any state (typically check that unp_conn
* is non-NULL) upon requiring the initial unpcb lock. The lock ordering
* between unpcbs is the conventional ascending address order. Two helper
* routines exist for this:
*
* - unp_pcb_lock2(unp, unp2) - which just acquires the two locks in the
* safe ordering.
*
* - unp_pcb_owned_lock2(unp, unp2, freed) - the lock for unp is held
* when called. If unp is unlocked and unp2 is subsequently freed
* freed will be set to 1.
*
* The helper routines for references are:
*
* - unp_pcb_hold(unp): Can be called any time we currently hold a valid
* reference to unp.
*
* - unp_pcb_rele(unp): The caller must hold the unp lock. If we are
* releasing the last reference, detach must have been called thus
* unp->unp_socket be NULL.
*
* UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
* allocated in pru_attach() and freed in pru_detach(). The validity of that
@ -210,15 +238,8 @@ SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
* to the unpcb is held. Typically, this reference will be from the socket,
* or from another unpcb when the referring unpcb's lock is held (in order
* that the reference not be invalidated during use). For example, to follow
* unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
* as unp_socket remains valid as long as the reference to unp_conn is valid.
*
* Fields of unpcbss are locked using a per-unpcb lock, unp_mtx. Individual
* atomic reads without the lock may be performed "lockless", but more
* complex reads and read-modify-writes require the mutex to be held. No
* lock order is defined between unpcb locks -- multiple unpcb locks may be
* acquired at the same time only when holding the linkage rwlock
* exclusively, which prevents deadlocks.
* unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
* that detach is not run clearing unp_socket.
*
* Blocking with UNIX domain sockets is a tricky issue: unlike most network
* protocols, bind() is a non-atomic operation, and connect() requires
@ -257,13 +278,19 @@ static struct mtx unp_defers_lock;
#define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock)
#define UNP_DEFERRED_UNLOCK() mtx_unlock(&unp_defers_lock)
#define UNP_REF_LIST_LOCK() UNP_DEFERRED_LOCK();
#define UNP_REF_LIST_UNLOCK() UNP_DEFERRED_UNLOCK();
#define UNP_PCB_LOCK_INIT(unp) mtx_init(&(unp)->unp_mtx, \
"unp_mtx", "unp_mtx", \
MTX_DUPOK|MTX_DEF|MTX_RECURSE)
MTX_DUPOK|MTX_DEF)
#define UNP_PCB_LOCK_DESTROY(unp) mtx_destroy(&(unp)->unp_mtx)
#define UNP_PCB_LOCK(unp) mtx_lock(&(unp)->unp_mtx)
#define UNP_PCB_TRYLOCK(unp) mtx_trylock(&(unp)->unp_mtx)
#define UNP_PCB_UNLOCK(unp) mtx_unlock(&(unp)->unp_mtx)
#define UNP_PCB_OWNED(unp) mtx_owned(&(unp)->unp_mtx)
#define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED)
#define UNP_PCB_UNLOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
static int uipc_connect2(struct socket *, struct socket *);
static int uipc_ctloutput(struct socket *, struct sockopt *);
@ -289,6 +316,75 @@ static int unp_externalize_fp(struct file *);
static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
static void unp_process_defers(void * __unused, int);
static void
unp_pcb_hold(struct unpcb *unp)
{
MPASS(unp->unp_refcount);
refcount_acquire(&unp->unp_refcount);
}
static int
unp_pcb_rele(struct unpcb *unp)
{
int freed;
UNP_PCB_LOCK_ASSERT(unp);
MPASS(unp->unp_refcount);
if ((freed = refcount_release(&unp->unp_refcount))) {
/* we got here with having detached? */
MPASS(unp->unp_socket == NULL);
UNP_PCB_UNLOCK(unp);
UNP_PCB_LOCK_DESTROY(unp);
uma_zfree(unp_zone, unp);
}
return (freed);
}
static void
unp_pcb_lock2(struct unpcb *unp, struct unpcb *unp2)
{
UNP_PCB_UNLOCK_ASSERT(unp);
UNP_PCB_UNLOCK_ASSERT(unp2);
if ((uintptr_t)unp2 > (uintptr_t)unp) {
UNP_PCB_LOCK(unp);
UNP_PCB_LOCK(unp2);
} else {
UNP_PCB_LOCK(unp2);
UNP_PCB_LOCK(unp);
}
}
static __noinline void
unp_pcb_owned_lock2_slowpath(struct unpcb *unp, struct unpcb **unp2p, int *freed)
{
struct unpcb *unp2;
unp2 = *unp2p;
unp_pcb_hold((unp2));
UNP_PCB_UNLOCK((unp));
UNP_PCB_LOCK((unp2));
UNP_PCB_LOCK((unp));
*freed = unp_pcb_rele((unp2));
if (*freed)
*unp2p = NULL;
}
#define unp_pcb_owned_lock2(unp, unp2, freed) do { \
freed = 0; \
UNP_PCB_LOCK_ASSERT((unp)); \
UNP_PCB_UNLOCK_ASSERT((unp2)); \
if (__predict_true(UNP_PCB_TRYLOCK((unp2)))) \
break; \
else if ((uintptr_t)(unp2) > (uintptr_t)(unp)) \
UNP_PCB_LOCK((unp2)); \
else { \
unp_pcb_owned_lock2_slowpath((unp), &(unp2), &freed); \
} \
} while (0)
/*
* Definitions of protocols supported in the LOCAL domain.
*/
@ -344,17 +440,16 @@ uipc_abort(struct socket *so)
unp = sotounpcb(so);
KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
UNP_PCB_UNLOCK_ASSERT(unp);
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
unp2 = unp->unp_conn;
if (unp2 != NULL) {
UNP_PCB_LOCK(unp2);
unp_pcb_hold(unp2);
UNP_PCB_UNLOCK(unp);
unp_drop(unp2);
UNP_PCB_UNLOCK(unp2);
}
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
} else
UNP_PCB_UNLOCK(unp);
}
static int
@ -551,14 +646,12 @@ restart:
ASSERT_VOP_ELOCKED(vp, "uipc_bind");
soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
VOP_UNP_BIND(vp, unp);
unp->unp_vnode = vp;
unp->unp_addr = soun;
unp->unp_flags &= ~UNP_BINDING;
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
free(buf, M_TEMP);
@ -585,9 +678,7 @@ uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
int error;
KASSERT(td == curthread, ("uipc_connect: td != curthread"));
UNP_LINK_WLOCK();
error = unp_connect(so, nam, td);
UNP_LINK_WUNLOCK();
return (error);
}
@ -598,9 +689,7 @@ uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
int error;
KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
UNP_LINK_WLOCK();
error = unp_connectat(fd, so, nam, td);
UNP_LINK_WUNLOCK();
return (error);
}
@ -609,26 +698,41 @@ uipc_close(struct socket *so)
{
struct unpcb *unp, *unp2;
struct vnode *vp = NULL;
struct mtx *vplock;
int freed;
unp = sotounpcb(so);
KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
unp2 = unp->unp_conn;
if (unp2 != NULL) {
UNP_PCB_LOCK(unp2);
unp_disconnect(unp, unp2);
UNP_PCB_UNLOCK(unp2);
vplock = NULL;
if ((vp = unp->unp_vnode) != NULL) {
vplock = mtx_pool_find(mtxpool_sleep, vp);
mtx_lock(vplock);
}
if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) {
UNP_PCB_LOCK(unp);
if (vp && unp->unp_vnode == NULL) {
mtx_unlock(vplock);
vp = NULL;
}
if (vp != NULL) {
VOP_UNP_DETACH(vp);
unp->unp_vnode = NULL;
}
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
if (vp)
unp2 = unp->unp_conn;
unp_pcb_hold(unp);
if (unp2 != NULL) {
unp_pcb_hold(unp2);
unp_pcb_owned_lock2(unp, unp2, freed);
unp_disconnect(unp, unp2);
if (unp_pcb_rele(unp2) == 0)
UNP_PCB_UNLOCK(unp2);
}
if (unp_pcb_rele(unp) == 0)
UNP_PCB_UNLOCK(unp);
if (vp) {
mtx_unlock(vplock);
vrele(vp);
}
}
static int
@ -637,17 +741,14 @@ uipc_connect2(struct socket *so1, struct socket *so2)
struct unpcb *unp, *unp2;
int error;
UNP_LINK_WLOCK();
unp = so1->so_pcb;
KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
UNP_PCB_LOCK(unp);
unp2 = so2->so_pcb;
KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
UNP_PCB_LOCK(unp2);
unp_pcb_lock2(unp, unp2);
error = unp_connect2(so1, so2, PRU_CONNECT2);
UNP_PCB_UNLOCK(unp2);
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
return (error);
}
@ -655,6 +756,7 @@ static void
uipc_detach(struct socket *so)
{
struct unpcb *unp, *unp2;
struct mtx *vplock;
struct sockaddr_un *saved_unp_addr;
struct vnode *vp;
int freeunp, local_unp_rights;
@ -669,49 +771,77 @@ uipc_detach(struct socket *so)
LIST_REMOVE(unp, unp_link);
unp->unp_gencnt = ++unp_gencnt;
--unp_count;
UNP_PCB_LOCK(unp);
if ((unp->unp_flags & UNP_NASCENT) != 0)
goto teardown;
UNP_LINK_WUNLOCK();
UNP_PCB_UNLOCK_ASSERT(unp);
restart:
if ((vp = unp->unp_vnode) != NULL) {
vplock = mtx_pool_find(mtxpool_sleep, vp);
mtx_lock(vplock);
}
UNP_PCB_LOCK(unp);
if ((unp2 = unp->unp_conn) != NULL) {
unp_pcb_owned_lock2(unp, unp2, freeunp);
if (freeunp)
unp2 = NULL;
}
if (unp->unp_vnode != vp &&
unp->unp_vnode != NULL) {
mtx_unlock(vplock);
UNP_PCB_UNLOCK(unp);
if (unp2)
UNP_PCB_UNLOCK(unp2);
goto restart;
}
if ((unp->unp_flags & UNP_NASCENT) != 0) {
if (unp2)
UNP_PCB_UNLOCK(unp2);
goto teardown;
}
if ((vp = unp->unp_vnode) != NULL) {
VOP_UNP_DETACH(vp);
unp->unp_vnode = NULL;
}
unp2 = unp->unp_conn;
unp_pcb_hold(unp);
if (unp2 != NULL) {
UNP_PCB_LOCK(unp2);
unp_pcb_hold(unp2);
unp_disconnect(unp, unp2);
UNP_PCB_UNLOCK(unp2);
if (unp_pcb_rele(unp2) == 0)
UNP_PCB_UNLOCK(unp2);
}
/*
* We hold the linkage lock exclusively, so it's OK to acquire
* multiple pcb locks at a time.
*/
UNP_PCB_UNLOCK(unp);
UNP_REF_LIST_LOCK();
while (!LIST_EMPTY(&unp->unp_refs)) {
struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
UNP_PCB_LOCK(ref);
unp_pcb_hold(ref);
UNP_REF_LIST_UNLOCK();
MPASS(ref != unp);
UNP_PCB_UNLOCK_ASSERT(ref);
unp_drop(ref);
UNP_PCB_UNLOCK(ref);
UNP_REF_LIST_LOCK();
}
UNP_REF_LIST_UNLOCK();
UNP_PCB_LOCK(unp);
freeunp = unp_pcb_rele(unp);
MPASS(freeunp == 0);
local_unp_rights = unp_rights;
teardown:
UNP_LINK_WUNLOCK();
unp->unp_socket->so_pcb = NULL;
saved_unp_addr = unp->unp_addr;
unp->unp_addr = NULL;
unp->unp_refcount--;
freeunp = (unp->unp_refcount == 0);
unp->unp_socket = NULL;
freeunp = unp_pcb_rele(unp);
if (saved_unp_addr != NULL)
free(saved_unp_addr, M_SONAME);
if (freeunp) {
UNP_PCB_LOCK_DESTROY(unp);
uma_zfree(unp_zone, unp);
} else
if (!freeunp)
UNP_PCB_UNLOCK(unp);
if (vp)
if (vp) {
mtx_unlock(vplock);
vrele(vp);
}
if (local_unp_rights)
taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
}
@ -720,20 +850,28 @@ static int
uipc_disconnect(struct socket *so)
{
struct unpcb *unp, *unp2;
int freed;
unp = sotounpcb(so);
KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
unp2 = unp->unp_conn;
if (unp2 != NULL) {
UNP_PCB_LOCK(unp2);
unp_disconnect(unp, unp2);
UNP_PCB_UNLOCK(unp2);
if ((unp2 = unp->unp_conn) == NULL) {
UNP_PCB_UNLOCK(unp);
return (0);
}
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
unp_pcb_owned_lock2(unp, unp2, freed);
if (__predict_false(freed)) {
UNP_PCB_UNLOCK(unp);
return (0);
}
unp_pcb_hold(unp2);
unp_pcb_hold(unp);
unp_disconnect(unp, unp2);
if (unp_pcb_rele(unp) == 0)
UNP_PCB_UNLOCK(unp);
if (unp_pcb_rele(unp2) == 0)
UNP_PCB_UNLOCK(unp2);
return (0);
}
@ -851,6 +989,28 @@ uipc_rcvd(struct socket *so, int flags)
return (0);
}
static int
connect_internal(struct socket *so, struct sockaddr *nam, struct thread *td)
{
int error;
struct unpcb *unp;
unp = so->so_pcb;
if (unp->unp_conn != NULL)
return (EISCONN);
error = unp_connect(so, nam, td);
if (error)
return (error);
UNP_PCB_LOCK(unp);
if (unp->unp_conn == NULL) {
UNP_PCB_UNLOCK(unp);
if (error == 0)
error = ENOTCONN;
}
return (error);
}
static int
uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct thread *td)
@ -858,7 +1018,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
struct unpcb *unp, *unp2;
struct socket *so2;
u_int mbcnt, sbcc;
int error = 0;
int freed, error;
unp = sotounpcb(so);
KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
@ -866,49 +1026,66 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
so->so_type == SOCK_SEQPACKET,
("%s: socktype %d", __func__, so->so_type));
freed = error = 0;
if (flags & PRUS_OOB) {
error = EOPNOTSUPP;
goto release;
}
if (control != NULL && (error = unp_internalize(&control, td)))
goto release;
if ((nam != NULL) || (flags & PRUS_EOF))
UNP_LINK_WLOCK();
else
UNP_LINK_RLOCK();
unp2 = NULL;
switch (so->so_type) {
case SOCK_DGRAM:
{
const struct sockaddr *from;
unp2 = unp->unp_conn;
if (nam != NULL) {
UNP_LINK_WLOCK_ASSERT();
if (unp2 != NULL) {
error = EISCONN;
/*
* We return with UNP_PCB_LOCK_HELD so we know that
* the reference is live if the pointer is valid.
*/
if ((error = connect_internal(so, nam, td)))
break;
MPASS(unp->unp_conn != NULL);
unp2 = unp->unp_conn;
} else {
UNP_PCB_LOCK(unp);
/*
* Because connect() and send() are non-atomic in a sendto()
* with a target address, it's possible that the socket will
* have disconnected before the send() can run. In that case
* return the slightly counter-intuitive but otherwise
* correct error that the socket is not connected.
*/
if ((unp2 = unp->unp_conn) == NULL) {
UNP_PCB_UNLOCK(unp);
error = ENOTCONN;
break;
}
error = unp_connect(so, nam, td);
if (error)
break;
unp2 = unp->unp_conn;
}
/*
* Because connect() and send() are non-atomic in a sendto()
* with a target address, it's possible that the socket will
* have disconnected before the send() can run. In that case
* return the slightly counter-intuitive but otherwise
* correct error that the socket is not connected.
*/
if (unp2 == NULL) {
unp_pcb_owned_lock2(unp, unp2, freed);
if (__predict_false(freed)) {
UNP_PCB_UNLOCK(unp);
error = ENOTCONN;
break;
}
/*
* The socket referencing unp2 may have been closed
* or unp may have been disconnected if the unp lock
* was dropped to acquire unp2.
*/
if (__predict_false(unp->unp_conn == NULL) ||
unp2->unp_socket == NULL) {
UNP_PCB_UNLOCK(unp);
if (unp_pcb_rele(unp2) == 0)
UNP_PCB_UNLOCK(unp2);
error = ENOTCONN;
break;
}
/* Lockless read. */
if (unp2->unp_flags & UNP_WANTCRED)
control = unp_addsockcred(td, control);
UNP_PCB_LOCK(unp);
if (unp->unp_addr != NULL)
from = (struct sockaddr *)unp->unp_addr;
else
@ -924,12 +1101,9 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
SOCKBUF_UNLOCK(&so2->so_rcv);
error = ENOBUFS;
}
if (nam != NULL) {
UNP_LINK_WLOCK_ASSERT();
UNP_PCB_LOCK(unp2);
if (nam != NULL)
unp_disconnect(unp, unp2);
UNP_PCB_UNLOCK(unp2);
}
UNP_PCB_UNLOCK(unp2);
UNP_PCB_UNLOCK(unp);
break;
}
@ -938,42 +1112,37 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
case SOCK_STREAM:
if ((so->so_state & SS_ISCONNECTED) == 0) {
if (nam != NULL) {
UNP_LINK_WLOCK_ASSERT();
error = unp_connect(so, nam, td);
if (error)
break; /* XXX */
} else {
if ((error = connect_internal(so, nam, td)))
break;
} else {
error = ENOTCONN;
break;
}
} else if ((unp2 = unp->unp_conn) == NULL) {
error = ENOTCONN;
break;
} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
error = EPIPE;
break;
} else {
UNP_PCB_LOCK(unp);
if ((unp2 = unp->unp_conn) == NULL) {
UNP_PCB_UNLOCK(unp);
error = ENOTCONN;
break;
}
}
/* Lockless read. */
if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
error = EPIPE;
break;
}
/*
* Because connect() and send() are non-atomic in a sendto()
* with a target address, it's possible that the socket will
* have disconnected before the send() can run. In that case
* return the slightly counter-intuitive but otherwise
* correct error that the socket is not connected.
*
* Locking here must be done carefully: the linkage lock
* prevents interconnections between unpcbs from changing, so
* we can traverse from unp to unp2 without acquiring unp's
* lock. Socket buffer locks follow unpcb locks, so we can
* acquire both remote and lock socket buffer locks.
*/
unp2 = unp->unp_conn;
if (unp2 == NULL) {
unp_pcb_owned_lock2(unp, unp2, freed);
UNP_PCB_UNLOCK(unp);
if (__predict_false(freed)) {
error = ENOTCONN;
break;
}
if ((so2 = unp2->unp_socket) == NULL) {
UNP_PCB_UNLOCK(unp2);
error = ENOTCONN;
break;
}
so2 = unp2->unp_socket;
UNP_PCB_LOCK(unp2);
SOCKBUF_LOCK(&so2->so_rcv);
if (unp2->unp_flags & UNP_WANTCRED) {
/*
@ -1046,12 +1215,6 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
unp_shutdown(unp);
UNP_PCB_UNLOCK(unp);
}
if ((nam != NULL) || (flags & PRUS_EOF))
UNP_LINK_WUNLOCK();
else
UNP_LINK_RUNLOCK();
if (control != NULL && error != 0)
unp_dispose_mbuf(control);
@ -1124,12 +1287,10 @@ uipc_shutdown(struct socket *so)
unp = sotounpcb(so);
KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
socantsendmore(so);
unp_shutdown(unp);
UNP_PCB_UNLOCK(unp);
UNP_LINK_WUNLOCK();
return (0);
}
@ -1333,16 +1494,11 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
char buf[SOCK_MAXADDRLEN];
struct sockaddr *sa;
cap_rights_t rights;
int error, len;
int error, len, freed;
struct mtx *vplock;
if (nam->sa_family != AF_UNIX)
return (EAFNOSUPPORT);
UNP_LINK_WLOCK_ASSERT();
unp = sotounpcb(so);
KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
if (nam->sa_len > sizeof(struct sockaddr_un))
return (EINVAL);
len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
@ -1351,12 +1507,12 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
bcopy(soun->sun_path, buf, len);
buf[len] = 0;
unp = sotounpcb(so);
UNP_PCB_LOCK(unp);
if (unp->unp_flags & UNP_CONNECTING) {
UNP_PCB_UNLOCK(unp);
return (EALREADY);
}
UNP_LINK_WUNLOCK();
unp->unp_flags |= UNP_CONNECTING;
UNP_PCB_UNLOCK(unp);
@ -1389,11 +1545,8 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
unp = sotounpcb(so);
KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
/*
* Lock linkage lock for two reasons: make sure v_socket is stable,
* and to protect simultaneous locking of multiple pcbs.
*/
UNP_LINK_WLOCK();
vplock = mtx_pool_find(mtxpool_sleep, vp);
mtx_lock(vplock);
VOP_UNP_CONNECT(vp, &unp2);
if (unp2 == NULL) {
error = ECONNREFUSED;
@ -1404,8 +1557,7 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
error = EPROTOTYPE;
goto bad2;
}
UNP_PCB_LOCK(unp);
UNP_PCB_LOCK(unp2);
unp_pcb_lock2(unp, unp2);
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if (so2->so_options & SO_ACCEPTCONN) {
CURVNET_SET(so2->so_vnet);
@ -1418,7 +1570,9 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
goto bad3;
}
unp3 = sotounpcb(so2);
UNP_PCB_LOCK(unp3);
UNP_PCB_UNLOCK(unp);
unp_pcb_owned_lock2(unp2, unp3, freed);
MPASS(!freed);
if (unp2->unp_addr != NULL) {
bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
unp3->unp_addr = (struct sockaddr_un *) sa;
@ -1445,6 +1599,8 @@ unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
unp3->unp_flags |= UNP_WANTCRED;
UNP_PCB_UNLOCK(unp2);
unp2 = unp3;
unp_pcb_owned_lock2(unp2, unp, freed);
MPASS(!freed);
#ifdef MAC
mac_socketpeer_set_from_socket(so, so2);
mac_socketpeer_set_from_socket(so2, so);
@ -1459,12 +1615,12 @@ bad3:
UNP_PCB_UNLOCK(unp2);
UNP_PCB_UNLOCK(unp);
bad2:
UNP_LINK_WUNLOCK();
mtx_unlock(vplock);
bad:
if (vp != NULL)
if (vp != NULL) {
vput(vp);
}
free(sa, M_SONAME);
UNP_LINK_WLOCK();
UNP_PCB_LOCK(unp);
unp->unp_flags &= ~UNP_CONNECTING;
UNP_PCB_UNLOCK(unp);
@ -1482,7 +1638,6 @@ unp_connect2(struct socket *so, struct socket *so2, int req)
unp2 = sotounpcb(so2);
KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
UNP_LINK_WLOCK_ASSERT();
UNP_PCB_LOCK_ASSERT(unp);
UNP_PCB_LOCK_ASSERT(unp2);
@ -1490,10 +1645,13 @@ unp_connect2(struct socket *so, struct socket *so2, int req)
return (EPROTOTYPE);
unp2->unp_flags &= ~UNP_NASCENT;
unp->unp_conn = unp2;
unp_pcb_hold(unp2);
unp_pcb_hold(unp);
switch (so->so_type) {
case SOCK_DGRAM:
UNP_REF_LIST_LOCK();
LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
UNP_REF_LIST_UNLOCK();
soisconnected(so);
break;
@ -1517,31 +1675,48 @@ unp_connect2(struct socket *so, struct socket *so2, int req)
static void
unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
{
struct socket *so;
struct socket *so, *so2;
int rele, freed;
KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
UNP_LINK_WLOCK_ASSERT();
UNP_PCB_LOCK_ASSERT(unp);
UNP_PCB_LOCK_ASSERT(unp2);
if (unp->unp_conn == NULL && unp2->unp_conn == NULL)
return;
MPASS(unp->unp_conn == unp2);
unp->unp_conn = NULL;
rele = 0;
so = unp->unp_socket;
so2 = unp2->unp_socket;
switch (unp->unp_socket->so_type) {
case SOCK_DGRAM:
UNP_REF_LIST_LOCK();
LIST_REMOVE(unp, unp_reflink);
so = unp->unp_socket;
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED;
SOCK_UNLOCK(so);
UNP_REF_LIST_UNLOCK();
if (so) {
SOCK_LOCK(so);
so->so_state &= ~SS_ISCONNECTED;
SOCK_UNLOCK(so);
}
break;
case SOCK_STREAM:
case SOCK_SEQPACKET:
soisdisconnected(unp->unp_socket);
if (so)
soisdisconnected(so);
MPASS(unp2->unp_conn == unp);
unp2->unp_conn = NULL;
soisdisconnected(unp2->unp_socket);
if (so2)
soisdisconnected(so2);
break;
}
freed = unp_pcb_rele(unp);
MPASS(freed == 0);
freed = unp_pcb_rele(unp2);
MPASS(freed == 0);
}
/*
@ -1625,7 +1800,7 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
continue;
}
unp_list[i++] = unp;
unp->unp_refcount++;
unp_pcb_hold(unp);
}
UNP_PCB_UNLOCK(unp);
}
@ -1637,8 +1812,9 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
for (i = 0; i < n; i++) {
unp = unp_list[i];
UNP_PCB_LOCK(unp);
unp->unp_refcount--;
if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
freeunp = unp_pcb_rele(unp);
if (freeunp == 0 && unp->unp_gencnt <= gencnt) {
xu->xu_len = sizeof *xu;
xu->xu_unpp = unp;
/*
@ -1665,14 +1841,8 @@ unp_pcblist(SYSCTL_HANDLER_ARGS)
sotoxsocket(unp->unp_socket, &xu->xu_socket);
UNP_PCB_UNLOCK(unp);
error = SYSCTL_OUT(req, xu, sizeof *xu);
} else {
freeunp = (unp->unp_refcount == 0);
} else if (freeunp == 0)
UNP_PCB_UNLOCK(unp);
if (freeunp) {
UNP_PCB_LOCK_DESTROY(unp);
uma_zfree(unp_zone, unp);
}
}
}
free(xu, M_TEMP);
if (!error) {
@ -1709,7 +1879,6 @@ unp_shutdown(struct unpcb *unp)
struct unpcb *unp2;
struct socket *so;
UNP_LINK_WLOCK_ASSERT();
UNP_PCB_LOCK_ASSERT(unp);
unp2 = unp->unp_conn;
@ -1726,22 +1895,28 @@ unp_drop(struct unpcb *unp)
{
struct socket *so = unp->unp_socket;
struct unpcb *unp2;
UNP_LINK_WLOCK_ASSERT();
UNP_PCB_LOCK_ASSERT(unp);
int freed;
/*
* Regardless of whether the socket's peer dropped the connection
* with this socket by aborting or disconnecting, POSIX requires
* that ECONNRESET is returned.
*/
so->so_error = ECONNRESET;
/* acquire a reference so that unp isn't freed from underneath us */
UNP_PCB_LOCK(unp);
if (so)
so->so_error = ECONNRESET;
unp2 = unp->unp_conn;
if (unp2 == NULL)
return;
UNP_PCB_LOCK(unp2);
unp_disconnect(unp, unp2);
UNP_PCB_UNLOCK(unp2);
if (unp2 != NULL) {
unp_pcb_hold(unp2);
unp_pcb_owned_lock2(unp, unp2, freed);
unp_disconnect(unp, unp2);
if (unp_pcb_rele(unp2) == 0)
UNP_PCB_UNLOCK(unp2);
}
if (unp_pcb_rele(unp) == 0)
UNP_PCB_UNLOCK(unp);
}
static void
@ -1881,7 +2056,7 @@ unp_init(void)
return;
#endif
unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
NULL, NULL, UMA_ALIGN_CACHE, 0);
if (unp_zone == NULL)
panic("unp_init");
uma_zone_set_max(unp_zone, maxsockets);
@ -2464,13 +2639,15 @@ vfs_unp_reclaim(struct vnode *vp)
{
struct unpcb *unp;
int active;
struct mtx *vplock;
ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
KASSERT(vp->v_type == VSOCK,
("vfs_unp_reclaim: vp->v_type != VSOCK"));
active = 0;
UNP_LINK_WLOCK();
vplock = mtx_pool_find(mtxpool_sleep, vp);
mtx_lock(vplock);
VOP_UNP_CONNECT(vp, &unp);
if (unp == NULL)
goto done;
@ -2481,8 +2658,8 @@ vfs_unp_reclaim(struct vnode *vp)
active = 1;
}
UNP_PCB_UNLOCK(unp);
done:
UNP_LINK_WUNLOCK();
done:
mtx_unlock(vplock);
if (active)
vunref(vp);
}

@ -69,23 +69,25 @@ typedef uint64_t unp_gen_t;
LIST_HEAD(unp_head, unpcb);
struct unpcb {
LIST_ENTRY(unpcb) unp_link; /* glue on list of all PCBs */
struct socket *unp_socket; /* pointer back to socket */
struct file *unp_file; /* back-pointer to file for gc. */
struct vnode *unp_vnode; /* if associated with file */
ino_t unp_ino; /* fake inode number */
/* Cache line 1 */
struct mtx unp_mtx; /* mutex */
struct unpcb *unp_conn; /* control block of connected socket */
struct unp_head unp_refs; /* referencing socket linked list */
LIST_ENTRY(unpcb) unp_reflink; /* link in unp_refs list */
struct sockaddr_un *unp_addr; /* bound address of socket */
unp_gen_t unp_gencnt; /* generation count of this instance */
volatile u_int unp_refcount;
short unp_flags; /* flags */
short unp_gcflag; /* Garbage collector flags. */
struct sockaddr_un *unp_addr; /* bound address of socket */
struct socket *unp_socket; /* pointer back to socket */
/* Cache line 2 */
struct vnode *unp_vnode; /* if associated with file */
struct xucred unp_peercred; /* peer credentials, if applicable */
u_int unp_refcount;
LIST_ENTRY(unpcb) unp_reflink; /* link in unp_refs list */
LIST_ENTRY(unpcb) unp_link; /* glue on list of all PCBs */
struct unp_head unp_refs; /* referencing socket linked list */
unp_gen_t unp_gencnt; /* generation count of this instance */
struct file *unp_file; /* back-pointer to file for gc. */
u_int unp_msgcount; /* references from message queue */
struct mtx unp_mtx; /* mutex */
};
ino_t unp_ino; /* fake inode number */
} __aligned(CACHE_LINE_SIZE);
/*
* Flags in unp_flags.