From d861372b14c2778e8a18ec80fc4e330eb576d2f9 Mon Sep 17 00:00:00 2001 From: Robert Watson Date: Sun, 11 Jul 2004 18:29:47 +0000 Subject: [PATCH] Add additional annotations to soreceive(), documenting the effects of locking on 'nextrecord' and concerns regarding potentially inconsistent or stale use of socket buffer or stack fields if they aren't carefully synchronized whenever the socket buffer mutex is released. Document that the high-level sblock() prevents races against other readers on the socket. Also document the 'type' logic as to how soreceive() guarantees that it will only return one of normal data or inline out-of-band data. --- sys/kern/uipc_socket.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index 35355d3a00ae..160c89eec8c0 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -838,7 +838,7 @@ out: * data from a socket. For more complete comments, see soreceive(), from * which this code originated. * - * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreceve(), + * XXXRW: Note that soreceive_rcvoob(), unlike the remainder of soreiceve(), * is unable to return an mbuf chain to the caller. */ static int @@ -1007,6 +1007,21 @@ restart: goto restart; } dontblock: + /* + * From this point onward, we maintain 'nextrecord' as a cache of the + * pointer to the next record in the socket buffer. We must keep the + * various socket buffer pointers and local stack versions of the + * pointers in sync, pushing out modifications before dropping the + * socket buffer mutex, and re-reading them when picking it up. + * + * Otherwise, we will race with the network stack appending new data + * or records onto the socket buffer by using inconsistent/stale + * versions of the field, possibly resulting in socket buffer + * corruption. + * + * By holding the high-level sblock(), we prevent simultaneous + * readers from pulling off the front of the socket buffer. + */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (uio->uio_td) uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++; @@ -1031,6 +1046,13 @@ dontblock: m->m_nextpkt = nextrecord; } } + + /* + * Process one or more MT_CONTROL mbufs present before any data mbufs + * in the first mbuf chain on the socket buffer. If MSG_PEEK, we + * just copy the data; if !MSG_PEEK, we call into the protocol to + * perform externalization. + */ while (m != NULL && m->m_type == MT_CONTROL && error == 0) { if (flags & MSG_PEEK) { if (controlp != NULL) @@ -1085,9 +1107,21 @@ dontblock: SBLASTRECORDCHK(&so->so_rcv); SBLASTMBUFCHK(&so->so_rcv); + /* + * Now continue to read any data mbufs off of the head of the socket + * buffer until the read request is satisfied. Note that 'type' is + * used to store the type of any mbuf reads that have happened so far + * such that soreceive() can stop reading if the type changes, which + * causes soreceive() to return only one of regular data and inline + * out-of-band data in a single socket receive operation. + */ moff = 0; offset = 0; while (m != NULL && uio->uio_resid > 0 && error == 0) { + /* + * If the type of mbuf has changed since the last mbuf + * examined ('type'), end the receive operation. + */ SOCKBUF_LOCK_ASSERT(&so->so_rcv); if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA)