freebsd-skq/sys/kern/uipc_accf.c
glebius e35d543ec1 Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
  fields that belong to normal dataflow, and unionize them.  This
  shrinks the structure a bit.
  - Take out selinfo's from the socket buffers into the socket. The
    first reason is to support braindamaged scenario when a socket is
    added to kevent(2) and then listen(2) is cast on it. The second
    reason is that there is future plan to make socket buffers pluggable,
    so that for a dataflow socket a socket buffer can be changed, and
    in this case we also want to keep same selinfos through the lifetime
    of a socket.
  - Remove struct struct so_accf. Since now listening stuff no longer
    affects struct socket size, just move its fields into listening part
    of the union.
  - Provide sol_upcall field and enforce that so_upcall_set() may be called
    only on a dataflow socket, which has buffers, and for listening sockets
    provide solisten_upcall_set().

o Remove ACCEPT_LOCK() global.
  - Add a mutex to socket, to be used instead of socket buffer lock to lock
    fields of struct socket that don't belong to a socket buffer.
  - Allow to acquire two socket locks, but the first one must belong to a
    listening socket.
  - Make soref()/sorele() to use atomic(9).  This allows in some situations
    to do soref() without owning socket lock.  There is place for improvement
    here, it is possible to make sorele() also to lock optionally.
  - Most protocols aren't touched by this change, except UNIX local sockets.
    See below for more information.

o Reduce copy-and-paste in kernel modules that accept connections from
  listening sockets: provide function solisten_dequeue(), and use it in
  the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
  infiniband, rpc.

o UNIX local sockets.
  - Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
    local sockets.  Most races exist around spawning a new socket, when we
    are connecting to a local listening socket.  To cover them, we need to
    hold locks on both PCBs when spawning a third one.  This means holding
    them across sonewconn().  This creates a LOR between pcb locks and
    unp_list_lock.
  - To fix the new LOR, abandon the global unp_list_lock in favor of global
    unp_link_lock.  Indeed, separating these two locks didn't provide us any
    extra parralelism in the UNIX sockets.
  - Now call into uipc_attach() may happen with unp_link_lock hold if, we
    are accepting, or without unp_link_lock in case if we are just creating
    a socket.
  - Another problem in UNIX sockets is that uipc_close() basicly did nothing
    for a listening socket.  The vnode remained opened for connections.  This
    is fixed by removing vnode in uipc_close().  Maybe the right way would be
    to do it for all sockets (not only listening), simply move the vnode
    teardown from uipc_detach() to uipc_close()?

Sponsored by:		Netflix
Differential Revision:	https://reviews.freebsd.org/D9770
2017-06-08 21:30:34 +00:00

307 lines
8.0 KiB
C

/*-
* Copyright (c) 2000 Paycounter, Inc.
* Copyright (c) 2005 Robert N. M. Watson
* Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#define ACCEPT_FILTER_MOD
#include "opt_param.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/queue.h>
static struct mtx accept_filter_mtx;
MTX_SYSINIT(accept_filter, &accept_filter_mtx, "accept_filter_mtx",
MTX_DEF);
#define ACCEPT_FILTER_LOCK() mtx_lock(&accept_filter_mtx)
#define ACCEPT_FILTER_UNLOCK() mtx_unlock(&accept_filter_mtx)
static SLIST_HEAD(, accept_filter) accept_filtlsthd =
SLIST_HEAD_INITIALIZER(accept_filtlsthd);
MALLOC_DEFINE(M_ACCF, "accf", "accept filter data");
static int unloadable = 0;
SYSCTL_NODE(_net, OID_AUTO, accf, CTLFLAG_RW, 0, "Accept filters");
SYSCTL_INT(_net_accf, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
"Allow unload of accept filters (not recommended)");
/*
* Must be passed a malloc'd structure so we don't explode if the kld is
* unloaded, we leak the struct on deallocation to deal with this, but if a
* filter is loaded with the same name as a leaked one we re-use the entry.
*/
int
accept_filt_add(struct accept_filter *filt)
{
struct accept_filter *p;
ACCEPT_FILTER_LOCK();
SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
if (strcmp(p->accf_name, filt->accf_name) == 0) {
if (p->accf_callback != NULL) {
ACCEPT_FILTER_UNLOCK();
return (EEXIST);
} else {
p->accf_callback = filt->accf_callback;
ACCEPT_FILTER_UNLOCK();
free(filt, M_ACCF);
return (0);
}
}
if (p == NULL)
SLIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
ACCEPT_FILTER_UNLOCK();
return (0);
}
int
accept_filt_del(char *name)
{
struct accept_filter *p;
p = accept_filt_get(name);
if (p == NULL)
return (ENOENT);
p->accf_callback = NULL;
return (0);
}
struct accept_filter *
accept_filt_get(char *name)
{
struct accept_filter *p;
ACCEPT_FILTER_LOCK();
SLIST_FOREACH(p, &accept_filtlsthd, accf_next)
if (strcmp(p->accf_name, name) == 0)
break;
ACCEPT_FILTER_UNLOCK();
return (p);
}
int
accept_filt_generic_mod_event(module_t mod, int event, void *data)
{
struct accept_filter *p;
struct accept_filter *accfp = (struct accept_filter *) data;
int error;
switch (event) {
case MOD_LOAD:
p = malloc(sizeof(*p), M_ACCF, M_WAITOK);
bcopy(accfp, p, sizeof(*p));
error = accept_filt_add(p);
break;
case MOD_UNLOAD:
/*
* Do not support unloading yet. we don't keep track of
* refcounts and unloading an accept filter callback and then
* having it called is a bad thing. A simple fix would be to
* track the refcount in the struct accept_filter.
*/
if (unloadable != 0) {
error = accept_filt_del(accfp->accf_name);
} else
error = EOPNOTSUPP;
break;
case MOD_SHUTDOWN:
error = 0;
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
int
accept_filt_getopt(struct socket *so, struct sockopt *sopt)
{
struct accept_filter_arg *afap;
int error;
error = 0;
afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO);
SOCK_LOCK(so);
if ((so->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto out;
}
if (so->sol_accept_filter == NULL) {
error = EINVAL;
goto out;
}
strcpy(afap->af_name, so->sol_accept_filter->accf_name);
if (so->sol_accept_filter_str != NULL)
strcpy(afap->af_arg, so->sol_accept_filter_str);
out:
SOCK_UNLOCK(so);
if (error == 0)
error = sooptcopyout(sopt, afap, sizeof(*afap));
free(afap, M_TEMP);
return (error);
}
int
accept_filt_setopt(struct socket *so, struct sockopt *sopt)
{
struct accept_filter_arg *afap;
struct accept_filter *afp;
char *accept_filter_str = NULL;
void *accept_filter_arg = NULL;
int error;
/*
* Handle the simple delete case first.
*/
if (sopt == NULL || sopt->sopt_val == NULL) {
struct socket *sp, *sp1;
int wakeup;
SOCK_LOCK(so);
if ((so->so_options & SO_ACCEPTCONN) == 0) {
SOCK_UNLOCK(so);
return (EINVAL);
}
if (so->sol_accept_filter == NULL) {
SOCK_UNLOCK(so);
return (0);
}
if (so->sol_accept_filter->accf_destroy != NULL)
so->sol_accept_filter->accf_destroy(so);
if (so->sol_accept_filter_str != NULL)
free(so->sol_accept_filter_str, M_ACCF);
so->sol_accept_filter = NULL;
so->sol_accept_filter_arg = NULL;
so->sol_accept_filter_str = NULL;
so->so_options &= ~SO_ACCEPTFILTER;
/*
* Move from incomplete queue to complete only those
* connections, that are blocked by us.
*/
wakeup = 0;
TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) {
SOCK_LOCK(sp);
if (sp->so_options & SO_ACCEPTFILTER) {
TAILQ_REMOVE(&so->sol_incomp, sp, so_list);
TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list);
sp->so_qstate = SQ_COMP;
sp->so_options &= ~SO_ACCEPTFILTER;
so->sol_incqlen--;
so->sol_qlen++;
wakeup = 1;
}
SOCK_UNLOCK(sp);
}
if (wakeup)
solisten_wakeup(so); /* unlocks */
else
SOLISTEN_UNLOCK(so);
return (0);
}
/*
* Pre-allocate any memory we may need later to avoid blocking at
* untimely moments. This does not optimize for invalid arguments.
*/
afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK);
error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap);
afap->af_name[sizeof(afap->af_name)-1] = '\0';
afap->af_arg[sizeof(afap->af_arg)-1] = '\0';
if (error) {
free(afap, M_TEMP);
return (error);
}
afp = accept_filt_get(afap->af_name);
if (afp == NULL) {
free(afap, M_TEMP);
return (ENOENT);
}
if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
size_t len = strlen(afap->af_name) + 1;
accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
strcpy(accept_filter_str, afap->af_name);
}
/*
* Require a listen socket; don't try to replace an existing filter
* without first removing it.
*/
SOCK_LOCK(so);
if ((so->so_options & SO_ACCEPTCONN) == 0 ||
so->sol_accept_filter != NULL) {
error = EINVAL;
goto out;
}
/*
* Invoke the accf_create() method of the filter if required. The
* socket mutex is held over this call, so create methods for filters
* can't block.
*/
if (afp->accf_create != NULL) {
accept_filter_arg = afp->accf_create(so, afap->af_arg);
if (accept_filter_arg == NULL) {
error = EINVAL;
goto out;
}
}
so->sol_accept_filter = afp;
so->sol_accept_filter_arg = accept_filter_arg;
so->sol_accept_filter_str = accept_filter_str;
so->so_options |= SO_ACCEPTFILTER;
out:
SOCK_UNLOCK(so);
if (accept_filter_str != NULL)
free(accept_filter_str, M_ACCF);
free(afap, M_TEMP);
return (error);
}