Continue to refine inpcb reference counting and locking, in preparation for

reworking of inpcbinfo locking:

(1) Convert inpcb reference counting from manually manipulated integers to
    the refcount(9) KPI.  This allows the refcount to be managed atomically
    with an inpcb read lock rather than write lock, or even with no inpcb
    lock at all.  As a result, in_pcbref() also no longer requires an inpcb
    lock, so can be performed solely using the lock used to look up an
    inpcb.

(2) Shift more inpcb freeing activity from the in_pcbrele() context (via
    in_pcbfree_internal) to the explicit in_pcbfree() context.  This means
    that the inpcb refcount is increasingly used only to maintain memory
    stability, not actually defer the clean up of inpcb protocol parts.
    This is desirable as many of those protocol parts required the pcbinfo
    lock, which we'd like not to acquire in in_pcbrele() contexts.  Document
    this in comments better.

(3) Introduce new read-locked and write-locked in_pcbrele() variations,
    in_pcbrele_rlocked() and in_pcbrele_wlocked(), which allow the inpcb to
    be properly unlocked as needed.  in_pcbrele() is a wrapper around the
    latter, and should probably go away at some point.  This makes it
    easier to use this weak reference model when holding only a read lock,
    as will happen in the future.

This may well be safe to MFC, but some more KBI analysis is required.

Reviewed by:    bz
MFC after:      3 weeks
Sponsored by:   Juniper Networks, Inc.
This commit is contained in:
Robert Watson 2011-05-23 19:32:02 +00:00
parent f53edc909e
commit 79bdc6e5d3
2 changed files with 115 additions and 85 deletions

View File

@ -2,8 +2,12 @@
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California.
* Copyright (c) 2007-2009 Robert N. M. Watson
* Copyright (c) 2010-2011 Juniper Networks, Inc.
* All rights reserved.
*
* Portions of this software were developed by Robert N. M. Watson under
* contract to Juniper Networks, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@ -50,6 +54,7 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/refcount.h>
#include <sys/jail.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
@ -287,7 +292,7 @@ in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo)
#endif
INP_WLOCK(inp);
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
inp->inp_refcount = 1; /* Reference from the inpcbinfo */
refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */
#if defined(IPSEC) || defined(MAC)
out:
if (error != 0) {
@ -1028,26 +1033,121 @@ in_pcbdetach(struct inpcb *inp)
}
/*
* in_pcbfree_internal() frees an inpcb that has been detached from its
* socket, and whose reference count has reached 0. It will also remove the
* inpcb from any global lists it might remain on.
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
* but where the inpcb lock is already held.
*
* in_pcbref() should be used only to provide brief memory stability, and
* must always be followed by a call to INP_WLOCK() and in_pcbrele() to
* garbage collect the inpcb if it has been in_pcbfree()'d from another
* context. Until in_pcbrele() has returned that the inpcb is still valid,
* lock and rele are the *only* safe operations that may be performed on the
* inpcb.
*
* While the inpcb will not be freed, releasing the inpcb lock means that the
* connection's state may change, so the caller should be careful to
* revalidate any cached state on reacquiring the lock. Drop the reference
* using in_pcbrele().
*/
static void
in_pcbfree_internal(struct inpcb *inp)
void
in_pcbref(struct inpcb *inp)
{
struct inpcbinfo *ipi = inp->inp_pcbinfo;
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
KASSERT(inp->inp_refcount == 0, ("%s: refcount !0", __func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
refcount_acquire(&inp->inp_refcount);
}
/*
* Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
* return a flag indicating whether or not the inpcb remains valid. If it is
* valid, we return with the inpcb lock held.
*
* Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a
* reference on an inpcb. Historically more work was done here (actually, in
* in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the
* need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely
* about memory stability (and continued use of the write lock).
*/
int
in_pcbrele_rlocked(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo;
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
INP_RLOCK_ASSERT(inp);
if (refcount_release(&inp->inp_refcount) == 0)
return (0);
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
INP_RUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
return (1);
}
int
in_pcbrele_wlocked(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo;
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
INP_WLOCK_ASSERT(inp);
if (refcount_release(&inp->inp_refcount) == 0)
return (0);
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
INP_WUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
return (1);
}
/*
* Temporary wrapper.
*/
int
in_pcbrele(struct inpcb *inp)
{
return (in_pcbrele_wlocked(inp));
}
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
* released using in_pcbrele(), but the inpcb is still unlocked. Almost all
* work, including removal from global lists, is done in this context, where
* the pcbinfo lock is held.
*/
void
in_pcbfree(struct inpcb *inp)
{
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
INP_INFO_WLOCK_ASSERT(pcbinfo);
INP_WLOCK_ASSERT(inp);
/* XXXRW: Do as much as possible here. */
#ifdef IPSEC
if (inp->inp_sp != NULL)
ipsec_delete_pcbpolicy(inp);
#endif /* IPSEC */
inp->inp_gencnt = ++ipi->ipi_gencnt;
inp->inp_gencnt = ++pcbinfo->ipi_gencnt;
in_pcbremlists(inp);
#ifdef INET6
if (inp->inp_vflag & INP_IPV6PROTO) {
@ -1064,82 +1164,10 @@ in_pcbfree_internal(struct inpcb *inp)
#endif
inp->inp_vflag = 0;
crfree(inp->inp_cred);
#ifdef MAC
mac_inpcb_destroy(inp);
#endif
INP_WUNLOCK(inp);
uma_zfree(ipi->ipi_zone, inp);
}
/*
* in_pcbref() bumps the reference count on an inpcb in order to maintain
* stability of an inpcb pointer despite the inpcb lock being released. This
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded,
* but where the inpcb lock is already held.
*
* While the inpcb will not be freed, releasing the inpcb lock means that the
* connection's state may change, so the caller should be careful to
* revalidate any cached state on reacquiring the lock. Drop the reference
* using in_pcbrele().
*/
void
in_pcbref(struct inpcb *inp)
{
INP_WLOCK_ASSERT(inp);
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
inp->inp_refcount++;
}
/*
* Drop a refcount on an inpcb elevated using in_pcbref(); because a call to
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we
* return a flag indicating whether or not the inpcb remains valid. If it is
* valid, we return with the inpcb lock held.
*/
int
in_pcbrele(struct inpcb *inp)
{
#ifdef INVARIANTS
struct inpcbinfo *ipi = inp->inp_pcbinfo;
#endif
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
inp->inp_refcount--;
if (inp->inp_refcount > 0)
return (0);
in_pcbfree_internal(inp);
return (1);
}
/*
* Unconditionally schedule an inpcb to be freed by decrementing its
* reference count, which should occur only after the inpcb has been detached
* from its socket. If another thread holds a temporary reference (acquired
* using in_pcbref()) then the free is deferred until that reference is
* released using in_pcbrele(), but the inpcb is still unlocked.
*/
void
in_pcbfree(struct inpcb *inp)
{
#ifdef INVARIANTS
struct inpcbinfo *ipi = inp->inp_pcbinfo;
#endif
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL",
__func__));
INP_INFO_WLOCK_ASSERT(ipi);
INP_WLOCK_ASSERT(inp);
if (!in_pcbrele(inp))
if (!in_pcbrele_wlocked(inp))
INP_WUNLOCK(inp);
}

View File

@ -534,6 +534,8 @@ void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
void in_pcbref(struct inpcb *);
void in_pcbrehash(struct inpcb *);
int in_pcbrele(struct inpcb *);
int in_pcbrele_rlocked(struct inpcb *);
int in_pcbrele_wlocked(struct inpcb *);
void in_pcbsetsolabel(struct socket *so);
int in_getpeeraddr(struct socket *so, struct sockaddr **nam);
int in_getsockaddr(struct socket *so, struct sockaddr **nam);