jail: Use refcount(9) for prison references.

Use refcount(9) for both pr_ref and pr_uref in struct prison.  This
allows prisons to held and freed without requiring the prison mutex.
An exception to this is that dropping the last reference will still
lock the prison, to keep the guarantee that a locked prison remains
valid and alive (provided it was at the time it was locked).

Among other things, this honors the promise made in a comment in
crcopy(9), that it will not block, which hasn't been true for two
decades.
This commit is contained in:
Jamie Gritton 2021-01-20 15:08:27 -08:00
parent e3dd8ed77b
commit 6754ae2572
2 changed files with 91 additions and 63 deletions

View File

@ -1116,7 +1116,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
name_again: name_again:
deadpr = NULL; deadpr = NULL;
FOREACH_PRISON_CHILD(ppr, tpr) { FOREACH_PRISON_CHILD(ppr, tpr) {
if (tpr != pr && tpr->pr_ref > 0 && if (tpr != pr &&
!strcmp(tpr->pr_name + pnamelen, namelc)) { !strcmp(tpr->pr_name + pnamelen, namelc)) {
mtx_lock(&tpr->pr_mtx); mtx_lock(&tpr->pr_mtx);
if (prison_isalive(tpr)) { if (prison_isalive(tpr)) {
@ -1199,8 +1199,8 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
prison_name(mypr, ppr)); prison_name(mypr, ppr));
goto done_deref; goto done_deref;
} }
ppr->pr_ref++; prison_hold(ppr);
ppr->pr_uref++; refcount_acquire(&ppr->pr_uref);
mtx_unlock(&ppr->pr_mtx); mtx_unlock(&ppr->pr_mtx);
if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) { if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
@ -1315,7 +1315,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
* Grab a reference for existing prisons, to ensure they * Grab a reference for existing prisons, to ensure they
* continue to exist for the duration of the call. * continue to exist for the duration of the call.
*/ */
pr->pr_ref++; prison_hold(pr);
drflags |= PD_DEREF; drflags |= PD_DEREF;
#if defined(VIMAGE) && (defined(INET) || defined(INET6)) #if defined(VIMAGE) && (defined(INET) || defined(INET6))
if ((pr->pr_flags & PR_VNET) && if ((pr->pr_flags & PR_VNET) &&
@ -1434,7 +1434,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
#ifdef VIMAGE #ifdef VIMAGE
(tpr != tppr && (tpr->pr_flags & PR_VNET)) || (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
#endif #endif
tpr->pr_uref == 0) { refcount_load(&tpr->pr_uref) == 0) {
descend = 0; descend = 0;
continue; continue;
} }
@ -1502,7 +1502,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
#ifdef VIMAGE #ifdef VIMAGE
(tpr != tppr && (tpr->pr_flags & PR_VNET)) || (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
#endif #endif
tpr->pr_uref == 0) { refcount_load(&tpr->pr_uref) == 0) {
descend = 0; descend = 0;
continue; continue;
} }
@ -1738,11 +1738,11 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
born = !prison_isalive(pr); born = !prison_isalive(pr);
if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) { if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
if (pr_flags & PR_PERSIST) { if (pr_flags & PR_PERSIST) {
pr->pr_ref++; prison_hold(pr);
pr->pr_uref++; refcount_acquire(&pr->pr_uref);
} else { } else {
pr->pr_ref--; refcount_release(&pr->pr_uref);
pr->pr_uref--; refcount_release(&pr->pr_ref);
} }
} }
pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags; pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
@ -1857,8 +1857,8 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
if (pr_flags & PR_PERSIST) { if (pr_flags & PR_PERSIST) {
mtx_lock(&pr->pr_mtx); mtx_lock(&pr->pr_mtx);
drflags |= PD_LOCKED; drflags |= PD_LOCKED;
pr->pr_ref++; refcount_acquire(&pr->pr_ref);
pr->pr_uref++; refcount_acquire(&pr->pr_uref);
} else { } else {
/* Non-persistent jails need no further changes. */ /* Non-persistent jails need no further changes. */
pr = NULL; pr = NULL;
@ -1933,7 +1933,8 @@ get_next_prid(struct prison **insprp)
TAILQ_FOREACH(inspr, &allprison, pr_list) { TAILQ_FOREACH(inspr, &allprison, pr_list) {
if (inspr->pr_id < jid) if (inspr->pr_id < jid)
continue; continue;
if (inspr->pr_id > jid || inspr->pr_ref == 0) { if (inspr->pr_id > jid ||
refcount_load(&inspr->pr_ref) == 0) {
/* /*
* Found an opening. This may be a gap * Found an opening. This may be a gap
* in the list, or a dead jail with the * in the list, or a dead jail with the
@ -2096,7 +2097,7 @@ kern_jail_get(struct thread *td, struct uio *optuio, int flags)
found_prison: found_prison:
/* Get the parameters of the prison. */ /* Get the parameters of the prison. */
pr->pr_ref++; prison_hold(pr);
drflags |= PD_DEREF; drflags |= PD_DEREF;
td->td_retval[0] = pr->pr_id; td->td_retval[0] = pr->pr_id;
error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id)); error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
@ -2309,7 +2310,7 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
} }
/* Remove all descendants of this prison, then remove this prison. */ /* Remove all descendants of this prison, then remove this prison. */
pr->pr_ref++; prison_hold(pr);
if (!LIST_EMPTY(&pr->pr_children)) { if (!LIST_EMPTY(&pr->pr_children)) {
mtx_unlock(&pr->pr_mtx); mtx_unlock(&pr->pr_mtx);
lpr = NULL; lpr = NULL;
@ -2317,7 +2318,7 @@ sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
mtx_lock(&cpr->pr_mtx); mtx_lock(&cpr->pr_mtx);
if (prison_isvalid(cpr)) { if (prison_isvalid(cpr)) {
tpr = cpr; tpr = cpr;
cpr->pr_ref++; prison_hold(cpr);
} else { } else {
/* Already removed - do not do it again. */ /* Already removed - do not do it again. */
tpr = NULL; tpr = NULL;
@ -2351,18 +2352,19 @@ prison_remove_one(struct prison *pr)
/* If the prison was persistent, it is not anymore. */ /* If the prison was persistent, it is not anymore. */
if (pr->pr_flags & PR_PERSIST) { if (pr->pr_flags & PR_PERSIST) {
pr->pr_ref--; refcount_release(&pr->pr_ref);
drflags |= PD_DEUREF; drflags |= PD_DEUREF;
pr->pr_flags &= ~PR_PERSIST; pr->pr_flags &= ~PR_PERSIST;
} }
/* /*
* jail_remove added a reference. If that's the only one, remove * jail_remove added a reference. If that's the only one, remove
* the prison now. * the prison now. refcount(9) doesn't guarantee the cache coherence
* of non-zero counters, so force it here.
*/ */
KASSERT(pr->pr_ref > 0, KASSERT(refcount_load(&pr->pr_ref) > 0,
("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id)); ("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
if (pr->pr_ref == 1) { if (atomic_load_acq_int(&pr->pr_ref) == 1) {
prison_deref(pr, drflags); prison_deref(pr, drflags);
return; return;
} }
@ -2440,8 +2442,8 @@ do_jail_attach(struct thread *td, struct prison *pr)
* a process root from one prison, but attached to the jail * a process root from one prison, but attached to the jail
* of another. * of another.
*/ */
pr->pr_ref++; refcount_acquire(&pr->pr_ref);
pr->pr_uref++; refcount_acquire(&pr->pr_uref);
mtx_unlock(&pr->pr_mtx); mtx_unlock(&pr->pr_mtx);
/* Let modules do whatever they need to prepare for attaching. */ /* Let modules do whatever they need to prepare for attaching. */
@ -2614,19 +2616,21 @@ void
prison_hold_locked(struct prison *pr) prison_hold_locked(struct prison *pr)
{ {
mtx_assert(&pr->pr_mtx, MA_OWNED); /* Locking is no longer required. */
KASSERT(pr->pr_ref > 0, prison_hold(pr);
("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
pr->pr_ref++;
} }
void void
prison_hold(struct prison *pr) prison_hold(struct prison *pr)
{ {
#ifdef INVARIANTS
int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
mtx_lock(&pr->pr_mtx); KASSERT(was_valid,
prison_hold_locked(pr); ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
mtx_unlock(&pr->pr_mtx); #else
refcount_acquire(&pr->pr_ref);
#endif
} }
/* /*
@ -2637,14 +2641,17 @@ prison_hold(struct prison *pr)
void void
prison_free_locked(struct prison *pr) prison_free_locked(struct prison *pr)
{ {
int ref; int lastref;
mtx_assert(&pr->pr_mtx, MA_OWNED); mtx_assert(&pr->pr_mtx, MA_OWNED);
ref = --pr->pr_ref; KASSERT(refcount_load(&pr->pr_ref) > 0,
("Trying to free dead prison %p (jid=%d).",
pr, pr->pr_id));
lastref = refcount_release(&pr->pr_ref);
mtx_unlock(&pr->pr_mtx); mtx_unlock(&pr->pr_mtx);
if (ref == 0) { if (lastref) {
/* /*
* Don't remove the last reference in this context, * Don't remove the prison itself in this context,
* in case there are locks held. * in case there are locks held.
*/ */
taskqueue_enqueue(taskqueue_thread, &pr->pr_task); taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
@ -2655,6 +2662,16 @@ void
prison_free(struct prison *pr) prison_free(struct prison *pr)
{ {
/*
* Locking is only required when releasing the last reference.
* This allows assurance that a locked prison will remain valid
* until it is unlocked.
*/
KASSERT(refcount_load(&pr->pr_ref) > 0,
("Trying to free dead prison %p (jid=%d).",
pr, pr->pr_id));
if (refcount_release_if_not_last(&pr->pr_ref))
return;
mtx_lock(&pr->pr_mtx); mtx_lock(&pr->pr_mtx);
prison_free_locked(pr); prison_free_locked(pr);
} }
@ -2670,12 +2687,14 @@ prison_free(struct prison *pr)
void void
prison_proc_hold(struct prison *pr) prison_proc_hold(struct prison *pr)
{ {
#ifdef INVARIANTS
int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
mtx_lock(&pr->pr_mtx); KASSERT(was_alive,
KASSERT(pr->pr_uref > 0,
("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id)); ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
pr->pr_uref++; #else
mtx_unlock(&pr->pr_mtx); refcount_acquire(&pr->pr_uref);
#endif
} }
/* /*
@ -2686,20 +2705,27 @@ prison_proc_hold(struct prison *pr)
void void
prison_proc_free(struct prison *pr) prison_proc_free(struct prison *pr)
{ {
int lasturef;
mtx_lock(&pr->pr_mtx); /*
KASSERT(pr->pr_uref > 0, * Locking is only required when releasing the last reference.
* This allows assurance that a locked prison will remain alive
* until it is unlocked.
*/
KASSERT(refcount_load(&pr->pr_uref) > 0,
("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id)); ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
if (pr->pr_uref > 1) if (refcount_release_if_not_last(&pr->pr_uref))
pr->pr_uref--; return;
else { mtx_lock(&pr->pr_mtx);
lasturef = refcount_release(&pr->pr_uref);
if (lasturef) {
/* /*
* Don't remove the last user reference in this context, * Don't remove the last user reference in this context,
* which is expected to be a process that is not only locked, * which is expected to be a process that is not only locked,
* but also half dead. Add a reference so any calls to * but also half dead. Add a reference so any calls to
* prison_free() won't re-submit the task. * prison_free() won't re-submit the task.
*/ */
pr->pr_ref++; refcount_acquire(&pr->pr_ref);
mtx_unlock(&pr->pr_mtx); mtx_unlock(&pr->pr_mtx);
taskqueue_enqueue(taskqueue_thread, &pr->pr_task); taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
return; return;
@ -2723,7 +2749,7 @@ prison_complete(void *context, int pending)
* was added. No references are expected if this is completing a call * was added. No references are expected if this is completing a call
* to prison_free, but prison_deref is still called for the cleanup. * to prison_free, but prison_deref is still called for the cleanup.
*/ */
prison_deref(pr, pr->pr_uref > 0 prison_deref(pr, refcount_load(&pr->pr_uref) > 0
? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
: PD_LOCKED | PD_LIST_XLOCKED); : PD_LOCKED | PD_LIST_XLOCKED);
} }
@ -2740,29 +2766,30 @@ static void
prison_deref(struct prison *pr, int flags) prison_deref(struct prison *pr, int flags)
{ {
struct prison *ppr, *tpr; struct prison *ppr, *tpr;
int ref, lasturef; int lastref, lasturef;
if (!(flags & PD_LOCKED)) if (!(flags & PD_LOCKED))
mtx_lock(&pr->pr_mtx); mtx_lock(&pr->pr_mtx);
for (;;) { for (;;) {
if (flags & PD_DEUREF) { if (flags & PD_DEUREF) {
KASSERT(pr->pr_uref > 0, KASSERT(refcount_load(&pr->pr_uref) > 0,
("prison_deref PD_DEUREF on a dead prison (jid=%d)", ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
pr->pr_id)); pr->pr_id));
pr->pr_uref--; lasturef = refcount_release(&pr->pr_uref);
lasturef = pr->pr_uref == 0;
if (lasturef) if (lasturef)
pr->pr_ref++; refcount_acquire(&pr->pr_ref);
KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0")); KASSERT(refcount_load(&prison0.pr_uref) > 0,
("prison0 pr_uref=0"));
} else } else
lasturef = 0; lasturef = 0;
if (flags & PD_DEREF) { if (flags & PD_DEREF) {
KASSERT(pr->pr_ref > 0, KASSERT(refcount_load(&pr->pr_ref) > 0,
("prison_deref PD_DEREF on a dead prison (jid=%d)", ("prison_deref PD_DEREF on a dead prison (jid=%d)",
pr->pr_id)); pr->pr_id));
pr->pr_ref--; lastref = refcount_release(&pr->pr_ref);
} }
ref = pr->pr_ref; else
lastref = refcount_load(&pr->pr_ref) == 0;
mtx_unlock(&pr->pr_mtx); mtx_unlock(&pr->pr_mtx);
/* /*
@ -2771,7 +2798,7 @@ prison_deref(struct prison *pr, int flags)
*/ */
if (lasturef) { if (lasturef) {
if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) { if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
if (ref > 1) { if (atomic_load_acq_int(&pr->pr_ref) > 1) {
sx_slock(&allprison_lock); sx_slock(&allprison_lock);
flags |= PD_LIST_SLOCKED; flags |= PD_LIST_SLOCKED;
} else { } else {
@ -2781,12 +2808,12 @@ prison_deref(struct prison *pr, int flags)
} }
(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL); (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
mtx_lock(&pr->pr_mtx); mtx_lock(&pr->pr_mtx);
ref = --pr->pr_ref; lastref = refcount_release(&pr->pr_ref);
mtx_unlock(&pr->pr_mtx); mtx_unlock(&pr->pr_mtx);
} }
/* If the prison still has references, nothing else to do. */ /* If the prison still has references, nothing else to do. */
if (ref > 0) { if (!lastref) {
if (flags & PD_LIST_SLOCKED) if (flags & PD_LIST_SLOCKED)
sx_sunlock(&allprison_lock); sx_sunlock(&allprison_lock);
else if (flags & PD_LIST_XLOCKED) else if (flags & PD_LIST_XLOCKED)
@ -3008,9 +3035,9 @@ prison_isalive(struct prison *pr)
{ {
mtx_assert(&pr->pr_mtx, MA_OWNED); mtx_assert(&pr->pr_mtx, MA_OWNED);
if (__predict_false(pr->pr_ref == 0)) if (__predict_false(refcount_load(&pr->pr_ref) == 0))
return (false); return (false);
if (__predict_false(pr->pr_uref == 0)) if (__predict_false(refcount_load(&pr->pr_uref) == 0))
return (false); return (false);
return (true); return (true);
} }
@ -3025,7 +3052,7 @@ prison_isvalid(struct prison *pr)
{ {
mtx_assert(&pr->pr_mtx, MA_OWNED); mtx_assert(&pr->pr_mtx, MA_OWNED);
if (__predict_false(pr->pr_ref == 0)) if (__predict_false(refcount_load(&pr->pr_ref) == 0))
return (false); return (false);
return (true); return (true);
} }

View File

@ -150,17 +150,18 @@ struct prison_racct;
* *
* Lock key: * Lock key:
* (a) allprison_lock * (a) allprison_lock
* (c) set only during creation before the structure is shared, no mutex
* required to read
* (m) locked by pr_mtx * (m) locked by pr_mtx
* (p) locked by pr_mtx, and also at least shared allprison_lock required * (p) locked by pr_mtx, and also at least shared allprison_lock required
* to update * to update
* (c) set only during creation before the structure is shared, no mutex * (r) atomic via refcount(9), pr_mtx required to decrement to zero
* required to read
*/ */
struct prison { struct prison {
TAILQ_ENTRY(prison) pr_list; /* (a) all prisons */ TAILQ_ENTRY(prison) pr_list; /* (a) all prisons */
int pr_id; /* (c) prison id */ int pr_id; /* (c) prison id */
int pr_ref; /* (m) refcount */ volatile u_int pr_ref; /* (r) refcount */
int pr_uref; /* (m) user (alive) refcount */ volatile u_int pr_uref; /* (r) user (alive) refcount */
unsigned pr_flags; /* (p) PR_* flags */ unsigned pr_flags; /* (p) PR_* flags */
LIST_HEAD(, prison) pr_children; /* (a) list of child jails */ LIST_HEAD(, prison) pr_children; /* (a) list of child jails */
LIST_ENTRY(prison) pr_sibling; /* (a) next in parent's list */ LIST_ENTRY(prison) pr_sibling; /* (a) next in parent's list */