jail: allow root to implicitly widen its cpuset to attach

The default behavior for attaching processes to jails is that the jail's
cpuset augments the attaching processes, so that it cannot be used to
escalate a user's ability to take advantage of more CPUs than the
administrator wanted them to.

This is problematic when root needs to manage jails that have disjoint
sets with whatever process is attaching, as this would otherwise result
in a deadlock. Therefore, if we did not have an appropriate common
subset of cpus/domains for our new policy, we now allow the process to
simply take on the jail set *if* it has the privilege to widen its mask
anyways.

With the new logic, root can still usefully cpuset a process that
attaches to a jail with the desire of maintaining the set it was given
pre-attachment while still retaining the ability to manage child jails
without jumping through hoops.

A test has been added to demonstrate the issue; cpuset of a process
down to just the first CPU and attempting to attach to a jail without
access to any of the same CPUs previously resulted in EDEADLK and now
results in taking on the jail's mask for privileged users.

PR:		253724
Reviewed by:	jamie (also discussed with)
MFC after:	3 days
Differential Revision:	https://reviews.freebsd.org/D28952
This commit is contained in:
Kyle Evans 2021-02-26 15:46:47 -06:00
parent af11c20290
commit 60c4ec806d
2 changed files with 210 additions and 1 deletions

View File

@ -1,7 +1,7 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause
*
* Copyright (c) 2020 Kyle Evans <kevans@FreeBSD.org>
* Copyright (c) 2020-2021 Kyle Evans <kevans@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -31,6 +31,8 @@ __FBSDID("$FreeBSD");
#include <sys/param.h>
#include <sys/cpuset.h>
#include <sys/jail.h>
#include <sys/procdesc.h>
#include <sys/select.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/wait.h>
@ -64,6 +66,10 @@ typedef void (*jail_test_cb)(struct jail_test_cb_params *);
#define FAILURE_JAILSET 44
#define FAILURE_PIDSET 45
#define FAILURE_SEND 46
#define FAILURE_DEADLK 47
#define FAILURE_ATTACH 48
#define FAILURE_BADAFFIN 49
#define FAILURE_SUCCESS 50
static const char *
do_jail_errstr(int error)
@ -80,6 +86,14 @@ do_jail_errstr(int error)
return ("Failed to get the pid setid");
case FAILURE_SEND:
return ("Failed to send(2) cpuset information");
case FAILURE_DEADLK:
return ("Deadlock hit trying to attach to jail");
case FAILURE_ATTACH:
return ("jail_attach(2) failed");
case FAILURE_BADAFFIN:
return ("Unexpected post-attach affinity");
case FAILURE_SUCCESS:
return ("jail_attach(2) succeeded, but should have failed.");
default:
return (NULL);
}
@ -444,6 +458,192 @@ ATF_TC_BODY(jail_attach_plain, tc)
do_jail_test(1, false, &jail_attach_plain_pro, &jail_attach_jset_epi);
}
static int
jail_attach_disjoint_newjail(int fd)
{
struct iovec iov[2];
char *name;
int jid;
if (asprintf(&name, "cpuset_%d", getpid()) == -1)
_exit(42);
iov[0].iov_base = "name";
iov[0].iov_len = sizeof("name");
iov[1].iov_base = name;
iov[1].iov_len = strlen(name) + 1;
if ((jid = jail_set(iov, 2, JAIL_CREATE | JAIL_ATTACH)) < 0)
return (FAILURE_JAIL);
/* Signal that we're ready. */
write(fd, &jid, sizeof(jid));
for (;;) {
/* Spin */
}
}
static int
wait_jail(int fd, int pfd)
{
fd_set lset;
struct timeval tv;
int error, jid, maxfd;
FD_ZERO(&lset);
FD_SET(fd, &lset);
FD_SET(pfd, &lset);
maxfd = MAX(fd, pfd);
tv.tv_sec = 5;
tv.tv_usec = 0;
/* Wait for jid to be written. */
do {
error = select(maxfd + 1, &lset, NULL, NULL, &tv);
} while (error == -1 && errno == EINTR);
if (error == 0) {
atf_tc_fail("Jail creator did not respond in time.");
}
ATF_REQUIRE_MSG(error > 0, "Unexpected error %d from select()", errno);
if (FD_ISSET(pfd, &lset)) {
/* Process died */
atf_tc_fail("Jail creator died unexpectedly.");
}
ATF_REQUIRE(FD_ISSET(fd, &lset));
ATF_REQUIRE_EQ(sizeof(jid), recv(fd, &jid, sizeof(jid), 0));
return (jid);
}
static int
try_attach_child(int jid, cpuset_t *expected_mask)
{
cpuset_t mask;
if (jail_attach(jid) == -1) {
if (errno == EDEADLK)
return (FAILURE_DEADLK);
return (FAILURE_ATTACH);
}
if (expected_mask == NULL)
return (FAILURE_SUCCESS);
/* If we had an expected mask, check it against the new process mask. */
CPU_ZERO(&mask);
if (cpuset_getaffinity(CPU_LEVEL_CPUSET, CPU_WHICH_PID,
-1, sizeof(mask), &mask) != 0) {
return (FAILURE_MASK);
}
if (CPU_CMP(expected_mask, &mask) != 0)
return (FAILURE_BADAFFIN);
return (0);
}
static void
try_attach(int jid, cpuset_t *expected_mask)
{
const char *errstr;
pid_t pid;
int error, fail, status;
ATF_REQUIRE(expected_mask != NULL);
ATF_REQUIRE((pid = fork()) != -1);
if (pid == 0)
_exit(try_attach_child(jid, expected_mask));
while ((error = waitpid(pid, &status, 0)) == -1 && errno == EINTR) {
/* Try again. */
}
/* Sanity check the exit info. */
ATF_REQUIRE_EQ(pid, error);
ATF_REQUIRE(WIFEXITED(status));
if ((fail = WEXITSTATUS(status)) != 0) {
errstr = do_jail_errstr(fail);
if (errstr != NULL)
atf_tc_fail("%s", errstr);
else
atf_tc_fail("Unknown error '%d'", WEXITSTATUS(status));
}
}
ATF_TC(jail_attach_disjoint);
ATF_TC_HEAD(jail_attach_disjoint, tc)
{
atf_tc_set_md_var(tc, "descr",
"Test root attachment into completely disjoint jail cpuset.");
atf_tc_set_md_var(tc, "require.user", "root");
}
ATF_TC_BODY(jail_attach_disjoint, tc)
{
cpuset_t smask, jmask;
int sockpair[2];
cpusetid_t setid;
pid_t pid;
int fcpu, jid, pfd, sock, scpu;
ATF_REQUIRE_EQ(0, cpuset(&setid));
skip_ltncpu(2, &jmask);
fcpu = CPU_FFS(&jmask) - 1;
ATF_REQUIRE_EQ(0, socketpair(PF_UNIX, SOCK_STREAM, 0, sockpair));
/* We'll wait on the procdesc, too, so we can fail faster if it dies. */
ATF_REQUIRE((pid = pdfork(&pfd, 0)) != -1);
if (pid == 0) {
/* First child sets up the jail. */
sock = sockpair[SP_CHILD];
close(sockpair[SP_PARENT]);
_exit(jail_attach_disjoint_newjail(sock));
}
close(sockpair[SP_CHILD]);
sock = sockpair[SP_PARENT];
ATF_REQUIRE((jid = wait_jail(sock, pfd)) > 0);
/*
* This process will be clamped down to the first cpu, while the jail
* will simply have the first CPU removed to make it a completely
* disjoint operation.
*/
CPU_ZERO(&smask);
CPU_SET(fcpu, &smask);
CPU_CLR(fcpu, &jmask);
/*
* We'll test with the first and second cpu set as well. Only the
* second cpu should be used.
*/
scpu = CPU_FFS(&jmask) - 1;
ATF_REQUIRE_EQ(0, cpuset_setaffinity(CPU_LEVEL_ROOT, CPU_WHICH_JAIL,
jid, sizeof(jmask), &jmask));
ATF_REQUIRE_EQ(0, cpuset_setaffinity(CPU_LEVEL_CPUSET, CPU_WHICH_CPUSET,
setid, sizeof(smask), &smask));
try_attach(jid, &jmask);
CPU_SET(scpu, &smask);
ATF_REQUIRE_EQ(0, cpuset_setaffinity(CPU_LEVEL_CPUSET, CPU_WHICH_CPUSET,
setid, sizeof(smask), &smask));
CPU_CLR(fcpu, &smask);
try_attach(jid, &smask);
}
ATF_TC(badparent);
ATF_TC_HEAD(badparent, tc)
{
@ -488,6 +688,7 @@ ATF_TP_ADD_TCS(tp)
ATF_TP_ADD_TC(tp, jail_attach_newbase_plain);
ATF_TP_ADD_TC(tp, jail_attach_prevbase);
ATF_TP_ADD_TC(tp, jail_attach_plain);
ATF_TP_ADD_TC(tp, jail_attach_disjoint);
ATF_TP_ADD_TC(tp, badparent);
return (atf_no_error());
}

View File

@ -1255,6 +1255,11 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
* as the parent, then we'll check if the process was previously using
* the root set and, if it wasn't, create a new base with the process's
* mask applied to it.
*
* If the new root is incompatible with the existing mask, then we allow
* the process to take on the new root if and only if they have
* privilege to widen their mask anyways. Unprivileged processes get
* rejected with EDEADLK.
*/
if (set != NULL && rebase && nroot != tdroot) {
cpusetid_t base_id, root_id;
@ -1265,6 +1270,9 @@ cpuset_setproc(pid_t pid, struct cpuset *set, cpuset_t *mask,
if (base_id != root_id) {
error = cpuset_setproc_newbase(td, set, nroot, &base,
&freelist, &domainlist);
if (error == EDEADLK &&
priv_check(td, PRIV_SCHED_CPUSET) == 0)
error = 0;
if (error != 0)
goto unlock_out;
}