9425 allow channel programs to be stopped via signals

illumos/illumos-gate@d0cb1fb926
d0cb1fb926

https://www.illumos.org/issues/9425
  Problem Statement

  ZFS Channel program scripts currently require a timeout, so that hung
  or long- running scripts return a timeout error instead of causing ZFS
  to get wedged.  This limit can currently be set up to 100 million Lua
  instructions. Even with a limit in place, it would be desirable to
  have a sys admin (support engineer) be able to cancel a script that is
  taking a long time.

  Proposed Solution

  Make it possible to abort a channel program by sending an interrupt
  signal.In the underlying txg_wait_sync function, switch the cv_wait to
  a cv_wait_sig to catch the signal. Once a signal is encountered, the
  dsl_sync_task function can install a Lua hook that will get called
  before the Lua interpreter executes a new line of code. The
  dsl_sync_task can resume with a standard txg_wait_sync call and wait
  for the txg to complete. Meanwhile, the hook will abort the script and
  indicate that the channel program was canceled. The kernel returns a
  EINTR to indicate that the channel program run was canceled.

Author: Don Brady <don.brady@delphix.com>
This commit is contained in:
Andriy Gapon 2019-10-16 06:44:37 +00:00
parent 66c1f9ba9c
commit 35b885b060
7 changed files with 188 additions and 89 deletions

View File

@ -41,7 +41,7 @@ dsl_null_checkfunc(void *arg, dmu_tx_t *tx)
static int
dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check, boolean_t early)
{
spa_t *spa;
@ -85,6 +85,11 @@ top:
dmu_tx_commit(tx);
if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) {
/* current contract is to call func once */
sigfunc(arg, tx);
sigfunc = NULL; /* in case of an EAGAIN retry */
}
txg_wait_synced(dp, dst.dst_txg);
if (dst.dst_error == EAGAIN) {
@ -124,7 +129,7 @@ dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check)
{
return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
blocks_modified, space_check, B_FALSE));
}
@ -146,10 +151,23 @@ dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check)
{
return (dsl_sync_task_common(pool, checkfunc, syncfunc, arg,
return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg,
blocks_modified, space_check, B_TRUE));
}
/*
* A standard synctask that can be interrupted from a signal. The sigfunc
* is called once if a signal occurred while waiting for the task to sync.
*/
int
dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc,
dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check)
{
return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg,
blocks_modified, space_check, B_FALSE));
}
static void
dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx,

View File

@ -37,6 +37,7 @@ struct dsl_pool;
typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *);
typedef enum zfs_space_check {
/*
@ -116,6 +117,8 @@ int dsl_early_sync_task(const char *, dsl_checkfunc_t *,
dsl_syncfunc_t *, void *, int, zfs_space_check_t);
void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
void *, int, zfs_space_check_t, dmu_tx_t *);
int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *,
dsl_sigfunc_t *, void *, int, zfs_space_check_t);
#ifdef __cplusplus
}

View File

@ -87,6 +87,11 @@ extern void txg_kick(struct dsl_pool *dp);
*/
extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg);
/*
* Wait as above. Returns true if the thread was signaled while waiting.
*/
extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg);
/*
* Wait until the given transaction group, or one after it, is
* the open transaction group. Try to make this happen as soon

View File

@ -52,6 +52,12 @@ typedef struct zcp_cleanup_handler {
list_node_t zch_node;
} zcp_cleanup_handler_t;
typedef struct zcp_alloc_arg {
boolean_t aa_must_succeed;
int64_t aa_alloc_remaining;
int64_t aa_alloc_limit;
} zcp_alloc_arg_t;
typedef struct zcp_run_info {
dsl_pool_t *zri_pool;
@ -93,6 +99,11 @@ typedef struct zcp_run_info {
*/
boolean_t zri_timed_out;
/*
* Channel program was canceled by user
*/
boolean_t zri_canceled;
/*
* Boolean indicating whether or not we are running in syncing
* context.
@ -104,6 +115,26 @@ typedef struct zcp_run_info {
* triggered in the event of a fatal error.
*/
list_t zri_cleanup_handlers;
/*
* The Lua state context of our channel program.
*/
lua_State *zri_state;
/*
* Lua memory allocator arguments.
*/
zcp_alloc_arg_t *zri_allocargs;
/*
* Contains output values from zcp script or error string.
*/
nvlist_t *zri_outnvl;
/*
* The errno number returned to caller of zcp_eval().
*/
int zri_result;
} zcp_run_info_t;
zcp_run_info_t *zcp_run_info(lua_State *);

View File

@ -632,8 +632,8 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
mutex_exit(&tx->tx_sync_lock);
}
void
txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
static boolean_t
txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig)
{
tx_state_t *tx = &dp->dp_tx;
@ -652,9 +652,39 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
"tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
cv_broadcast(&tx->tx_sync_more_cv);
cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
if (wait_sig) {
/*
* Condition wait here but stop if the thread receives a
* signal. The caller may call txg_wait_synced*() again
* to resume waiting for this txg.
*/
if (cv_wait_sig(&tx->tx_sync_done_cv,
&tx->tx_sync_lock) == 0) {
mutex_exit(&tx->tx_sync_lock);
return (B_TRUE);
}
} else {
cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
}
}
mutex_exit(&tx->tx_sync_lock);
return (B_FALSE);
}
void
txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
{
VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE));
}
/*
* Similar to a txg_wait_synced but it can be interrupted from a signal.
* Returns B_TRUE if the thread was signaled while waiting.
*/
boolean_t
txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg)
{
return (txg_wait_synced_impl(dp, txg, B_TRUE));
}
void

View File

@ -115,21 +115,6 @@ static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int);
static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *,
int);
typedef struct zcp_alloc_arg {
boolean_t aa_must_succeed;
int64_t aa_alloc_remaining;
int64_t aa_alloc_limit;
} zcp_alloc_arg_t;
typedef struct zcp_eval_arg {
lua_State *ea_state;
zcp_alloc_arg_t *ea_allocargs;
cred_t *ea_cred;
nvlist_t *ea_outnvl;
int ea_result;
uint64_t ea_instrlimit;
} zcp_eval_arg_t;
/*
* The outer-most error callback handler for use with lua_pcall(). On
* error Lua will call this callback with a single argument that
@ -449,7 +434,7 @@ zcp_lua_to_nvlist_helper(lua_State *state)
static void
zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
const char *key, zcp_eval_arg_t *evalargs)
const char *key, int *result)
{
int err;
VERIFY3U(1, ==, lua_gettop(state));
@ -461,7 +446,7 @@ zcp_convert_return_values(lua_State *state, nvlist_t *nvl,
err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */
if (err != 0) {
zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR);
evalargs->ea_result = SET_ERROR(ECHRNG);
*result = SET_ERROR(ECHRNG);
}
}
@ -788,13 +773,24 @@ zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize)
static void
zcp_lua_counthook(lua_State *state, lua_Debug *ar)
{
/*
* If we're called, check how many instructions the channel program has
* executed so far, and compare against the limit.
*/
lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
zcp_run_info_t *ri = lua_touserdata(state, -1);
/*
* Check if we were canceled while waiting for the
* txg to sync or from our open context thread
*/
if (ri->zri_canceled ||
(!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) {
ri->zri_canceled = B_TRUE;
(void) lua_pushstring(state, "Channel program was canceled.");
(void) lua_error(state);
}
/*
* Check how many instructions the channel program has
* executed so far, and compare against the limit.
*/
ri->zri_curinstrs += zfs_lua_check_instrlimit_interval;
if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) {
ri->zri_timed_out = B_TRUE;
@ -813,31 +809,25 @@ zcp_panic_cb(lua_State *state)
}
static void
zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri)
{
int err;
zcp_run_info_t ri;
lua_State *state = evalargs->ea_state;
lua_State *state = ri->zri_state;
VERIFY3U(3, ==, lua_gettop(state));
/* finish initializing our runtime state */
ri->zri_pool = dmu_tx_pool(tx);
ri->zri_tx = tx;
list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
offsetof(zcp_cleanup_handler_t, zch_node));
/*
* Store the zcp_run_info_t struct for this run in the Lua registry.
* Registry entries are not directly accessible by the Lua scripts but
* can be accessed by our callbacks.
*/
ri.zri_space_used = 0;
ri.zri_pool = dmu_tx_pool(tx);
ri.zri_cred = evalargs->ea_cred;
ri.zri_tx = tx;
ri.zri_timed_out = B_FALSE;
ri.zri_sync = sync;
list_create(&ri.zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
offsetof(zcp_cleanup_handler_t, zch_node));
ri.zri_curinstrs = 0;
ri.zri_maxinstrs = evalargs->ea_instrlimit;
lua_pushlightuserdata(state, &ri);
lua_pushlightuserdata(state, ri);
lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY);
VERIFY3U(3, ==, lua_gettop(state));
@ -854,7 +844,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
* off control to the channel program. Channel programs that use too
* much memory should die with ENOSPC.
*/
evalargs->ea_allocargs->aa_must_succeed = B_FALSE;
ri->zri_allocargs->aa_must_succeed = B_FALSE;
/*
* Call the Lua function that open-context passed us. This pops the
@ -866,14 +856,14 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
/*
* Let Lua use KM_SLEEP while we interpret the return values.
*/
evalargs->ea_allocargs->aa_must_succeed = B_TRUE;
ri->zri_allocargs->aa_must_succeed = B_TRUE;
/*
* Remove the error handler callback from the stack. At this point,
* there shouldn't be any cleanup handler registered in the handler
* list (zri_cleanup_handlers), regardless of whether it ran or not.
*/
list_destroy(&ri.zri_cleanup_handlers);
list_destroy(&ri->zri_cleanup_handlers);
lua_remove(state, 1);
switch (err) {
@ -893,16 +883,16 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
int return_count = lua_gettop(state);
if (return_count == 1) {
evalargs->ea_result = 0;
zcp_convert_return_values(state, evalargs->ea_outnvl,
ZCP_RET_RETURN, evalargs);
ri->zri_result = 0;
zcp_convert_return_values(state, ri->zri_outnvl,
ZCP_RET_RETURN, &ri->zri_result);
} else if (return_count > 1) {
evalargs->ea_result = SET_ERROR(ECHRNG);
ri->zri_result = SET_ERROR(ECHRNG);
lua_settop(state, 0);
(void) lua_pushfstring(state, "Multiple return "
"values not supported");
zcp_convert_return_values(state, evalargs->ea_outnvl,
ZCP_RET_ERROR, evalargs);
zcp_convert_return_values(state, ri->zri_outnvl,
ZCP_RET_ERROR, &ri->zri_result);
}
break;
}
@ -916,14 +906,16 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
* stack.
*/
VERIFY3U(1, ==, lua_gettop(state));
if (ri.zri_timed_out) {
evalargs->ea_result = SET_ERROR(ETIME);
if (ri->zri_timed_out) {
ri->zri_result = SET_ERROR(ETIME);
} else if (ri->zri_canceled) {
ri->zri_result = SET_ERROR(EINTR);
} else {
evalargs->ea_result = SET_ERROR(ECHRNG);
ri->zri_result = SET_ERROR(ECHRNG);
}
zcp_convert_return_values(state, evalargs->ea_outnvl,
ZCP_RET_ERROR, evalargs);
zcp_convert_return_values(state, ri->zri_outnvl,
ZCP_RET_ERROR, &ri->zri_result);
break;
}
case LUA_ERRERR: {
@ -934,14 +926,16 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
* return the error message.
*/
VERIFY3U(1, ==, lua_gettop(state));
if (ri.zri_timed_out) {
evalargs->ea_result = SET_ERROR(ETIME);
if (ri->zri_timed_out) {
ri->zri_result = SET_ERROR(ETIME);
} else if (ri->zri_canceled) {
ri->zri_result = SET_ERROR(EINTR);
} else {
evalargs->ea_result = SET_ERROR(ECHRNG);
ri->zri_result = SET_ERROR(ECHRNG);
}
zcp_convert_return_values(state, evalargs->ea_outnvl,
ZCP_RET_ERROR, evalargs);
zcp_convert_return_values(state, ri->zri_outnvl,
ZCP_RET_ERROR, &ri->zri_result);
break;
}
case LUA_ERRMEM:
@ -949,7 +943,7 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
* Lua ran out of memory while running the channel program.
* There's not much we can do.
*/
evalargs->ea_result = SET_ERROR(ENOSPC);
ri->zri_result = SET_ERROR(ENOSPC);
break;
default:
VERIFY0(err);
@ -957,21 +951,35 @@ zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
}
static void
zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname)
zcp_pool_error(zcp_run_info_t *ri, const char *poolname)
{
evalargs->ea_result = SET_ERROR(ECHRNG);
lua_settop(evalargs->ea_state, 0);
(void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s",
ri->zri_result = SET_ERROR(ECHRNG);
lua_settop(ri->zri_state, 0);
(void) lua_pushfstring(ri->zri_state, "Could not open pool: %s",
poolname);
zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl,
ZCP_RET_ERROR, evalargs);
zcp_convert_return_values(ri->zri_state, ri->zri_outnvl,
ZCP_RET_ERROR, &ri->zri_result);
}
/*
* This callback is called when txg_wait_synced_sig encountered a signal.
* The txg_wait_synced_sig will continue to wait for the txg to complete
* after calling this callback.
*/
/* ARGSUSED */
static void
zcp_eval_sig(void *arg, dmu_tx_t *tx)
{
zcp_run_info_t *ri = arg;
ri->zri_canceled = B_TRUE;
}
static void
zcp_eval_sync(void *arg, dmu_tx_t *tx)
{
zcp_eval_arg_t *evalargs = arg;
zcp_run_info_t *ri = arg;
/*
* Open context should have setup the stack to contain:
@ -979,15 +987,14 @@ zcp_eval_sync(void *arg, dmu_tx_t *tx)
* 2: Script to run (converted to a Lua function)
* 3: nvlist input to function (converted to Lua table or nil)
*/
VERIFY3U(3, ==, lua_gettop(evalargs->ea_state));
VERIFY3U(3, ==, lua_gettop(ri->zri_state));
zcp_eval_impl(tx, B_TRUE, evalargs);
zcp_eval_impl(tx, ri);
}
static void
zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname)
zcp_eval_open(zcp_run_info_t *ri, const char *poolname)
{
int error;
dsl_pool_t *dp;
dmu_tx_t *tx;
@ -995,11 +1002,11 @@ zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname)
/*
* See comment from the same assertion in zcp_eval_sync().
*/
VERIFY3U(3, ==, lua_gettop(evalargs->ea_state));
VERIFY3U(3, ==, lua_gettop(ri->zri_state));
error = dsl_pool_hold(poolname, FTAG, &dp);
if (error != 0) {
zcp_pool_error(evalargs, poolname);
zcp_pool_error(ri, poolname);
return;
}
@ -1014,7 +1021,7 @@ zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname)
*/
tx = dmu_tx_create_dd(dp->dp_mos_dir);
zcp_eval_impl(tx, B_FALSE, evalargs);
zcp_eval_impl(tx, ri);
dmu_tx_abort(tx);
@ -1027,7 +1034,7 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
{
int err;
lua_State *state;
zcp_eval_arg_t evalargs;
zcp_run_info_t runinfo;
if (instrlimit > zfs_lua_max_instrlimit)
return (SET_ERROR(EINVAL));
@ -1127,24 +1134,29 @@ zcp_eval(const char *poolname, const char *program, boolean_t sync,
}
VERIFY3U(3, ==, lua_gettop(state));
evalargs.ea_state = state;
evalargs.ea_allocargs = &allocargs;
evalargs.ea_instrlimit = instrlimit;
evalargs.ea_cred = CRED();
evalargs.ea_outnvl = outnvl;
evalargs.ea_result = 0;
runinfo.zri_state = state;
runinfo.zri_allocargs = &allocargs;
runinfo.zri_outnvl = outnvl;
runinfo.zri_result = 0;
runinfo.zri_cred = CRED();
runinfo.zri_timed_out = B_FALSE;
runinfo.zri_canceled = B_FALSE;
runinfo.zri_sync = sync;
runinfo.zri_space_used = 0;
runinfo.zri_curinstrs = 0;
runinfo.zri_maxinstrs = instrlimit;
if (sync) {
err = dsl_sync_task(poolname, NULL,
zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync,
zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL);
if (err != 0)
zcp_pool_error(&evalargs, poolname);
zcp_pool_error(&runinfo, poolname);
} else {
zcp_eval_open(&evalargs, poolname);
zcp_eval_open(&runinfo, poolname);
}
lua_close(state);
return (evalargs.ea_result);
return (runinfo.zri_result);
}
/*

View File

@ -6347,7 +6347,7 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
error = vec->zvec_func(zc->zc_name, innvl, outnvl);
/*
* Some commands can partially execute, modfiy state, and still
* Some commands can partially execute, modify state, and still
* return an error. In these cases, attempt to record what
* was modified.
*/