MFV r328220: 8677 Open-Context Channel Programs

illumos/illumos-gate@a3b2868063

https://www.illumos.org/issues/8677
  We want to be able to run channel programs outside of synching context.
  This would greatly improve performance of channel program that just gather
  information, as we won't have to wait for synching context anymore.

  This feature should introduce the following:
  - A new command line flag in "zfs program" to specify our intention to
  run in open context.
  - A new flag/option within the channel program ioctl which selects the
  context.
  - Appropriate error handling whenever we try a channel program in
  open-context that contains zfs.sync* expressions.
  - Documentation for the new feature in the manual pages.

Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: Chris Williamson <chris.williamson@delphix.com>
Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
Author: Serapheim Dimitropoulos <serapheim@delphix.com>
This commit is contained in:
Alexander Motin 2018-01-21 23:02:05 +00:00
commit 442814b7e7
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=328224
12 changed files with 233 additions and 86 deletions

View File

@ -18,6 +18,7 @@
.Nd executes ZFS channel programs
.Sh SYNOPSIS
.Cm zfs program
.Op Fl n
.Op Fl t Ar instruction-limit
.Op Fl m Ar memory-limit
.Ar pool
@ -45,6 +46,14 @@ will be run on
and any attempts to access or modify other pools will cause an error.
.Sh OPTIONS
.Bl -tag -width "-t"
.It Fl n
Executes a read-only channel program, which runs faster.
The program cannot change on-disk state by calling functions from the
zfs.sync submodule.
The program can be used to gather information such as properties and
determining if changes would succeed (zfs.check.*).
Without this flag, all pending changes must be synced to disk before a
channel program can complete.
.It Fl t Ar instruction-limit
Execution time limit, in number of Lua instructions to execute.
If a channel program executes more than the specified number of instructions,

View File

@ -287,6 +287,7 @@
.Op Ar snapshot Ns | Ns Ar filesystem
.Nm
.Cm program
.Op Fl n
.Op Fl t Ar timeout
.Op Fl m Ar memory_limit
.Ar pool script
@ -3294,6 +3295,7 @@ Display the path's inode change time as the first column of output.
.It Xo
.Nm
.Cm program
.Op Fl n
.Op Fl t Ar timeout
.Op Fl m Ar memory_limit
.Ar pool script
@ -3316,6 +3318,14 @@ For full documentation of the ZFS channel program interface, see the manual
page for
.Xr zfs-program 8 .
.Bl -tag -width indent
.It Fl n
Executes a read-only channel program, which runs faster.
The program cannot change on-disk state by calling functions from
the zfs.sync submodule.
The program can be used to gather information such as properties and
determining if changes would succeed (zfs.check.*).
Without this flag, all pending changes must be synced to disk before
a channel program can complete.
.It Fl t Ar timeout
Execution time limit, in milliseconds.
If a channel program executes for longer than the provided timeout, it will

View File

@ -345,7 +345,7 @@ get_usage(zfs_help_t idx)
case HELP_BOOKMARK:
return (gettext("\tbookmark <snapshot> <bookmark>\n"));
case HELP_CHANNEL_PROGRAM:
return (gettext("\tprogram [-t <instruction limit>] "
return (gettext("\tprogram [-n] [-t <instruction limit>] "
"[-m <memory limit (b)>] <pool> <program file> "
"[lua args...]\n"));
}
@ -7131,11 +7131,12 @@ zfs_do_channel_program(int argc, char **argv)
nvlist_t *outnvl;
uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT;
uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT;
boolean_t sync_flag = B_TRUE;
zpool_handle_t *zhp;
/* check options */
while (-1 !=
(c = getopt(argc, argv, "t:(instr-limit)m:(memory-limit)"))) {
(c = getopt(argc, argv, "nt:(instr-limit)m:(memory-limit)"))) {
switch (c) {
case 't':
case 'm': {
@ -7173,6 +7174,10 @@ zfs_do_channel_program(int argc, char **argv)
}
break;
}
case 'n': {
sync_flag = B_FALSE;
break;
}
case '?':
(void) fprintf(stderr, gettext("invalid option '%c'\n"),
optopt);
@ -7244,8 +7249,13 @@ zfs_do_channel_program(int argc, char **argv)
nvlist_t *argnvl = fnvlist_alloc();
fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2);
ret = lzc_channel_program(poolname, progbuf, instrlimit, memlimit,
argnvl, &outnvl);
if (sync_flag) {
ret = lzc_channel_program(poolname, progbuf,
instrlimit, memlimit, argnvl, &outnvl);
} else {
ret = lzc_channel_program_nosync(poolname, progbuf,
instrlimit, memlimit, argnvl, &outnvl);
}
if (ret != 0) {
/*

View File

@ -2381,7 +2381,7 @@ zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval,
fnvlist_add_string(argnvl, "dataset", zhp->zfs_name);
fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop));
error = lzc_channel_program(poolname, program,
error = lzc_channel_program_nosync(poolname, program,
10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl);
if (error == 0) {

View File

@ -918,6 +918,25 @@ lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist)
return (error);
}
static int
lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync,
uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
{
int error;
nvlist_t *args;
args = fnvlist_alloc();
fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync);
fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
fnvlist_free(args);
return (error);
}
/*
* Executes a channel program.
*
@ -955,16 +974,26 @@ int
lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit,
uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
{
int error;
nvlist_t *args;
args = fnvlist_alloc();
fnvlist_add_string(args, ZCP_ARG_PROGRAM, program);
fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl);
fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit);
fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit);
error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl);
fnvlist_free(args);
return (error);
return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit,
memlimit, argnvl, outnvl));
}
/*
* Executes a read-only channel program.
*
* A read-only channel program works programmatically the same way as a
* normal channel program executed with lzc_channel_program(). The only
* difference is it runs exclusively in open-context and therefore can
* return faster. The downside to that, is that the program cannot change
* on-disk state by calling functions from the zfs.sync submodule.
*
* The return values of this function (and their meaning) are exactly the
* same as the ones described in lzc_channel_program().
*/
int
lzc_channel_program_nosync(const char *pool, const char *program,
uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl)
{
return (lzc_channel_program_impl(pool, program, B_FALSE, timeout,
memlimit, argnvl, outnvl));
}

View File

@ -86,8 +86,10 @@ boolean_t lzc_exists(const char *);
int lzc_rollback(const char *, char *, int);
int lzc_rollback_to(const char *, const char *);
int lzc_channel_program(const char *, const char *, uint64_t, uint64_t,
nvlist_t *, nvlist_t **);
int lzc_channel_program(const char *, const char *, uint64_t,
uint64_t, nvlist_t *, nvlist_t **);
int lzc_channel_program_nosync(const char *, const char *, uint64_t,
uint64_t, nvlist_t *, nvlist_t **);
#ifdef __cplusplus
}

View File

@ -542,6 +542,7 @@ dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer,
nvlist_t *result = fnvlist_alloc();
int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)),
program,
B_TRUE,
0,
zfs_lua_max_memlimit,
nvlist_next_nvpair(wrapper, NULL), result);

View File

@ -38,20 +38,25 @@ extern uint64_t zfs_lua_max_memlimit;
int zcp_argerror(lua_State *, int, const char *, ...);
int zcp_eval(const char *, const char *, uint64_t, uint64_t, nvpair_t *,
nvlist_t *);
int zcp_eval(const char *, const char *, boolean_t, uint64_t, uint64_t,
nvpair_t *, nvlist_t *);
int zcp_load_list_lib(lua_State *);
int zcp_load_synctask_lib(lua_State *, boolean_t);
typedef void (zcp_cleanup_t)(void *);
typedef struct zcp_cleanup_handler {
zcp_cleanup_t *zch_cleanup_func;
void *zch_cleanup_arg;
list_node_t zch_node;
} zcp_cleanup_handler_t;
typedef struct zcp_run_info {
dsl_pool_t *zri_pool;
/*
* An estimate of the total ammount of space consumed by all
* An estimate of the total amount of space consumed by all
* synctasks we have successfully performed so far in this
* channel program. Used to generate ENOSPC errors for syncfuncs.
*/
@ -89,16 +94,21 @@ typedef struct zcp_run_info {
boolean_t zri_timed_out;
/*
* The currently registered cleanup function, which will be called
* with the stored argument if a fatal error occurs.
* Boolean indicating whether or not we are running in syncing
* context.
*/
zcp_cleanup_t *zri_cleanup;
void *zri_cleanup_arg;
boolean_t zri_sync;
/*
* List of currently registered cleanup handlers, which will be
* triggered in the event of a fatal error.
*/
list_t zri_cleanup_handlers;
} zcp_run_info_t;
zcp_run_info_t *zcp_run_info(lua_State *);
void zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *);
void zcp_clear_cleanup(lua_State *);
zcp_cleanup_handler_t *zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *);
void zcp_deregister_cleanup(lua_State *, zcp_cleanup_handler_t *);
void zcp_cleanup(lua_State *);
/*

View File

@ -137,13 +137,6 @@ typedef struct zcp_eval_arg {
uint64_t ea_instrlimit;
} zcp_eval_arg_t;
/*ARGSUSED*/
static int
zcp_eval_check(void *arg, dmu_tx_t *tx)
{
return (0);
}
/*
* The outer-most error callback handler for use with lua_pcall(). On
* error Lua will call this callback with a single argument that
@ -187,41 +180,45 @@ zcp_argerror(lua_State *state, int narg, const char *msg, ...)
*
* If an error occurs, the cleanup function will be invoked exactly once and
* then unreigstered.
*
* Returns the registered cleanup handler so the caller can deregister it
* if no error occurs.
*/
void
zcp_cleanup_handler_t *
zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg)
{
zcp_run_info_t *ri = zcp_run_info(state);
/*
* A cleanup function should always be explicitly removed before
* installing a new one to avoid accidental clobbering.
*/
ASSERT3P(ri->zri_cleanup, ==, NULL);
ri->zri_cleanup = cleanfunc;
ri->zri_cleanup_arg = cleanarg;
zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP);
zch->zch_cleanup_func = cleanfunc;
zch->zch_cleanup_arg = cleanarg;
list_insert_head(&ri->zri_cleanup_handlers, zch);
return (zch);
}
void
zcp_clear_cleanup(lua_State *state)
zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch)
{
zcp_run_info_t *ri = zcp_run_info(state);
ri->zri_cleanup = NULL;
ri->zri_cleanup_arg = NULL;
list_remove(&ri->zri_cleanup_handlers, zch);
kmem_free(zch, sizeof (*zch));
}
/*
* If it exists, execute the currently set cleanup function then unregister it.
* Execute the currently registered cleanup handlers then free them and
* destroy the handler list.
*/
void
zcp_cleanup(lua_State *state)
{
zcp_run_info_t *ri = zcp_run_info(state);
if (ri->zri_cleanup != NULL) {
ri->zri_cleanup(ri->zri_cleanup_arg);
zcp_clear_cleanup(state);
for (zcp_cleanup_handler_t *zch =
list_remove_head(&ri->zri_cleanup_handlers); zch != NULL;
zch = list_remove_head(&ri->zri_cleanup_handlers)) {
zch->zch_cleanup_func(zch->zch_cleanup_arg);
kmem_free(zch, sizeof (*zch));
}
}
@ -822,19 +819,12 @@ zcp_panic_cb(lua_State *state)
}
static void
zcp_eval_sync(void *arg, dmu_tx_t *tx)
zcp_eval_impl(dmu_tx_t *tx, boolean_t sync, zcp_eval_arg_t *evalargs)
{
int err;
zcp_run_info_t ri;
zcp_eval_arg_t *evalargs = arg;
lua_State *state = evalargs->ea_state;
/*
* Open context should have setup the stack to contain:
* 1: Error handler callback
* 2: Script to run (converted to a Lua function)
* 3: nvlist input to function (converted to Lua table or nil)
*/
VERIFY3U(3, ==, lua_gettop(state));
/*
@ -847,8 +837,9 @@ zcp_eval_sync(void *arg, dmu_tx_t *tx)
ri.zri_cred = evalargs->ea_cred;
ri.zri_tx = tx;
ri.zri_timed_out = B_FALSE;
ri.zri_cleanup = NULL;
ri.zri_cleanup_arg = NULL;
ri.zri_sync = sync;
list_create(&ri.zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t),
offsetof(zcp_cleanup_handler_t, zch_node));
ri.zri_curinstrs = 0;
ri.zri_maxinstrs = evalargs->ea_instrlimit;
@ -885,10 +876,10 @@ zcp_eval_sync(void *arg, dmu_tx_t *tx)
/*
* Remove the error handler callback from the stack. At this point,
* if there is a cleanup function registered, then it was registered
* but never run or removed, which should never occur.
* there shouldn't be any cleanup handler registered in the handler
* list (zri_cleanup_handlers), regardless of whether it ran or not.
*/
ASSERT3P(ri.zri_cleanup, ==, NULL);
list_destroy(&ri.zri_cleanup_handlers);
lua_remove(state, 1);
switch (err) {
@ -970,9 +961,73 @@ zcp_eval_sync(void *arg, dmu_tx_t *tx)
}
}
static void
zcp_pool_error(zcp_eval_arg_t *evalargs, const char *poolname)
{
evalargs->ea_result = SET_ERROR(ECHRNG);
(void) lua_pushfstring(evalargs->ea_state, "Could not open pool: %s",
poolname);
zcp_convert_return_values(evalargs->ea_state, evalargs->ea_outnvl,
ZCP_RET_ERROR, evalargs);
}
static void
zcp_eval_sync(void *arg, dmu_tx_t *tx)
{
zcp_eval_arg_t *evalargs = arg;
/*
* Open context should have setup the stack to contain:
* 1: Error handler callback
* 2: Script to run (converted to a Lua function)
* 3: nvlist input to function (converted to Lua table or nil)
*/
VERIFY3U(3, ==, lua_gettop(evalargs->ea_state));
zcp_eval_impl(tx, B_TRUE, evalargs);
}
static void
zcp_eval_open(zcp_eval_arg_t *evalargs, const char *poolname)
{
int error;
dsl_pool_t *dp;
dmu_tx_t *tx;
/*
* See comment from the same assertion in zcp_eval_sync().
*/
VERIFY3U(3, ==, lua_gettop(evalargs->ea_state));
error = dsl_pool_hold(poolname, FTAG, &dp);
if (error != 0) {
zcp_pool_error(evalargs, poolname);
return;
}
/*
* As we are running in open-context, we have no transaction associated
* with the channel program. At the same time, functions from the
* zfs.check submodule need to be associated with a transaction as
* they are basically dry-runs of their counterparts in the zfs.sync
* submodule. These functions should be able to run in open-context.
* Therefore we create a new transaction that we later abort once
* the channel program has been evaluated.
*/
tx = dmu_tx_create_dd(dp->dp_mos_dir);
zcp_eval_impl(tx, B_FALSE, evalargs);
dmu_tx_abort(tx);
dsl_pool_rele(dp, FTAG);
}
int
zcp_eval(const char *poolname, const char *program, uint64_t instrlimit,
uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl)
zcp_eval(const char *poolname, const char *program, boolean_t sync,
uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl)
{
int err;
lua_State *state;
@ -1083,9 +1138,14 @@ zcp_eval(const char *poolname, const char *program, uint64_t instrlimit,
evalargs.ea_outnvl = outnvl;
evalargs.ea_result = 0;
VERIFY0(dsl_sync_task(poolname, zcp_eval_check,
zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE));
if (sync) {
err = dsl_sync_task(poolname, NULL,
zcp_eval_sync, &evalargs, 0, ZFS_SPACE_CHECK_NONE);
if (err != 0)
zcp_pool_error(&evalargs, poolname);
} else {
zcp_eval_open(&evalargs, poolname);
}
lua_close(state);
return (evalargs.ea_result);

View File

@ -55,6 +55,10 @@ typedef struct zcp_synctask_info {
*
* If 'sync' is false, executes a dry run and returns the error code.
*
* If we are not running in syncing context and we are not doing a dry run
* (meaning we are running a zfs.sync function in open-context) then we
* return a Lua error.
*
* This function also handles common fatal error cases for channel program
* library functions. If a fatal error occurs, err_dsname will be the dataset
* name reported in error messages, if supplied.
@ -70,6 +74,13 @@ zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc,
if (!sync)
return (err);
if (!ri->zri_sync) {
return (luaL_error(state, "running functions from the zfs.sync "
"submodule requires passing sync=TRUE to "
"lzc_channel_program() (i.e. do not specify the \"-n\" "
"command line argument)"));
}
if (err == 0) {
syncfunc(arg, ri->zri_tx);
} else if (err == EIO) {
@ -233,6 +244,15 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
const char *dsname = lua_tostring(state, 1);
zcp_run_info_t *ri = zcp_run_info(state);
/*
* On old pools, the ZIL must not be active when a snapshot is created,
* but we can't suspend the ZIL because we're already in syncing
* context.
*/
if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
return (ENOTSUP);
}
/*
* We only allow for a single snapshot rather than a list, so the
* error list output is unnecessary.
@ -243,33 +263,23 @@ zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details)
ddsa.ddsa_snaps = fnvlist_alloc();
fnvlist_add_boolean(ddsa.ddsa_snaps, dsname);
/*
* On old pools, the ZIL must not be active when a snapshot is created,
* but we can't suspend the ZIL because we're already in syncing
* context.
*/
if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) {
return (ENOTSUP);
}
zcp_cleanup_handler_t *zch = zcp_register_cleanup(state,
(zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps);
err = zcp_sync_task(state, dsl_dataset_snapshot_check,
dsl_dataset_snapshot_sync, &ddsa, sync, dsname);
zcp_deregister_cleanup(state, zch);
fnvlist_free(ddsa.ddsa_snaps);
return (err);
}
void
zcp_synctask_wrapper_cleanup(void *arg)
{
fnvlist_free(arg);
}
static int
zcp_synctask_wrapper(lua_State *state)
{
int err;
zcp_cleanup_handler_t *zch;
int num_ret = 1;
nvlist_t *err_details = fnvlist_alloc();
@ -277,7 +287,8 @@ zcp_synctask_wrapper(lua_State *state)
* Make sure err_details is properly freed, even if a fatal error is
* thrown during the synctask.
*/
zcp_register_cleanup(state, &zcp_synctask_wrapper_cleanup, err_details);
zch = zcp_register_cleanup(state,
(zcp_cleanup_t *)&fnvlist_free, err_details);
zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1));
boolean_t sync = lua_toboolean(state, lua_upvalueindex(2));
@ -317,7 +328,7 @@ zcp_synctask_wrapper(lua_State *state)
num_ret++;
}
zcp_clear_cleanup(state);
zcp_deregister_cleanup(state, zch);
fnvlist_free(err_details);
return (num_ret);

View File

@ -3777,11 +3777,15 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
{
char *program;
uint64_t instrlimit, memlimit;
boolean_t sync_flag;
nvpair_t *nvarg = NULL;
if (0 != nvlist_lookup_string(innvl, ZCP_ARG_PROGRAM, &program)) {
return (EINVAL);
}
if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
sync_flag = B_TRUE;
}
if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
instrlimit = ZCP_DEFAULT_INSTRLIMIT;
}
@ -3797,7 +3801,7 @@ zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
return (EINVAL);
return (zcp_eval(poolname, program, instrlimit, memlimit,
return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
nvarg, outnvl));
}

View File

@ -984,6 +984,7 @@ typedef enum {
*/
#define ZCP_ARG_PROGRAM "program"
#define ZCP_ARG_ARGLIST "arg"
#define ZCP_ARG_SYNC "sync"
#define ZCP_ARG_INSTRLIMIT "instrlimit"
#define ZCP_ARG_MEMLIMIT "memlimit"