rework how ZVOLs are updated in response to DSL operations

With this change all ZVOL updates are initiated from the SPA sync
context instead of a mix of the sync and open contexts.  The updates are
queued to be applied by a dedicated thread in the original order.  This
should ensure that ZVOLs always accurately reflect the corresponding
datasets.  ZFS ioctl operations wait on the mentioned thread to complete
its work.  Thus, the illusion of the synchronous ZVOL update is
preserved.  At the same time, the SPA sync thread never blocks on ZVOL
related operations avoiding problems like reported in bug 203864.

This change is based on earlier work in the same direction: D7179 and
D14669 by Anthoine Bourgeois.  D7179 tried to perform ZVOL operations
in the open context and that opened races between them.  D14669 uses a
design very similar to this change but with different implementation
details.

This change also heavily borrows from similar code in ZoL, but there are
many differences too.  See:
- a0bd735adb
- https://github.com/zfsonlinux/zfs/issues/3681
- https://github.com/zfsonlinux/zfs/issues/2217

PR:		203864
MFC after:	5 weeks
Sponsored by:	CyberSecure
Differential Revision: https://reviews.freebsd.org/D23478
This commit is contained in:
Andriy Gapon 2020-06-11 10:41:31 +00:00
parent 6fe9e470bb
commit f51f07e1ec
10 changed files with 212 additions and 72 deletions

View File

@ -1053,6 +1053,9 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
doca->doca_cred, tx);
}
#if defined(__FreeBSD__) && defined(_KERNEL)
zvol_create_minors(dp->dp_spa, doca->doca_name);
#endif
spa_history_log_internal_ds(ds, "create", tx, "");
dsl_dataset_rele(ds, FTAG);
dsl_dir_rele(pdd, FTAG);
@ -1148,6 +1151,9 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
dsl_dataset_name(origin, namebuf);
#if defined(__FreeBSD__) && defined(_KERNEL)
zvol_create_minors(dp->dp_spa, doca->doca_clone);
#endif
spa_history_log_internal_ds(ds, "clone", tx,
"origin=%s (%llu)", namebuf, origin->ds_object);
dsl_dataset_rele(ds, FTAG);

View File

@ -57,6 +57,9 @@
#include <sys/dsl_bookmark.h>
#include <sys/zfeature.h>
#include <sys/bqueue.h>
#ifdef __FreeBSD__
#include <sys/zvol.h>
#endif
#ifdef __FreeBSD__
#undef dump_write
@ -3445,6 +3448,11 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
drc->drc_newsnapobj =
dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
}
#if defined(__FreeBSD__) && defined(_KERNEL)
zvol_create_minors(dp->dp_spa, drc->drc_tofs);
#endif
/*
* Release the hold from dmu_recv_begin. This must be done before
* we return to open context, so that when we free the dataset's dnode,

View File

@ -1572,6 +1572,9 @@ dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx)
dsl_props_set_sync_impl(ds->ds_prev,
ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx);
}
#if defined(__FreeBSD__) && defined(_KERNEL)
zvol_create_minors(dp->dp_spa, name);
#endif
dsl_dataset_rele(ds, FTAG);
}
}
@ -1646,17 +1649,6 @@ dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors)
fnvlist_free(suspended);
}
#ifdef __FreeBSD__
#ifdef _KERNEL
if (error == 0) {
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
char *snapname = nvpair_name(pair);
zvol_create_minors(snapname);
}
}
#endif
#endif
return (error);
}
@ -2535,7 +2527,7 @@ dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp,
snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, "%s@%s",
ddrsa->ddrsa_fsname, ddrsa->ddrsa_newsnapname);
zfsvfs_update_fromname(oldname, newname);
zvol_rename_minors(oldname, newname);
zvol_rename_minors(dp->dp_spa, oldname, newname);
kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
#endif
@ -3087,9 +3079,6 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
}
#if defined(__FreeBSD__) && defined(_KERNEL)
/* Take the spa_namespace_lock early so zvol renames don't deadlock. */
mutex_enter(&spa_namespace_lock);
oldname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
newname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
#endif
@ -3135,7 +3124,7 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
#if defined(__FreeBSD__) && defined(_KERNEL)
dsl_dataset_name(ds, newname);
zfsvfs_update_fromname(oldname, newname);
zvol_rename_minors(oldname, newname);
zvol_rename_minors(dp->dp_spa, oldname, newname);
#endif
/* move any clone references */
@ -3177,8 +3166,6 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx)
}
#if defined(__FreeBSD__) && defined(_KERNEL)
mutex_exit(&spa_namespace_lock);
kmem_free(newname, ZFS_MAX_DATASET_NAME_LEN);
kmem_free(oldname, ZFS_MAX_DATASET_NAME_LEN);
#endif

View File

@ -43,6 +43,10 @@
#include <sys/dsl_deleg.h>
#include <sys/dmu_impl.h>
#include <sys/zcp.h>
#if defined(__FreeBSD__) && defined(_KERNEL)
#include <sys/zvol.h>
#endif
int
dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
@ -489,6 +493,14 @@ dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx)
if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
tx));
#if defined(__FreeBSD__) && defined(_KERNEL)
char dsname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds, dsname);
zvol_remove_minors(dp->dp_spa, dsname);
#endif
dsl_dir_rele(ds->ds_dir, ds);
ds->ds_dir = NULL;
dmu_object_free_zapified(mos, obj, tx);
@ -979,6 +991,9 @@ dsl_destroy_head_sync(void *arg, dmu_tx_t *tx)
VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds));
dsl_destroy_head_sync_impl(ds, tx);
#if defined(__FreeBSD__) && defined(_KERNEL)
zvol_remove_minors(dp->dp_spa, ddha->ddha_name);
#endif
dsl_dataset_rele(ds, FTAG);
}

View File

@ -2093,7 +2093,7 @@ dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
#ifdef __FreeBSD__
#ifdef _KERNEL
zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname);
zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname);
zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, ddra->ddra_newname);
#endif
#endif

View File

@ -32,6 +32,7 @@
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2017 Datto Inc.
* Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
*/
/*
@ -1280,6 +1281,24 @@ spa_activate(spa_t *spa, int mode)
*/
trim_thread_create(spa);
/*
* This taskq is used to perform zvol-minor-related tasks
* asynchronously. This has several advantages, including easy
* resolution of various deadlocks (zfsonlinux bug #3681).
*
* The taskq must be single threaded to ensure tasks are always
* processed in the order in which they were dispatched.
*
* A taskq per pool allows one to keep the pools independent.
* This way if one pool is suspended, it will not impact another.
*
* The preferred location to dispatch a zvol minor task is a sync
* task. In this context, there is easy access to the spa_t and minimal
* error handling is required because the sync task must succeed.
*/
spa->spa_zvol_taskq = taskq_create("z_zvol", 1, minclsyspri,
1, INT_MAX, 0);
for (size_t i = 0; i < TXG_SIZE; i++) {
spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
ZIO_FLAG_CANFAIL);
@ -1323,6 +1342,11 @@ spa_deactivate(spa_t *spa)
spa_evicting_os_wait(spa);
if (spa->spa_zvol_taskq) {
taskq_destroy(spa->spa_zvol_taskq);
spa->spa_zvol_taskq = NULL;
}
txg_list_destroy(&spa->spa_vdev_txg_list);
list_destroy(&spa->spa_config_dirty_list);
@ -4614,7 +4638,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
#ifdef __FreeBSD__
#ifdef _KERNEL
if (firstopen)
zvol_create_minors(spa->spa_name);
zvol_create_minors(spa, spa->spa_name);
#endif
#endif
}
@ -5970,7 +5994,7 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
#ifdef __FreeBSD__
#ifdef _KERNEL
zvol_create_minors(pool);
zvol_create_minors(spa, pool);
#endif
#endif
return (0);
@ -6119,6 +6143,10 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
spa_async_suspend(spa);
if (spa->spa_zvol_taskq) {
zvol_remove_minors(spa, spa_name(spa));
taskq_wait(spa->spa_zvol_taskq);
}
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);

View File

@ -27,6 +27,7 @@
* Copyright 2013 Saso Kiselkov. All rights reserved.
* Copyright (c) 2017 Datto Inc.
* Copyright (c) 2017, Intel Corporation.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@ -399,6 +400,8 @@ struct spa {
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
taskq_t *spa_zvol_taskq; /* Taskq for minor management */
uint64_t spa_multihost; /* multihost aware (mmp) */
mmp_thread_t spa_mmp; /* multihost mmp thread */
list_t spa_leaf_list; /* list of leaf vdevs */

View File

@ -21,6 +21,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
*/
#ifndef _SYS_ZVOL_H
@ -40,9 +41,6 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
extern int zvol_check_volblocksize(uint64_t volblocksize);
extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
extern int zvol_create_minor(const char *);
extern int zvol_remove_minor(const char *);
extern void zvol_remove_minors(const char *);
extern int zvol_set_volsize(const char *, uint64_t);
#ifdef illumos
@ -72,8 +70,10 @@ extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
#endif /* illumos */
#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
extern int zvol_create_minors(const char *name);
extern void zvol_rename_minors(const char *oldname, const char *newname);
extern void zvol_create_minors(spa_t *spa, const char *name);
extern void zvol_remove_minors(spa_t *spa, const char *name);
extern void zvol_rename_minors(spa_t *spa, const char *oldname,
const char *newname);
#endif
#endif /* _KERNEL */

View File

@ -1642,8 +1642,10 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
int error;
zfs_log_history(zc);
error = spa_destroy(zc->zc_name);
#ifndef __FreeBSD__
if (error == 0)
zvol_remove_minors(zc->zc_name);
#endif
return (error);
}
@ -1694,8 +1696,10 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
zfs_log_history(zc);
error = spa_export(zc->zc_name, NULL, force, hardforce);
#ifndef __FreeBSD__
if (error == 0)
zvol_remove_minors(zc->zc_name);
#endif
return (error);
}
@ -3395,13 +3399,23 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
if (error == 0) {
error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
nvprops, outnvl);
#if defined(__FreeBSD__) && defined(_KERNEL)
/*
* Wait for ZVOL operations to settle down before destroying.
*/
if (error != 0) {
spa_t *spa;
if (spa_open(fsname, &spa, FTAG) == 0) {
taskqueue_drain_all(
spa->spa_zvol_taskq->tq_queue);
spa_close(spa, FTAG);
}
}
#endif
if (error != 0)
(void) dsl_destroy_head(fsname);
}
#ifdef __FreeBSD__
if (error == 0 && type == DMU_OST_ZVOL)
zvol_create_minors(fsname);
#endif
return (error);
}
@ -3443,10 +3457,6 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
if (error != 0)
(void) dsl_destroy_head(fsname);
}
#ifdef __FreeBSD__
if (error == 0)
zvol_create_minors(fsname);
#endif
return (error);
}
@ -3738,9 +3748,6 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
return (SET_ERROR(EXDEV));
zfs_unmount_snap(nvpair_name(pair));
#if defined(__FreeBSD__)
zvol_remove_minors(name);
#endif
}
return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
@ -3924,10 +3931,8 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
else
err = dsl_destroy_head(zc->zc_name);
#ifndef __FreeBSD__
if (ost == DMU_OST_ZVOL && err == 0)
#ifdef __FreeBSD__
zvol_remove_minors(zc->zc_name);
#else
(void) zvol_remove_minor(zc->zc_name);
#endif
return (err);
@ -4822,11 +4827,6 @@ zfs_ioc_recv(zfs_cmd_t *zc)
}
#endif
#ifdef __FreeBSD__
if (error == 0)
zvol_create_minors(tofs);
#endif
/*
* On error, restore the original props.
*/
@ -6968,6 +6968,24 @@ zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag,
out:
nvlist_free(innvl);
#if defined(__FreeBSD__) && defined(_KERNEL)
/*
* Wait for ZVOL changes to get applied.
* NB: taskqueue_drain_all() does less than taskq_wait(),
* but enough for what we want.
* And there is no equivalent illumos API.
*/
if (error == 0) {
spa_t *spa;
if (spa_open(saved_poolname, &spa, FTAG) == 0) {
taskqueue_drain_all(
spa->spa_zvol_taskq->tq_queue);
spa_close(spa, FTAG);
}
}
#endif
#ifdef illumos
rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)

View File

@ -30,6 +30,7 @@
* Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2014 Integros [integros.com]
* Copyright (c) 2016 Actifio, Inc. All rights reserved.
*/
/* Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org> */
@ -185,6 +186,20 @@ typedef struct zvol_state {
#endif
} zvol_state_t;
typedef enum {
ZVOL_ASYNC_CREATE_MINORS,
ZVOL_ASYNC_REMOVE_MINORS,
ZVOL_ASYNC_RENAME_MINORS,
ZVOL_ASYNC_MAX
} zvol_async_op_t;
typedef struct {
zvol_async_op_t op;
char pool[ZFS_MAX_DATASET_NAME_LEN];
char name1[ZFS_MAX_DATASET_NAME_LEN];
char name2[ZFS_MAX_DATASET_NAME_LEN];
} zvol_task_t;
#ifndef illumos
static LIST_HEAD(, zvol_state) all_zvols;
#endif
@ -607,7 +622,7 @@ zvol_name2minor(const char *name, minor_t *minor)
/*
* Create a minor node (plus a whole lot more) for the specified volume.
*/
int
static int
zvol_create_minor(const char *name)
{
zfs_soft_state_t *zs;
@ -691,7 +706,6 @@ zvol_create_minor(const char *name)
if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
mode = volmode;
DROP_GIANT();
zv->zv_volmode = mode;
if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
g_topology_lock();
@ -766,7 +780,6 @@ zvol_create_minor(const char *name)
zvol_geom_run(zv);
g_topology_unlock();
}
PICKUP_GIANT();
ZFS_LOG(1, "ZVOL %s created.", name);
#endif
@ -819,22 +832,6 @@ zvol_remove_zv(zvol_state_t *zv)
return (0);
}
int
zvol_remove_minor(const char *name)
{
zvol_state_t *zv;
int rc;
mutex_enter(&zfsdev_state_lock);
if ((zv = zvol_minor_lookup(name)) == NULL) {
mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(ENXIO));
}
rc = zvol_remove_zv(zv);
mutex_exit(&zfsdev_state_lock);
return (rc);
}
int
zvol_first_open(zvol_state_t *zv)
{
@ -976,7 +973,7 @@ zvol_update_volsize(objset_t *os, uint64_t volsize)
}
void
zvol_remove_minors(const char *name)
zvol_remove_minors_impl(const char *name)
{
#ifdef illumos
zvol_state_t *zv;
@ -1004,7 +1001,6 @@ zvol_remove_minors(const char *name)
namelen = strlen(name);
DROP_GIANT();
mutex_enter(&zfsdev_state_lock);
LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
@ -1017,7 +1013,6 @@ zvol_remove_minors(const char *name)
}
mutex_exit(&zfsdev_state_lock);
PICKUP_GIANT();
#endif /* illumos */
}
@ -2920,7 +2915,7 @@ zvol_create_snapshots(objset_t *os, const char *name)
}
int
zvol_create_minors(const char *name)
zvol_create_minors_impl(const char *name)
{
uint64_t cookie;
objset_t *os;
@ -2976,7 +2971,7 @@ zvol_create_minors(const char *name)
while (dmu_dir_list_next(os, MAXPATHLEN - (p - osname), p, NULL,
&cookie) == 0) {
dmu_objset_rele(os, FTAG);
(void)zvol_create_minors(osname);
(void)zvol_create_minors_impl(osname);
if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
printf("ZFS WARNING: Unable to put hold on %s (error=%d).\n",
name, error);
@ -3045,7 +3040,7 @@ zvol_rename_minor(zvol_state_t *zv, const char *newname)
}
void
zvol_rename_minors(const char *oldname, const char *newname)
zvol_rename_minors_impl(const char *oldname, const char *newname)
{
char name[MAXPATHLEN];
struct g_provider *pp;
@ -3058,7 +3053,6 @@ zvol_rename_minors(const char *oldname, const char *newname)
oldnamelen = strlen(oldname);
newnamelen = strlen(newname);
DROP_GIANT();
/* See comment in zvol_open(). */
if (!MUTEX_HELD(&zfsdev_state_lock)) {
mutex_enter(&zfsdev_state_lock);
@ -3080,7 +3074,88 @@ zvol_rename_minors(const char *oldname, const char *newname)
if (locked)
mutex_exit(&zfsdev_state_lock);
PICKUP_GIANT();
}
static zvol_task_t *
zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2)
{
zvol_task_t *task;
char *delim;
task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
task->op = op;
delim = strchr(name1, '/');
strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN);
strlcpy(task->name1, name1, MAXNAMELEN);
if (name2 != NULL)
strlcpy(task->name2, name2, MAXNAMELEN);
return (task);
}
static void
zvol_task_free(zvol_task_t *task)
{
kmem_free(task, sizeof (zvol_task_t));
}
/*
* The worker thread function performed asynchronously.
*/
static void
zvol_task_cb(void *param)
{
zvol_task_t *task = (zvol_task_t *)param;
switch (task->op) {
case ZVOL_ASYNC_CREATE_MINORS:
(void) zvol_create_minors_impl(task->name1);
break;
case ZVOL_ASYNC_REMOVE_MINORS:
zvol_remove_minors_impl(task->name1);
break;
case ZVOL_ASYNC_RENAME_MINORS:
zvol_rename_minors_impl(task->name1, task->name2);
break;
default:
VERIFY(0);
break;
}
zvol_task_free(task);
}
static void
zvol_minors_helper(spa_t *spa, zvol_async_op_t op, const char *name1,
const char *name2)
{
zvol_task_t *task;
if (dataset_name_hidden(name1))
return;
if (name2 != NULL && dataset_name_hidden(name2))
return;
task = zvol_task_alloc(op, name1, name2);
(void)taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
}
void
zvol_create_minors(spa_t *spa, const char *name)
{
zvol_minors_helper(spa, ZVOL_ASYNC_CREATE_MINORS, name, NULL);
}
void
zvol_remove_minors(spa_t *spa, const char *name)
{
zvol_minors_helper(spa, ZVOL_ASYNC_REMOVE_MINORS, name, NULL);
}
void
zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname)
{
zvol_minors_helper(spa, ZVOL_ASYNC_RENAME_MINORS, oldname, newname);
}
static int