From 55c12724d377e6c7ace5b4bd42cd728d4a60af3e Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Wed, 28 Sep 2022 21:48:46 +0500 Subject: [PATCH] zed: mark disks as REMOVED when they are removed ZED does not take any action for disk removal events if there is no spare VDEV available. Added zpool_vdev_remove_wanted() in libzfs and vdev_remove_wanted() in vdev.c to remove the VDEV through ZED on removal event. This means that if you are running zed and remove a disk, it will be properly marked as REMOVED. Reviewed-by: Alexander Motin Reviewed-by: Ryan Moeller Reviewed-by: Tony Hutter Signed-off-by: Ameer Hamza Closes #13797 --- cmd/zed/agents/zfs_agents.c | 54 ++++++++++++----- cmd/zed/agents/zfs_retire.c | 36 ++++++++--- cmd/ztest.c | 2 +- config/kernel-blkdev.m4 | 55 +++++++++++++++++ include/libzfs.h | 1 + include/os/linux/Makefile.am | 1 + include/os/linux/kernel/linux/blkdev_compat.h | 26 ++++++++ include/os/linux/spl/sys/misc.h | 29 +++++++++ include/sys/spa.h | 2 +- include/sys/vdev.h | 3 + include/sys/vdev_impl.h | 3 + include/sys/zfs_context.h | 1 + lib/libzfs/libzfs.abi | 6 ++ lib/libzfs/libzfs_pool.c | 37 ++++++++++++ module/os/linux/spl/spl-generic.c | 33 +++++++++++ module/os/linux/zfs/vdev_disk.c | 24 +++++++- module/zfs/spa.c | 19 ++++-- module/zfs/spa_config.c | 14 ++++- module/zfs/spa_misc.c | 4 +- module/zfs/vdev.c | 59 +++++++++++++++++++ module/zfs/zfs_ioctl.c | 6 +- module/zfs/zio.c | 2 +- tests/zfs-tests/include/libtest.shlib | 2 +- .../functional/fault/auto_offline_001_pos.ksh | 27 ++++----- 24 files changed, 395 insertions(+), 51 deletions(-) create mode 100644 include/os/linux/spl/sys/misc.h diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c index 2bc84a4f57d1..fb07266dae21 100644 --- a/cmd/zed/agents/zfs_agents.c +++ b/cmd/zed/agents/zfs_agents.c @@ -80,6 +80,7 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) char *path = NULL; uint_t c, children; nvlist_t **child; + uint64_t vdev_guid; /* * First iterate over any children. @@ -100,7 +101,7 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) &child, &children) == 0) { for (c = 0; c < children; c++) { if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { - gsp->gs_vdev_type = DEVICE_TYPE_L2ARC; + gsp->gs_vdev_type = DEVICE_TYPE_SPARE; return (B_TRUE); } } @@ -109,7 +110,7 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) &child, &children) == 0) { for (c = 0; c < children; c++) { if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { - gsp->gs_vdev_type = DEVICE_TYPE_SPARE; + gsp->gs_vdev_type = DEVICE_TYPE_L2ARC; return (B_TRUE); } } @@ -126,6 +127,21 @@ zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) &gsp->gs_vdev_expandtime); return (B_TRUE); } + /* + * Otherwise, on a vdev guid match, grab the devid and expansion + * time. The devid might be missing on removal since its not part + * of blkid cache and L2ARC VDEV does not contain pool guid in its + * blkid, so this is a special case for L2ARC VDEV. + */ + else if (gsp->gs_vdev_guid != 0 && gsp->gs_devid == NULL && + nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, &vdev_guid) == 0 && + gsp->gs_vdev_guid == vdev_guid) { + (void) nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, + &gsp->gs_devid); + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, + &gsp->gs_vdev_expandtime); + return (B_TRUE); + } return (B_FALSE); } @@ -148,7 +164,7 @@ zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) /* * if a match was found then grab the pool guid */ - if (gsp->gs_vdev_guid) { + if (gsp->gs_vdev_guid && gsp->gs_devid) { (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &gsp->gs_pool_guid); } @@ -195,11 +211,13 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) uint64_t pool_guid = 0, vdev_guid = 0; guid_search_t search = { 0 }; device_type_t devtype = DEVICE_TYPE_PRIMARY; + char *devid = NULL; class = "resource.fs.zfs.removed"; subclass = ""; (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid); (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); @@ -209,20 +227,24 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); /* + * If devid is missing but vdev_guid is available, find devid + * and pool_guid from vdev_guid. * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or * ZFS_EV_POOL_GUID may be missing so find them. */ - if (pool_guid == 0 || vdev_guid == 0) { - if ((nvlist_lookup_string(nvl, DEV_IDENTIFIER, - &search.gs_devid) == 0) && - (zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search) - == 1)) { - if (pool_guid == 0) - pool_guid = search.gs_pool_guid; - if (vdev_guid == 0) - vdev_guid = search.gs_vdev_guid; - devtype = search.gs_vdev_type; - } + if (devid == NULL || pool_guid == 0 || vdev_guid == 0) { + if (devid == NULL) + search.gs_vdev_guid = vdev_guid; + else + search.gs_devid = devid; + zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + if (devid == NULL) + devid = search.gs_devid; + if (pool_guid == 0) + pool_guid = search.gs_pool_guid; + if (vdev_guid == 0) + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; } /* @@ -235,7 +257,9 @@ zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) search.gs_vdev_expandtime + 10 > tv.tv_sec) { zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' " "for recently expanded device '%s'", EC_DEV_REMOVE, - search.gs_devid); + devid); + fnvlist_free(payload); + free(event); goto out; } diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index 5fe56fe81562..f4b6dff48176 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -323,6 +323,9 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, boolean_t is_disk; vdev_aux_t aux; uint64_t state = 0; + int l2arc; + vdev_stat_t *vs; + unsigned int c; fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); @@ -352,13 +355,32 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); - /* Can't replace l2arc with a spare: offline the device */ - if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, - &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) { - fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); - zpool_vdev_offline(zhp, devname, B_TRUE); - } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || - replace_with_spare(hdl, zhp, vdev) == B_FALSE) { + nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c); + + /* + * If state removed is requested for already removed vdev, + * its a loopback event from spa_async_remove(). Just + * ignore it. + */ + if (vs->vs_state == VDEV_STATE_REMOVED && + state == VDEV_STATE_REMOVED) + return; + + l2arc = (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, &devtype) == 0 && + strcmp(devtype, VDEV_TYPE_L2CACHE) == 0); + + /* Remove the vdev since device is unplugged */ + if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) { + int status = zpool_vdev_remove_wanted(zhp, devname); + fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'" + ", ret:%d", devname, status); + } + + /* Replace the vdev with a spare if its not a l2arc */ + if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") || + replace_with_spare(hdl, zhp, vdev) == B_FALSE)) { /* Could not handle with spare */ fmd_hdl_debug(hdl, "no spare for '%s'", devname); } diff --git a/cmd/ztest.c b/cmd/ztest.c index 436f5c156ec3..033bcfd33576 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -1166,7 +1166,7 @@ ztest_kill(ztest_shared_t *zs) * See comment above spa_write_cachefile(). */ mutex_enter(&spa_namespace_lock); - spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE, B_FALSE); mutex_exit(&spa_namespace_lock); (void) raise(SIGKILL); diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 462d6c6efa8e..28e5364581ea 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -103,6 +103,57 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ ]) ]) +dnl # +dnl # bdev_kobj() is introduced from 5.12 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ], [ + ZFS_LINUX_TEST_SRC([bdev_kobj], [ + #include + #include + #include + ], [ + struct block_device *bdev = NULL; + struct kobject *disk_kobj; + disk_kobj = bdev_kobj(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ], [ + AC_MSG_CHECKING([whether bdev_kobj() exists]) + ZFS_LINUX_TEST_RESULT([bdev_kobj], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_KOBJ, 1, + [bdev_kobj() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # part_to_dev() was removed in 5.12 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV], [ + ZFS_LINUX_TEST_SRC([part_to_dev], [ + #include + #include + ], [ + struct hd_struct *p = NULL; + struct device *pdev; + pdev = part_to_dev(p); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV], [ + AC_MSG_CHECKING([whether part_to_dev() exists]) + ZFS_LINUX_TEST_RESULT([part_to_dev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PART_TO_DEV, 1, + [part_to_dev() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 5.10 API, check_disk_change() is removed, in favor of dnl # bdev_check_media_change(), which doesn't force revalidation @@ -405,6 +456,8 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ + ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -421,4 +474,6 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE + ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ + ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ]) diff --git a/include/libzfs.h b/include/libzfs.h index 9bd4613c1091..df17873369ad 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -310,6 +310,7 @@ _LIBZFS_H int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); _LIBZFS_H int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, splitflags_t); +_LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am index 4d6901c694c8..13ba8060c62d 100644 --- a/include/os/linux/Makefile.am +++ b/include/os/linux/Makefile.am @@ -71,6 +71,7 @@ kernel_spl_sys_HEADERS = \ %D%/spl/sys/kmem_cache.h \ %D%/spl/sys/kstat.h \ %D%/spl/sys/list.h \ + %D%/spl/sys/misc.h \ %D%/spl/sys/mod_os.h \ %D%/spl/sys/mutex.h \ %D%/spl/sys/param.h \ diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index cdcea166903d..3276796537a4 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -261,6 +261,32 @@ bio_set_bi_error(struct bio *bio, int error) #define BIO_END_IO(bio, error) bio_endio(bio, error); #endif /* HAVE_1ARG_BIO_END_IO_T */ +/* + * 5.15 MACRO, + * GD_DEAD + * + * 2.6.36 - 5.14 MACRO, + * GENHD_FL_UP + * + * Check the disk status and return B_TRUE if alive + * otherwise B_FALSE + */ +static inline boolean_t +zfs_check_disk_status(struct block_device *bdev) +{ +#if defined(GENHD_FL_UP) + return (!!(bdev->bd_disk->flags & GENHD_FL_UP)); +#elif defined(GD_DEAD) + return (!test_bit(GD_DEAD, &bdev->bd_disk->state)); +#else +/* + * This is encountered if neither GENHD_FL_UP nor GD_DEAD is available in + * the kernel - likely due to an MACRO change that needs to be chased down. + */ +#error "Unsupported kernel: no usable disk status check" +#endif +} + /* * 4.1 API, * 3.10.0 CentOS 7.x API, diff --git a/include/os/linux/spl/sys/misc.h b/include/os/linux/spl/sys/misc.h new file mode 100644 index 000000000000..299fe9c1ab07 --- /dev/null +++ b/include/os/linux/spl/sys/misc.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _OS_LINUX_SPL_MISC_H +#define _OS_LINUX_SPL_MISC_H + +#include + +extern void spl_signal_kobj_evt(struct block_device *bdev); + +#endif diff --git a/include/sys/spa.h b/include/sys/spa.h index 76fae788bc1f..3e68cb8c6511 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -838,7 +838,7 @@ extern kmutex_t spa_namespace_lock; #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 -extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 5fec1d51a5f2..7a7c70dc1598 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -148,6 +148,7 @@ extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); +extern int vdev_remove_wanted(spa_t *spa, uint64_t guid); extern void vdev_clear(spa_t *spa, vdev_t *vd); extern boolean_t vdev_is_dead(vdev_t *vd); @@ -190,6 +191,8 @@ typedef enum vdev_config_flag { VDEV_CONFIG_MISSING = 1 << 4 } vdev_config_flag_t; +extern void vdev_post_kobj_evt(vdev_t *vd); +extern void vdev_clear_kobj_evt(vdev_t *vd); extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 84c7363ddef0..b789d2c05d59 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -69,6 +69,7 @@ extern uint_t zfs_vdev_async_write_max_active; * Virtual device operations */ typedef int vdev_init_func_t(spa_t *spa, nvlist_t *nv, void **tsd); +typedef void vdev_kobj_post_evt_func_t(vdev_t *vd); typedef void vdev_fini_func_t(vdev_t *vd); typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, uint64_t *ashift, uint64_t *pshift); @@ -123,6 +124,7 @@ typedef const struct vdev_ops { vdev_config_generate_func_t *vdev_op_config_generate; vdev_nparity_func_t *vdev_op_nparity; vdev_ndisks_func_t *vdev_op_ndisks; + vdev_kobj_post_evt_func_t *vdev_op_kobj_evt_post; char vdev_op_type[16]; boolean_t vdev_op_leaf; } vdev_ops_t; @@ -436,6 +438,7 @@ struct vdev { boolean_t vdev_isl2cache; /* was a l2cache device */ boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ boolean_t vdev_resilver_deferred; /* resilver deferred */ + boolean_t vdev_kobj_flag; /* kobj event record */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 83ed97fbec7f..1f7e8bffabdb 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -50,6 +50,7 @@ extern "C" { #include #include #include +#include #include #include #include diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 7dd12df81718..3471bcac9412 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -550,6 +550,7 @@ + @@ -3505,6 +3506,11 @@ + + + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index eea388cf348f..b9806dc30dac 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3073,6 +3073,43 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) } } +/* + * Remove the specified vdev asynchronously from the configuration, so + * that it may come ONLINE if reinserted. This is called from zed on + * Udev remove event. + * Note: We also have a similar function zpool_vdev_remove() that + * removes the vdev from the pool. + */ +int +zpool_vdev_remove_wanted(zpool_handle_t *zhp, const char *path) +{ + zfs_cmd_t zc = {"\0"}; + char errbuf[ERRBUFLEN]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + NULL)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, errbuf)); + + zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, errbuf)); + + zc.zc_cookie = VDEV_STATE_REMOVED; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, errbuf)); +} + /* * Mark the given vdev faulted. */ diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c index de91c44257aa..bc39ece9a427 100644 --- a/module/os/linux/spl/spl-generic.c +++ b/module/os/linux/spl/spl-generic.c @@ -47,6 +47,7 @@ #include #include #include +#include unsigned long spl_hostid = 0; EXPORT_SYMBOL(spl_hostid); @@ -517,6 +518,38 @@ ddi_copyin(const void *from, void *to, size_t len, int flags) } EXPORT_SYMBOL(ddi_copyin); +/* + * Post a uevent to userspace whenever a new vdev adds to the pool. It is + * necessary to sync blkid information with udev, which zed daemon uses + * during device hotplug to identify the vdev. + */ +void +spl_signal_kobj_evt(struct block_device *bdev) +{ +#if defined(HAVE_BDEV_KOBJ) || defined(HAVE_PART_TO_DEV) +#ifdef HAVE_BDEV_KOBJ + struct kobject *disk_kobj = bdev_kobj(bdev); +#else + struct kobject *disk_kobj = &part_to_dev(bdev->bd_part)->kobj; +#endif + if (disk_kobj) { + int ret = kobject_uevent(disk_kobj, KOBJ_CHANGE); + if (ret) { + pr_warn("ZFS: Sending event '%d' to kobject: '%s'" + " (%p): failed(ret:%d)\n", KOBJ_CHANGE, + kobject_name(disk_kobj), disk_kobj, ret); + } + } +#else +/* + * This is encountered if neither bdev_kobj() nor part_to_dev() is available + * in the kernel - likely due to an API change that needs to be chased down. + */ +#error "Unsupported kernel: unable to get struct kobj from bdev" +#endif +} +EXPORT_SYMBOL(spl_signal_kobj_evt); + int ddi_copyout(const void *from, void *to, size_t len, int flags) { diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index ba7adcc1b576..0fed09df5203 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -179,6 +179,18 @@ vdev_disk_error(zio_t *zio) zio->io_flags); } +static void +vdev_disk_kobj_evt_post(vdev_t *v) +{ + vdev_disk_t *vd = v->vdev_tsd; + if (vd && vd->vd_bdev) { + spl_signal_kobj_evt(vd->vd_bdev); + } else { + vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", + v->vdev_path); + } +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) @@ -290,6 +302,13 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, zfs_vdev_holder); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + /* + * There is no point of waiting since device is removed + * explicitly + */ + if (v->vdev_removed) + break; + schedule_timeout(MSEC_TO_TICK(10)); } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); @@ -901,7 +920,7 @@ vdev_disk_io_done(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - if (zfs_check_media_change(vd->vd_bdev)) { + if (!zfs_check_disk_status(vd->vd_bdev)) { invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); @@ -957,7 +976,8 @@ vdev_ops_t vdev_disk_ops = { .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ - .vdev_op_leaf = B_TRUE /* leaf vdev */ + .vdev_op_leaf = B_TRUE, /* leaf vdev */ + .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; /* diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b8e054a5f2a2..cc367745e486 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -909,7 +909,16 @@ spa_change_guid(spa_t *spa) spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { - spa_write_cachefile(spa, B_FALSE, B_TRUE); + /* + * Clear the kobj flag from all the vdevs to allow + * vdev_cache_process_kobj_evt() to post events to all the + * vdevs since GUID is updated. + */ + vdev_clear_kobj_evt(spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); + + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); } @@ -5220,7 +5229,7 @@ spa_open_common(const char *pool, spa_t **spapp, const void *tag, */ spa_unload(spa); spa_deactivate(spa); - spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); @@ -6044,7 +6053,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa_spawn_aux_threads(spa); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); /* * Don't count references from objsets that are already closed @@ -6107,7 +6116,7 @@ spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) if (props != NULL) spa_configfile_set(spa, props, B_FALSE); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); zfs_dbgmsg("spa_import: verbatim import of %s", pool); mutex_exit(&spa_namespace_lock); @@ -6503,7 +6512,7 @@ spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) - spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); spa_remove(spa); } else { /* diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c index 91ac5c05e8af..5165c370403b 100644 --- a/module/zfs/spa_config.c +++ b/module/zfs/spa_config.c @@ -240,7 +240,8 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) * would be required. */ void -spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) +spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent, + boolean_t postblkidevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; @@ -346,6 +347,16 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) if (postsysevent) spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); + + /* + * Post udev event to sync blkid information if the pool is created + * or a new vdev is added to the pool. + */ + if ((target->spa_root_vdev) && postblkidevent) { + vdev_post_kobj_evt(target->spa_root_vdev); + for (int i = 0; i < target->spa_l2cache.sav_count; i++) + vdev_post_kobj_evt(target->spa_l2cache.sav_vdevs[i]); + } } /* @@ -600,6 +611,7 @@ spa_config_update(spa_t *spa, int what) */ if (!spa->spa_is_root) { spa_write_cachefile(spa, B_FALSE, + what != SPA_CONFIG_UPDATE_POOL, what != SPA_CONFIG_UPDATE_POOL); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index f7865bc49bc4..102070013404 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1290,7 +1290,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, * If the config changed, update the config cache. */ if (config_changed) - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); } /* @@ -1385,7 +1385,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) */ if (config_changed) { mutex_enter(&spa_namespace_lock); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); mutex_exit(&spa_namespace_lock); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 53c767e3bcb1..b097e09210af 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -1948,6 +1948,14 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &logical_ashift, &physical_ashift); + + /* Keep the device in removed state if unplugged */ + if (error == ENOENT && vd->vdev_removed) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, + VDEV_AUX_NONE); + return (error); + } + /* * Physical volume size should never be larger than its max size, unless * the disk has shrunk while we were reading it or the device is buggy @@ -3166,6 +3174,34 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, mutex_exit(&vd->vdev_dtl_lock); } +/* + * Iterate over all the vdevs except spare, and post kobj events + */ +void +vdev_post_kobj_evt(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_kobj_evt_post && + vd->vdev_kobj_flag == B_FALSE) { + vd->vdev_kobj_flag = B_TRUE; + vd->vdev_ops->vdev_op_kobj_evt_post(vd); + } + + for (int c = 0; c < vd->vdev_children; c++) + vdev_post_kobj_evt(vd->vdev_child[c]); +} + +/* + * Iterate over all the vdevs except spare, and clear kobj events + */ +void +vdev_clear_kobj_evt(vdev_t *vd) +{ + vd->vdev_kobj_flag = B_FALSE; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_clear_kobj_evt(vd->vdev_child[c]); +} + int vdev_dtl_load(vdev_t *vd) { @@ -3947,6 +3983,29 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) return (spa_vdev_state_exit(spa, vd, 0)); } +int +vdev_remove_wanted(spa_t *spa, uint64_t guid) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); + + /* + * If the vdev is already removed, then don't do anything. + */ + if (vd->vdev_removed) + return (spa_vdev_state_exit(spa, NULL, 0)); + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_REMOVE); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + + /* * Online the given vdev. * diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 620238b7265f..c3266c09306b 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1912,6 +1912,10 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; + case VDEV_STATE_REMOVED: + error = vdev_remove_wanted(spa, zc->zc_guid); + break; + default: error = SET_ERROR(EINVAL); } @@ -2928,7 +2932,7 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); - spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { diff --git a/module/zfs/zio.c b/module/zfs/zio.c index cc2b61f2520b..c2e3c6169fa3 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3923,7 +3923,7 @@ zio_vdev_io_done(zio_t *zio) ops->vdev_op_io_done(zio); - if (unexpected_error) + if (unexpected_error && vd->vdev_remove_wanted == B_FALSE) VERIFY(vdev_probe(vd, zio) == NULL); return (zio); diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index d163fc7c8ccc..02e6a500a71a 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1884,7 +1884,7 @@ function wait_hotspare_state # pool disk state timeout # # Return 0 is pool/disk matches expected state, 1 otherwise # -function check_vdev_state # pool disk state{online,offline,unavail} +function check_vdev_state # pool disk state{online,offline,unavail,removed} { typeset pool=$1 typeset disk=${2#*$DEV_DSKDIR/} diff --git a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh index 17bde9a70636..0ab9317c0a06 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh @@ -24,29 +24,28 @@ # # DESCRIPTION: -# Testing Fault Management Agent ZED Logic - Physically removed device is -# made unavail and onlined when reattached +# Testing Fault Management Agent ZED Logic - Physically detached device is +# made removed and onlined when reattached # # STRATEGY: # 1. Create a pool # 2. Simulate physical removal of one device -# 3. Verify the device is unavailable +# 3. Verify the device is removed when detached # 4. Reattach the device # 5. Verify the device is onlined # 6. Repeat the same tests with a spare device: # zed will use the spare to handle the removed data device # 7. Repeat the same tests again with a faulted spare device: -# the removed data device should be unavailable +# the removed data device should be removed # # NOTE: the use of 'block_device_wait' throughout the test helps avoid race # conditions caused by mixing creation/removal events from partitioning the # disk (zpool create) and events from physically removing it (remove_disk). # -# NOTE: the test relies on 'zpool sync' to prompt the kmods to transition a -# vdev to the unavailable state. The ZED does receive a removal notification -# but only relies on it to activate a hot spare. Additional work is planned -# to extend an existing ioctl interface to allow the ZED to transition the -# vdev in to a removed state. +# NOTE: the test relies on ZED to transit state to removed on device removed +# event. The ZED does receive a removal notification but only relies on it to +# activate a hot spare. Additional work is planned to extend an existing ioctl +# interface to allow the ZED to transition the vdev in to a removed state. # verify_runnable "both" @@ -103,8 +102,8 @@ do log_must mkfile 1m $mntpnt/file sync_pool $TESTPOOL - # 3. Verify the device is unavailable. - log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" + # 3. Verify the device is removed. + log_must wait_vdev_state $TESTPOOL $removedev "REMOVED" # 4. Reattach the device insert_disk $removedev @@ -136,7 +135,7 @@ do # 3. Verify the device is handled by the spare. log_must wait_hotspare_state $TESTPOOL $sparedev "INUSE" - log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" + log_must wait_vdev_state $TESTPOOL $removedev "REMOVED" # 4. Reattach the device insert_disk $removedev @@ -170,8 +169,8 @@ do log_must mkfile 1m $mntpnt/file sync_pool $TESTPOOL - # 4. Verify the device is unavailable - log_must wait_vdev_state $TESTPOOL $removedev "UNAVAIL" + # 4. Verify the device is removed + log_must wait_vdev_state $TESTPOOL $removedev "REMOVED" # 5. Reattach the device insert_disk $removedev