From c746ed724d01b439d42aff01cbe88eadacd0ac0d Mon Sep 17 00:00:00 2001 From: "Jason A. Harmening" Date: Sat, 12 Jun 2021 12:42:12 -0700 Subject: [PATCH] Allow stacked filesystems to be recursively unmounted In certain emergency cases such as media failure or removal, UFS will initiate a forced unmount in order to prevent dirty buffers from accumulating against the no-longer-usable filesystem. The presence of a stacked filesystem such as nullfs or unionfs above the UFS mount will prevent this forced unmount from succeeding. This change addreses the situation by allowing stacked filesystems to be recursively unmounted on a taskqueue thread when the MNT_RECURSE flag is specified to dounmount(). This call will block until all upper mounts have been removed unless the caller specifies the MNT_DEFERRED flag to indicate the base filesystem should also be unmounted from the taskqueue. To achieve this, the recently-added vfs_pin_from_vp()/vfs_unpin() KPIs have been combined with the existing 'mnt_uppers' list used by nullfs and renamed to vfs_register_upper_from_vp()/vfs_unregister_upper(). The format of the mnt_uppers list has also been changed to accommodate filesystems such as unionfs in which a given mount may be stacked atop more than one lower mount. Additionally, management of lower FS reclaim/unlink notifications has been split into a separate list managed by a separate set of KPIs, as registration of an upper FS no longer implies interest in these notifications. Reviewed by: kib, mckusick Tested by: pho Differential Revision: https://reviews.freebsd.org/D31016 --- sys/fs/nullfs/null.h | 2 + sys/fs/nullfs/null_vfsops.c | 30 ++-- sys/fs/unionfs/union.h | 2 + sys/fs/unionfs/union_vfsops.c | 14 +- sys/kern/vfs_mount.c | 280 ++++++++++++++++++++++++++++++---- sys/kern/vfs_subr.c | 55 ++++--- sys/sys/mount.h | 51 +++++-- sys/ufs/ffs/ffs_vfsops.c | 2 +- 8 files changed, 348 insertions(+), 88 deletions(-) diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h index 6fdac4b1006e..a41625536d65 100644 --- a/sys/fs/nullfs/null.h +++ b/sys/fs/nullfs/null.h @@ -45,6 +45,8 @@ struct null_mount { struct mount *nullm_vfs; struct vnode *nullm_lowerrootvp; /* Ref to lower root vnode */ uint64_t nullm_flags; + struct mount_upper_node upper_node; + struct mount_upper_node notify_node; }; #ifdef _KERNEL diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c index 4914e5fc2dbf..73301c9275d2 100644 --- a/sys/fs/nullfs/null_vfsops.c +++ b/sys/fs/nullfs/null_vfsops.c @@ -163,7 +163,8 @@ nullfs_mount(struct mount *mp) * Save pointer to underlying FS and the reference to the * lower root vnode. */ - xmp->nullm_vfs = vfs_pin_from_vp(lowerrootvp); + xmp->nullm_vfs = vfs_register_upper_from_vp(lowerrootvp, mp, + &xmp->upper_node); if (xmp->nullm_vfs == NULL) { vput(lowerrootvp); free(xmp, M_NULLFSMNT); @@ -178,7 +179,7 @@ nullfs_mount(struct mount *mp) */ error = null_nodeget(mp, lowerrootvp, &nullm_rootvp); if (error != 0) { - vfs_unpin(xmp->nullm_vfs); + vfs_unregister_upper(xmp->nullm_vfs, &xmp->upper_node); vrele(lowerrootvp); free(xmp, M_NULLFSMNT); return (error); @@ -195,6 +196,11 @@ nullfs_mount(struct mount *mp) (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) != 0) xmp->nullm_flags &= ~NULLM_CACHE; + if ((xmp->nullm_flags & NULLM_CACHE) != 0) { + vfs_register_for_notification(xmp->nullm_vfs, mp, + &xmp->notify_node); + } + MNT_ILOCK(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & @@ -206,13 +212,6 @@ nullfs_mount(struct mount *mp) (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS); MNT_IUNLOCK(mp); vfs_getnewfsid(mp); - if ((xmp->nullm_flags & NULLM_CACHE) != 0) { - MNT_ILOCK(xmp->nullm_vfs); - TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp, - mnt_upper_link); - MNT_IUNLOCK(xmp->nullm_vfs); - } - vfs_mountedfrom(mp, target); vput(nullm_rootvp); @@ -230,7 +229,6 @@ nullfs_unmount(mp, mntflags) int mntflags; { struct null_mount *mntdata; - struct mount *ump; int error, flags; NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); @@ -259,17 +257,11 @@ nullfs_unmount(mp, mntflags) * Finally, throw away the null_mount structure */ mntdata = mp->mnt_data; - ump = mntdata->nullm_vfs; if ((mntdata->nullm_flags & NULLM_CACHE) != 0) { - MNT_ILOCK(ump); - while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) { - ump->mnt_kern_flag |= MNTK_VGONE_WAITER; - msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0); - } - TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link); - MNT_IUNLOCK(ump); + vfs_unregister_for_notification(mntdata->nullm_vfs, + &mntdata->notify_node); } - vfs_unpin(ump); + vfs_unregister_upper(mntdata->nullm_vfs, &mntdata->upper_node); vrele(mntdata->nullm_lowerrootvp); mp->mnt_data = NULL; free(mntdata, M_NULLFSMNT); diff --git a/sys/fs/unionfs/union.h b/sys/fs/unionfs/union.h index 64706b2b21a2..96180480dbec 100644 --- a/sys/fs/unionfs/union.h +++ b/sys/fs/unionfs/union.h @@ -57,6 +57,8 @@ struct unionfs_mount { struct vnode *um_lowervp; /* VREFed once */ struct vnode *um_uppervp; /* VREFed once */ struct vnode *um_rootvp; /* ROOT vnode */ + struct mount_upper_node um_lower_link; /* node in lower FS list of uppers */ + struct mount_upper_node um_upper_link; /* node in upper FS list of uppers */ unionfs_copymode um_copymode; unionfs_whitemode um_whitemode; uid_t um_uid; diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c index 96a30f0ae8b5..c17650dedc63 100644 --- a/sys/fs/unionfs/union_vfsops.c +++ b/sys/fs/unionfs/union_vfsops.c @@ -292,14 +292,16 @@ unionfs_domount(struct mount *mp) return (error); } - lowermp = vfs_pin_from_vp(ump->um_lowervp); - uppermp = vfs_pin_from_vp(ump->um_uppervp); + lowermp = vfs_register_upper_from_vp(ump->um_lowervp, mp, + &ump->um_lower_link); + uppermp = vfs_register_upper_from_vp(ump->um_uppervp, mp, + &ump->um_upper_link); if (lowermp == NULL || uppermp == NULL) { if (lowermp != NULL) - vfs_unpin(lowermp); + vfs_unregister_upper(lowermp, &ump->um_lower_link); if (uppermp != NULL) - vfs_unpin(uppermp); + vfs_unregister_upper(uppermp, &ump->um_upper_link); free(ump, M_UNIONFSMNT); mp->mnt_data = NULL; return (ENOENT); @@ -357,8 +359,8 @@ unionfs_unmount(struct mount *mp, int mntflags) if (error) return (error); - vfs_unpin(ump->um_lowervp->v_mount); - vfs_unpin(ump->um_uppervp->v_mount); + vfs_unregister_upper(ump->um_lowervp->v_mount, &ump->um_lower_link); + vfs_unregister_upper(ump->um_uppervp->v_mount, &ump->um_upper_link); free(ump, M_UNIONFSMNT); mp->mnt_data = NULL; diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 354113eb3277..3c546392b213 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include @@ -89,6 +90,11 @@ static bool default_autoro = false; SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); +static bool recursive_forced_unmount = false; +SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, + &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" + " when a file system is forcibly unmounted"); + MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); static uma_zone_t mount_zone; @@ -103,6 +109,16 @@ MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); EVENTHANDLER_LIST_DEFINE(vfs_mounted); EVENTHANDLER_LIST_DEFINE(vfs_unmounted); +static void vfs_deferred_unmount(void *arg, int pending); +static struct task deferred_unmount_task = + TASK_INITIALIZER(0, vfs_deferred_unmount, NULL);; +static struct mtx deferred_unmount_lock; +MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", + MTX_DEF); +static STAILQ_HEAD(, mount) deferred_unmount_list = + STAILQ_HEAD_INITIALIZER(deferred_unmount_list); +TASKQUEUE_DEFINE_THREAD(deferred_unmount); + static void mount_devctl_event(const char *type, struct mount *mp, bool donew); /* @@ -505,8 +521,21 @@ vfs_ref(struct mount *mp) MNT_IUNLOCK(mp); } +/* + * Register ump as an upper mount of the mount associated with + * vnode vp. This registration will be tracked through + * mount_upper_node upper, which should be allocated by the + * caller and stored in per-mount data associated with mp. + * + * If successful, this function will return the mount associated + * with vp, and will ensure that it cannot be unmounted until + * ump has been unregistered as one of its upper mounts. + * + * Upon failure this function will return NULL. + */ struct mount * -vfs_pin_from_vp(struct vnode *vp) +vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump, + struct mount_upper_node *upper) { struct mount *mp; @@ -514,26 +543,81 @@ vfs_pin_from_vp(struct vnode *vp) if (mp == NULL) return (NULL); MNT_ILOCK(mp); - if (mp != vp->v_mount || (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { + if (mp != vp->v_mount || + ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) { MNT_IUNLOCK(mp); return (NULL); } + KASSERT(ump != mp, ("upper and lower mounts are identical")); + upper->mp = ump; MNT_REF(mp); - KASSERT(mp->mnt_pinned_count < INT_MAX, - ("mount pinned count overflow")); - ++mp->mnt_pinned_count; + TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link); MNT_IUNLOCK(mp); return (mp); } +/* + * Register upper mount ump to receive vnode unlink/reclaim + * notifications from lower mount mp. This registration will + * be tracked through mount_upper_node upper, which should be + * allocated by the caller and stored in per-mount data + * associated with mp. + * + * ump must already be registered as an upper mount of mp + * through a call to vfs_register_upper_from_vp(). + */ void -vfs_unpin(struct mount *mp) +vfs_register_for_notification(struct mount *mp, struct mount *ump, + struct mount_upper_node *upper) +{ + upper->mp = ump; + MNT_ILOCK(mp); + TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link); + MNT_IUNLOCK(mp); +} + +static void +vfs_drain_upper_locked(struct mount *mp) +{ + mtx_assert(MNT_MTX(mp), MA_OWNED); + while (mp->mnt_upper_pending != 0) { + mp->mnt_kern_flag |= MNTK_UPPER_WAITER; + msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0); + } +} + +/* + * Undo a previous call to vfs_register_for_notification(). + * The mount represented by upper must be currently registered + * as an upper mount for mp. + */ +void +vfs_unregister_for_notification(struct mount *mp, + struct mount_upper_node *upper) +{ + MNT_ILOCK(mp); + vfs_drain_upper_locked(mp); + TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link); + MNT_IUNLOCK(mp); +} + +/* + * Undo a previous call to vfs_register_upper_from_vp(). + * This must be done before mp can be unmounted. + */ +void +vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper) { MNT_ILOCK(mp); - KASSERT(mp->mnt_pinned_count > 0, ("mount pinned count underflow")); KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0, - ("mount pinned with pending unmount")); - --mp->mnt_pinned_count; + ("registered upper with pending unmount")); + vfs_drain_upper_locked(mp); + TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link); + if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 && + TAILQ_EMPTY(&mp->mnt_uppers)) { + mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER; + wakeup(&mp->mnt_taskqueue_link); + } MNT_REL(mp); MNT_IUNLOCK(mp); } @@ -600,8 +684,10 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath, mac_mount_create(cred, mp); #endif arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); + mp->mnt_upper_pending = 0; TAILQ_INIT(&mp->mnt_uppers); - mp->mnt_pinned_count = 0; + TAILQ_INIT(&mp->mnt_notify); + mp->mnt_taskqueue_flags = 0; return (mp); } @@ -640,9 +726,9 @@ vfs_mount_destroy(struct mount *mp) vn_printf(vp, "dangling vnode "); panic("unmount: dangling vnode"); } - KASSERT(mp->mnt_pinned_count == 0, - ("mnt_pinned_count = %d", mp->mnt_pinned_count)); + KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending")); KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers")); + KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify")); if (mp->mnt_nvnodelistsize != 0) panic("vfs_mount_destroy: nonzero nvnodelistsize"); if (mp->mnt_lazyvnodelistsize != 0) @@ -1799,17 +1885,166 @@ vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which) return (sum); } +static bool +deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue) +{ + bool enqueued; + + enqueued = false; + mtx_lock(&deferred_unmount_lock); + if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) { + mp->mnt_taskqueue_flags = flags | MNT_DEFERRED; + STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, + mnt_taskqueue_link); + enqueued = true; + } + mtx_unlock(&deferred_unmount_lock); + + if (enqueued) { + taskqueue_enqueue(taskqueue_deferred_unmount, + &deferred_unmount_task); + } + + return (enqueued); +} + +/* + * Taskqueue handler for processing async/recursive unmounts + */ +static void +vfs_deferred_unmount(void *argi __unused, int pending __unused) +{ + STAILQ_HEAD(, mount) local_unmounts; + uint64_t flags; + struct mount *mp, *tmp; + bool unmounted; + + STAILQ_INIT(&local_unmounts); + mtx_lock(&deferred_unmount_lock); + STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); + mtx_unlock(&deferred_unmount_lock); + + STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) { + flags = mp->mnt_taskqueue_flags; + KASSERT((flags & MNT_DEFERRED) != 0, + ("taskqueue unmount without MNT_DEFERRED")); + if (dounmount(mp, flags, curthread) != 0) { + MNT_ILOCK(mp); + unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); + MNT_IUNLOCK(mp); + if (!unmounted) + deferred_unmount_enqueue(mp, flags, true); + else + vfs_rel(mp); + } + } +} + /* * Do the actual filesystem unmount. */ int -dounmount(struct mount *mp, int flags, struct thread *td) +dounmount(struct mount *mp, uint64_t flags, struct thread *td) { + struct mount_upper_node *upper; struct vnode *coveredvp, *rootvp; int error; uint64_t async_flag; int mnt_gen_r; + KASSERT((flags & MNT_DEFERRED) == 0 || + (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), + ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE")); + + /* + * If the caller has explicitly requested the unmount to be handled by + * the taskqueue and we're not already in taskqueue context, queue + * up the unmount request and exit. This is done prior to any + * credential checks; MNT_DEFERRED should be used only for kernel- + * initiated unmounts and will therefore be processed with the + * (kernel) credentials of the taskqueue thread. Still, callers + * should be sure this is the behavior they want. + */ + if ((flags & MNT_DEFERRED) != 0 && + taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { + if (!deferred_unmount_enqueue(mp, flags, false)) + vfs_rel(mp); + return (EINPROGRESS); + } + + /* + * Only privileged root, or (if MNT_USER is set) the user that did the + * original mount is permitted to unmount this filesystem. + * This check should be made prior to queueing up any recursive + * unmounts of upper filesystems. Those unmounts will be executed + * with kernel thread credentials and are expected to succeed, so + * we must at least ensure the originating context has sufficient + * privilege to unmount the base filesystem before proceeding with + * the uppers. + */ + error = vfs_suser(mp, td); + if (error != 0) { + KASSERT((flags & MNT_DEFERRED) == 0, + ("taskqueue unmount with insufficient privilege")); + vfs_rel(mp); + return (error); + } + + if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0)) + flags |= MNT_RECURSE; + + if ((flags & MNT_RECURSE) != 0) { + KASSERT((flags & MNT_FORCE) != 0, + ("MNT_RECURSE requires MNT_FORCE")); + + MNT_ILOCK(mp); + /* + * Set MNTK_RECURSE to prevent new upper mounts from being + * added, and note that an operation on the uppers list is in + * progress. This will ensure that unregistration from the + * uppers list, and therefore any pending unmount of the upper + * FS, can't complete until after we finish walking the list. + */ + mp->mnt_kern_flag |= MNTK_RECURSE; + mp->mnt_upper_pending++; + TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { + MNT_IUNLOCK(mp); + vfs_ref(upper->mp); + if (!deferred_unmount_enqueue(upper->mp, flags, false)) + vfs_rel(upper->mp); + MNT_ILOCK(mp); + } + mp->mnt_upper_pending--; + if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && + mp->mnt_upper_pending == 0) { + mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; + wakeup(&mp->mnt_uppers); + } + /* + * If we're not on the taskqueue, wait until the uppers list + * is drained before proceeding with unmount. Otherwise, if + * we are on the taskqueue and there are still pending uppers, + * just re-enqueue on the end of the taskqueue. + */ + if ((flags & MNT_DEFERRED) == 0) { + while (!TAILQ_EMPTY(&mp->mnt_uppers)) { + mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; + msleep(&mp->mnt_taskqueue_link, MNT_MTX(mp), 0, + "umntqw", 0); + } + } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { + MNT_IUNLOCK(mp); + deferred_unmount_enqueue(mp, flags, true); + return (0); + } + MNT_IUNLOCK(mp); + KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); + } + + /* Allow the taskqueue to safely re-enqueue on failure */ + if ((flags & MNT_DEFERRED) != 0) + vfs_ref(mp); + if ((coveredvp = mp->mnt_vnodecovered) != NULL) { mnt_gen_r = mp->mnt_gen; VI_LOCK(coveredvp); @@ -1828,27 +2063,13 @@ dounmount(struct mount *mp, int flags, struct thread *td) } } - /* - * Only privileged root, or (if MNT_USER is set) the user that did the - * original mount is permitted to unmount this filesystem. - */ - error = vfs_suser(mp, td); - if (error != 0) { - if (coveredvp != NULL) { - VOP_UNLOCK(coveredvp); - vdrop(coveredvp); - } - vfs_rel(mp); - return (error); - } - vfs_op_enter(mp); vn_start_write(NULL, &mp, V_WAIT | V_MNTREF); MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 || (mp->mnt_flag & MNT_UPDATE) != 0 || - mp->mnt_pinned_count != 0) { + !TAILQ_EMPTY(&mp->mnt_uppers)) { dounmount_cleanup(mp, coveredvp, 0); return (EBUSY); } @@ -1952,6 +2173,7 @@ dounmount(struct mount *mp, int flags, struct thread *td) } return (error); } + mtx_lock(&mountlist_mtx); TAILQ_REMOVE(&mountlist, mp, mnt_list); mtx_unlock(&mountlist_mtx); @@ -1977,6 +2199,8 @@ dounmount(struct mount *mp, int flags, struct thread *td) } if (mp == rootdevmp) rootdevmp = NULL; + if ((flags & MNT_DEFERRED) != 0) + vfs_rel(mp); vfs_mount_destroy(mp); return (0); } diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index a2f25bf78495..8add6951645f 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -831,9 +831,9 @@ vfs_busy(struct mount *mp, int flags) * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { - KASSERT(mp->mnt_pinned_count == 0, - ("%s: non-zero pinned count %d with pending unmount", - __func__, mp->mnt_pinned_count)); + KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), + ("%s: non-empty upper mount list with pending unmount", + __func__)); if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); @@ -3897,6 +3897,11 @@ notify_lowervp_vfs_dummy(struct mount *mp __unused, { } +struct notify_mount { + struct mount mp; + struct mount_upper_node upper; +}; + /* * Notify upper mounts about reclaimed or unlinked vnode. */ @@ -3907,45 +3912,52 @@ vfs_notify_upper(struct vnode *vp, int event) .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; - struct mount *mp, *ump, *mmp; + struct mount *mp; + struct mount_upper_node *ump; + struct notify_mount *mmp; mp = vp->v_mount; if (mp == NULL) return; - if (TAILQ_EMPTY(&mp->mnt_uppers)) + if (TAILQ_EMPTY(&mp->mnt_notify)) return; - mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); - mmp->mnt_op = &vgonel_vfsops; - mmp->mnt_kern_flag |= MNTK_MARKER; + mmp = malloc(sizeof(*mmp), M_TEMP, M_WAITOK | M_ZERO); + mmp->mp.mnt_op = &vgonel_vfsops; + mmp->mp.mnt_kern_flag |= MNTK_MARKER; + mmp->upper.mp = &mmp->mp; MNT_ILOCK(mp); - mp->mnt_kern_flag |= MNTK_VGONE_UPPER; - for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { - if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { + mp->mnt_upper_pending++; + KASSERT(mp->mnt_upper_pending > 0, + ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending)); + for (ump = TAILQ_FIRST(&mp->mnt_notify); ump != NULL;) { + if ((ump->mp->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } - TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); + TAILQ_INSERT_AFTER(&mp->mnt_notify, ump, &mmp->upper, + mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: - VFS_RECLAIM_LOWERVP(ump, vp); + VFS_RECLAIM_LOWERVP(ump->mp, vp); break; case VFS_NOTIFY_UPPER_UNLINK: - VFS_UNLINK_LOWERVP(ump, vp); + VFS_UNLINK_LOWERVP(ump->mp, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); - ump = TAILQ_NEXT(mmp, mnt_upper_link); - TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); + ump = TAILQ_NEXT(&mmp->upper, mnt_upper_link); + TAILQ_REMOVE(&mp->mnt_notify, &mmp->upper, mnt_upper_link); } free(mmp, M_TEMP); - mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; - if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { - mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; + mp->mnt_upper_pending--; + if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && + mp->mnt_upper_pending == 0) { + mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); @@ -4376,12 +4388,13 @@ DB_SHOW_COMMAND(mount, db_show_mount) MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); - MNT_KERN_FLAG(MNTK_VGONE_UPPER); - MNT_KERN_FLAG(MNTK_VGONE_WAITER); + MNT_KERN_FLAG(MNTK_RECURSE); + MNT_KERN_FLAG(MNTK_UPPER_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_FPLOOKUP); + MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 693293b12370..2082ff089d69 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -190,6 +190,19 @@ struct mount_pcpu { _Static_assert(sizeof(struct mount_pcpu) == 16, "the struct is allocated from pcpu 16 zone"); +/* + * Structure for tracking a stacked filesystem mounted above another + * filesystem. This is expected to be stored in the upper FS' per-mount data. + * + * Lock reference: + * i - lower mount interlock + * c - constant from node initialization + */ +struct mount_upper_node { + struct mount *mp; /* (c) mount object for upper FS */ + TAILQ_ENTRY(mount_upper_node) mnt_upper_link; /* (i) position in uppers list */ +}; + /* * Structure per mounted filesystem. Each mounted filesystem has an * array of operations and an instance record. The filesystems are @@ -199,8 +212,8 @@ _Static_assert(sizeof(struct mount_pcpu) == 16, * l - mnt_listmtx * m - mountlist_mtx * i - interlock - * i* - interlock of uppers' list head * v - vnode freelist mutex + * d - deferred unmount list mutex * * Unmarked fields are considered stable as long as a ref is held. * @@ -242,10 +255,12 @@ struct mount { struct mtx mnt_listmtx; struct vnodelst mnt_lazyvnodelist; /* (l) list of lazy vnodes */ int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */ - int mnt_pinned_count; /* (i) unmount prevented */ + int mnt_upper_pending; /* (i) # of pending ops on mnt_uppers */ struct lock mnt_explock; /* vfs_export walkers lock */ - TAILQ_ENTRY(mount) mnt_upper_link; /* (i*) we in the all uppers */ - TAILQ_HEAD(, mount) mnt_uppers; /* (i) upper mounts over us */ + TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us */ + TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for notification */ + STAILQ_ENTRY(mount) mnt_taskqueue_link; /* (d) our place in deferred unmount list */ + uint64_t mnt_taskqueue_flags; /* (d) unmount flags passed from taskqueue */ }; #endif /* _WANT_MOUNT || _KERNEL */ @@ -438,9 +453,13 @@ struct mntoptnames { #define MNT_BYFSID 0x0000000008000000ULL /* specify filesystem by ID. */ #define MNT_NOCOVER 0x0000001000000000ULL /* Do not cover a mount point */ #define MNT_EMPTYDIR 0x0000002000000000ULL /* Only mount on empty dir */ -#define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ +#define MNT_RECURSE 0x0000100000000000ULL /* recursively unmount uppers */ +#define MNT_DEFERRED 0x0000200000000000ULL /* unmount in async context */ +#define MNT_CMDFLAGS (MNT_UPDATE | MNT_DELEXPORT | MNT_RELOAD | \ MNT_FORCE | MNT_SNAPSHOT | MNT_NONBUSY | \ - MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR) + MNT_BYFSID | MNT_NOCOVER | MNT_EMPTYDIR | \ + MNT_RECURSE | MNT_DEFERRED) + /* * Internal filesystem control flags stored in mnt_kern_flag. * @@ -466,8 +485,8 @@ struct mntoptnames { #define MNTK_NO_IOPF 0x00000100 /* Disallow page faults during reads and writes. Filesystem shall properly handle i/o state on EFAULT. */ -#define MNTK_VGONE_UPPER 0x00000200 -#define MNTK_VGONE_WAITER 0x00000400 +#define MNTK_RECURSE 0x00000200 /* pending recursive unmount */ +#define MNTK_UPPER_WAITER 0x00000400 /* waiting to drain MNTK_UPPER_PENDING */ #define MNTK_LOOKUP_EXCL_DOTDOT 0x00000800 #define MNTK_MARKER 0x00001000 #define MNTK_UNMAPPED_BUFS 0x00002000 @@ -477,8 +496,9 @@ struct mntoptnames { #define MNTK_UNIONFS 0x00020000 /* A hack for F_ISUNIONSTACK */ #define MNTK_FPLOOKUP 0x00040000 /* fast path lookup is supported */ #define MNTK_SUSPEND_ALL 0x00080000 /* Suspended by all-fs suspension */ -#define MNTK_NOASYNC 0x00800000 /* disable async */ -#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ +#define MNTK_TASKQUEUE_WAITER 0x00100000 /* Waiting on unmount taskqueue */ +#define MNTK_NOASYNC 0x00800000 /* disable async */ +#define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ #define MNTK_SUSPEND2 0x04000000 /* block secondary writes */ @@ -952,7 +972,7 @@ vfs_statfs_t __vfs_statfs; * exported vnode operations */ -int dounmount(struct mount *, int, struct thread *); +int dounmount(struct mount *, uint64_t, struct thread *); int kernel_mount(struct mntarg *ma, uint64_t flags); int kernel_vmount(int flags, ...); @@ -1012,8 +1032,13 @@ struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *, int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); -struct mount *vfs_pin_from_vp(struct vnode *); -void vfs_unpin(struct mount *); +struct mount *vfs_register_upper_from_vp(struct vnode *, + struct mount *ump, struct mount_upper_node *); +void vfs_register_for_notification(struct mount *, struct mount *, + struct mount_upper_node *); +void vfs_unregister_for_notification(struct mount *, + struct mount_upper_node *); +void vfs_unregister_upper(struct mount *, struct mount_upper_node *); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx_padalign mountlist_mtx; extern struct nfs_public nfs_pub; diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 6b7407eb88f9..689c85d7bb1f 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -297,7 +297,7 @@ ffs_fsfail_unmount(void *v, int pending) */ mp = vfs_getvfs(&etp->fsid); if (mp != NULL) - dounmount(mp, MNT_FORCE, curthread); + dounmount(mp, MNT_FORCE | MNT_RECURSE, curthread); free(etp, M_UFSMNT); }