freebsd-dev/sys/kern/vfs_subr.c

6956 lines
167 KiB
C
Raw Normal View History

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
1994-05-24 10:09:53 +00:00
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1994-05-24 10:09:53 +00:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
1994-05-24 10:09:53 +00:00
*/
/*
* External virtual filesystem routines
*/
2003-06-11 00:56:59 +00:00
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
1996-01-04 21:13:23 +00:00
#include "opt_ddb.h"
#include "opt_watchdog.h"
1994-05-24 10:09:53 +00:00
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/asan.h>
#include <sys/bio.h>
#include <sys/buf.h>
#include <sys/capsicum.h>
#include <sys/condvar.h>
#include <sys/conf.h>
#include <sys/counter.h>
#include <sys/dirent.h>
#include <sys/event.h>
#include <sys/eventhandler.h>
#include <sys/extattr.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/ktr.h>
#include <sys/lockf.h>
#include <sys/malloc.h>
1994-05-24 10:09:53 +00:00
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pctrie.h>
#include <sys/priv.h>
#include <sys/reboot.h>
#include <sys/refcount.h>
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
#include <sys/rwlock.h>
#include <sys/sched.h>
Switch the sleep/wakeup and condition variable implementations to use the sleep queue interface: - Sleep queues attempt to merge some of the benefits of both sleep queues and condition variables. Having sleep qeueus in a hash table avoids having to allocate a queue head for each wait channel. Thus, struct cv has shrunk down to just a single char * pointer now. However, the hash table does not hold threads directly, but queue heads. This means that once you have located a queue in the hash bucket, you no longer have to walk the rest of the hash chain looking for threads. Instead, you have a list of all the threads sleeping on that wait channel. - Outside of the sleepq code and the sleep/cv code the kernel no longer differentiates between cv's and sleep/wakeup. For example, calls to abortsleep() and cv_abort() are replaced with a call to sleepq_abort(). Thus, the TDF_CVWAITQ flag is removed. Also, calls to unsleep() and cv_waitq_remove() have been replaced with calls to sleepq_remove(). - The sched_sleep() function no longer accepts a priority argument as sleep's no longer inherently bump the priority. Instead, this is soley a propery of msleep() which explicitly calls sched_prio() before blocking. - The TDF_ONSLEEPQ flag has been dropped as it was never used. The associated TDF_SET_ONSLEEPQ and TDF_CLR_ON_SLEEPQ macros have also been dropped and replaced with a single explicit clearing of td_wchan. TD_SET_ONSLEEPQ() would really have only made sense if it had taken the wait channel and message as arguments anyway. Now that that only happens in one place, a macro would be overkill.
2004-02-27 18:52:44 +00:00
#include <sys/sleepqueue.h>
#include <sys/smr.h>
#include <sys/smp.h>
1994-05-24 10:09:53 +00:00
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <sys/watchdog.h>
1994-05-24 10:09:53 +00:00
#include <machine/stdarg.h>
#include <security/mac/mac_framework.h>
1994-05-24 10:09:53 +00:00
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
#include <vm/vm_page.h>
#include <vm/vm_kern.h>
#include <vm/uma.h>
1994-05-24 10:09:53 +00:00
2006-09-04 22:15:44 +00:00
#ifdef DDB
#include <ddb/ddb.h>
#endif
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
static void delmntque(struct vnode *vp);
static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
int slpflag, int slptimeo);
static void syncer_shutdown(void *arg, int howto);
static int vtryrecycle(struct vnode *vp);
static void v_init_counters(struct vnode *);
static void vn_seqc_init(struct vnode *);
static void vn_seqc_write_end_free(struct vnode *vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
static void vgonel(struct vnode *);
static bool vhold_recycle_free(struct vnode *);
static void vfs_knllock(void *arg);
static void vfs_knlunlock(void *arg);
static void vfs_knl_assert_lock(void *arg, int what);
static void destroy_vpollinfo(struct vpollinfo *vi);
static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
daddr_t startlbn, daddr_t endlbn);
static void vnlru_recalc(void);
/*
* Number of vnodes in existence. Increased whenever getnewvnode()
* allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode.
*/
static u_long __exclusive_cache_line numvnodes;
SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
2010-11-14 16:10:15 +00:00
"Number of vnodes in existence");
static counter_u64_t vnodes_created;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
"Number of vnodes created by getnewvnode");
/*
* Conversion tables for conversion from vnode types to inode formats
* and back.
*/
1994-05-24 10:09:53 +00:00
enum vtype iftovt_tab[16] = {
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
1994-05-24 10:09:53 +00:00
};
int vttoif_tab[10] = {
1994-05-24 10:09:53 +00:00
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
1994-05-24 10:09:53 +00:00
};
2000-10-05 18:22:46 +00:00
/*
* List of allocates vnodes in the system.
2000-10-05 18:22:46 +00:00
*/
static TAILQ_HEAD(freelst, vnode) vnode_list;
static struct vnode *vnode_list_free_marker;
static struct vnode *vnode_list_reclaim_marker;
/*
* "Free" vnode target. Free vnodes are rarely completely free, but are
* just ones that are cheap to recycle. Usually they are for files which
* have been stat'd but not read; these usually have inode and namecache
* data attached to them. This target is the preferred minimum size of a
* sub-cache consisting mostly of such files. The system balances the size
* of this sub-cache with its complement to try to prevent either from
* thrashing while the other is relatively inactive. The targets express
* a preference for the best balance.
*
* "Above" this target there are 2 further targets (watermarks) related
* to recyling of free vnodes. In the best-operating case, the cache is
* exactly full, the free list has size between vlowat and vhiwat above the
* free target, and recycling from it and normal use maintains this state.
* Sometimes the free list is below vlowat or even empty, but this state
* is even better for immediate use provided the cache is not full.
* Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
* ones) to reach one of these states. The watermarks are currently hard-
* coded as 4% and 9% of the available space higher. These and the default
* of 25% for wantfreevnodes are too large if the memory size is large.
* E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
* whenever vnlru_proc() becomes active.
*/
static long wantfreevnodes;
static long __exclusive_cache_line freevnodes;
SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
&freevnodes, 0, "Number of \"free\" vnodes");
static long freevnodes_old;
static counter_u64_t recycles_count;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
"Number of vnodes recycled to meet vnode cache targets");
static counter_u64_t recycles_free_count;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count,
"Number of free vnodes recycled to meet vnode cache targets");
static counter_u64_t deferred_inact;
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact,
"Number of times inactive processing was deferred");
/* To keep more than one thread at a time from running vfs_getnewfsid */
static struct mtx mntid_mtx;
2000-10-05 18:22:46 +00:00
/*
* Lock for any access to the following:
* vnode_list
* numvnodes
* freevnodes
*/
static struct mtx __exclusive_cache_line vnode_list_mtx;
/* Publicly exported FS */
struct nfs_public nfs_pub;
2000-10-05 18:22:46 +00:00
static uma_zone_t buf_trie_zone;
static smr_t buf_trie_smr;
/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
static uma_zone_t vnode_zone;
MALLOC_DEFINE(M_VNODEPOLL, "VN POLL", "vnode poll");
2000-10-05 18:22:46 +00:00
__read_frequently smr_t vfs_smr;
/*
* The workitem queue.
2002-06-06 15:46:38 +00:00
*
* It is useful to delay writes of file data and filesystem metadata
* for tens of seconds so that quickly created and deleted files need
* not waste disk bandwidth being created and removed. To realize this,
* we append vnodes to a "workitem" queue. When running with a soft
* updates implementation, most pending metadata dependencies should
* not wait for more than a few seconds. Thus, mounted on block devices
* are delayed only about a half the time that file data is delayed.
* Similarly, directory updates are more critical, so are only delayed
* about a third the time that file data is delayed. Thus, there are
* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
* one each second (driven off the filesystem syncer process). The
* syncer_delayno variable indicates the next queue that is to be processed.
* Items that need to be processed soon are placed in this queue:
*
* syncer_workitem_pending[syncer_delayno]
*
* A delay of fifteen seconds is done by placing the request fifteen
* entries later in the queue:
*
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
*
*/
static int syncer_delayno;
2002-06-06 15:46:38 +00:00
static long syncer_mask;
LIST_HEAD(synclist, bufobj);
static struct synclist *syncer_workitem_pending;
/*
* The sync_mtx protects:
* bo->bo_synclist
* sync_vnode_count
* syncer_delayno
* syncer_state
* syncer_workitem_pending
* syncer_worklist_len
* rushjob
*/
static struct mtx sync_mtx;
static struct cv sync_wakeup;
#define SYNCER_MAXDELAY 32
1998-12-21 23:38:33 +00:00
static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
static int syncdelay = 30; /* max time to delay syncing data */
static int filedelay = 30; /* time to delay syncing files */
SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
2010-11-14 16:10:15 +00:00
"Time to delay syncing files (in seconds)");
static int dirdelay = 29; /* time to delay syncing directories */
SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
2010-11-14 16:10:15 +00:00
"Time to delay syncing directories (in seconds)");
static int metadelay = 28; /* time to delay syncing metadata */
SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
2010-11-14 16:10:15 +00:00
"Time to delay syncing metadata (in seconds)");
static int rushjob; /* number of slots to run ASAP */
static int stat_rush_requests; /* number of times I/O speeded up */
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
2010-11-14 16:10:15 +00:00
"Number of times I/O speeded up (rush requests)");
#define VDBATCH_SIZE 8
struct vdbatch {
u_int index;
long freevnodes;
struct mtx lock;
struct vnode *tab[VDBATCH_SIZE];
};
DPCPU_DEFINE_STATIC(struct vdbatch, vd);
static void vdbatch_dequeue(struct vnode *vp);
/*
* When shutting down the syncer, run it at four times normal speed.
*/
#define SYNCER_SHUTDOWN_SPEEDUP 4
static int sync_vnode_count;
static int syncer_worklist_len;
static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
syncer_state;
/* Target for maximum number of vnodes. */
u_long desiredvnodes;
static u_long gapvnodes; /* gap between wanted and desired */
static u_long vhiwat; /* enough extras after expansion */
static u_long vlowat; /* minimal extras before expansion */
static u_long vstir; /* nonzero to stir non-free vnodes */
static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
static u_long vnlru_read_freevnodes(void);
/*
* Note that no attempt is made to sanitize these parameters.
*/
static int
sysctl_maxvnodes(SYSCTL_HANDLER_ARGS)
{
u_long val;
int error;
val = desiredvnodes;
error = sysctl_handle_long(oidp, &val, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (val == desiredvnodes)
return (0);
mtx_lock(&vnode_list_mtx);
desiredvnodes = val;
wantfreevnodes = desiredvnodes / 4;
vnlru_recalc();
mtx_unlock(&vnode_list_mtx);
/*
* XXX There is no protection against multiple threads changing
* desiredvnodes at the same time. Locking above only helps vnlru and
* getnewvnode.
*/
vfs_hash_changesize(desiredvnodes);
cache_changesize(desiredvnodes);
return (0);
}
SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes,
"LU", "Target for maximum number of vnodes");
static int
sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS)
{
u_long val;
int error;
val = wantfreevnodes;
error = sysctl_handle_long(oidp, &val, 0, req);
if (error != 0 || req->newptr == NULL)
return (error);
if (val == wantfreevnodes)
return (0);
mtx_lock(&vnode_list_mtx);
wantfreevnodes = val;
vnlru_recalc();
mtx_unlock(&vnode_list_mtx);
return (0);
}
SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes,
CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes,
"LU", "Target for minimum number of \"free\" vnodes");
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
&wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
static int vnlru_nowhere;
SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
1994-05-24 10:09:53 +00:00
static int
sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS)
{
struct vnode *vp;
struct nameidata nd;
char *buf;
unsigned long ndflags;
int error;
if (req->newptr == NULL)
return (EINVAL);
if (req->newlen >= PATH_MAX)
return (E2BIG);
buf = malloc(PATH_MAX, M_TEMP, M_WAITOK);
error = SYSCTL_IN(req, buf, req->newlen);
if (error != 0)
goto out;
buf[req->newlen] = '\0';
ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME;
NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread);
if ((error = namei(&nd)) != 0)
goto out;
vp = nd.ni_vp;
if (VN_IS_DOOMED(vp)) {
/*
* This vnode is being recycled. Return != 0 to let the caller
* know that the sysctl had no effect. Return EAGAIN because a
* subsequent call will likely succeed (since namei will create
* a new vnode if necessary)
*/
error = EAGAIN;
goto putvnode;
}
counter_u64_add(recycles_count, 1);
vgone(vp);
putvnode:
NDFREE(&nd, 0);
out:
free(buf, M_TEMP);
return (error);
}
static int
sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS)
{
struct thread *td = curthread;
struct vnode *vp;
struct file *fp;
int error;
int fd;
if (req->newptr == NULL)
return (EBADF);
error = sysctl_handle_int(oidp, &fd, 0, req);
if (error != 0)
return (error);
error = getvnode(curthread, fd, &cap_fcntl_rights, &fp);
if (error != 0)
return (error);
vp = fp->f_vnode;
error = vn_lock(vp, LK_EXCLUSIVE);
if (error != 0)
goto drop;
counter_u64_add(recycles_count, 1);
vgone(vp);
VOP_UNLOCK(vp);
drop:
fdrop(fp, td);
return (error);
}
SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode,
CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname");
SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode,
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0,
sysctl_ftry_reclaim_vnode, "I",
"Try to reclaim a vnode by its file descriptor");
/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
static int vnsz2log;
/*
* Support for the bufobj clean & dirty pctrie.
*/
static void *
buf_trie_alloc(struct pctrie *ptree)
{
return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
}
static void
buf_trie_free(struct pctrie *ptree, void *node)
{
uma_zfree_smr(buf_trie_zone, node);
}
PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
buf_trie_smr);
1994-05-24 10:09:53 +00:00
/*
* Initialize the vnode management data structures.
*
* Reevaluate the following cap on the number of vnodes after the physical
* memory size exceeds 512GB. In the limit, as the physical memory size
* grows, the ratio of the memory size in KB to vnodes approaches 64:1.
1994-05-24 10:09:53 +00:00
*/
2004-08-16 08:33:37 +00:00
#ifndef MAXVNODES_MAX
#define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */
2004-08-16 08:33:37 +00:00
#endif
static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
static struct vnode *
vn_alloc_marker(struct mount *mp)
{
struct vnode *vp;
vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
vp->v_type = VMARKER;
vp->v_mount = mp;
return (vp);
}
static void
vn_free_marker(struct vnode *vp)
{
MPASS(vp->v_type == VMARKER);
free(vp, M_VNODE_MARKER);
}
#ifdef KASAN
static int
vnode_ctor(void *mem, int size, void *arg __unused, int flags __unused)
{
kasan_mark(mem, size, roundup2(size, UMA_ALIGN_PTR + 1), 0);
return (0);
}
static void
vnode_dtor(void *mem, int size, void *arg __unused)
{
size_t end1, end2, off1, off2;
_Static_assert(offsetof(struct vnode, v_vnodelist) <
offsetof(struct vnode, v_dbatchcpu),
"KASAN marks require updating");
off1 = offsetof(struct vnode, v_vnodelist);
off2 = offsetof(struct vnode, v_dbatchcpu);
end1 = off1 + sizeof(((struct vnode *)NULL)->v_vnodelist);
end2 = off2 + sizeof(((struct vnode *)NULL)->v_dbatchcpu);
/*
* Access to the v_vnodelist and v_dbatchcpu fields are permitted even
* after the vnode has been freed. Try to get some KASAN coverage by
* marking everything except those two fields as invalid. Because
* KASAN's tracking is not byte-granular, any preceding fields sharing
* the same 8-byte aligned word must also be marked valid.
*/
/* Handle the area from the start until v_vnodelist... */
off1 = rounddown2(off1, KASAN_SHADOW_SCALE);
kasan_mark(mem, off1, off1, KASAN_UMA_FREED);
/* ... then the area between v_vnodelist and v_dbatchcpu ... */
off1 = roundup2(end1, KASAN_SHADOW_SCALE);
off2 = rounddown2(off2, KASAN_SHADOW_SCALE);
if (off2 > off1)
kasan_mark((void *)((char *)mem + off1), off2 - off1,
off2 - off1, KASAN_UMA_FREED);
/* ... and finally the area from v_dbatchcpu to the end. */
off2 = roundup2(end2, KASAN_SHADOW_SCALE);
kasan_mark((void *)((char *)mem + off2), size - off2, size - off2,
KASAN_UMA_FREED);
}
#endif /* KASAN */
/*
* Initialize a vnode as it first enters the zone.
*/
static int
vnode_init(void *mem, int size, int flags)
{
struct vnode *vp;
vp = mem;
bzero(vp, size);
/*
* Setup locks.
*/
vp->v_vnlock = &vp->v_lock;
mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
/*
* By default, don't allow shared locks unless filesystems opt-in.
*/
lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
LK_NOSHARE | LK_IS_VNODE);
/*
* Initialize bufobj.
*/
bufobj_init(&vp->v_bufobj, vp);
/*
* Initialize namecache.
*/
cache_vnode_init(vp);
/*
* Initialize rangelocks.
*/
rangelock_init(&vp->v_rl);
vp->v_dbatchcpu = NOCPU;
/*
* Check vhold_recycle_free for an explanation.
*/
vp->v_holdcnt = VHOLD_NO_SMR;
vp->v_type = VNON;
mtx_lock(&vnode_list_mtx);
TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
return (0);
}
/*
* Free a vnode when it is cleared from the zone.
*/
static void
vnode_fini(void *mem, int size)
{
struct vnode *vp;
struct bufobj *bo;
vp = mem;
vdbatch_dequeue(vp);
mtx_lock(&vnode_list_mtx);
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
rangelock_destroy(&vp->v_rl);
lockdestroy(vp->v_vnlock);
mtx_destroy(&vp->v_interlock);
bo = &vp->v_bufobj;
rw_destroy(BO_LOCKPTR(bo));
kasan_mark(mem, size, size, 0);
}
/*
* Provide the size of NFS nclnode and NFS fh for calculation of the
* vnode memory consumption. The size is specified directly to
* eliminate dependency on NFS-private header.
*
* Other filesystems may use bigger or smaller (like UFS and ZFS)
* private inode data, but the NFS-based estimation is ample enough.
* Still, we care about differences in the size between 64- and 32-bit
* platforms.
*
* Namecache structure size is heuristically
* sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
*/
#ifdef _LP64
#define NFS_NCLNODE_SZ (528 + 64)
#define NC_SZ 148
#else
#define NFS_NCLNODE_SZ (360 + 32)
#define NC_SZ 92
#endif
static void
vntblinit(void *dummy __unused)
1994-05-24 10:09:53 +00:00
{
struct vdbatch *vd;
uma_ctor ctor;
uma_dtor dtor;
int cpu, physvnodes, virtvnodes;
u_int i;
1994-05-24 10:09:53 +00:00
/*
* Desiredvnodes is a function of the physical memory size and the
* kernel's heap size. Generally speaking, it scales with the
* physical memory size. The ratio of desiredvnodes to the physical
* memory size is 1:16 until desiredvnodes exceeds 98,304.
* Thereafter, the
* marginal ratio of desiredvnodes to the physical memory size is
* 1:64. However, desiredvnodes is limited by the kernel's heap
* size. The memory required by desiredvnodes vnodes and vm objects
* must not exceed 1/10th of the kernel's heap size.
*/
physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
desiredvnodes = min(physvnodes, virtvnodes);
2004-08-16 08:33:37 +00:00
if (desiredvnodes > MAXVNODES_MAX) {
if (bootverbose)
printf("Reducing kern.maxvnodes %lu -> %lu\n",
2004-08-16 08:33:37 +00:00
desiredvnodes, MAXVNODES_MAX);
desiredvnodes = MAXVNODES_MAX;
}
wantfreevnodes = desiredvnodes / 4;
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
TAILQ_INIT(&vnode_list);
mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF);
/*
* The lock is taken to appease WITNESS.
*/
mtx_lock(&vnode_list_mtx);
vnlru_recalc();
mtx_unlock(&vnode_list_mtx);
vnode_list_free_marker = vn_alloc_marker(NULL);
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist);
vnode_list_reclaim_marker = vn_alloc_marker(NULL);
TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist);
#ifdef KASAN
ctor = vnode_ctor;
dtor = vnode_dtor;
#else
ctor = NULL;
dtor = NULL;
#endif
vnode_zone = uma_zcreate("VNODE", sizeof(struct vnode), ctor, dtor,
vnode_init, vnode_fini, UMA_ALIGN_PTR, UMA_ZONE_NOKASAN);
uma_zone_set_smr(vnode_zone, vfs_smr);
/*
* Preallocate enough nodes to support one-per buf so that
* we can not fail an insert. reassignbuf() callers can not
* tolerate the insertion failure.
*/
buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
UMA_ZONE_NOFREE | UMA_ZONE_SMR);
buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
uma_prealloc(buf_trie_zone, nbuf);
vnodes_created = counter_u64_alloc(M_WAITOK);
recycles_count = counter_u64_alloc(M_WAITOK);
recycles_free_count = counter_u64_alloc(M_WAITOK);
deferred_inact = counter_u64_alloc(M_WAITOK);
/*
* Initialize the filesystem syncer.
2002-06-06 15:46:38 +00:00
*/
syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
&syncer_mask);
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
for (i = 1; i <= sizeof(struct vnode); i <<= 1)
vnsz2log++;
vnsz2log--;
CPU_FOREACH(cpu) {
vd = DPCPU_ID_PTR((cpu), vd);
bzero(vd, sizeof(*vd));
mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF);
}
1994-05-24 10:09:53 +00:00
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
1994-05-24 10:09:53 +00:00
/*
* Mark a mount point as busy. Used to synchronize access and to delay
2008-11-02 10:15:42 +00:00
* unmounting. Eventually, mountlist_mtx is not released on failure.
*
* vfs_busy() is a custom lock, it can block the caller.
* vfs_busy() only sleeps if the unmount is active on the mount point.
* For a mountpoint mp, vfs_busy-enforced lock is before lock of any
* vnode belonging to mp.
*
* Lookup uses vfs_busy() to traverse mount points.
* root fs var fs
* / vnode lock A / vnode lock (/var) D
* /var vnode lock B /log vnode lock(/var/log) E
* vfs_busy lock C vfs_busy lock F
*
* Within each file system, the lock order is C->A->B and F->D->E.
*
* When traversing across mounts, the system follows that lock order:
*
* C->A->B
* |
* +->F->D->E
*
* The lookup() process for namei("/var") illustrates the process:
* VOP_LOOKUP() obtains B while A is held
* vfs_busy() obtains a shared lock on F while A and B are held
* vput() releases lock on B
* vput() releases lock on A
* VFS_ROOT() obtains lock on D while shared lock on F is held
* vfs_unbusy() releases shared lock on F
* vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
* Attempt to lock A (instead of vp_crossmp) while D is held would
* violate the global order, causing deadlocks.
*
* dounmount() locks B while F is drained.
1994-05-24 10:09:53 +00:00
*/
int
2008-11-02 10:15:42 +00:00
vfs_busy(struct mount *mp, int flags)
1994-05-24 10:09:53 +00:00
{
struct mount_pcpu *mpcpu;
2008-11-02 10:15:42 +00:00
MPASS((flags & ~MBF_MASK) == 0);
CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
1994-05-24 10:09:53 +00:00
if (vfs_op_thread_enter(mp, mpcpu)) {
MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0);
MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0);
vfs_mp_count_add_pcpu(mpcpu, ref, 1);
vfs_mp_count_add_pcpu(mpcpu, lockref, 1);
vfs_op_thread_exit(mp, mpcpu);
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
return (0);
}
MNT_ILOCK(mp);
vfs_assert_mount_counters(mp);
MNT_REF(mp);
/*
* If mount point is currently being unmounted, sleep until the
* mount point fate is decided. If thread doing the unmounting fails,
* it will clear MNTK_UNMOUNT flag before waking us up, indicating
* that this mount point has survived the unmount attempt and vfs_busy
* should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
* flag in addition to MNTK_UNMOUNT, indicating that mount point is
* about to be really destroyed. vfs_busy needs to release its
* reference on the mount point in this case and return with ENOENT,
* telling the caller that mount mount it tried to busy is no longer
* valid.
*/
while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
KASSERT(TAILQ_EMPTY(&mp->mnt_uppers),
("%s: non-empty upper mount list with pending unmount",
__func__));
if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
CTR1(KTR_VFS, "%s: failed busying before sleeping",
__func__);
return (ENOENT);
}
2008-11-02 10:15:42 +00:00
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
mp->mnt_kern_flag |= MNTK_MWAIT;
msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
2008-11-02 10:15:42 +00:00
if (flags & MBF_MNTLSTLOCK)
mtx_lock(&mountlist_mtx);
MNT_ILOCK(mp);
1994-05-24 10:09:53 +00:00
}
2008-11-02 10:15:42 +00:00
if (flags & MBF_MNTLSTLOCK)
mtx_unlock(&mountlist_mtx);
mp->mnt_lockref++;
2008-11-02 10:15:42 +00:00
MNT_IUNLOCK(mp);
1994-05-24 10:09:53 +00:00
return (0);
}
/*
* Free a busy filesystem.
1994-05-24 10:09:53 +00:00
*/
void
vfs_unbusy(struct mount *mp)
1994-05-24 10:09:53 +00:00
{
struct mount_pcpu *mpcpu;
int c;
1994-05-24 10:09:53 +00:00
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
if (vfs_op_thread_enter(mp, mpcpu)) {
MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
vfs_mp_count_sub_pcpu(mpcpu, lockref, 1);
vfs_mp_count_sub_pcpu(mpcpu, ref, 1);
vfs_op_thread_exit(mp, mpcpu);
return;
}
2008-11-02 10:15:42 +00:00
MNT_ILOCK(mp);
vfs_assert_mount_counters(mp);
2008-11-02 10:15:42 +00:00
MNT_REL(mp);
c = --mp->mnt_lockref;
if (mp->mnt_vfs_ops == 0) {
MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0);
MNT_IUNLOCK(mp);
return;
}
if (c < 0)
vfs_dump_mount_counters(mp);
if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
2008-11-02 10:15:42 +00:00
MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
CTR1(KTR_VFS, "%s: waking up waiters", __func__);
2008-11-02 10:15:42 +00:00
mp->mnt_kern_flag &= ~MNTK_DRAINING;
wakeup(&mp->mnt_lockref);
}
MNT_IUNLOCK(mp);
1994-05-24 10:09:53 +00:00
}
/*
* Lookup a mount point by filesystem identifier.
*/
struct mount *
vfs_getvfs(fsid_t *fsid)
1994-05-24 10:09:53 +00:00
{
2005-01-28 12:39:10 +00:00
struct mount *mp;
1994-05-24 10:09:53 +00:00
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
vfs_ref(mp);
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_unlock(&mountlist_mtx);
1994-05-24 10:09:53 +00:00
return (mp);
}
1994-05-24 10:09:53 +00:00
}
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_unlock(&mountlist_mtx);
CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
return ((struct mount *) 0);
1994-05-24 10:09:53 +00:00
}
/*
* Lookup a mount point by filesystem identifier, busying it before
* returning.
*
* To avoid congestion on mountlist_mtx, implement simple direct-mapped
* cache for popular filesystem identifiers. The cache is lockess, using
* the fact that struct mount's are never freed. In worst case we may
* get pointer to unmounted or even different filesystem, so we have to
* check what we got, and go slow way if so.
*/
struct mount *
vfs_busyfs(fsid_t *fsid)
{
#define FSID_CACHE_SIZE 256
typedef struct mount * volatile vmp_t;
static vmp_t cache[FSID_CACHE_SIZE];
struct mount *mp;
int error;
uint32_t hash;
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
hash = fsid->val[0] ^ fsid->val[1];
hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
mp = cache[hash];
if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0)
goto slow;
if (vfs_busy(mp, 0) != 0) {
cache[hash] = NULL;
goto slow;
}
if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0)
return (mp);
else
vfs_unbusy(mp);
slow:
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) {
error = vfs_busy(mp, MBF_MNTLSTLOCK);
if (error) {
cache[hash] = NULL;
mtx_unlock(&mountlist_mtx);
return (NULL);
}
cache[hash] = mp;
return (mp);
}
}
CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
mtx_unlock(&mountlist_mtx);
return ((struct mount *) 0);
}
/*
2007-04-10 15:22:40 +00:00
* Check if a user can access privileged mount options.
*/
int
vfs_suser(struct mount *mp, struct thread *td)
{
int error;
if (jailed(td->td_ucred)) {
/*
* If the jail of the calling thread lacks permission for
* this type of file system, deny immediately.
*/
if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag))
return (EPERM);
/*
* If the file system was mounted outside the jail of the
* calling thread, deny immediately.
*/
if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
return (EPERM);
}
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes. This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
2008-11-17 20:49:29 +00:00
/*
* If file system supports delegated administration, we don't check
* for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
* by the file system itself.
* If this is not the user that did original mount, we check for
* the PRIV_VFS_MOUNT_OWNER privilege.
*/
if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
return (error);
}
return (0);
}
1994-05-24 10:09:53 +00:00
/*
* Get a new unique fsid. Try to make its val[0] unique, since this value
* will be used to create fake device numbers for stat(). Also try (but
* not so hard) make its val[0] unique mod 2^16, since some emulators only
* support 16-bit device numbers. We end up with unique val[0]'s for the
* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
*
* Keep in mind that several mounts may be running in parallel. Starting
* the search one past where the previous search terminated is both a
* micro-optimization and a defense against returning the same fsid to
* different mounts.
1994-05-24 10:09:53 +00:00
*/
void
vfs_getnewfsid(struct mount *mp)
1994-05-24 10:09:53 +00:00
{
static uint16_t mntid_base;
struct mount *nmp;
1994-05-24 10:09:53 +00:00
fsid_t tfsid;
int mtype;
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_lock(&mntid_mtx);
mtype = mp->mnt_vfc->vfc_typenum;
tfsid.val[1] = mtype;
mtype = (mtype & 0xFF) << 24;
for (;;) {
tfsid.val[0] = makedev(255,
mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
mntid_base++;
if ((nmp = vfs_getvfs(&tfsid)) == NULL)
break;
vfs_rel(nmp);
1994-05-24 10:09:53 +00:00
}
mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_unlock(&mntid_mtx);
1994-05-24 10:09:53 +00:00
}
/*
* Knob to control the precision of file timestamps:
*
* 0 = seconds only; nanoseconds zeroed.
* 1 = seconds and nanoseconds, accurate within 1/HZ.
* 2 = seconds and nanoseconds, truncated to microseconds.
* >=3 = seconds and nanoseconds, maximum precision.
*/
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
static int timestamp_precision = TSP_USEC;
SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
&timestamp_precision, 0, "File timestamp precision (0: seconds, "
"1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
2010-11-14 16:10:15 +00:00
"3+: sec + ns (max. precision))");
/*
* Get a current timestamp.
*/
void
vfs_timestamp(struct timespec *tsp)
{
struct timeval tv;
switch (timestamp_precision) {
case TSP_SEC:
tsp->tv_sec = time_second;
tsp->tv_nsec = 0;
break;
case TSP_HZ:
getnanotime(tsp);
break;
case TSP_USEC:
microtime(&tv);
TIMEVAL_TO_TIMESPEC(&tv, tsp);
break;
case TSP_NSEC:
default:
nanotime(tsp);
break;
}
}
1994-05-24 10:09:53 +00:00
/*
* Set vnode attributes to VNOVAL
*/
void
vattr_null(struct vattr *vap)
1994-05-24 10:09:53 +00:00
{
vap->va_type = VNON;
vap->va_size = VNOVAL;
vap->va_bytes = VNOVAL;
vap->va_mode = VNOVAL;
vap->va_nlink = VNOVAL;
vap->va_uid = VNOVAL;
vap->va_gid = VNOVAL;
vap->va_fsid = VNOVAL;
vap->va_fileid = VNOVAL;
vap->va_blocksize = VNOVAL;
vap->va_rdev = VNOVAL;
vap->va_atime.tv_sec = VNOVAL;
vap->va_atime.tv_nsec = VNOVAL;
vap->va_mtime.tv_sec = VNOVAL;
vap->va_mtime.tv_nsec = VNOVAL;
vap->va_ctime.tv_sec = VNOVAL;
vap->va_ctime.tv_nsec = VNOVAL;
vap->va_birthtime.tv_sec = VNOVAL;
vap->va_birthtime.tv_nsec = VNOVAL;
vap->va_flags = VNOVAL;
vap->va_gen = VNOVAL;
1994-05-24 10:09:53 +00:00
vap->va_vaflags = 0;
}
/*
* Try to reduce the total number of vnodes.
*
* This routine (and its user) are buggy in at least the following ways:
* - all parameters were picked years ago when RAM sizes were significantly
* smaller
* - it can pick vnodes based on pages used by the vm object, but filesystems
* like ZFS don't use it making the pick broken
* - since ZFS has its own aging policy it gets partially combated by this one
* - a dedicated method should be provided for filesystems to let them decide
* whether the vnode should be recycled
*
* This routine is called when we have too many vnodes. It attempts
* to free <count> vnodes and will potentially free vnodes that still
* have VM backing store (VM backing store is typically the cause
* of a vnode blowout so we want to do this). Therefore, this operation
* is not considered cheap.
*
* A number of conditions may prevent a vnode from being reclaimed.
* the buffer cache may have references on the vnode, a directory
* vnode may still have references due to the namei cache representing
* underlying files, or the vnode may be in active use. It is not
* desirable to reuse such vnodes. These conditions may cause the
* number of vnodes to reach some minimum value regardless of what
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
*
* @param reclaim_nc_src Only reclaim directories with outgoing namecache
* entries if this argument is strue
* @param trigger Only reclaim vnodes with fewer than this many resident
* pages.
* @param target How many vnodes to reclaim.
* @return The number of vnodes that were reclaimed.
*/
static int
vlrureclaim(bool reclaim_nc_src, int trigger, u_long target)
{
struct vnode *vp, *mvp;
struct mount *mp;
struct vm_object *object;
u_long done;
bool retried;
mtx_assert(&vnode_list_mtx, MA_OWNED);
retried = false;
done = 0;
mvp = vnode_list_reclaim_marker;
restart:
vp = mvp;
while (done < target) {
vp = TAILQ_NEXT(vp, v_vnodelist);
if (__predict_false(vp == NULL))
break;
if (__predict_false(vp->v_type == VMARKER))
continue;
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
/*
* If it's been deconstructed already, it's still
* referenced, or it exceeds the trigger, skip it.
* Also skip free vnodes. We are trying to make space
* to expand the free list, not reduce it.
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
*/
if (vp->v_usecount > 0 || vp->v_holdcnt == 0 ||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)))
goto next_iter;
if (vp->v_type == VBAD || vp->v_type == VNON)
goto next_iter;
object = atomic_load_ptr(&vp->v_object);
if (object == NULL || object->resident_page_count > trigger) {
goto next_iter;
}
/*
* Handle races against vnode allocation. Filesystems lock the
* vnode some time after it gets returned from getnewvnode,
* despite type and hold count being manipulated earlier.
* Resorting to checking v_mount restores guarantees present
* before the global list was reworked to contain all vnodes.
*/
if (!VI_TRYLOCK(vp))
goto next_iter;
if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
VI_UNLOCK(vp);
goto next_iter;
}
if (vp->v_mount == NULL) {
VI_UNLOCK(vp);
goto next_iter;
}
vholdl(vp);
VI_UNLOCK(vp);
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
vdrop(vp);
goto next_iter_unlocked;
}
if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) {
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vdrop(vp);
vn_finished_write(mp);
goto next_iter_unlocked;
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
}
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
VI_LOCK(vp);
if (vp->v_usecount > 0 ||
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
(vp->v_object != NULL && vp->v_object->handle == vp &&
vp->v_object->resident_page_count > trigger)) {
VOP_UNLOCK(vp);
vdropl(vp);
vn_finished_write(mp);
goto next_iter_unlocked;
}
counter_u64_add(recycles_count, 1);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vgonel(vp);
VOP_UNLOCK(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vdropl(vp);
vn_finished_write(mp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
done++;
next_iter_unlocked:
if (should_yield())
kern_yield(PRI_USER);
mtx_lock(&vnode_list_mtx);
goto restart;
next_iter:
MPASS(vp->v_type != VMARKER);
if (!should_yield())
continue;
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
kern_yield(PRI_USER);
mtx_lock(&vnode_list_mtx);
goto restart;
}
if (done == 0 && !retried) {
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist);
retried = true;
goto restart;
}
return (done);
}
static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
0,
"limit on vnode free requests per call to the vnlru_free routine");
/*
* Attempt to reduce the free list by the requested amount.
*/
static int
vnlru_free_impl(int count, struct vfsops *mnt_op, struct vnode *mvp)
{
struct vnode *vp;
struct mount *mp;
int ocount;
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (count > max_vnlru_free)
count = max_vnlru_free;
ocount = count;
vp = mvp;
for (;;) {
if (count == 0) {
break;
}
vp = TAILQ_NEXT(vp, v_vnodelist);
if (__predict_false(vp == NULL)) {
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist);
break;
}
if (__predict_false(vp->v_type == VMARKER))
continue;
if (vp->v_holdcnt > 0)
continue;
/*
* Don't recycle if our vnode is from different type
* of mount point. Note that mp is type-safe, the
* check does not reach unmapped address even if
* vnode is reclaimed.
*/
if (mnt_op != NULL && (mp = vp->v_mount) != NULL &&
mp->mnt_op != mnt_op) {
continue;
}
if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) {
continue;
}
if (!vhold_recycle_free(vp))
continue;
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
if (vtryrecycle(vp) == 0)
count--;
mtx_lock(&vnode_list_mtx);
vp = mvp;
}
return (ocount - count);
}
static int
vnlru_free_locked(int count)
{
mtx_assert(&vnode_list_mtx, MA_OWNED);
return (vnlru_free_impl(count, NULL, vnode_list_free_marker));
}
void
vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
{
MPASS(mnt_op != NULL);
MPASS(mvp != NULL);
VNPASS(mvp->v_type == VMARKER, mvp);
mtx_lock(&vnode_list_mtx);
vnlru_free_impl(count, mnt_op, mvp);
mtx_unlock(&vnode_list_mtx);
}
struct vnode *
vnlru_alloc_marker(void)
{
struct vnode *mvp;
mvp = vn_alloc_marker(NULL);
mtx_lock(&vnode_list_mtx);
TAILQ_INSERT_BEFORE(vnode_list_free_marker, mvp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
return (mvp);
}
void
vnlru_free_marker(struct vnode *mvp)
{
mtx_lock(&vnode_list_mtx);
TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist);
mtx_unlock(&vnode_list_mtx);
vn_free_marker(mvp);
}
static void
vnlru_recalc(void)
{
mtx_assert(&vnode_list_mtx, MA_OWNED);
gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
vlowat = vhiwat / 2;
}
/*
* Attempt to recycle vnodes in a context that is always safe to block.
2002-05-16 21:28:32 +00:00
* Calling vlrurecycle() from the bowels of filesystem code has some
* interesting deadlock problems.
*/
static struct proc *vnlruproc;
static int vnlruproc_sig;
/*
* The main freevnodes counter is only updated when threads requeue their vnode
* batches. CPUs are conditionally walked to compute a more accurate total.
*
* Limit how much of a slop are we willing to tolerate. Note: the actual value
* at any given moment can still exceed slop, but it should not be by significant
* margin in practice.
*/
#define VNLRU_FREEVNODES_SLOP 128
static __inline void
vfs_freevnodes_inc(void)
{
struct vdbatch *vd;
critical_enter();
vd = DPCPU_PTR(vd);
vd->freevnodes++;
critical_exit();
}
static __inline void
vfs_freevnodes_dec(void)
{
struct vdbatch *vd;
critical_enter();
vd = DPCPU_PTR(vd);
vd->freevnodes--;
critical_exit();
}
static u_long
vnlru_read_freevnodes(void)
{
struct vdbatch *vd;
long slop;
int cpu;
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (freevnodes > freevnodes_old)
slop = freevnodes - freevnodes_old;
else
slop = freevnodes_old - freevnodes;
if (slop < VNLRU_FREEVNODES_SLOP)
return (freevnodes >= 0 ? freevnodes : 0);
freevnodes_old = freevnodes;
CPU_FOREACH(cpu) {
vd = DPCPU_ID_PTR((cpu), vd);
freevnodes_old += vd->freevnodes;
}
return (freevnodes_old >= 0 ? freevnodes_old : 0);
}
static bool
vnlru_under(u_long rnumvnodes, u_long limit)
{
u_long rfreevnodes, space;
if (__predict_false(rnumvnodes > desiredvnodes))
return (true);
space = desiredvnodes - rnumvnodes;
if (space < limit) {
rfreevnodes = vnlru_read_freevnodes();
if (rfreevnodes > wantfreevnodes)
space += rfreevnodes - wantfreevnodes;
}
return (space < limit);
}
static bool
vnlru_under_unlocked(u_long rnumvnodes, u_long limit)
{
long rfreevnodes, space;
if (__predict_false(rnumvnodes > desiredvnodes))
return (true);
space = desiredvnodes - rnumvnodes;
if (space < limit) {
rfreevnodes = atomic_load_long(&freevnodes);
if (rfreevnodes > wantfreevnodes)
space += rfreevnodes - wantfreevnodes;
}
return (space < limit);
}
static void
vnlru_kick(void)
{
mtx_assert(&vnode_list_mtx, MA_OWNED);
if (vnlruproc_sig == 0) {
vnlruproc_sig = 1;
wakeup(vnlruproc);
}
}
2002-06-06 15:46:38 +00:00
static void
vnlru_proc(void)
{
u_long rnumvnodes, rfreevnodes, target;
unsigned long onumvnodes;
int done, force, trigger, usevnodes;
bool reclaim_nc_src, want_reread;
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
2002-06-06 15:46:38 +00:00
SHUTDOWN_PRI_FIRST);
force = 0;
want_reread = false;
for (;;) {
kproc_suspend_check(vnlruproc);
mtx_lock(&vnode_list_mtx);
rnumvnodes = atomic_load_long(&numvnodes);
if (want_reread) {
force = vnlru_under(numvnodes, vhiwat) ? 1 : 0;
want_reread = false;
}
/*
* If numvnodes is too large (due to desiredvnodes being
* adjusted using its sysctl, or emergency growth), first
* try to reduce it by discarding from the free list.
*/
if (rnumvnodes > desiredvnodes) {
vnlru_free_locked(rnumvnodes - desiredvnodes);
rnumvnodes = atomic_load_long(&numvnodes);
}
/*
* Sleep if the vnode cache is in a good state. This is
* when it is not over-full and has space for about a 4%
* or 9% expansion (by growing its size or inexcessively
* reducing its free list). Otherwise, try to reclaim
* space for a 10% expansion.
*/
if (vstir && force == 0) {
force = 1;
vstir = 0;
}
if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
vnlruproc_sig = 0;
wakeup(&vnlruproc_sig);
msleep(vnlruproc, &vnode_list_mtx,
PVFS|PDROP, "vlruwt", hz);
continue;
}
rfreevnodes = vnlru_read_freevnodes();
onumvnodes = rnumvnodes;
/*
* Calculate parameters for recycling. These are the same
* throughout the loop to give some semblance of fairness.
* The trigger point is to avoid recycling vnodes with lots
* of resident pages. We aren't trying to free memory; we
* are trying to recycle or at least free vnodes.
*/
if (rnumvnodes <= desiredvnodes)
usevnodes = rnumvnodes - rfreevnodes;
else
usevnodes = rnumvnodes;
if (usevnodes <= 0)
usevnodes = 1;
/*
* The trigger value is is chosen to give a conservatively
* large value to ensure that it alone doesn't prevent
* making progress. The value can easily be so large that
* it is effectively infinite in some congested and
* misconfigured cases, and this is necessary. Normally
* it is about 8 to 100 (pages), which is quite large.
*/
trigger = vm_cnt.v_page_count * 2 / usevnodes;
if (force < 2)
trigger = vsmalltrigger;
reclaim_nc_src = force >= 3;
target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1);
target = target / 10 + 1;
done = vlrureclaim(reclaim_nc_src, trigger, target);
mtx_unlock(&vnode_list_mtx);
if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
uma_reclaim(UMA_RECLAIM_DRAIN);
if (done == 0) {
if (force == 0 || force == 1) {
force = 2;
continue;
}
if (force == 2) {
force = 3;
continue;
}
want_reread = true;
force = 0;
vnlru_nowhere++;
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
} else {
want_reread = true;
kern_yield(PRI_USER);
}
}
}
static struct kproc_desc vnlru_kp = {
"vnlru",
vnlru_proc,
&vnlruproc
};
SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
&vnlru_kp);
1994-05-24 10:09:53 +00:00
/*
* Routines having to do with the management of the vnode table.
*/
/*
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
* Try to recycle a freed vnode. We abort if anyone picks up a reference
* before we actually vgone(). This function must be called with the vnode
* held to prevent the vnode from being returned to the free list midway
* through vgone().
*/
static int
vtryrecycle(struct vnode *vp)
{
struct mount *vnmp;
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
VNASSERT(vp->v_holdcnt, vp,
("vtryrecycle: Recycling vp %p without a reference.", vp));
/*
* This vnode may found and locked via some other list, if so we
* can't recycle it yet.
*/
if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
CTR2(KTR_VFS,
"%s: impossible to recycle, vp %p lock is already held",
__func__, vp);
vdrop(vp);
return (EWOULDBLOCK);
}
/*
* Don't recycle if its filesystem is being suspended.
*/
if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
VOP_UNLOCK(vp);
CTR2(KTR_VFS,
"%s: impossible to recycle, cannot start the write for %p",
__func__, vp);
vdrop(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
return (EBUSY);
}
/*
* If we got this far, we need to acquire the interlock and see if
* anyone picked up this vnode from another list. If not, we will
* mark it with DOOMED via vgonel() so that anyone who does find it
* will skip over it.
*/
VI_LOCK(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
if (vp->v_usecount) {
VOP_UNLOCK(vp);
vdropl(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vn_finished_write(vnmp);
CTR2(KTR_VFS,
"%s: impossible to recycle, %p is already referenced",
__func__, vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
return (EBUSY);
}
if (!VN_IS_DOOMED(vp)) {
counter_u64_add(recycles_free_count, 1);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vgonel(vp);
}
VOP_UNLOCK(vp);
vdropl(vp);
vn_finished_write(vnmp);
return (0);
}
1994-05-24 10:09:53 +00:00
/*
* Allocate a new vnode.
*
* The operation never returns an error. Returning an error was disabled
* in r145385 (dated 2005) with the following comment:
*
* XXX Not all VFS_VGET/ffs_vget callers check returns.
*
* Given the age of this commit (almost 15 years at the time of writing this
* comment) restoring the ability to fail requires a significant audit of
* all codepaths.
*
* The routine can try to free a vnode or stall for up to 1 second waiting for
* vnlru to clear things up, but ultimately always performs a M_WAITOK allocation.
1994-05-24 10:09:53 +00:00
*/
static u_long vn_alloc_cyclecount;
static struct vnode * __noinline
vn_alloc_hard(struct mount *mp)
{
u_long rnumvnodes, rfreevnodes;
mtx_lock(&vnode_list_mtx);
rnumvnodes = atomic_load_long(&numvnodes);
if (rnumvnodes + 1 < desiredvnodes) {
vn_alloc_cyclecount = 0;
goto alloc;
}
rfreevnodes = vnlru_read_freevnodes();
if (vn_alloc_cyclecount++ >= rfreevnodes) {
vn_alloc_cyclecount = 0;
vstir = 1;
}
/*
* Grow the vnode cache if it will not be above its target max
* after growing. Otherwise, if the free list is nonempty, try
* to reclaim 1 item from it before growing the cache (possibly
* above its target max if the reclamation failed or is delayed).
* Otherwise, wait for some space. In all cases, schedule
* vnlru_proc() if we are getting short of space. The watermarks
* should be chosen so that we never wait or even reclaim from
* the free list to below its target minimum.
*/
if (vnlru_free_locked(1) > 0)
goto alloc;
if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
/*
* Wait for space for a new vnode.
*/
vnlru_kick();
msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz);
if (atomic_load_long(&numvnodes) + 1 > desiredvnodes &&
vnlru_read_freevnodes() > 1)
vnlru_free_locked(1);
}
alloc:
rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
if (vnlru_under(rnumvnodes, vlowat))
vnlru_kick();
mtx_unlock(&vnode_list_mtx);
return (uma_zalloc_smr(vnode_zone, M_WAITOK));
}
static struct vnode *
vn_alloc(struct mount *mp)
{
u_long rnumvnodes;
if (__predict_false(vn_alloc_cyclecount != 0))
return (vn_alloc_hard(mp));
rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1;
if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) {
atomic_subtract_long(&numvnodes, 1);
return (vn_alloc_hard(mp));
}
return (uma_zalloc_smr(vnode_zone, M_WAITOK));
}
static void
vn_free(struct vnode *vp)
{
atomic_subtract_long(&numvnodes, 1);
uma_zfree_smr(vnode_zone, vp);
}
/*
* Return the next vnode from the free list.
*/
int
getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
struct vnode **vpp)
{
struct vnode *vp;
struct thread *td;
struct lock_object *lo;
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
KASSERT(vops->registered,
("%s: not registered vector op %p\n", __func__, vops));
td = curthread;
if (td->td_vp_reserved != NULL) {
vp = td->td_vp_reserved;
td->td_vp_reserved = NULL;
} else {
vp = vn_alloc(mp);
}
counter_u64_add(vnodes_created, 1);
/*
* Locks are given the generic name "vnode" when created.
* Follow the historic practice of using the filesystem
* name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
*
* Locks live in a witness group keyed on their name. Thus,
* when a lock is renamed, it must also move from the witness
* group of its old name to the witness group of its new name.
*
* The change only needs to be made when the vnode moves
* from one filesystem type to another. We ensure that each
* filesystem use a single static name pointer for its tag so
* that we can compare pointers rather than doing a strcmp().
*/
lo = &vp->v_vnlock->lock_object;
#ifdef WITNESS
if (lo->lo_name != tag) {
#endif
lo->lo_name = tag;
#ifdef WITNESS
WITNESS_DESTROY(lo);
WITNESS_INIT(lo, tag);
}
#endif
/*
* By default, don't allow shared locks unless filesystems opt-in.
*/
vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
/*
* Finalize various vnode identity bits.
*/
KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
vp->v_type = VNON;
1994-05-24 10:09:53 +00:00
vp->v_op = vops;
vp->v_irflag = 0;
v_init_counters(vp);
vn_seqc_init(vp);
vp->v_bufobj.bo_ops = &buf_ops_bio;
#ifdef DIAGNOSTIC
if (mp == NULL && vops != &dead_vnodeops)
2017-01-22 15:27:14 +00:00
printf("NULL mp in getnewvnode(9), tag %s\n", tag);
#endif
#ifdef MAC
mac_vnode_init(vp);
if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
mac_vnode_associate_singlelabel(mp, vp);
#endif
if (mp != NULL) {
vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
vp->v_vflag |= VV_NOKNOTE;
}
/*
* For the filesystems which do not use vfs_hash_insert(),
* still initialize v_hash to have vfs_hash_index() useful.
* E.g., nullfs uses vfs_hash_index() on the lower vnode for
* its own hashing.
*/
vp->v_hash = (uintptr_t)vp >> vnsz2log;
*vpp = vp;
1994-05-24 10:09:53 +00:00
return (0);
}
void
getnewvnode_reserve(void)
{
struct thread *td;
td = curthread;
MPASS(td->td_vp_reserved == NULL);
td->td_vp_reserved = vn_alloc(NULL);
}
void
getnewvnode_drop_reserve(void)
{
struct thread *td;
td = curthread;
if (td->td_vp_reserved != NULL) {
vn_free(td->td_vp_reserved);
td->td_vp_reserved = NULL;
}
}
2020-08-23 11:05:26 +00:00
static void __noinline
freevnode(struct vnode *vp)
{
struct bufobj *bo;
/*
* The vnode has been marked for destruction, so free it.
*
* The vnode will be returned to the zone where it will
* normally remain until it is needed for another vnode. We
* need to cleanup (or verify that the cleanup has already
* been done) any residual data left from its current use
* so as not to contaminate the freshly allocated vnode.
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
/*
* Paired with vgone.
*/
vn_seqc_write_end_free(vp);
bo = &vp->v_bufobj;
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp);
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
("clean blk trie not empty"));
VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
("dirty blk trie not empty"));
VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
("Dangling rangelock waiters"));
VNASSERT((vp->v_iflag & (VI_DOINGINACT | VI_OWEINACT)) == 0, vp,
("Leaked inactivation"));
VI_UNLOCK(vp);
#ifdef MAC
mac_vnode_destroy(vp);
#endif
if (vp->v_pollinfo != NULL) {
destroy_vpollinfo(vp->v_pollinfo);
vp->v_pollinfo = NULL;
}
vp->v_mountedhere = NULL;
vp->v_unpcb = NULL;
vp->v_rdev = NULL;
vp->v_fifoinfo = NULL;
vp->v_iflag = 0;
vp->v_vflag = 0;
bo->bo_flag = 0;
vn_free(vp);
}
1994-05-24 10:09:53 +00:00
/*
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
* Delete from old mount point vnode list, if on one.
1994-05-24 10:09:53 +00:00
*/
1997-11-22 08:35:46 +00:00
static void
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
delmntque(struct vnode *vp)
1994-05-24 10:09:53 +00:00
{
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
struct mount *mp;
1994-05-24 10:09:53 +00:00
VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
mp = vp->v_mount;
if (mp == NULL)
return;
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
MNT_ILOCK(mp);
VI_LOCK(vp);
vp->v_mount = NULL;
VI_UNLOCK(vp);
2005-02-17 10:28:58 +00:00
VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
("bad mount point vnode list size"));
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
mp->mnt_nvnodelistsize--;
MNT_REL(mp);
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
MNT_IUNLOCK(mp);
}
static void
insmntque_stddtr(struct vnode *vp, void *dtr_arg)
{
vp->v_data = NULL;
vp->v_op = &dead_vnodeops;
vgone(vp);
vput(vp);
}
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
/*
* Insert into list of vnodes for the new mount point, if available.
*/
int
insmntque1(struct vnode *vp, struct mount *mp,
void (*dtr)(struct vnode *, void *), void *dtr_arg)
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
{
KASSERT(vp->v_mount == NULL,
("insmntque: vnode already on per mount vnode list"));
2005-02-17 10:28:58 +00:00
VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
/*
* We acquire the vnode interlock early to ensure that the
* vnode cannot be recycled by another process releasing a
* holdcnt on it before we get it on both the vnode list
* and the active vnode list. The mount mutex protects only
* manipulation of the vnode list and the vnode freelist
* mutex protects only manipulation of the active vnode list.
* Hence the need to hold the vnode interlock throughout.
*/
MNT_ILOCK(mp);
VI_LOCK(vp);
if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 &&
((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
mp->mnt_nvnodelistsize == 0)) &&
(vp->v_vflag & VV_FORCEINSMQ) == 0) {
VI_UNLOCK(vp);
MNT_IUNLOCK(mp);
if (dtr != NULL)
dtr(vp, dtr_arg);
return (EBUSY);
}
vp->v_mount = mp;
MNT_REF(mp);
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
("neg mount point vnode list size"));
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
mp->mnt_nvnodelistsize++;
VI_UNLOCK(vp);
MNT_IUNLOCK(mp);
return (0);
}
int
insmntque(struct vnode *vp, struct mount *mp)
{
return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1994-05-24 10:09:53 +00:00
}
/*
* Flush out and invalidate all buffers associated with a bufobj
1994-05-24 10:09:53 +00:00
* Called with the underlying object locked.
*/
int
bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1994-05-24 10:09:53 +00:00
{
int error;
BO_LOCK(bo);
if (flags & V_SAVE) {
error = bufobj_wwait(bo, slpflag, slptimeo);
if (error) {
2005-01-11 10:16:39 +00:00
BO_UNLOCK(bo);
return (error);
}
if (bo->bo_dirty.bv_cnt > 0) {
2005-01-11 10:16:39 +00:00
BO_UNLOCK(bo);
do {
error = BO_SYNC(bo, MNT_WAIT);
} while (error == ERELOOKUP);
if (error != 0)
return (error);
2005-01-11 10:16:39 +00:00
BO_LOCK(bo);
if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) {
BO_UNLOCK(bo);
return (EBUSY);
}
}
2002-06-06 15:46:38 +00:00
}
/*
* If you alter this loop please notice that interlock is dropped and
* reacquired in flushbuflist. Special care is needed to ensure that
* no race conditions occur from this.
*/
do {
error = flushbuflist(&bo->bo_clean,
flags, bo, slpflag, slptimeo);
Add the posix_fadvise(2) system call. It is somewhat similar to madvise(2) except that it operates on a file descriptor instead of a memory region. It is currently only supported on regular files. Just as with madvise(2), the advice given to posix_fadvise(2) can be divided into two types. The first type provide hints about data access patterns and are used in the file read and write routines to modify the I/O flags passed down to VOP_READ() and VOP_WRITE(). These modes are thus filesystem independent. Note that to ease implementation (and since this API is only advisory anyway), only a single non-normal range is allowed per file descriptor. The second type of hints are used to hint to the OS that data will or will not be used. These hints are implemented via a new VOP_ADVISE(). A default implementation is provided which does nothing for the WILLNEED request and attempts to move any clean pages to the cache page queue for the DONTNEED request. This latter case required two other changes. First, a new V_CLEANONLY flag was added to vinvalbuf(). This requests vinvalbuf() to only flush clean buffers for the vnode from the buffer cache and to not remove any backing pages from the vnode. This is used to ensure clean pages are not wired into the buffer cache before attempting to move them to the cache page queue. The second change adds a new vm_object_page_cache() method. This method is somewhat similar to vm_object_page_remove() except that instead of freeing each page in the specified range, it attempts to move clean pages to the cache queue if possible. To preserve the ABI of struct file, the f_cdevpriv pointer is now reused in a union to point to the currently active advice region if one is present for regular files. Reviewed by: jilles, kib, arch@ Approved by: re (kib) MFC after: 1 month
2011-11-04 04:02:50 +00:00
if (error == 0 && !(flags & V_CLEANONLY))
error = flushbuflist(&bo->bo_dirty,
flags, bo, slpflag, slptimeo);
if (error != 0 && error != EAGAIN) {
BO_UNLOCK(bo);
return (error);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
} while (error != 0);
/*
* Wait for I/O to complete. XXX needs cleaning up. The vnode can
* have write I/O in-progress but if there is a VM object then the
* VM object can also have read-I/O in-progress.
*/
do {
bufobj_wwait(bo, 0, 0);
if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) {
BO_UNLOCK(bo);
vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx");
BO_LOCK(bo);
}
} while (bo->bo_numoutput > 0);
BO_UNLOCK(bo);
/*
* Destroy the copy in the VM cache, too.
*/
Add the posix_fadvise(2) system call. It is somewhat similar to madvise(2) except that it operates on a file descriptor instead of a memory region. It is currently only supported on regular files. Just as with madvise(2), the advice given to posix_fadvise(2) can be divided into two types. The first type provide hints about data access patterns and are used in the file read and write routines to modify the I/O flags passed down to VOP_READ() and VOP_WRITE(). These modes are thus filesystem independent. Note that to ease implementation (and since this API is only advisory anyway), only a single non-normal range is allowed per file descriptor. The second type of hints are used to hint to the OS that data will or will not be used. These hints are implemented via a new VOP_ADVISE(). A default implementation is provided which does nothing for the WILLNEED request and attempts to move any clean pages to the cache page queue for the DONTNEED request. This latter case required two other changes. First, a new V_CLEANONLY flag was added to vinvalbuf(). This requests vinvalbuf() to only flush clean buffers for the vnode from the buffer cache and to not remove any backing pages from the vnode. This is used to ensure clean pages are not wired into the buffer cache before attempting to move them to the cache page queue. The second change adds a new vm_object_page_cache() method. This method is somewhat similar to vm_object_page_remove() except that instead of freeing each page in the specified range, it attempts to move clean pages to the cache queue if possible. To preserve the ABI of struct file, the f_cdevpriv pointer is now reused in a union to point to the currently active advice region if one is present for regular files. Reviewed by: jilles, kib, arch@ Approved by: re (kib) MFC after: 1 month
2011-11-04 04:02:50 +00:00
if (bo->bo_object != NULL &&
(flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WLOCK(bo->bo_object);
vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
OBJPR_CLEANONLY : 0);
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WUNLOCK(bo->bo_object);
}
#ifdef INVARIANTS
2005-01-11 10:16:39 +00:00
BO_LOCK(bo);
if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO |
V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 ||
bo->bo_clean.bv_cnt > 0))
1994-05-24 10:09:53 +00:00
panic("vinvalbuf: flush failed");
if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
bo->bo_dirty.bv_cnt > 0)
panic("vinvalbuf: flush dirty failed");
2005-01-11 10:16:39 +00:00
BO_UNLOCK(bo);
#endif
1994-05-24 10:09:53 +00:00
return (0);
}
/*
* Flush out and invalidate all buffers associated with a vnode.
* Called with the underlying object locked.
*/
int
vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
{
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
ASSERT_VOP_LOCKED(vp, "vinvalbuf");
if (vp->v_object != NULL && vp->v_object->handle != vp)
return (0);
return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
}
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
/*
* Flush out buffers on the specified list.
*
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
*/
static int
2011-10-27 17:43:36 +00:00
flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
int slptimeo)
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
{
struct buf *bp, *nbp;
int retval, error;
daddr_t lblkno;
b_xflags_t xflags;
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
ASSERT_BO_WLOCKED(bo);
retval = 0;
TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
/*
* If we are flushing both V_NORMAL and V_ALT buffers then
* do not skip any buffers. If we are flushing only V_NORMAL
* buffers then skip buffers marked as BX_ALTDATA. If we are
* flushing only V_ALT buffers then skip buffers not marked
* as BX_ALTDATA.
*/
if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) &&
(((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) ||
((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) {
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
continue;
}
if (nbp != NULL) {
lblkno = nbp->b_lblkno;
xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
}
retval = EAGAIN;
error = BUF_TIMELOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
"flushbuf", slpflag, slptimeo);
if (error) {
2005-01-11 10:16:39 +00:00
BO_LOCK(bo);
return (error != ENOLCK ? error : EAGAIN);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
KASSERT(bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p",
bp, bp->b_bufobj, bo));
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
/*
* XXX Since there are no node locks for NFS, I
* believe there is a slight chance that a delayed
* write will occur while sleeping just above, so
* check for it.
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
*/
if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
(flags & V_SAVE)) {
bremfree(bp);
bp->b_flags |= B_ASYNC;
bwrite(bp);
2005-01-11 10:16:39 +00:00
BO_LOCK(bo);
return (EAGAIN); /* XXX: why not loop ? */
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
bremfree(bp);
bp->b_flags |= (B_INVAL | B_RELBUF);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
bp->b_flags &= ~B_ASYNC;
brelse(bp);
2005-01-11 10:16:39 +00:00
BO_LOCK(bo);
if (nbp == NULL)
break;
nbp = gbincore(bo, lblkno);
if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
!= xflags)
break; /* nbp invalid */
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
return (retval);
Add support to UFS2 to provide storage for extended attributes. As this code is not actually used by any of the existing interfaces, it seems unlikely to break anything (famous last words). The internal kernel interface to manipulate these attributes is invoked using two new IO_ flags: IO_NORMAL and IO_EXT. These flags may be specified in the ioflags word of VOP_READ, VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that you want to do I/O to the normal data part of the file and IO_EXT means that you want to do I/O to the extended attributes part of the file. IO_NORMAL and IO_EXT are mutually exclusive for VOP_READ and VOP_WRITE, but may be specified individually or together in the case of VOP_TRUNCATE. For example, when removing a file, VOP_TRUNCATE is called with both IO_NORMAL and IO_EXT set. For backward compatibility, if neither IO_NORMAL nor IO_EXT is set, then IO_NORMAL is assumed. Note that the BA_ and IO_ flags have been `merged' so that they may both be used in the same flags word. This merger is possible by assigning the IO_ flags to the low sixteen bits and the BA_ flags the high sixteen bits. This works because the high sixteen bits of the IO_ word is reserved for read-ahead and help with write clustering so will never be used for flags. This merge lets us get away from code of the form: if (ioflags & IO_SYNC) flags |= BA_SYNC; For the future, I have considered adding a new field to the vattr structure, va_extsize. This addition could then be exported through the stat structure to allow applications to find out the size of the extended attribute storage and also would provide a more standard interface for truncating them (via VOP_SETATTR rather than VOP_TRUNCATE). I am also contemplating adding a pathconf parameter (for concreteness, lets call it _PC_MAX_EXTSIZE) which would let an application determine the maximum size of the extended atribute storage. Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
}
int
bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
{
struct buf *bp;
int error;
daddr_t lblkno;
ASSERT_BO_LOCKED(bo);
for (lblkno = startn;;) {
again:
bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
if (bp == NULL || bp->b_lblkno >= endn ||
bp->b_lblkno < startn)
break;
error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
if (error != 0) {
BO_RLOCK(bo);
if (error == ENOLCK)
goto again;
return (error);
}
KASSERT(bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p",
bp, bp->b_bufobj, bo));
lblkno = bp->b_lblkno + 1;
if ((bp->b_flags & B_MANAGED) == 0)
bremfree(bp);
bp->b_flags |= B_RELBUF;
/*
* In the VMIO case, use the B_NOREUSE flag to hint that the
* pages backing each buffer in the range are unlikely to be
* reused. Dirty buffers will have the hint applied once
* they've been written.
*/
if ((bp->b_flags & B_VMIO) != 0)
bp->b_flags |= B_NOREUSE;
brelse(bp);
BO_RLOCK(bo);
}
return (0);
}
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
/*
* Truncate a file's buffer and pages to a specified length. This
* is in lieu of the old vinvalbuf mechanism, which performed unneeded
* sync activity.
*/
int
vtruncbuf(struct vnode *vp, off_t length, int blksize)
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
{
struct buf *bp, *nbp;
struct bufobj *bo;
daddr_t startlbn;
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__,
vp, blksize, (uintmax_t)length);
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
/*
* Round up to the *next* lbn.
*/
startlbn = howmany(length, blksize);
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
ASSERT_VOP_LOCKED(vp, "vtruncbuf");
bo = &vp->v_bufobj;
restart_unlocked:
BO_LOCK(bo);
while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN)
;
if (length > 0) {
restartsync:
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
if (bp->b_lblkno > 0)
continue;
/*
* Since we hold the vnode lock this should only
* fail if we're racing with the buf daemon.
*/
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK)
goto restart_unlocked;
VNASSERT((bp->b_flags & B_DELWRI), vp,
("buf(%p) on dirty queue without DELWRI", bp));
bremfree(bp);
bawrite(bp);
BO_LOCK(bo);
goto restartsync;
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
}
}
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
bufobj_wwait(bo, 0, 0);
BO_UNLOCK(bo);
vnode_pager_setsize(vp, length);
return (0);
}
/*
* Invalidate the cached pages of a file's buffer within the range of block
* numbers [startlbn, endlbn).
*/
void
v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn,
int blksize)
{
struct bufobj *bo;
off_t start, end;
ASSERT_VOP_LOCKED(vp, "v_inval_buf_range");
start = blksize * startlbn;
end = blksize * endlbn;
bo = &vp->v_bufobj;
BO_LOCK(bo);
MPASS(blksize == bo->bo_bsize);
while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN)
;
BO_UNLOCK(bo);
vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1));
}
static int
v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo,
daddr_t startlbn, daddr_t endlbn)
{
struct buf *bp, *nbp;
bool anyfreed;
ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked");
ASSERT_BO_LOCKED(bo);
do {
anyfreed = false;
TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK) {
BO_LOCK(bo);
return (EAGAIN);
}
bremfree(bp);
bp->b_flags |= B_INVAL | B_RELBUF;
bp->b_flags &= ~B_ASYNC;
brelse(bp);
anyfreed = true;
BO_LOCK(bo);
if (nbp != NULL &&
(((nbp->b_xflags & BX_VNCLEAN) == 0) ||
nbp->b_vp != vp ||
(nbp->b_flags & B_DELWRI) != 0))
return (EAGAIN);
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
}
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn)
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
BO_LOCKPTR(bo)) == ENOLCK) {
BO_LOCK(bo);
return (EAGAIN);
}
bremfree(bp);
bp->b_flags |= B_INVAL | B_RELBUF;
bp->b_flags &= ~B_ASYNC;
brelse(bp);
anyfreed = true;
BO_LOCK(bo);
if (nbp != NULL &&
(((nbp->b_xflags & BX_VNDIRTY) == 0) ||
(nbp->b_vp != vp) ||
(nbp->b_flags & B_DELWRI) == 0))
return (EAGAIN);
}
} while (anyfreed);
Some VM improvements, including elimination of alot of Sig-11 problems. Tor Egge and others have helped with various VM bugs lately, but don't blame him -- blame me!!! pmap.c: 1) Create an object for kernel page table allocations. This fixes a bogus allocation method previously used for such, by grabbing pages from the kernel object, using bogus pindexes. (This was a code cleanup, and perhaps a minor system stability issue.) pmap.c: 2) Pre-set the modify and accessed bits when prudent. This will decrease bus traffic under certain circumstances. vfs_bio.c, vfs_cluster.c: 3) Rather than calculating the beginning virtual byte offset multiple times, stick the offset into the buffer header, so that the calculated offset can be reused. (Long long multiplies are often expensive, and this is a probably unmeasurable performance improvement, and code cleanup.) vfs_bio.c: 4) Handle write recursion more intelligently (but not perfectly) so that it is less likely to cause a system panic, and is also much more robust. vfs_bio.c: 5) getblk incorrectly wrote out blocks that are incorrectly sized. The problem is fixed, and writes blocks out ONLY when B_DELWRI is true. vfs_bio.c: 6) Check that already constituted buffers have fully valid pages. If not, then make sure that the B_CACHE bit is not set. (This was a major source of Sig-11 type problems.) vfs_bio.c: 7) Fix a potential system deadlock due to an incorrectly specified sleep priority while waiting for a buffer write operation. The change that I made opens the system up to serious problems, and we need to examine the issue of process sleep priorities. vfs_cluster.c, vfs_bio.c: 8) Make clustered reads work more correctly (and more completely) when buffers are already constituted, but not fully valid. (This was another system reliability issue.) vfs_subr.c, ffs_inode.c: 9) Create a vtruncbuf function, which is used by filesystems that can truncate files. The vinvalbuf forced a file sync type operation, while vtruncbuf only invalidates the buffers past the new end of file, and also invalidates the appropriate pages. (This was a system reliabiliy and performance issue.) 10) Modify FFS to use vtruncbuf. vm_object.c: 11) Make the object rundown mechanism for OBJT_VNODE type objects work more correctly. Included in that fix, create pager entries for the OBJT_DEAD pager type, so that paging requests that might slip in during race conditions are properly handled. (This was a system reliability issue.) vm_page.c: 12) Make some of the page validation routines be a little less picky about arguments passed to them. Also, support page invalidation change the object generation count so that we handle generation counts a little more robustly. vm_pageout.c: 13) Further reduce pageout daemon activity when the system doesn't need help from it. There should be no additional performance decrease even when the pageout daemon is running. (This was a significant performance issue.) vnode_pager.c: 14) Teach the vnode pager to handle race conditions during vnode deallocations.
1998-03-16 01:56:03 +00:00
return (0);
}
static void
buf_vlist_remove(struct buf *bp)
{
struct bufv *bv;
b_xflags_t flags;
flags = bp->b_xflags;
KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
ASSERT_BO_WLOCKED(bp->b_bufobj);
KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 &&
(flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN),
("%s: buffer %p has invalid queue state", __func__, bp));
if ((flags & BX_VNDIRTY) != 0)
bv = &bp->b_bufobj->bo_dirty;
else
bv = &bp->b_bufobj->bo_clean;
BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
bv->bv_cnt--;
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
}
/*
* Add the buffer to the sorted clean or dirty block list.
*
* NOTE: xflags is passed as a constant, optimizing this inline function!
*/
static void
buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
{
struct bufv *bv;
struct buf *n;
int error;
ASSERT_BO_WLOCKED(bo);
KASSERT((bo->bo_flag & BO_NOBUFS) == 0,
("buf_vlist_add: bo %p does not allow bufs", bo));
KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
("dead bo %p", bo));
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
bp->b_xflags |= xflags;
if (xflags & BX_VNDIRTY)
bv = &bo->bo_dirty;
else
bv = &bo->bo_clean;
/*
* Keep the list ordered. Optimize empty list insertion. Assume
* we tend to grow at the tail so lookup_le should usually be cheaper
* than _ge.
*/
if (bv->bv_cnt == 0 ||
bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
else
TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
if (error)
panic("buf_vlist_add: Preallocated nodes insufficient.");
bv->bv_cnt++;
}
/*
* Look up a buffer using the buffer tries.
*/
struct buf *
gbincore(struct bufobj *bo, daddr_t lblkno)
{
struct buf *bp;
ASSERT_BO_LOCKED(bo);
bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
if (bp != NULL)
return (bp);
return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
}
/*
* Look up a buf using the buffer tries, without the bufobj lock. This relies
* on SMR for safe lookup, and bufs being in a no-free zone to provide type
* stability of the result. Like other lockless lookups, the found buf may
* already be invalid by the time this function returns.
*/
struct buf *
gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
{
struct buf *bp;
ASSERT_BO_UNLOCKED(bo);
bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
if (bp != NULL)
return (bp);
return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
}
1994-05-24 10:09:53 +00:00
/*
* Associate a buffer with a vnode.
*/
void
bgetvp(struct vnode *vp, struct buf *bp)
1994-05-24 10:09:53 +00:00
{
struct bufobj *bo;
2005-02-17 10:28:58 +00:00
bo = &vp->v_bufobj;
ASSERT_BO_WLOCKED(bo);
2005-02-17 10:28:58 +00:00
VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1999-01-10 01:58:29 +00:00
CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
2005-02-17 10:28:58 +00:00
VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
("bgetvp: bp already attached! %p", bp));
vhold(vp);
1994-05-24 10:09:53 +00:00
bp->b_vp = vp;
bp->b_bufobj = bo;
1994-05-24 10:09:53 +00:00
/*
* Insert onto list for new vnode.
*/
buf_vlist_add(bp, bo, BX_VNCLEAN);
1994-05-24 10:09:53 +00:00
}
/*
* Disassociate a buffer from a vnode.
*/
void
brelvp(struct buf *bp)
1994-05-24 10:09:53 +00:00
{
struct bufobj *bo;
1994-05-24 10:09:53 +00:00
struct vnode *vp;
CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1994-05-24 10:09:53 +00:00
/*
* Delete from old vnode list, if on one.
*/
vp = bp->b_vp; /* XXX */
bo = bp->b_bufobj;
BO_LOCK(bo);
buf_vlist_remove(bp);
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
bo->bo_flag &= ~BO_ONWORKLST;
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
syncer_worklist_len--;
mtx_unlock(&sync_mtx);
}
bp->b_vp = NULL;
bp->b_bufobj = NULL;
BO_UNLOCK(bo);
vdrop(vp);
1994-05-24 10:09:53 +00:00
}
/*
* Add an item to the syncer work queue.
*/
static void
vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
{
int slot;
ASSERT_BO_WLOCKED(bo);
mtx_lock(&sync_mtx);
if (bo->bo_flag & BO_ONWORKLST)
LIST_REMOVE(bo, bo_synclist);
else {
bo->bo_flag |= BO_ONWORKLST;
syncer_worklist_len++;
}
if (delay > syncer_maxdelay - 2)
delay = syncer_maxdelay - 2;
slot = (syncer_delayno + delay) & syncer_mask;
LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
mtx_unlock(&sync_mtx);
}
static int
sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
{
int error, len;
mtx_lock(&sync_mtx);
len = syncer_worklist_len - sync_vnode_count;
mtx_unlock(&sync_mtx);
error = SYSCTL_OUT(req, &len, sizeof(len));
return (error);
}
SYSCTL_PROC(_vfs, OID_AUTO, worklist_len,
CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0,
sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
Un-staticize runningbufwakeup() and staticize updateproc. Add a new private thread flag to indicate that the thread should not sleep if runningbufspace is too large. Set this flag on the bufdaemon and syncer threads so that they skip the waitrunningbufspace() call in bufwrite() rather than than checking the proc pointer vs. the known proc pointers for these two threads. A way of preventing these threads from being starved for I/O but still placing limits on their outstanding I/O would be desirable. Set this flag in ffs_copyonwrite() to prevent bufwrite() calls from blocking on the runningbufspace check while holding snaplk. This prevents snaplk from being held for an arbitrarily long period of time if runningbufspace is high and greatly reduces the contention for snaplk. The disadvantage is that ffs_copyonwrite() can start a large amount of I/O if there are a large number of snapshots, which could cause a deadlock in other parts of the code. Call runningbufwakeup() in ffs_copyonwrite() to decrement runningbufspace before attempting to grab snaplk so that I/O requests waiting on snaplk are not counted in runningbufspace as being in-progress. Increment runningbufspace again before actually launching the original I/O request. Prior to the above two changes, the system could deadlock if enough I/O requests were blocked by snaplk to prevent runningbufspace from falling below lorunningspace and one of the bawrite() calls in ffs_copyonwrite() blocked in waitrunningbufspace() while holding snaplk. See <http://www.holm.cc/stress/log/cons143.html>
2005-09-30 01:30:01 +00:00
static struct proc *updateproc;
2002-03-19 21:25:46 +00:00
static void sched_sync(void);
static struct kproc_desc up_kp = {
"syncer",
sched_sync,
&updateproc
};
SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
static int
sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
{
struct vnode *vp;
struct mount *mp;
*bo = LIST_FIRST(slp);
if (*bo == NULL)
return (0);
vp = bo2vnode(*bo);
if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
return (1);
/*
* We use vhold in case the vnode does not
* successfully sync. vhold prevents the vnode from
* going away when we unlock the sync_mtx so that
* we can acquire the vnode interlock.
*/
vholdl(vp);
mtx_unlock(&sync_mtx);
VI_UNLOCK(vp);
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
vdrop(vp);
mtx_lock(&sync_mtx);
return (*bo == LIST_FIRST(slp));
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
(void) VOP_FSYNC(vp, MNT_LAZY, td);
VOP_UNLOCK(vp);
vn_finished_write(mp);
BO_LOCK(*bo);
if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*/
vn_syncer_add_to_worklist(*bo, syncdelay);
}
BO_UNLOCK(*bo);
vdrop(vp);
mtx_lock(&sync_mtx);
return (0);
}
static int first_printf = 1;
/*
* System filesystem synchronizer daemon.
*/
static void
sched_sync(void)
{
struct synclist *next, *slp;
struct bufobj *bo;
long starttime;
struct thread *td = curthread;
int last_work_seen;
int net_worklist_len;
int syncer_final_iter;
int error;
last_work_seen = 0;
syncer_final_iter = 0;
syncer_state = SYNCER_RUNNING;
starttime = time_uptime;
Un-staticize runningbufwakeup() and staticize updateproc. Add a new private thread flag to indicate that the thread should not sleep if runningbufspace is too large. Set this flag on the bufdaemon and syncer threads so that they skip the waitrunningbufspace() call in bufwrite() rather than than checking the proc pointer vs. the known proc pointers for these two threads. A way of preventing these threads from being starved for I/O but still placing limits on their outstanding I/O would be desirable. Set this flag in ffs_copyonwrite() to prevent bufwrite() calls from blocking on the runningbufspace check while holding snaplk. This prevents snaplk from being held for an arbitrarily long period of time if runningbufspace is high and greatly reduces the contention for snaplk. The disadvantage is that ffs_copyonwrite() can start a large amount of I/O if there are a large number of snapshots, which could cause a deadlock in other parts of the code. Call runningbufwakeup() in ffs_copyonwrite() to decrement runningbufspace before attempting to grab snaplk so that I/O requests waiting on snaplk are not counted in runningbufspace as being in-progress. Increment runningbufspace again before actually launching the original I/O request. Prior to the above two changes, the system could deadlock if enough I/O requests were blocked by snaplk to prevent runningbufspace from falling below lorunningspace and one of the bawrite() calls in ffs_copyonwrite() blocked in waitrunningbufspace() while holding snaplk. See <http://www.holm.cc/stress/log/cons143.html>
2005-09-30 01:30:01 +00:00
td->td_pflags |= TDP_NORUNNINGBUF;
EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
2002-06-06 15:46:38 +00:00
SHUTDOWN_PRI_LAST);
mtx_lock(&sync_mtx);
for (;;) {
if (syncer_state == SYNCER_FINAL_DELAY &&
syncer_final_iter == 0) {
mtx_unlock(&sync_mtx);
kproc_suspend_check(td->td_proc);
mtx_lock(&sync_mtx);
}
net_worklist_len = syncer_worklist_len - sync_vnode_count;
2004-07-15 04:29:48 +00:00
if (syncer_state != SYNCER_RUNNING &&
starttime != time_uptime) {
2004-07-15 04:29:48 +00:00
if (first_printf) {
printf("\nSyncing disks, vnodes remaining... ");
2004-07-15 04:29:48 +00:00
first_printf = 0;
}
printf("%d ", net_worklist_len);
2004-07-15 04:29:48 +00:00
}
starttime = time_uptime;
/*
* Push files whose dirty time has expired. Be careful
* of interrupt race on slp queue.
*
* Skip over empty worklist slots when shutting down.
*/
do {
slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1;
if (syncer_delayno == syncer_maxdelay)
syncer_delayno = 0;
next = &syncer_workitem_pending[syncer_delayno];
/*
* If the worklist has wrapped since the
* it was emptied of all but syncer vnodes,
* switch to the FINAL_DELAY state and run
* for one more second.
*/
if (syncer_state == SYNCER_SHUTTING_DOWN &&
net_worklist_len == 0 &&
last_work_seen == syncer_delayno) {
syncer_state = SYNCER_FINAL_DELAY;
syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
}
} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
syncer_worklist_len > 0);
/*
* Keep track of the last time there was anything
* on the worklist other than syncer vnodes.
* Return to the SHUTTING_DOWN state if any
* new work appears.
*/
if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
last_work_seen = syncer_delayno;
if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
syncer_state = SYNCER_SHUTTING_DOWN;
while (!LIST_EMPTY(slp)) {
error = sync_vnode(slp, &bo, td);
if (error == 1) {
LIST_REMOVE(bo, bo_synclist);
LIST_INSERT_HEAD(next, bo, bo_synclist);
continue;
}
if (first_printf == 0) {
/*
* Drop the sync mutex, because some watchdog
* drivers need to sleep while patting
*/
mtx_unlock(&sync_mtx);
wdog_kern_pat(WD_LASTVAL);
mtx_lock(&sync_mtx);
}
}
if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
syncer_final_iter--;
/*
* The variable rushjob allows the kernel to speed up the
* processing of the filesystem syncer process. A rushjob
* value of N tells the filesystem syncer to process the next
* N seconds worth of work on its queue ASAP. Currently rushjob
* is used by the soft update code to speed up the filesystem
* syncer process when the incore state is getting so far
* ahead of the disk that the kernel memory pool is being
* threatened with exhaustion.
*/
if (rushjob > 0) {
rushjob -= 1;
continue;
}
/*
* Just sleep for a short period of time between
* iterations when shutting down to allow some I/O
* to happen.
*
* If it has taken us less than a second to process the
* current work, then wait. Otherwise start right over
* again. We can still lose time if any single round
* takes more than two seconds, but it does not really
* matter as we are just trying to generally pace the
* filesystem activity.
*/
if (syncer_state != SYNCER_RUNNING ||
time_uptime == starttime) {
thread_lock(td);
sched_prio(td, PPAUSE);
thread_unlock(td);
}
if (syncer_state != SYNCER_RUNNING)
cv_timedwait(&sync_wakeup, &sync_mtx,
hz / SYNCER_SHUTDOWN_SPEEDUP);
else if (time_uptime == starttime)
cv_timedwait(&sync_wakeup, &sync_mtx, hz);
}
}
/*
* Request the syncer daemon to speed up its work.
* We never push it to speed up more than half of its
* normal turn time, otherwise it could take over the cpu.
*/
int
speedup_syncer(void)
{
int ret = 0;
mtx_lock(&sync_mtx);
if (rushjob < syncdelay / 2) {
rushjob += 1;
stat_rush_requests += 1;
ret = 1;
}
mtx_unlock(&sync_mtx);
cv_broadcast(&sync_wakeup);
return (ret);
}
/*
* Tell the syncer to speed up its work and run though its work
* list several times, then tell it to shut down.
*/
static void
syncer_shutdown(void *arg, int howto)
{
if (howto & RB_NOSYNC)
return;
mtx_lock(&sync_mtx);
syncer_state = SYNCER_SHUTTING_DOWN;
rushjob = 0;
mtx_unlock(&sync_mtx);
cv_broadcast(&sync_wakeup);
kproc_shutdown(arg, howto);
}
void
syncer_suspend(void)
{
syncer_shutdown(updateproc, 0);
}
void
syncer_resume(void)
{
mtx_lock(&sync_mtx);
first_printf = 1;
syncer_state = SYNCER_RUNNING;
mtx_unlock(&sync_mtx);
cv_broadcast(&sync_wakeup);
kproc_resume(updateproc);
}
1994-05-24 10:09:53 +00:00
/*
* Move the buffer between the clean and dirty lists of its vnode.
1994-05-24 10:09:53 +00:00
*/
void
reassignbuf(struct buf *bp)
1994-05-24 10:09:53 +00:00
{
struct vnode *vp;
struct bufobj *bo;
int delay;
#ifdef INVARIANTS
struct bufv *bv;
#endif
1994-05-24 10:09:53 +00:00
vp = bp->b_vp;
bo = bp->b_bufobj;
KASSERT((bp->b_flags & B_PAGING) == 0,
("%s: cannot reassign paging buffer %p", __func__, bp));
CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
BO_LOCK(bo);
buf_vlist_remove(bp);
1994-05-24 10:09:53 +00:00
/*
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
* If dirty, put on list of dirty buffers; otherwise insert onto list
* of clean buffers.
1994-05-24 10:09:53 +00:00
*/
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
if (bp->b_flags & B_DELWRI) {
if ((bo->bo_flag & BO_ONWORKLST) == 0) {
switch (vp->v_type) {
case VDIR:
delay = dirdelay;
break;
case VCHR:
delay = metadelay;
break;
default:
delay = filedelay;
}
vn_syncer_add_to_worklist(bo, delay);
}
buf_vlist_add(bp, bo, BX_VNDIRTY);
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
} else {
buf_vlist_add(bp, bo, BX_VNCLEAN);
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
mtx_lock(&sync_mtx);
LIST_REMOVE(bo, bo_synclist);
syncer_worklist_len--;
mtx_unlock(&sync_mtx);
bo->bo_flag &= ~BO_ONWORKLST;
}
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
}
#ifdef INVARIANTS
bv = &bo->bo_clean;
bp = TAILQ_FIRST(&bv->bv_hd);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
bp = TAILQ_LAST(&bv->bv_hd, buflists);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
bv = &bo->bo_dirty;
bp = TAILQ_FIRST(&bv->bv_hd);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
bp = TAILQ_LAST(&bv->bv_hd, buflists);
KASSERT(bp == NULL || bp->b_bufobj == bo,
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
#endif
BO_UNLOCK(bo);
1994-05-24 10:09:53 +00:00
}
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
static void
v_init_counters(struct vnode *vp)
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
{
VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
vp, ("%s called for an initialized vnode", __FUNCTION__));
ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
refcount_init(&vp->v_holdcnt, 1);
refcount_init(&vp->v_usecount, 1);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
}
1994-05-24 10:09:53 +00:00
/*
* Grab a particular vnode from the free list, increment its
* reference count and lock it. VIRF_DOOMED is set if the vnode
* is being destroyed. Only callers who specify LK_RETRY will
* see doomed vnodes. If inactive processing was delayed in
* vput try to do it here.
*
* usecount is manipulated using atomics without holding any locks.
*
* holdcnt can be manipulated using atomics without holding any locks,
* except when transitioning 1<->0, in which case the interlock is held.
*
* Consumers which don't guarantee liveness of the vnode can use SMR to
* try to get a reference. Note this operation can fail since the vnode
* may be awaiting getting freed by the time they get to it.
1994-05-24 10:09:53 +00:00
*/
enum vgetstate
vget_prep_smr(struct vnode *vp)
{
enum vgetstate vs;
VFS_SMR_ASSERT_ENTERED();
if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
vs = VGET_USECOUNT;
} else {
if (vhold_smr(vp))
vs = VGET_HOLDCNT;
else
vs = VGET_NONE;
}
return (vs);
}
enum vgetstate
vget_prep(struct vnode *vp)
{
enum vgetstate vs;
if (refcount_acquire_if_not_zero(&vp->v_usecount)) {
vs = VGET_USECOUNT;
} else {
vhold(vp);
vs = VGET_HOLDCNT;
}
return (vs);
}
void
vget_abort(struct vnode *vp, enum vgetstate vs)
{
switch (vs) {
case VGET_USECOUNT:
vrele(vp);
break;
case VGET_HOLDCNT:
vdrop(vp);
break;
default:
__assert_unreachable();
}
}
int
vget(struct vnode *vp, int flags)
{
enum vgetstate vs;
vs = vget_prep(vp);
return (vget_finish(vp, flags, vs));
}
int
vget_finish(struct vnode *vp, int flags, enum vgetstate vs)
1994-05-24 10:09:53 +00:00
{
int error;
1994-05-24 10:09:53 +00:00
if ((flags & LK_INTERLOCK) != 0)
ASSERT_VI_LOCKED(vp, __func__);
else
ASSERT_VI_UNLOCKED(vp, __func__);
VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
VNPASS(vp->v_holdcnt > 0, vp);
VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
error = vn_lock(vp, flags);
if (__predict_false(error != 0)) {
vget_abort(vp, vs);
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
vp);
return (error);
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
}
vget_finish_ref(vp, vs);
return (0);
}
void
vget_finish_ref(struct vnode *vp, enum vgetstate vs)
{
int old;
VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp);
VNPASS(vp->v_holdcnt > 0, vp);
VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp);
if (vs == VGET_USECOUNT)
return;
/*
* We hold the vnode. If the usecount is 0 it will be utilized to keep
* the vnode around. Otherwise someone else lended their hold count and
* we have to drop ours.
*/
old = atomic_fetchadd_int(&vp->v_usecount, 1);
VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old));
if (old != 0) {
#ifdef INVARIANTS
old = atomic_fetchadd_int(&vp->v_holdcnt, -1);
VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old));
#else
refcount_release(&vp->v_holdcnt);
#endif
}
}
void
vref(struct vnode *vp)
{
enum vgetstate vs;
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
vs = vget_prep(vp);
vget_finish_ref(vp, vs);
}
void
vrefact(struct vnode *vp)
{
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
#ifdef INVARIANTS
int old = atomic_fetchadd_int(&vp->v_usecount, 1);
VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old));
#else
refcount_acquire(&vp->v_usecount);
#endif
}
void
vlazy(struct vnode *vp)
{
struct mount *mp;
VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__));
if ((vp->v_mflag & VMP_LAZYLIST) != 0)
return;
/*
* We may get here for inactive routines after the vnode got doomed.
*/
if (VN_IS_DOOMED(vp))
return;
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
vp->v_mflag |= VMP_LAZYLIST;
TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist);
mp->mnt_lazyvnodelistsize++;
}
mtx_unlock(&mp->mnt_listmtx);
}
static void
vunlazy(struct vnode *vp)
{
struct mount *mp;
ASSERT_VI_LOCKED(vp, __func__);
VNPASS(!VN_IS_DOOMED(vp), vp);
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
/*
* Don't remove the vnode from the lazy list if another thread
* has increased the hold count. It may have re-enqueued the
* vnode to the lazy list and is now responsible for its
* removal.
*/
if (vp->v_holdcnt == 0) {
vp->v_mflag &= ~VMP_LAZYLIST;
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
mp->mnt_lazyvnodelistsize--;
}
mtx_unlock(&mp->mnt_listmtx);
}
/*
* This routine is only meant to be called from vgonel prior to dooming
* the vnode.
*/
static void
vunlazy_gone(struct vnode *vp)
{
struct mount *mp;
ASSERT_VOP_ELOCKED(vp, __func__);
ASSERT_VI_LOCKED(vp, __func__);
VNPASS(!VN_IS_DOOMED(vp), vp);
if (vp->v_mflag & VMP_LAZYLIST) {
mp = vp->v_mount;
mtx_lock(&mp->mnt_listmtx);
VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
vp->v_mflag &= ~VMP_LAZYLIST;
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist);
mp->mnt_lazyvnodelistsize--;
mtx_unlock(&mp->mnt_listmtx);
}
}
static void
vdefer_inactive(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
VNASSERT(vp->v_holdcnt > 0, vp,
("%s: vnode without hold count", __func__));
if (VN_IS_DOOMED(vp)) {
vdropl(vp);
return;
}
if (vp->v_iflag & VI_DEFINACT) {
VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
vdropl(vp);
return;
}
if (vp->v_usecount > 0) {
vp->v_iflag &= ~VI_OWEINACT;
vdropl(vp);
return;
}
vlazy(vp);
vp->v_iflag |= VI_DEFINACT;
VI_UNLOCK(vp);
counter_u64_add(deferred_inact, 1);
}
static void
vdefer_inactive_unlocked(struct vnode *vp)
{
VI_LOCK(vp);
if ((vp->v_iflag & VI_OWEINACT) == 0) {
vdropl(vp);
return;
}
vdefer_inactive(vp);
}
enum vput_op { VRELE, VPUT, VUNREF };
/*
* Handle ->v_usecount transitioning to 0.
*
* By releasing the last usecount we take ownership of the hold count which
* provides liveness of the vnode, meaning we have to vdrop.
*
* For all vnodes we may need to perform inactive processing. It requires an
* exclusive lock on the vnode, while it is legal to call here with only a
* shared lock (or no locks). If locking the vnode in an expected manner fails,
* inactive processing gets deferred to the syncer.
*
* XXX Some filesystems pass in an exclusively locked vnode and strongly depend
* on the lock being held all the way until VOP_INACTIVE. This in particular
* happens with UFS which adds half-constructed vnodes to the hash, where they
* can be found by other code.
*/
static void
vput_final(struct vnode *vp, enum vput_op func)
1994-05-24 10:09:53 +00:00
{
int error;
bool want_unlock;
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
VNPASS(vp->v_holdcnt > 0, vp);
VI_LOCK(vp);
/*
* By the time we got here someone else might have transitioned
* the count back to > 0.
*/
if (vp->v_usecount > 0)
goto out;
/*
* If the vnode is doomed vgone already performed inactive processing
* (if needed).
*/
if (VN_IS_DOOMED(vp))
goto out;
if (__predict_true(VOP_NEED_INACTIVE(vp) == 0))
goto out;
if (vp->v_iflag & VI_DOINGINACT)
goto out;
/*
* Locking operations here will drop the interlock and possibly the
* vnode lock, opening a window where the vnode can get doomed all the
* while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to
* perform inactive.
*/
vp->v_iflag |= VI_OWEINACT;
want_unlock = false;
error = 0;
switch (func) {
case VRELE:
switch (VOP_ISLOCKED(vp)) {
case LK_EXCLUSIVE:
break;
case LK_EXCLOTHER:
case 0:
want_unlock = true;
error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
VI_LOCK(vp);
break;
default:
/*
* The lock has at least one sharer, but we have no way
* to conclude whether this is us. Play it safe and
* defer processing.
*/
error = EAGAIN;
break;
}
break;
case VPUT:
want_unlock = true;
if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
LK_NOWAIT);
VI_LOCK(vp);
}
break;
case VUNREF:
if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
VI_LOCK(vp);
}
break;
}
if (error == 0) {
if (func == VUNREF) {
VNASSERT((vp->v_vflag & VV_UNREF) == 0, vp,
("recursive vunref"));
vp->v_vflag |= VV_UNREF;
}
for (;;) {
error = vinactive(vp);
if (want_unlock)
VOP_UNLOCK(vp);
if (error != ERELOOKUP || !want_unlock)
break;
VOP_LOCK(vp, LK_EXCLUSIVE);
}
if (func == VUNREF)
vp->v_vflag &= ~VV_UNREF;
vdropl(vp);
} else {
vdefer_inactive(vp);
}
return;
out:
if (func == VPUT)
VOP_UNLOCK(vp);
vdropl(vp);
1994-05-24 10:09:53 +00:00
}
/*
* Decrement ->v_usecount for a vnode.
*
* Releasing the last use count requires additional processing, see vput_final
* above for details.
*
* Comment above each variant denotes lock state on entry and exit.
*/
/*
* in: any
* out: same as passed in
*/
void
vrele(struct vnode *vp)
{
ASSERT_VI_UNLOCKED(vp, __func__);
if (!refcount_release(&vp->v_usecount))
return;
vput_final(vp, VRELE);
}
2002-06-06 15:46:38 +00:00
/*
* in: locked
* out: unlocked
*/
void
vput(struct vnode *vp)
1994-05-24 10:09:53 +00:00
{
ASSERT_VOP_LOCKED(vp, __func__);
ASSERT_VI_UNLOCKED(vp, __func__);
if (!refcount_release(&vp->v_usecount)) {
VOP_UNLOCK(vp);
return;
}
vput_final(vp, VPUT);
}
/*
* in: locked
* out: locked
*/
void
vunref(struct vnode *vp)
{
ASSERT_VOP_LOCKED(vp, __func__);
ASSERT_VI_UNLOCKED(vp, __func__);
if (!refcount_release(&vp->v_usecount))
return;
vput_final(vp, VUNREF);
}
void
vhold(struct vnode *vp)
1994-05-24 10:09:53 +00:00
{
int old;
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
("%s: wrong hold count %d", __func__, old));
if (old == 0)
vfs_freevnodes_dec();
}
void
vholdnz(struct vnode *vp)
{
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
#ifdef INVARIANTS
int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp,
("%s: wrong hold count %d", __func__, old));
#else
atomic_add_int(&vp->v_holdcnt, 1);
#endif
}
/*
2020-07-06 02:00:35 +00:00
* Grab a hold count unless the vnode is freed.
*
* Only use this routine if vfs smr is the only protection you have against
* freeing the vnode.
2020-07-06 02:00:35 +00:00
*
* The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag
* is not set. After the flag is set the vnode becomes immutable to anyone but
* the thread which managed to set the flag.
*
* It may be tempting to replace the loop with:
* count = atomic_fetchadd_int(&vp->v_holdcnt, 1);
* if (count & VHOLD_NO_SMR) {
* backpedal and error out;
* }
*
* However, while this is more performant, it hinders debugging by eliminating
* the previously mentioned invariant.
*/
bool
vhold_smr(struct vnode *vp)
{
int count;
VFS_SMR_ASSERT_ENTERED();
count = atomic_load_int(&vp->v_holdcnt);
for (;;) {
if (count & VHOLD_NO_SMR) {
VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
("non-zero hold count with flags %d\n", count));
return (false);
}
VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
if (count == 0)
vfs_freevnodes_dec();
return (true);
}
}
}
/*
* Hold a free vnode for recycling.
*
* Note: vnode_init references this comment.
*
* Attempts to recycle only need the global vnode list lock and have no use for
* SMR.
*
* However, vnodes get inserted into the global list before they get fully
* initialized and stay there until UMA decides to free the memory. This in
* particular means the target can be found before it becomes usable and after
* it becomes recycled. Picking up such vnodes is guarded with v_holdcnt set to
* VHOLD_NO_SMR.
*
* Note: the vnode may gain more references after we transition the count 0->1.
*/
static bool
vhold_recycle_free(struct vnode *vp)
{
int count;
mtx_assert(&vnode_list_mtx, MA_OWNED);
count = atomic_load_int(&vp->v_holdcnt);
for (;;) {
if (count & VHOLD_NO_SMR) {
VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp,
("non-zero hold count with flags %d\n", count));
return (false);
}
VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count));
if (count > 0) {
return (false);
}
if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) {
vfs_freevnodes_dec();
return (true);
}
}
}
static void __noinline
vdbatch_process(struct vdbatch *vd)
{
struct vnode *vp;
int i;
mtx_assert(&vd->lock, MA_OWNED);
MPASS(curthread->td_pinned > 0);
MPASS(vd->index == VDBATCH_SIZE);
mtx_lock(&vnode_list_mtx);
critical_enter();
freevnodes += vd->freevnodes;
for (i = 0; i < VDBATCH_SIZE; i++) {
vp = vd->tab[i];
TAILQ_REMOVE(&vnode_list, vp, v_vnodelist);
TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist);
MPASS(vp->v_dbatchcpu != NOCPU);
vp->v_dbatchcpu = NOCPU;
}
mtx_unlock(&vnode_list_mtx);
vd->freevnodes = 0;
bzero(vd->tab, sizeof(vd->tab));
vd->index = 0;
critical_exit();
}
static void
vdbatch_enqueue(struct vnode *vp)
{
struct vdbatch *vd;
ASSERT_VI_LOCKED(vp, __func__);
VNASSERT(!VN_IS_DOOMED(vp), vp,
("%s: deferring requeue of a doomed vnode", __func__));
if (vp->v_dbatchcpu != NOCPU) {
VI_UNLOCK(vp);
return;
}
sched_pin();
vd = DPCPU_PTR(vd);
mtx_lock(&vd->lock);
MPASS(vd->index < VDBATCH_SIZE);
MPASS(vd->tab[vd->index] == NULL);
/*
* A hack: we depend on being pinned so that we know what to put in
* ->v_dbatchcpu.
*/
vp->v_dbatchcpu = curcpu;
vd->tab[vd->index] = vp;
vd->index++;
VI_UNLOCK(vp);
if (vd->index == VDBATCH_SIZE)
vdbatch_process(vd);
mtx_unlock(&vd->lock);
sched_unpin();
}
/*
* This routine must only be called for vnodes which are about to be
* deallocated. Supporting dequeue for arbitrary vndoes would require
* validating that the locked batch matches.
*/
static void
vdbatch_dequeue(struct vnode *vp)
{
struct vdbatch *vd;
int i;
short cpu;
VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp,
("%s: called for a used vnode\n", __func__));
cpu = vp->v_dbatchcpu;
if (cpu == NOCPU)
return;
vd = DPCPU_ID_PTR(cpu, vd);
mtx_lock(&vd->lock);
for (i = 0; i < vd->index; i++) {
if (vd->tab[i] != vp)
continue;
vp->v_dbatchcpu = NOCPU;
vd->index--;
vd->tab[i] = vd->tab[vd->index];
vd->tab[vd->index] = NULL;
break;
}
mtx_unlock(&vd->lock);
/*
* Either we dequeued the vnode above or the target CPU beat us to it.
*/
MPASS(vp->v_dbatchcpu == NOCPU);
}
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
/*
* Drop the hold count of the vnode. If this is the last reference to
* the vnode we place it on the free list unless it has been vgone'd
* (marked VIRF_DOOMED) in which case we will free it.
*
* Because the vnode vm object keeps a hold reference on the vnode if
* there is at least one resident non-cached page, the vnode cannot
* leave the active list without the page cleanup done.
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
*/
static void __noinline
vdropl_final(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
VNPASS(VN_IS_DOOMED(vp), vp);
/*
* Set the VHOLD_NO_SMR flag.
*
* We may be racing against vhold_smr. If they win we can just pretend
* we never got this far, they will vdrop later.
*/
if (__predict_false(!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR))) {
vfs_freevnodes_inc();
VI_UNLOCK(vp);
/*
* We lost the aforementioned race. Any subsequent access is
* invalid as they might have managed to vdropl on their own.
*/
return;
}
/*
* Don't bump freevnodes as this one is going away.
*/
freevnode(vp);
}
void
vdrop(struct vnode *vp)
{
ASSERT_VI_UNLOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (refcount_release_if_not_last(&vp->v_holdcnt))
return;
VI_LOCK(vp);
vdropl(vp);
}
void
vdropl(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if (!refcount_release(&vp->v_holdcnt)) {
VI_UNLOCK(vp);
return;
}
VNPASS((vp->v_iflag & VI_OWEINACT) == 0, vp);
VNPASS((vp->v_iflag & VI_DEFINACT) == 0, vp);
if (VN_IS_DOOMED(vp)) {
vdropl_final(vp);
return;
}
vfs_freevnodes_inc();
if (vp->v_mflag & VMP_LAZYLIST) {
vunlazy(vp);
}
/*
* Also unlocks the interlock. We can't assert on it as we
* released our hold and by now the vnode might have been
* freed.
*/
vdbatch_enqueue(vp);
1994-05-24 10:09:53 +00:00
}
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
/*
* Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
* flags. DOINGINACT prevents us from recursing in calls to vinactive.
*/
static int
vinactivef(struct vnode *vp)
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
{
struct vm_object *obj;
int error;
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
ASSERT_VOP_ELOCKED(vp, "vinactive");
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
ASSERT_VI_LOCKED(vp, "vinactive");
VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
("vinactive: recursed on VI_DOINGINACT"));
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
vp->v_iflag |= VI_DOINGINACT;
vp->v_iflag &= ~VI_OWEINACT;
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
VI_UNLOCK(vp);
/*
* Before moving off the active list, we must be sure that any
* modified pages are converted into the vnode's dirty
* buffers, since these will no longer be checked once the
* vnode is on the inactive list.
*
* The write-out of the dirty pages is asynchronous. At the
* point that VOP_INACTIVE() is called, there could still be
* pending I/O and dirty pages in the object.
*/
if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
vm_object_mightbedirty(obj)) {
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WLOCK(obj);
vm_object_page_clean(obj, 0, 0, 0);
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WUNLOCK(obj);
}
error = VOP_INACTIVE(vp);
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
VI_LOCK(vp);
VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
("vinactive: lost VI_DOINGINACT"));
vp->v_iflag &= ~VI_DOINGINACT;
return (error);
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
}
int
vinactive(struct vnode *vp)
{
ASSERT_VOP_ELOCKED(vp, "vinactive");
ASSERT_VI_LOCKED(vp, "vinactive");
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
if ((vp->v_iflag & VI_OWEINACT) == 0)
return (0);
if (vp->v_iflag & VI_DOINGINACT)
return (0);
if (vp->v_usecount > 0) {
vp->v_iflag &= ~VI_OWEINACT;
return (0);
}
return (vinactivef(vp));
}
1994-05-24 10:09:53 +00:00
/*
* Remove any vnodes in the vnode table belonging to mount point mp.
*
* If FORCECLOSE is not specified, there should not be any active ones,
1994-05-24 10:09:53 +00:00
* return error if any are found (nb: this is a user error, not a
* system error). If FORCECLOSE is specified, detach any active vnodes
1994-05-24 10:09:53 +00:00
* that are found.
*
* If WRITECLOSE is set, only flush out regular file vnodes open for
* writing.
*
* SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
*
* `rootrefs' specifies the base reference count for the root vnode
* of this filesystem. The root vnode is considered busy if its
* v_usecount exceeds this value. On a successful return, vflush(, td)
* will call vrele() on the root vnode exactly rootrefs times.
* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
* be zero.
1994-05-24 10:09:53 +00:00
*/
#ifdef DIAGNOSTIC
static int busyprt = 0; /* print out busy vnodes */
SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
1994-05-24 10:09:53 +00:00
#endif
int
2010-04-03 11:19:20 +00:00
vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
1994-05-24 10:09:53 +00:00
{
struct vnode *vp, *mvp, *rootvp = NULL;
struct vattr vattr;
int busy = 0, error;
1994-05-24 10:09:53 +00:00
CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
rootrefs, flags);
if (rootrefs > 0) {
KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
("vflush: bad args"));
/*
* Get the filesystem root vnode. We can vput() it
* immediately, since with rootrefs > 0, it won't go away.
*/
if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
__func__, error);
return (error);
}
vput(rootvp);
}
1994-05-24 10:09:53 +00:00
loop:
MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vholdl(vp);
error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
if (error) {
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vdrop(vp);
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
goto loop;
}
1994-05-24 10:09:53 +00:00
/*
* Skip over a vnodes marked VV_SYSTEM.
1994-05-24 10:09:53 +00:00
*/
if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
VOP_UNLOCK(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vdrop(vp);
1994-05-24 10:09:53 +00:00
continue;
}
1994-05-24 10:09:53 +00:00
/*
* If WRITECLOSE is set, flush out unlinked but still open
* files (even if open only for reading) and regular file
2002-06-06 15:46:38 +00:00
* vnodes open for writing.
1994-05-24 10:09:53 +00:00
*/
if (flags & WRITECLOSE) {
if (vp->v_object != NULL) {
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
Switch the vm_object mutex to be a rwlock. This will enable in the future further optimizations where the vm_object lock will be held in read mode most of the time the page cache resident pool of pages are accessed for reading purposes. The change is mostly mechanical but few notes are reported: * The KPI changes as follow: - VM_OBJECT_LOCK() -> VM_OBJECT_WLOCK() - VM_OBJECT_TRYLOCK() -> VM_OBJECT_TRYWLOCK() - VM_OBJECT_UNLOCK() -> VM_OBJECT_WUNLOCK() - VM_OBJECT_LOCK_ASSERT(MA_OWNED) -> VM_OBJECT_ASSERT_WLOCKED() (in order to avoid visibility of implementation details) - The read-mode operations are added: VM_OBJECT_RLOCK(), VM_OBJECT_TRYRLOCK(), VM_OBJECT_RUNLOCK(), VM_OBJECT_ASSERT_RLOCKED(), VM_OBJECT_ASSERT_LOCKED() * The vm/vm_pager.h namespace pollution avoidance (forcing requiring sys/mutex.h in consumers directly to cater its inlining functions using VM_OBJECT_LOCK()) imposes that all the vm/vm_pager.h consumers now must include also sys/rwlock.h. * zfs requires a quite convoluted fix to include FreeBSD rwlocks into the compat layer because the name clash between FreeBSD and solaris versions must be avoided. At this purpose zfs redefines the vm_object locking functions directly, isolating the FreeBSD components in specific compat stubs. The KPI results heavilly broken by this commit. Thirdy part ports must be updated accordingly (I can think off-hand of VirtualBox, for example). Sponsored by: EMC / Isilon storage division Reviewed by: jeff Reviewed by: pjd (ZFS specific review) Discussed with: alc Tested by: pho
2013-03-09 02:32:23 +00:00
VM_OBJECT_WUNLOCK(vp->v_object);
}
do {
error = VOP_FSYNC(vp, MNT_WAIT, td);
} while (error == ERELOOKUP);
if (error != 0) {
VOP_UNLOCK(vp);
vdrop(vp);
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
return (error);
}
error = VOP_GETATTR(vp, &vattr, td->td_ucred);
VI_LOCK(vp);
if ((vp->v_type == VNON ||
(error == 0 && vattr.va_nlink > 0)) &&
(vp->v_writecount <= 0 || vp->v_type != VREG)) {
VOP_UNLOCK(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vdropl(vp);
continue;
}
} else
VI_LOCK(vp);
1994-05-24 10:09:53 +00:00
/*
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
* With v_usecount == 0, all we need to do is clear out the
* vnode data structures and we are done.
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
*
* If FORCECLOSE is set, forcibly close the vnode.
1994-05-24 10:09:53 +00:00
*/
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
vgonel(vp);
} else {
busy++;
1994-05-24 10:09:53 +00:00
#ifdef DIAGNOSTIC
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
if (busyprt)
vn_printf(vp, "vflush: busy vnode ");
1994-05-24 10:09:53 +00:00
#endif
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
}
VOP_UNLOCK(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vdropl(vp);
1994-05-24 10:09:53 +00:00
}
if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
/*
* If just the root vnode is busy, and if its refcount
* is equal to `rootrefs', then go ahead and kill it.
*/
VI_LOCK(rootvp);
KASSERT(busy > 0, ("vflush: not busy"));
2005-02-17 10:28:58 +00:00
VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
("vflush: usecount %d < rootrefs %d",
rootvp->v_usecount, rootrefs));
if (busy == 1 && rootvp->v_usecount == rootrefs) {
VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
vgone(rootvp);
VOP_UNLOCK(rootvp);
busy = 0;
} else
VI_UNLOCK(rootvp);
}
if (busy) {
CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
busy);
1994-05-24 10:09:53 +00:00
return (EBUSY);
}
for (; rootrefs > 0; rootrefs--)
vrele(rootvp);
1994-05-24 10:09:53 +00:00
return (0);
}
/*
* Recycle an unused vnode to the front of the free list.
*/
int
vrecycle(struct vnode *vp)
{
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
int recycled;
VI_LOCK(vp);
recycled = vrecyclel(vp);
VI_UNLOCK(vp);
return (recycled);
}
/*
* vrecycle, with the vp interlock held.
*/
int
vrecyclel(struct vnode *vp)
{
int recycled;
ASSERT_VOP_ELOCKED(vp, __func__);
ASSERT_VI_LOCKED(vp, __func__);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
recycled = 0;
if (vp->v_usecount == 0) {
recycled = 1;
vgonel(vp);
}
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
return (recycled);
}
1994-05-24 10:09:53 +00:00
/*
* Eliminate all activity associated with a vnode
* in preparation for reuse.
1994-05-24 10:09:53 +00:00
*/
void
vgone(struct vnode *vp)
{
VI_LOCK(vp);
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vgonel(vp);
VI_UNLOCK(vp);
}
/*
* Notify upper mounts about reclaimed or unlinked vnode.
*/
void
vfs_notify_upper(struct vnode *vp, int event)
{
struct mount *mp;
struct mount_upper_node *ump;
mp = atomic_load_ptr(&vp->v_mount);
if (mp == NULL)
return;
if (TAILQ_EMPTY(&mp->mnt_notify))
return;
MNT_ILOCK(mp);
mp->mnt_upper_pending++;
KASSERT(mp->mnt_upper_pending > 0,
("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending));
TAILQ_FOREACH(ump, &mp->mnt_notify, mnt_upper_link) {
MNT_IUNLOCK(mp);
switch (event) {
case VFS_NOTIFY_UPPER_RECLAIM:
VFS_RECLAIM_LOWERVP(ump->mp, vp);
break;
case VFS_NOTIFY_UPPER_UNLINK:
VFS_UNLINK_LOWERVP(ump->mp, vp);
break;
default:
KASSERT(0, ("invalid event %d", event));
break;
}
MNT_ILOCK(mp);
}
mp->mnt_upper_pending--;
if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
mp->mnt_upper_pending == 0) {
mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
wakeup(&mp->mnt_uppers);
}
MNT_IUNLOCK(mp);
}
/*
* vgone, with the vp interlock held.
*/
static void
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
vgonel(struct vnode *vp)
1994-05-24 10:09:53 +00:00
{
struct thread *td;
struct mount *mp;
vm_object_t object;
bool active, doinginact, oweinact;
1994-05-24 10:09:53 +00:00
ASSERT_VOP_ELOCKED(vp, "vgonel");
ASSERT_VI_LOCKED(vp, "vgonel");
- Change holdcnt use around vnode recycling. We now always keep a holdcnt ref while we're calling vgone(). This prevents transient refs from re-adding us to the free list. Previously, a vfree() triggered via vinvalbuf() getting rid of all of a vnode's pages could place a partially destructed vnode on the free list where vtryrecycle() could find it. The first call to vtryrecycle would hang up on the vnode lock, but when it failed it would place a now dead vnode onto the free list, and another call to vtryrecycle() would free an already free vnode. There were many complications of having a zero ref count while freeing which can now go away. - Change vdropl() to release the interlock before returning. All callers now respect this, so vdropl() directly frees VI_DOOMED vnodes once the last ref is dropped. This means that we'll never have VI_DOOMED vnodes on the free list. - Seperate v_incr_usecount() into v_incr_usecount(), v_decr_usecount() and v_decr_useonly(). The incr/decr split is so that incr usecount can return with the interlock still held while decr drops the interlock so it can call vdropl() which will potentially free the vnode. The calling function can't drop the lock of an already free'd node. v_decr_useonly() drops a usecount without droping the hold count. This is done so the usecount reaches zero in vput() before we recycle, however the holdcount is still 1 which prevents any new references from placing the vnode back on the free list. - Fix vnlrureclaim() to vhold the vnode since it doesn't do a vget(). We wouldn't want vnlrureclaim() to bump the usecount since this has different semantics. Also change vnlrureclaim() to do a NOWAIT on the vn_lock. When this function runs we're usually in a desperate situation and we wouldn't want to wait for any specific vnode to be released. - Fix a bunch of misc comments to reflect the new behavior. - Add vhold() and vdrop() to vflush() for the same reasons that we do in vlrureclaim(). Previously we held no reference and a vnode could have been freed while we were waiting on the lock. - Get rid of vlruvp() and vfreehead(). Neither are used. vlruvp() should really be rethought before it's reintroduced. - vgonel() always returns with the vnode locked now and never puts the vnode back on a free list. The vnode will be freed as soon as the last reference is released. Sponsored by: Isilon Systems, Inc. Debugging help from: Kris Kennaway, Peter Holm Approved by: re (blanket vfs)
2005-06-16 04:41:42 +00:00
VNASSERT(vp->v_holdcnt, vp,
("vgonel: vp %p has no reference.", vp));
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
td = curthread;
/*
* Don't vgonel if we're already doomed.
*/
if (VN_IS_DOOMED(vp))
return;
/*
* Paired with freevnode.
*/
vn_seqc_write_begin_locked(vp);
vunlazy_gone(vp);
vn_irflag_set_locked(vp, VIRF_DOOMED);
1994-05-24 10:09:53 +00:00
/*
* Check to see if the vnode is in use. If so, we have to
* call VOP_CLOSE() and VOP_INACTIVE().
*
* It could be that VOP_INACTIVE() requested reclamation, in
* which case we should avoid recursion, so check
* VI_DOINGINACT. This is not precise but good enough.
1994-05-24 10:09:53 +00:00
*/
active = vp->v_usecount > 0;
oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
doinginact = (vp->v_iflag & VI_DOINGINACT) != 0;
/*
* If we need to do inactive VI_OWEINACT will be set.
*/
if (vp->v_iflag & VI_DEFINACT) {
VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count"));
vp->v_iflag &= ~VI_DEFINACT;
vdropl(vp);
} else {
VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count"));
VI_UNLOCK(vp);
}
cache_purge_vgone(vp);
vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
1994-05-24 10:09:53 +00:00
/*
* If purging an active vnode, it must be closed and
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
* deactivated before being reclaimed.
1994-05-24 10:09:53 +00:00
*/
if (active)
VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
if (!doinginact) {
do {
if (oweinact || active) {
VI_LOCK(vp);
vinactivef(vp);
oweinact = (vp->v_iflag & VI_OWEINACT) != 0;
VI_UNLOCK(vp);
}
} while (oweinact);
1994-05-24 10:09:53 +00:00
}
if (vp->v_type == VSOCK)
vfs_unp_reclaim(vp);
/*
* Clean out any buffers associated with the vnode.
* If the flush fails, just toss the buffers.
*/
mp = NULL;
if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
(void) vn_start_secondary_write(vp, &mp, V_WAIT);
if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
while (vinvalbuf(vp, 0, 0, 0) != 0)
;
}
BO_LOCK(&vp->v_bufobj);
KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
vp->v_bufobj.bo_clean.bv_cnt == 0,
("vp %p bufobj not invalidated", vp));
/*
* For VMIO bufobj, BO_DEAD is set later, or in
* vm_object_terminate() after the object's page queue is
* flushed.
*/
object = vp->v_bufobj.bo_object;
if (object == NULL)
vp->v_bufobj.bo_flag |= BO_DEAD;
BO_UNLOCK(&vp->v_bufobj);
/*
* Handle the VM part. Tmpfs handles v_object on its own (the
* OBJT_VNODE check). Nullfs or other bypassing filesystems
* should not touch the object borrowed from the lower vnode
* (the handle check).
*/
if (object != NULL && object->type == OBJT_VNODE &&
object->handle == vp)
vnode_destroy_vobject(vp);
1994-05-24 10:09:53 +00:00
/*
* Reclaim the vnode.
*/
if (VOP_RECLAIM(vp))
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
panic("vgone: cannot reclaim");
if (mp != NULL)
vn_finished_secondary_write(mp);
2005-02-17 10:28:58 +00:00
VNASSERT(vp->v_object == NULL, vp,
("vop_reclaim left v_object vp=%p", vp));
/*
* Clear the advisory locks and wake up waiting threads.
*/
(void)VOP_ADVLOCKPURGE(vp);
vp->v_lockf = NULL;
/*
* Delete from old mount point vnode list.
*/
When we traverse the vnodes on a mountpoint we need to look out for our cached 'next vnode' being removed from this mountpoint. If we find that it was recycled, we restart our traversal from the start of the list. Code to do that is in all local disk filesystems (and a few other places) and looks roughly like this: MNT_ILOCK(mp); loop: for (vp = TAILQ_FIRST(&mp...); (vp = nvp) != NULL; nvp = TAILQ_NEXT(vp,...)) { if (vp->v_mount != mp) goto loop; MNT_IUNLOCK(mp); ... MNT_ILOCK(mp); } MNT_IUNLOCK(mp); The code which takes vnodes off a mountpoint looks like this: MNT_ILOCK(vp->v_mount); ... TAILQ_REMOVE(&vp->v_mount->mnt_nvnodelist, vp, v_nmntvnodes); ... MNT_IUNLOCK(vp->v_mount); ... vp->v_mount = something; (Take a moment and try to spot the locking error before you read on.) On a SMP system, one CPU could have removed nvp from our mountlist but not yet gotten to assign a new value to vp->v_mount while another CPU simultaneously get to the top of the traversal loop where it finds that (vp->v_mount != mp) is not true despite the fact that the vnode has indeed been removed from our mountpoint. Fix: Introduce the macro MNT_VNODE_FOREACH() to traverse the list of vnodes on a mountpoint while taking into account that vnodes may be removed from the list as we go. This saves approx 65 lines of duplicated code. Split the insmntque() which potentially moves a vnode from one mount point to another into delmntque() and insmntque() which does just what the names say. Fix delmntque() to set vp->v_mount to NULL while holding the mountpoint lock.
2004-07-04 08:52:35 +00:00
delmntque(vp);
1994-05-24 10:09:53 +00:00
/*
* Done with purge, reset to the standard lock and invalidate
* the vnode.
1994-05-24 10:09:53 +00:00
*/
VI_LOCK(vp);
vp->v_vnlock = &vp->v_lock;
vp->v_op = &dead_vnodeops;
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
vp->v_type = VBAD;
1994-05-24 10:09:53 +00:00
}
/*
* Print out a description of a vnode.
*/
static const char * const typename[] =
{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
"VMARKER"};
1994-05-24 10:09:53 +00:00
_Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0,
"new hold count flag not added to vn_printf");
void
vn_printf(struct vnode *vp, const char *fmt, ...)
1994-05-24 10:09:53 +00:00
{
va_list ap;
char buf[256], buf2[16];
u_long flags;
u_int holdcnt;
short irflag;
1994-05-24 10:09:53 +00:00
va_start(ap, fmt);
vprintf(fmt, ap);
va_end(ap);
printf("%p: ", (void *)vp);
printf("type %s\n", typename[vp->v_type]);
holdcnt = atomic_load_int(&vp->v_holdcnt);
printf(" usecount %d, writecount %d, refcount %d seqc users %d",
vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS,
vp->v_seqc_users);
switch (vp->v_type) {
case VDIR:
printf(" mountedhere %p\n", vp->v_mountedhere);
break;
case VCHR:
printf(" rdev %p\n", vp->v_rdev);
break;
case VSOCK:
printf(" socket %p\n", vp->v_unpcb);
break;
case VFIFO:
printf(" fifoinfo %p\n", vp->v_fifoinfo);
break;
default:
printf("\n");
break;
}
buf[0] = '\0';
buf[1] = '\0';
if (holdcnt & VHOLD_NO_SMR)
strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf));
printf(" hold count flags (%s)\n", buf + 1);
1994-05-24 10:09:53 +00:00
buf[0] = '\0';
buf[1] = '\0';
irflag = vn_irflag_read(vp);
if (irflag & VIRF_DOOMED)
strlcat(buf, "|VIRF_DOOMED", sizeof(buf));
if (irflag & VIRF_PGREAD)
strlcat(buf, "|VIRF_PGREAD", sizeof(buf));
if (irflag & VIRF_MOUNTPOINT)
strlcat(buf, "|VIRF_MOUNTPOINT", sizeof(buf));
flags = irflag & ~(VIRF_DOOMED | VIRF_PGREAD | VIRF_MOUNTPOINT);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
if (vp->v_vflag & VV_ROOT)
strlcat(buf, "|VV_ROOT", sizeof(buf));
if (vp->v_vflag & VV_ISTTY)
strlcat(buf, "|VV_ISTTY", sizeof(buf));
if (vp->v_vflag & VV_NOSYNC)
strlcat(buf, "|VV_NOSYNC", sizeof(buf));
if (vp->v_vflag & VV_ETERNALDEV)
strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
if (vp->v_vflag & VV_CACHEDLABEL)
strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
if (vp->v_vflag & VV_VMSIZEVNLOCK)
strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf));
if (vp->v_vflag & VV_COPYONWRITE)
strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
if (vp->v_vflag & VV_SYSTEM)
strlcat(buf, "|VV_SYSTEM", sizeof(buf));
if (vp->v_vflag & VV_PROCDEP)
strlcat(buf, "|VV_PROCDEP", sizeof(buf));
if (vp->v_vflag & VV_NOKNOTE)
strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
if (vp->v_vflag & VV_DELETED)
strlcat(buf, "|VV_DELETED", sizeof(buf));
if (vp->v_vflag & VV_MD)
strlcat(buf, "|VV_MD", sizeof(buf));
if (vp->v_vflag & VV_FORCEINSMQ)
strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
if (vp->v_vflag & VV_READLINK)
strlcat(buf, "|VV_READLINK", sizeof(buf));
flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
VV_CACHEDLABEL | VV_VMSIZEVNLOCK | VV_COPYONWRITE | VV_SYSTEM |
VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ |
VV_READLINK);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
if (vp->v_iflag & VI_TEXT_REF)
strlcat(buf, "|VI_TEXT_REF", sizeof(buf));
if (vp->v_iflag & VI_MOUNT)
strlcat(buf, "|VI_MOUNT", sizeof(buf));
if (vp->v_iflag & VI_DOINGINACT)
strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
if (vp->v_iflag & VI_OWEINACT)
strlcat(buf, "|VI_OWEINACT", sizeof(buf));
if (vp->v_iflag & VI_DEFINACT)
strlcat(buf, "|VI_DEFINACT", sizeof(buf));
if (vp->v_iflag & VI_FOPENING)
strlcat(buf, "|VI_FOPENING", sizeof(buf));
flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT |
VI_OWEINACT | VI_DEFINACT | VI_FOPENING);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
if (vp->v_mflag & VMP_LAZYLIST)
strlcat(buf, "|VMP_LAZYLIST", sizeof(buf));
flags = vp->v_mflag & ~(VMP_LAZYLIST);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags);
strlcat(buf, buf2, sizeof(buf));
}
printf(" flags (%s)", buf + 1);
if (mtx_owned(VI_MTX(vp)))
printf(" VI_LOCKed");
printf("\n");
if (vp->v_object != NULL)
printf(" v_object %p ref %d pages %d "
"cleanbuf %d dirtybuf %d\n",
- Remove vx_lock, vx_unlock, vx_wait, etc. - Add a vn_start_write/vn_finished_write around vlrureclaim so we don't do writing ops without suspending. This could suspend the vlruproc which should not be a problem under normal circumstances. - Manually implement VMIGHTFREE in vlrureclaim as this was the only instance where it was used. - Acquire a lock before calling vgone() as it now requires it. - Move the acquisition of the vnode interlock from vtryrecycle() to getnewvnode() so that if it fails we don't drop and reacquire the vnode_free_list_mtx. - Check for a usecount or holdcount at the end of vtryrecycle() in case someone grabbed a ref while we were recycling. Abort the recycle, and on the final ref drop this vnode will be placed on the head of the free list. - Move the redundant VOP_INACTIVE protection code into the local vinactive() routine to avoid code bloat. - Keep the vnode lock held across calls to vgone() in several places. - vgonel() no longer uses XLOCK, instead callers must hold an exclusive vnode lock. The VI_DOOMED flag is set to allow other threads to detect a vnode which is no longer valid. This flag is set until the last reference is gone, and there are no chances for a new ref. vgonel() holds this lock across the entire function, which greatly simplifies logic. _ Only vfree() in one place in vgone() not three. - Adjust vget() to check the VI_DOOMED flag prior to waiting on the lock in the LK_NOWAIT case. In other cases, check after we have slept and acquired an exlusive lock. This will simulate the old vx_wait() behavior. Sponsored by: Isilon Systems, Inc.
2005-03-13 11:54:28 +00:00
vp->v_object, vp->v_object->ref_count,
vp->v_object->resident_page_count,
vp->v_bufobj.bo_clean.bv_cnt,
vp->v_bufobj.bo_dirty.bv_cnt);
printf(" ");
lockmgr_printinfo(vp->v_vnlock);
if (vp->v_data != NULL)
1994-05-24 10:09:53 +00:00
VOP_PRINT(vp);
}
#ifdef DDB
1994-05-24 10:09:53 +00:00
/*
* List all of the locked vnodes in the system.
* Called when debugging the kernel.
*/
DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
1994-05-24 10:09:53 +00:00
{
struct mount *mp;
struct vnode *vp;
1994-05-24 10:09:53 +00:00
/*
* Note: because this is DDB, we can't obey the locking semantics
* for these structures, which means we could catch an inconsistent
* state and dereference a nasty pointer. Not much to be done
* about that.
*/
db_printf("Locked vnodes\n");
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
vn_printf(vp, "vnode ");
}
1994-05-24 10:09:53 +00:00
}
}
2006-09-04 22:15:44 +00:00
/*
* Show details about the given vnode.
*/
DB_SHOW_COMMAND(vnode, db_show_vnode)
{
struct vnode *vp;
if (!have_addr)
return;
vp = (struct vnode *)addr;
vn_printf(vp, "vnode ");
}
/*
* Show details about the given mount point.
*/
DB_SHOW_COMMAND(mount, db_show_mount)
{
struct mount *mp;
struct vfsopt *opt;
struct statfs *sp;
struct vnode *vp;
char buf[512];
uint64_t mflags;
u_int flags;
if (!have_addr) {
/* No address given, print short info about all mount points. */
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
db_printf("%p %s on %s (%s)\n", mp,
mp->mnt_stat.f_mntfromname,
mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_fstypename);
if (db_pager_quit)
break;
}
db_printf("\nMore info: show mount <addr>\n");
return;
}
mp = (struct mount *)addr;
db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
buf[0] = '\0';
mflags = mp->mnt_flag;
#define MNT_FLAG(flag) do { \
if (mflags & (flag)) { \
if (buf[0] != '\0') \
strlcat(buf, ", ", sizeof(buf)); \
strlcat(buf, (#flag) + 4, sizeof(buf)); \
mflags &= ~(flag); \
} \
} while (0)
MNT_FLAG(MNT_RDONLY);
MNT_FLAG(MNT_SYNCHRONOUS);
MNT_FLAG(MNT_NOEXEC);
MNT_FLAG(MNT_NOSUID);
MNT_FLAG(MNT_NFS4ACLS);
MNT_FLAG(MNT_UNION);
MNT_FLAG(MNT_ASYNC);
MNT_FLAG(MNT_SUIDDIR);
MNT_FLAG(MNT_SOFTDEP);
MNT_FLAG(MNT_NOSYMFOLLOW);
MNT_FLAG(MNT_GJOURNAL);
MNT_FLAG(MNT_MULTILABEL);
MNT_FLAG(MNT_ACLS);
MNT_FLAG(MNT_NOATIME);
MNT_FLAG(MNT_NOCLUSTERR);
MNT_FLAG(MNT_NOCLUSTERW);
MNT_FLAG(MNT_SUJ);
MNT_FLAG(MNT_EXRDONLY);
MNT_FLAG(MNT_EXPORTED);
MNT_FLAG(MNT_DEFEXPORTED);
MNT_FLAG(MNT_EXPORTANON);
MNT_FLAG(MNT_EXKERB);
MNT_FLAG(MNT_EXPUBLIC);
MNT_FLAG(MNT_LOCAL);
MNT_FLAG(MNT_QUOTA);
MNT_FLAG(MNT_ROOTFS);
MNT_FLAG(MNT_USER);
MNT_FLAG(MNT_IGNORE);
MNT_FLAG(MNT_UPDATE);
MNT_FLAG(MNT_DELEXPORT);
MNT_FLAG(MNT_RELOAD);
MNT_FLAG(MNT_FORCE);
MNT_FLAG(MNT_SNAPSHOT);
MNT_FLAG(MNT_BYFSID);
#undef MNT_FLAG
if (mflags != 0) {
if (buf[0] != '\0')
strlcat(buf, ", ", sizeof(buf));
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
"0x%016jx", mflags);
}
db_printf(" mnt_flag = %s\n", buf);
buf[0] = '\0';
flags = mp->mnt_kern_flag;
#define MNT_KERN_FLAG(flag) do { \
if (flags & (flag)) { \
if (buf[0] != '\0') \
strlcat(buf, ", ", sizeof(buf)); \
strlcat(buf, (#flag) + 5, sizeof(buf)); \
flags &= ~(flag); \
} \
} while (0)
MNT_KERN_FLAG(MNTK_UNMOUNTF);
MNT_KERN_FLAG(MNTK_ASYNC);
MNT_KERN_FLAG(MNTK_SOFTDEP);
MNT_KERN_FLAG(MNTK_DRAINING);
MNT_KERN_FLAG(MNTK_REFEXPIRE);
MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
MNT_KERN_FLAG(MNTK_SHARED_WRITES);
MNT_KERN_FLAG(MNTK_NO_IOPF);
MNT_KERN_FLAG(MNTK_RECURSE);
MNT_KERN_FLAG(MNTK_UPPER_WAITER);
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
MNT_KERN_FLAG(MNTK_USES_BCACHE);
MNT_KERN_FLAG(MNTK_FPLOOKUP);
MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER);
MNT_KERN_FLAG(MNTK_NOASYNC);
MNT_KERN_FLAG(MNTK_UNMOUNT);
MNT_KERN_FLAG(MNTK_MWAIT);
MNT_KERN_FLAG(MNTK_SUSPEND);
MNT_KERN_FLAG(MNTK_SUSPEND2);
MNT_KERN_FLAG(MNTK_SUSPENDED);
MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
MNT_KERN_FLAG(MNTK_NOKNOTE);
#undef MNT_KERN_FLAG
if (flags != 0) {
if (buf[0] != '\0')
strlcat(buf, ", ", sizeof(buf));
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
"0x%08x", flags);
}
db_printf(" mnt_kern_flag = %s\n", buf);
db_printf(" mnt_opt = ");
opt = TAILQ_FIRST(mp->mnt_opt);
if (opt != NULL) {
db_printf("%s", opt->name);
opt = TAILQ_NEXT(opt, link);
while (opt != NULL) {
db_printf(", %s", opt->name);
opt = TAILQ_NEXT(opt, link);
}
}
db_printf("\n");
sp = &mp->mnt_stat;
db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
"bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
"ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
"asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
(u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
(uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
(uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
(intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
(intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
(uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
(uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
(u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
db_printf(" mnt_cred = { uid=%u ruid=%u",
(u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
if (jailed(mp->mnt_cred))
db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
db_printf(" }\n");
db_printf(" mnt_ref = %d (with %d in the struct)\n",
vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref);
db_printf(" mnt_gen = %d\n", mp->mnt_gen);
db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
db_printf(" mnt_lazyvnodelistsize = %d\n",
mp->mnt_lazyvnodelistsize);
db_printf(" mnt_writeopcount = %d (with %d in the struct)\n",
vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount);
db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
db_printf(" mnt_lockref = %d (with %d in the struct)\n",
vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref);
db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
db_printf(" mnt_secondary_accwrites = %d\n",
mp->mnt_secondary_accwrites);
db_printf(" mnt_gjprovider = %s\n",
mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops);
db_printf("\n\nList of active vnodes\n");
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && vp->v_holdcnt > 0) {
vn_printf(vp, "vnode ");
if (db_pager_quit)
break;
}
}
db_printf("\n\nList of inactive vnodes\n");
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (vp->v_type != VMARKER && vp->v_holdcnt == 0) {
vn_printf(vp, "vnode ");
if (db_pager_quit)
break;
}
}
}
2006-09-04 22:15:44 +00:00
#endif /* DDB */
1994-05-24 10:09:53 +00:00
/*
* Fill in a struct xvfsconf based on a struct vfsconf.
*/
static int
vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
{
struct xvfsconf xvfsp;
bzero(&xvfsp, sizeof(xvfsp));
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
xvfsp.vfc_typenum = vfsp->vfc_typenum;
xvfsp.vfc_refcount = vfsp->vfc_refcount;
xvfsp.vfc_flags = vfsp->vfc_flags;
/*
* These are unused in userland, we keep them
* to not break binary compatibility.
*/
xvfsp.vfc_vfsops = NULL;
xvfsp.vfc_next = NULL;
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
}
#ifdef COMPAT_FREEBSD32
struct xvfsconf32 {
uint32_t vfc_vfsops;
char vfc_name[MFSNAMELEN];
int32_t vfc_typenum;
int32_t vfc_refcount;
int32_t vfc_flags;
uint32_t vfc_next;
};
static int
vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
{
struct xvfsconf32 xvfsp;
bzero(&xvfsp, sizeof(xvfsp));
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
xvfsp.vfc_typenum = vfsp->vfc_typenum;
xvfsp.vfc_refcount = vfsp->vfc_refcount;
xvfsp.vfc_flags = vfsp->vfc_flags;
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
}
#endif
/*
* Top level filesystem related information gathering.
*/
static int
sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
{
struct vfsconf *vfsp;
int error;
error = 0;
vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
#ifdef COMPAT_FREEBSD32
if (req->flags & SCTL_MASK32)
error = vfsconf2x32(req, vfsp);
else
#endif
error = vfsconf2x(req, vfsp);
if (error)
break;
}
vfsconf_sunlock();
return (error);
}
SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
"S,xvfsconf", "List of all configured filesystems");
#ifndef BURN_BRIDGES
2002-03-19 21:25:46 +00:00
static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
static int
vfs_sysctl(SYSCTL_HANDLER_ARGS)
{
int *name = (int *)arg1 - 1; /* XXX */
u_int namelen = arg2 + 1; /* XXX */
struct vfsconf *vfsp;
log(LOG_WARNING, "userland calling deprecated sysctl, "
"please rebuild world\n");
#if 1 || defined(COMPAT_PRELITE2)
/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
if (namelen == 1)
return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
#endif
switch (name[1]) {
case VFS_MAXTYPENUM:
if (namelen != 2)
return (ENOTDIR);
return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
case VFS_CONF:
if (namelen != 3)
return (ENOTDIR); /* overloaded */
vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
if (vfsp->vfc_typenum == name[2])
break;
}
vfsconf_sunlock();
if (vfsp == NULL)
return (EOPNOTSUPP);
#ifdef COMPAT_FREEBSD32
if (req->flags & SCTL_MASK32)
return (vfsconf2x32(req, vfsp));
else
#endif
return (vfsconf2x(req, vfsp));
}
return (EOPNOTSUPP);
}
static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
CTLFLAG_MPSAFE, vfs_sysctl,
"Generic filesystem");
#if 1 || defined(COMPAT_PRELITE2)
static int
sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
{
int error;
struct vfsconf *vfsp;
struct ovfsconf ovfs;
vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
bzero(&ovfs, sizeof(ovfs));
ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
strcpy(ovfs.vfc_name, vfsp->vfc_name);
ovfs.vfc_index = vfsp->vfc_typenum;
ovfs.vfc_refcount = vfsp->vfc_refcount;
ovfs.vfc_flags = vfsp->vfc_flags;
error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
if (error != 0) {
vfsconf_sunlock();
return (error);
}
}
vfsconf_sunlock();
return (0);
}
#endif /* 1 || COMPAT_PRELITE2 */
#endif /* !BURN_BRIDGES */
#define KINFO_VNODESLOP 10
#ifdef notyet
1994-05-24 10:09:53 +00:00
/*
* Dump vnode list (via sysctl).
*/
/* ARGSUSED */
static int
sysctl_vnode(SYSCTL_HANDLER_ARGS)
1994-05-24 10:09:53 +00:00
{
struct xvnode *xvn;
struct mount *mp;
struct vnode *vp;
int error, len, n;
/*
* Stale numvnodes access is not fatal here.
*/
req->lock = 0;
len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
if (!req->oldptr)
/* Make an estimate */
return (SYSCTL_OUT(req, 0, len));
These changes embody the support of the fully coherent merged VM buffer cache, much higher filesystem I/O performance, and much better paging performance. It represents the culmination of over 6 months of R&D. The majority of the merged VM/cache work is by John Dyson. The following highlights the most significant changes. Additionally, there are (mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to support the new VM/buffer scheme. vfs_bio.c: Significant rewrite of most of vfs_bio to support the merged VM buffer cache scheme. The scheme is almost fully compatible with the old filesystem interface. Significant improvement in the number of opportunities for write clustering. vfs_cluster.c, vfs_subr.c Upgrade and performance enhancements in vfs layer code to support merged VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff. vm_object.c: Yet more improvements in the collapse code. Elimination of some windows that can cause list corruption. vm_pageout.c: Fixed it, it really works better now. Somehow in 2.0, some "enhancements" broke the code. This code has been reworked from the ground-up. vm_fault.c, vm_page.c, pmap.c, vm_object.c Support for small-block filesystems with merged VM/buffer cache scheme. pmap.c vm_map.c Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of kernel PTs. vm_glue.c Much simpler and more effective swapping code. No more gratuitous swapping. proc.h Fixed the problem that the p_lock flag was not being cleared on a fork. swap_pager.c, vnode_pager.c Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the code doesn't need it anymore. machdep.c Changes to better support the parameter values for the merged VM/buffer cache scheme. machdep.c, kern_exec.c, vm_glue.c Implemented a seperate submap for temporary exec string space and another one to contain process upages. This eliminates all map fragmentation problems that previously existed. ffs_inode.c, ufs_inode.c, ufs_readwrite.c Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on busy buffers. Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
error = sysctl_wire_old_buffer(req, 0);
if (error != 0)
return (error);
xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
n = 0;
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2008-11-02 10:15:42 +00:00
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
1994-05-24 10:09:53 +00:00
continue;
MNT_ILOCK(mp);
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
if (n == len)
break;
vref(vp);
xvn[n].xv_size = sizeof *xvn;
xvn[n].xv_vnode = vp;
xvn[n].xv_id = 0; /* XXX compat */
#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
XV_COPY(usecount);
XV_COPY(writecount);
XV_COPY(holdcnt);
XV_COPY(mount);
XV_COPY(numoutput);
XV_COPY(type);
#undef XV_COPY
xvn[n].xv_flag = vp->v_vflag;
switch (vp->v_type) {
case VREG:
case VDIR:
case VLNK:
break;
case VBLK:
case VCHR:
if (vp->v_rdev == NULL) {
vrele(vp);
continue;
}
xvn[n].xv_dev = dev2udev(vp->v_rdev);
break;
case VSOCK:
xvn[n].xv_socket = vp->v_socket;
break;
case VFIFO:
xvn[n].xv_fifo = vp->v_fifoinfo;
break;
case VNON:
case VBAD:
default:
/* shouldn't happen? */
vrele(vp);
continue;
}
vrele(vp);
++n;
1994-05-24 10:09:53 +00:00
}
MNT_IUNLOCK(mp);
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_lock(&mountlist_mtx);
vfs_unbusy(mp);
if (n == len)
break;
1994-05-24 10:09:53 +00:00
}
Change and clean the mutex lock interface. mtx_enter(lock, type) becomes: mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks) mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized) similarily, for releasing a lock, we now have: mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN. We change the caller interface for the two different types of locks because the semantics are entirely different for each case, and this makes it explicitly clear and, at the same time, it rids us of the extra `type' argument. The enter->lock and exit->unlock change has been made with the idea that we're "locking data" and not "entering locked code" in mind. Further, remove all additional "flags" previously passed to the lock acquire/release routines with the exception of two: MTX_QUIET and MTX_NOSWITCH The functionality of these flags is preserved and they can be passed to the lock/unlock routines by calling the corresponding wrappers: mtx_{lock, unlock}_flags(lock, flag(s)) and mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN locks, respectively. Re-inline some lock acq/rel code; in the sleep lock case, we only inline the _obtain_lock()s in order to ensure that the inlined code fits into a cache line. In the spin lock case, we inline recursion and actually only perform a function call if we need to spin. This change has been made with the idea that we generally tend to avoid spin locks and that also the spin locks that we do have and are heavily used (i.e. sched_lock) do recurse, and therefore in an effort to reduce function call overhead for some architectures (such as alpha), we inline recursion for this case. Create a new malloc type for the witness code and retire from using the M_DEV type. The new type is called M_WITNESS and is only declared if WITNESS is enabled. Begin cleaning up some machdep/mutex.h code - specifically updated the "optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently need those. Finally, caught up to the interface changes in all sys code. Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
mtx_unlock(&mountlist_mtx);
1994-05-24 10:09:53 +00:00
error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
free(xvn, M_TEMP);
return (error);
1994-05-24 10:09:53 +00:00
}
SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
"");
#endif
static void
unmount_or_warn(struct mount *mp)
{
int error;
error = dounmount(mp, MNT_FORCE, curthread);
if (error != 0) {
printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
if (error == EBUSY)
printf("BUSY)\n");
else
printf("%d)\n", error);
}
}
/*
* Unmount all filesystems. The list is traversed in reverse order
* of mounting to avoid dependencies.
*/
void
vfs_unmountall(void)
{
struct mount *mp, *tmp;
CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
/*
* Since this only runs when rebooting, it is not interlocked.
*/
TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
vfs_ref(mp);
/*
* Forcibly unmounting "/dev" before "/" would prevent clean
* unmount of the latter.
*/
if (mp == rootdevmp)
continue;
unmount_or_warn(mp);
}
if (rootdevmp != NULL)
unmount_or_warn(rootdevmp);
1994-05-24 10:09:53 +00:00
}
static void
vfs_deferred_inactive(struct vnode *vp, int lkflags)
{
ASSERT_VI_LOCKED(vp, __func__);
VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set"));
if ((vp->v_iflag & VI_OWEINACT) == 0) {
vdropl(vp);
return;
}
if (vn_lock(vp, lkflags) == 0) {
VI_LOCK(vp);
vinactive(vp);
VOP_UNLOCK(vp);
vdropl(vp);
return;
}
vdefer_inactive_unlocked(vp);
}
static int
vfs_periodic_inactive_filter(struct vnode *vp, void *arg)
{
return (vp->v_iflag & VI_DEFINACT);
}
static void __noinline
vfs_periodic_inactive(struct mount *mp, int flags)
{
struct vnode *vp, *mvp;
int lkflags;
lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
if (flags != MNT_WAIT)
lkflags |= LK_NOWAIT;
MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) {
if ((vp->v_iflag & VI_DEFINACT) == 0) {
VI_UNLOCK(vp);
continue;
}
vp->v_iflag &= ~VI_DEFINACT;
vfs_deferred_inactive(vp, lkflags);
}
}
static inline bool
vfs_want_msync(struct vnode *vp)
{
struct vm_object *obj;
/*
* This test may be performed without any locks held.
* We rely on vm_object's type stability.
*/
if (vp->v_vflag & VV_NOSYNC)
return (false);
obj = vp->v_object;
return (obj != NULL && vm_object_mightbedirty(obj));
}
static int
vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused)
{
if (vp->v_vflag & VV_NOSYNC)
return (false);
if (vp->v_iflag & VI_DEFINACT)
return (true);
return (vfs_want_msync(vp));
}
static void __noinline
vfs_periodic_msync_inactive(struct mount *mp, int flags)
{
struct vnode *vp, *mvp;
struct vm_object *obj;
int lkflags, objflags;
bool seen_defer;
lkflags = LK_EXCLUSIVE | LK_INTERLOCK;
if (flags != MNT_WAIT) {
lkflags |= LK_NOWAIT;
objflags = OBJPC_NOSYNC;
} else {
objflags = OBJPC_SYNC;
}
MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) {
seen_defer = false;
if (vp->v_iflag & VI_DEFINACT) {
vp->v_iflag &= ~VI_DEFINACT;
seen_defer = true;
}
if (!vfs_want_msync(vp)) {
if (seen_defer)
vfs_deferred_inactive(vp, lkflags);
else
VI_UNLOCK(vp);
continue;
}
if (vget(vp, lkflags) == 0) {
obj = vp->v_object;
if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) {
VM_OBJECT_WLOCK(obj);
vm_object_page_clean(obj, 0, 0, objflags);
VM_OBJECT_WUNLOCK(obj);
}
vput(vp);
if (seen_defer)
vdrop(vp);
} else {
if (seen_defer)
vdefer_inactive_unlocked(vp);
}
}
}
void
vfs_periodic(struct mount *mp, int flags)
{
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0)
vfs_periodic_inactive(mp, flags);
else
vfs_periodic_msync_inactive(mp, flags);
}
static void
destroy_vpollinfo_free(struct vpollinfo *vi)
{
knlist_destroy(&vi->vpi_selinfo.si_note);
mtx_destroy(&vi->vpi_lock);
free(vi, M_VNODEPOLL);
}
static void
destroy_vpollinfo(struct vpollinfo *vi)
{
knlist_clear(&vi->vpi_selinfo.si_note, 1);
seldrain(&vi->vpi_selinfo);
destroy_vpollinfo_free(vi);
}
/*
* Initialize per-vnode helper structure to hold poll-related state.
*/
void
v_addpollinfo(struct vnode *vp)
{
struct vpollinfo *vi;
if (vp->v_pollinfo != NULL)
return;
vi = malloc(sizeof(*vi), M_VNODEPOLL, M_WAITOK | M_ZERO);
mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
vfs_knlunlock, vfs_knl_assert_lock);
VI_LOCK(vp);
if (vp->v_pollinfo != NULL) {
VI_UNLOCK(vp);
destroy_vpollinfo_free(vi);
return;
}
vp->v_pollinfo = vi;
VI_UNLOCK(vp);
}
/*
* Record a process's interest in events which might happen to
* a vnode. Because poll uses the historic select-style interface
* internally, this routine serves as both the ``check for any
* pending events'' and the ``record my interest in future events''
* functions. (These are done together, while the lock is held,
* to avoid race conditions.)
*/
int
vn_pollrecord(struct vnode *vp, struct thread *td, int events)
{
v_addpollinfo(vp);
mtx_lock(&vp->v_pollinfo->vpi_lock);
if (vp->v_pollinfo->vpi_revents & events) {
/*
* This leaves events we are not interested
* in available for the other process which
* which presumably had requested them
* (otherwise they would never have been
* recorded).
*/
events &= vp->v_pollinfo->vpi_revents;
vp->v_pollinfo->vpi_revents &= ~events;
mtx_unlock(&vp->v_pollinfo->vpi_lock);
return (events);
}
vp->v_pollinfo->vpi_events |= events;
selrecord(td, &vp->v_pollinfo->vpi_selinfo);
mtx_unlock(&vp->v_pollinfo->vpi_lock);
return (0);
}
/*
* Routine to create and manage a filesystem syncer vnode.
*/
2002-03-19 21:25:46 +00:00
#define sync_close ((int (*)(struct vop_close_args *))nullop)
static int sync_fsync(struct vop_fsync_args *);
static int sync_inactive(struct vop_inactive_args *);
static int sync_reclaim(struct vop_reclaim_args *);
static struct vop_vector sync_vnodeops = {
.vop_bypass = VOP_EOPNOTSUPP,
.vop_close = sync_close, /* close */
.vop_fsync = sync_fsync, /* fsync */
.vop_inactive = sync_inactive, /* inactive */
.vop_need_inactive = vop_stdneed_inactive, /* need_inactive */
.vop_reclaim = sync_reclaim, /* reclaim */
.vop_lock1 = vop_stdlock, /* lock */
.vop_unlock = vop_stdunlock, /* unlock */
.vop_islocked = vop_stdislocked, /* islocked */
};
VFS_VOP_VECTOR_REGISTER(sync_vnodeops);
/*
* Create a new filesystem syncer vnode for the specified mount point.
*/
void
vfs_allocate_syncvnode(struct mount *mp)
{
struct vnode *vp;
struct bufobj *bo;
static long start, incr, next;
int error;
/* Allocate a new vnode */
error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
if (error != 0)
panic("vfs_allocate_syncvnode: getnewvnode() failed");
vp->v_type = VNON;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_FORCEINSMQ;
error = insmntque(vp, mp);
if (error != 0)
panic("vfs_allocate_syncvnode: insmntque() failed");
vp->v_vflag &= ~VV_FORCEINSMQ;
VOP_UNLOCK(vp);
/*
* Place the vnode onto the syncer worklist. We attempt to
* scatter them about on the list so that they will go off
* at evenly distributed times even if all the filesystems
* are mounted at once.
*/
next += incr;
if (next == 0 || next > syncer_maxdelay) {
start /= 2;
incr /= 2;
if (start == 0) {
start = syncer_maxdelay / 2;
incr = syncer_maxdelay;
}
next = start;
}
bo = &vp->v_bufobj;
BO_LOCK(bo);
vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
mtx_lock(&sync_mtx);
sync_vnode_count++;
if (mp->mnt_syncer == NULL) {
mp->mnt_syncer = vp;
vp = NULL;
}
mtx_unlock(&sync_mtx);
BO_UNLOCK(bo);
if (vp != NULL) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vgone(vp);
vput(vp);
}
}
void
vfs_deallocate_syncvnode(struct mount *mp)
{
struct vnode *vp;
mtx_lock(&sync_mtx);
vp = mp->mnt_syncer;
if (vp != NULL)
mp->mnt_syncer = NULL;
mtx_unlock(&sync_mtx);
if (vp != NULL)
vrele(vp);
}
/*
* Do a lazy sync of the filesystem.
*/
1998-12-21 23:38:33 +00:00
static int
sync_fsync(struct vop_fsync_args *ap)
{
struct vnode *syncvp = ap->a_vp;
struct mount *mp = syncvp->v_mount;
int error, save;
struct bufobj *bo;
/*
* We only need to do something if this is a lazy evaluation.
*/
if (ap->a_waitfor != MNT_LAZY)
return (0);
/*
* Move ourselves to the back of the sync list.
*/
bo = &syncvp->v_bufobj;
BO_LOCK(bo);
vn_syncer_add_to_worklist(bo, syncdelay);
BO_UNLOCK(bo);
/*
* Walk the list of vnodes pushing all that are dirty and
* not already on the sync list.
*/
if (vfs_busy(mp, MBF_NOWAIT) != 0)
return (0);
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
vfs_unbusy(mp);
return (0);
}
save = curthread_pflags_set(TDP_SYNCIO);
/*
* The filesystem at hand may be idle with free vnodes stored in the
* batch. Return them instead of letting them stay there indefinitely.
*/
vfs_periodic(mp, MNT_NOWAIT);
error = VFS_SYNC(mp, MNT_LAZY);
curthread_pflags_restore(save);
vn_finished_write(mp);
vfs_unbusy(mp);
return (error);
}
/*
* The syncer vnode is no referenced.
*/
1998-12-21 23:38:33 +00:00
static int
sync_inactive(struct vop_inactive_args *ap)
{
vgone(ap->a_vp);
return (0);
}
/*
* The syncer vnode is no longer needed and is being decommissioned.
*
* Modifications to the worklist must be protected by sync_mtx.
*/
1998-12-21 23:38:33 +00:00
static int
sync_reclaim(struct vop_reclaim_args *ap)
{
struct vnode *vp = ap->a_vp;
struct bufobj *bo;
bo = &vp->v_bufobj;
BO_LOCK(bo);
mtx_lock(&sync_mtx);
if (vp->v_mount->mnt_syncer == vp)
vp->v_mount->mnt_syncer = NULL;
if (bo->bo_flag & BO_ONWORKLST) {
LIST_REMOVE(bo, bo_synclist);
syncer_worklist_len--;
sync_vnode_count--;
bo->bo_flag &= ~BO_ONWORKLST;
}
mtx_unlock(&sync_mtx);
BO_UNLOCK(bo);
return (0);
}
int
vn_need_pageq_flush(struct vnode *vp)
{
struct vm_object *obj;
obj = vp->v_object;
return (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0 &&
vm_object_mightbedirty(obj));
}
/*
* Check if vnode represents a disk device
*/
bool
vn_isdisk_error(struct vnode *vp, int *errp)
{
2003-10-12 14:04:39 +00:00
int error;
if (vp->v_type != VCHR) {
error = ENOTBLK;
goto out;
}
2003-10-12 14:04:39 +00:00
error = 0;
dev_lock();
if (vp->v_rdev == NULL)
2003-10-12 14:04:39 +00:00
error = ENXIO;
else if (vp->v_rdev->si_devsw == NULL)
error = ENXIO;
else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
2003-10-12 14:04:39 +00:00
error = ENOTBLK;
dev_unlock();
out:
*errp = error;
2003-10-12 14:04:39 +00:00
return (error == 0);
}
bool
vn_isdisk(struct vnode *vp)
{
int error;
return (vn_isdisk_error(vp, &error));
}
/*
* VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
* the comment above cache_fplookup for details.
*/
int
vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred)
{
int error;
VFS_SMR_ASSERT_ENTERED();
/* Check the owner. */
if (cred->cr_uid == file_uid) {
if (file_mode & S_IXUSR)
return (0);
goto out_error;
}
/* Otherwise, check the groups (first match) */
if (groupmember(file_gid, cred)) {
if (file_mode & S_IXGRP)
return (0);
goto out_error;
}
/* Otherwise, check everyone else. */
if (file_mode & S_IXOTH)
return (0);
out_error:
/*
* Permission check failed, but it is possible denial will get overwritten
* (e.g., when root is traversing through a 700 directory owned by someone
* else).
*
* vaccess() calls priv_check_cred which in turn can descent into MAC
* modules overriding this result. It's quite unclear what semantics
* are allowed for them to operate, thus for safety we don't call them
* from within the SMR section. This also means if any such modules
* are present, we have to let the regular lookup decide.
*/
error = priv_check_cred_vfs_lookup_nomac(cred);
switch (error) {
case 0:
return (0);
case EAGAIN:
/*
* MAC modules present.
*/
return (EAGAIN);
case EPERM:
return (EACCES);
default:
return (error);
}
}
/*
2002-05-16 21:28:32 +00:00
* Common filesystem object access control check routine. Accepts a
* vnode's type, "mode", uid and gid, requested access mode, and credentials.
* Returns 0 on success, or an errno on failure.
*/
int
vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
accmode_t accmode, struct ucred *cred)
{
accmode_t dac_granted;
accmode_t priv_granted;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
("invalid bit in accmode"));
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
2010-04-03 11:19:20 +00:00
("VAPPEND without VWRITE"));
/*
* Look for a normal, non-privileged way to access the file/directory
* as requested. If it exists, go with that.
*/
dac_granted = 0;
/* Check the owner. */
if (cred->cr_uid == file_uid) {
dac_granted |= VADMIN;
if (file_mode & S_IXUSR)
dac_granted |= VEXEC;
if (file_mode & S_IRUSR)
dac_granted |= VREAD;
if (file_mode & S_IWUSR)
dac_granted |= (VWRITE | VAPPEND);
if ((accmode & dac_granted) == accmode)
return (0);
goto privcheck;
}
/* Otherwise, check the groups (first match) */
if (groupmember(file_gid, cred)) {
if (file_mode & S_IXGRP)
dac_granted |= VEXEC;
if (file_mode & S_IRGRP)
dac_granted |= VREAD;
if (file_mode & S_IWGRP)
dac_granted |= (VWRITE | VAPPEND);
if ((accmode & dac_granted) == accmode)
return (0);
goto privcheck;
}
/* Otherwise, check everyone else. */
if (file_mode & S_IXOTH)
dac_granted |= VEXEC;
if (file_mode & S_IROTH)
dac_granted |= VREAD;
if (file_mode & S_IWOTH)
dac_granted |= (VWRITE | VAPPEND);
if ((accmode & dac_granted) == accmode)
return (0);
privcheck:
/*
* Build a privilege mask to determine if the set of privileges
* satisfies the requirements when combined with the granted mask
* from above. For each privilege, if the privilege is required,
* bitwise or the request type onto the priv_granted mask.
*/
priv_granted = 0;
if (type == VDIR) {
/*
* For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
* requests, instead of PRIV_VFS_EXEC.
*/
if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
!priv_check_cred(cred, PRIV_VFS_LOOKUP))
priv_granted |= VEXEC;
} else {
/*
* Ensure that at least one execute bit is on. Otherwise,
* a privileged user will always succeed, and we don't want
* this to happen unless the file really is executable.
*/
if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
(file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
!priv_check_cred(cred, PRIV_VFS_EXEC))
priv_granted |= VEXEC;
}
if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
!priv_check_cred(cred, PRIV_VFS_READ))
priv_granted |= VREAD;
if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
!priv_check_cred(cred, PRIV_VFS_WRITE))
priv_granted |= (VWRITE | VAPPEND);
if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
!priv_check_cred(cred, PRIV_VFS_ADMIN))
priv_granted |= VADMIN;
if ((accmode & (priv_granted | dac_granted)) == accmode) {
return (0);
}
return ((accmode & VADMIN) ? EPERM : EACCES);
}
/*
* Credential check based on process requesting service, and per-attribute
* permissions.
*/
int
extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
struct thread *td, accmode_t accmode)
{
/*
* Kernel-invoked always succeeds.
*/
if (cred == NOCRED)
return (0);
/*
* Do not allow privileged processes in jail to directly manipulate
* system attributes.
*/
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
/* Potentially should be: return (EPERM); */
return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
case EXTATTR_NAMESPACE_USER:
return (VOP_ACCESS(vp, accmode, cred, td));
default:
return (EPERM);
}
}
#ifdef DEBUG_VFS_LOCKS
/*
* This only exists to suppress warnings from unlocked specfs accesses. It is
* no longer ok to have an unlocked VFS.
*/
#define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \
(vp)->v_type == VCHR || (vp)->v_type == VBAD)
int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
2010-11-14 16:10:15 +00:00
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
"Drop into debugger on lock violation");
2004-01-05 23:40:46 +00:00
int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
2010-11-14 16:10:15 +00:00
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
0, "Check for interlock across VOPs");
int vfs_badlock_print = 1; /* Print lock violations. */
2010-11-14 16:10:15 +00:00
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
0, "Print lock violations");
int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
0, "Print vnode details on lock violations");
#ifdef KDB
int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
2010-11-14 16:10:15 +00:00
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
&vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
#endif
static void
vfs_badlock(const char *msg, const char *str, struct vnode *vp)
{
#ifdef KDB
if (vfs_badlock_backtrace)
kdb_backtrace();
#endif
if (vfs_badlock_vnode)
vn_printf(vp, "vnode ");
if (vfs_badlock_print)
printf("%s: %p %s\n", str, (void *)vp, msg);
if (vfs_badlock_ddb)
kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
}
void
assert_vi_locked(struct vnode *vp, const char *str)
{
if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
vfs_badlock("interlock is not locked but should be", str, vp);
}
void
assert_vi_unlocked(struct vnode *vp, const char *str)
{
if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
vfs_badlock("interlock is locked but should not be", str, vp);
}
void
assert_vop_locked(struct vnode *vp, const char *str)
{
int locked;
if (!IGNORE_LOCK(vp)) {
locked = VOP_ISLOCKED(vp);
if (locked == 0 || locked == LK_EXCLOTHER)
vfs_badlock("is not locked but should be", str, vp);
}
}
void
assert_vop_unlocked(struct vnode *vp, const char *str)
{
if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
vfs_badlock("is locked but should not be", str, vp);
}
void
assert_vop_elocked(struct vnode *vp, const char *str)
{
if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
vfs_badlock("is not exclusive locked but should be", str, vp);
}
#endif /* DEBUG_VFS_LOCKS */
void
vop_rename_fail(struct vop_rename_args *ap)
{
if (ap->a_tvp != NULL)
vput(ap->a_tvp);
if (ap->a_tdvp == ap->a_tvp)
vrele(ap->a_tdvp);
else
vput(ap->a_tdvp);
vrele(ap->a_fdvp);
vrele(ap->a_fvp);
}
void
vop_rename_pre(void *ap)
{
struct vop_rename_args *a = ap;
#ifdef DEBUG_VFS_LOCKS
if (a->a_tvp)
ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
/* Check the source (from). */
if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
(a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
/* Check the target. */
if (a->a_tvp)
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
#endif
/*
* It may be tempting to add vn_seqc_write_begin/end calls here and
* in vop_rename_post but that's not going to work out since some
* filesystems relookup vnodes mid-rename. This is probably a bug.
*
* For now filesystems are expected to do the relevant calls after they
* decide what vnodes to operate on.
*/
if (a->a_tdvp != a->a_fdvp)
vhold(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
vhold(a->a_fvp);
vhold(a->a_tdvp);
if (a->a_tvp)
vhold(a->a_tvp);
}
#ifdef DEBUG_VFS_LOCKS
void
vop_fplookup_vexec_debugpre(void *ap __unused)
{
VFS_SMR_ASSERT_ENTERED();
}
void
vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused)
{
VFS_SMR_ASSERT_ENTERED();
}
void
vop_fplookup_symlink_debugpre(void *ap __unused)
{
VFS_SMR_ASSERT_ENTERED();
}
void
vop_fplookup_symlink_debugpost(void *ap __unused, int rc __unused)
{
VFS_SMR_ASSERT_ENTERED();
}
void
vop_strategy_debugpre(void *ap)
{
2004-01-05 23:40:46 +00:00
struct vop_strategy_args *a;
struct buf *bp;
2004-01-05 23:40:46 +00:00
a = ap;
bp = a->a_bp;
/*
* Cluster ops lock their component buffers but not the IO container.
*/
if ((bp->b_flags & B_CLUSTER) != 0)
return;
if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) {
if (vfs_badlock_print)
printf(
2004-01-05 23:40:46 +00:00
"VOP_STRATEGY: bp is not locked but should be\n");
if (vfs_badlock_ddb)
kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
}
}
void
vop_lock_debugpre(void *ap)
{
struct vop_lock1_args *a = ap;
if ((a->a_flags & LK_INTERLOCK) == 0)
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
else
ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
}
void
vop_lock_debugpost(void *ap, int rc)
{
struct vop_lock1_args *a = ap;
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
}
void
vop_unlock_debugpre(void *ap)
{
struct vop_unlock_args *a = ap;
ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
}
void
vop_need_inactive_debugpre(void *ap)
{
struct vop_need_inactive_args *a = ap;
ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
}
void
vop_need_inactive_debugpost(void *ap, int rc)
{
struct vop_need_inactive_args *a = ap;
ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE");
}
#endif
void
vop_create_pre(void *ap)
{
struct vop_create_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_begin(dvp);
}
void
vop_create_post(void *ap, int rc)
{
struct vop_create_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
if (!rc)
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
}
void
vop_whiteout_pre(void *ap)
{
struct vop_whiteout_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_begin(dvp);
}
void
vop_whiteout_post(void *ap, int rc)
{
struct vop_whiteout_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
}
void
vop_deleteextattr_pre(void *ap)
{
struct vop_deleteextattr_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_begin(vp);
}
void
vop_deleteextattr_post(void *ap, int rc)
{
struct vop_deleteextattr_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
}
void
vop_link_pre(void *ap)
{
struct vop_link_args *a;
struct vnode *vp, *tdvp;
a = ap;
vp = a->a_vp;
tdvp = a->a_tdvp;
vn_seqc_write_begin(vp);
vn_seqc_write_begin(tdvp);
}
void
vop_link_post(void *ap, int rc)
{
struct vop_link_args *a;
struct vnode *vp, *tdvp;
a = ap;
vp = a->a_vp;
tdvp = a->a_tdvp;
vn_seqc_write_end(vp);
vn_seqc_write_end(tdvp);
if (!rc) {
VFS_KNOTE_LOCKED(vp, NOTE_LINK);
VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE);
}
}
void
vop_mkdir_pre(void *ap)
{
struct vop_mkdir_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_begin(dvp);
}
void
vop_mkdir_post(void *ap, int rc)
{
struct vop_mkdir_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
if (!rc)
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
}
#ifdef DEBUG_VFS_LOCKS
void
vop_mkdir_debugpost(void *ap, int rc)
{
struct vop_mkdir_args *a;
a = ap;
if (!rc)
cache_validate(a->a_dvp, *a->a_vpp, a->a_cnp);
}
#endif
void
vop_mknod_pre(void *ap)
{
struct vop_mknod_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_begin(dvp);
}
void
vop_mknod_post(void *ap, int rc)
{
struct vop_mknod_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
if (!rc)
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
}
void
vop_reclaim_post(void *ap, int rc)
{
struct vop_reclaim_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
ASSERT_VOP_IN_SEQC(vp);
if (!rc)
VFS_KNOTE_LOCKED(vp, NOTE_REVOKE);
}
void
vop_remove_pre(void *ap)
{
struct vop_remove_args *a;
struct vnode *dvp, *vp;
a = ap;
dvp = a->a_dvp;
vp = a->a_vp;
vn_seqc_write_begin(dvp);
vn_seqc_write_begin(vp);
}
void
vop_remove_post(void *ap, int rc)
{
struct vop_remove_args *a;
struct vnode *dvp, *vp;
a = ap;
dvp = a->a_dvp;
vp = a->a_vp;
vn_seqc_write_end(dvp);
vn_seqc_write_end(vp);
if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
}
}
void
vop_rename_post(void *ap, int rc)
{
struct vop_rename_args *a = ap;
long hint;
if (!rc) {
hint = NOTE_WRITE;
if (a->a_fdvp == a->a_tdvp) {
if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
hint |= NOTE_LINK;
VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
} else {
hint |= NOTE_EXTEND;
if (a->a_fvp->v_type == VDIR)
hint |= NOTE_LINK;
VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
a->a_tvp->v_type == VDIR)
hint &= ~NOTE_LINK;
VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
}
VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
if (a->a_tvp)
VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
}
if (a->a_tdvp != a->a_fdvp)
vdrop(a->a_fdvp);
if (a->a_tvp != a->a_fvp)
vdrop(a->a_fvp);
vdrop(a->a_tdvp);
if (a->a_tvp)
vdrop(a->a_tvp);
}
void
vop_rmdir_pre(void *ap)
{
struct vop_rmdir_args *a;
struct vnode *dvp, *vp;
a = ap;
dvp = a->a_dvp;
vp = a->a_vp;
vn_seqc_write_begin(dvp);
vn_seqc_write_begin(vp);
}
void
vop_rmdir_post(void *ap, int rc)
{
struct vop_rmdir_args *a;
struct vnode *dvp, *vp;
a = ap;
dvp = a->a_dvp;
vp = a->a_vp;
vn_seqc_write_end(dvp);
vn_seqc_write_end(vp);
if (!rc) {
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK);
VFS_KNOTE_LOCKED(vp, NOTE_DELETE);
}
}
void
vop_setattr_pre(void *ap)
{
struct vop_setattr_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_begin(vp);
}
void
vop_setattr_post(void *ap, int rc)
{
struct vop_setattr_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
if (!rc)
VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
}
void
vop_setacl_pre(void *ap)
{
struct vop_setacl_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_begin(vp);
}
void
vop_setacl_post(void *ap, int rc __unused)
{
struct vop_setacl_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
}
void
vop_setextattr_pre(void *ap)
{
struct vop_setextattr_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_begin(vp);
}
void
vop_setextattr_post(void *ap, int rc)
{
struct vop_setextattr_args *a;
struct vnode *vp;
a = ap;
vp = a->a_vp;
vn_seqc_write_end(vp);
if (!rc)
VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB);
}
void
vop_symlink_pre(void *ap)
{
struct vop_symlink_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_begin(dvp);
}
void
vop_symlink_post(void *ap, int rc)
{
struct vop_symlink_args *a;
struct vnode *dvp;
a = ap;
dvp = a->a_dvp;
vn_seqc_write_end(dvp);
if (!rc)
VFS_KNOTE_LOCKED(dvp, NOTE_WRITE);
}
void
vop_open_post(void *ap, int rc)
{
struct vop_open_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
}
void
vop_close_post(void *ap, int rc)
{
struct vop_close_args *a = ap;
if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
!VN_IS_DOOMED(a->a_vp))) {
VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
NOTE_CLOSE_WRITE : NOTE_CLOSE);
}
}
void
vop_read_post(void *ap, int rc)
{
struct vop_read_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
}
void
vop_read_pgcache_post(void *ap, int rc)
{
struct vop_read_pgcache_args *a = ap;
if (!rc)
VFS_KNOTE_UNLOCKED(a->a_vp, NOTE_READ);
}
void
vop_readdir_post(void *ap, int rc)
{
struct vop_readdir_args *a = ap;
if (!rc)
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
}
static struct knlist fs_knlist;
static void
vfs_event_init(void *arg)
{
knlist_init_mtx(&fs_knlist, NULL);
}
/* XXX - correct order? */
SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
void
vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
{
KNOTE_UNLOCKED(&fs_knlist, event);
}
static int filt_fsattach(struct knote *kn);
static void filt_fsdetach(struct knote *kn);
static int filt_fsevent(struct knote *kn, long hint);
struct filterops fs_filtops = {
.f_isfd = 0,
.f_attach = filt_fsattach,
.f_detach = filt_fsdetach,
.f_event = filt_fsevent
};
static int
filt_fsattach(struct knote *kn)
{
kn->kn_flags |= EV_CLEAR;
knlist_add(&fs_knlist, kn, 0);
return (0);
}
static void
filt_fsdetach(struct knote *kn)
{
knlist_remove(&fs_knlist, kn, 0);
}
static int
filt_fsevent(struct knote *kn, long hint)
{
kn->kn_fflags |= kn->kn_sfflags & hint;
return (kn->kn_fflags != 0);
}
static int
sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
{
struct vfsidctl vc;
int error;
struct mount *mp;
error = SYSCTL_IN(req, &vc, sizeof(vc));
if (error)
return (error);
if (vc.vc_vers != VFS_CTL_VERS1)
return (EINVAL);
mp = vfs_getvfs(&vc.vc_fsid);
if (mp == NULL)
return (ENOENT);
/* ensure that a specific sysctl goes to the right filesystem. */
if (strcmp(vc.vc_fstypename, "*") != 0 &&
strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
vfs_rel(mp);
return (EINVAL);
}
VCTLTOREQ(&vc, req);
error = VFS_SYSCTL(mp, vc.vc_op, req);
vfs_rel(mp);
return (error);
}
SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR,
NULL, 0, sysctl_vfs_ctl, "",
"Sysctl by fsid");
/*
* Function to initialize a va_filerev field sensibly.
* XXX: Wouldn't a random number make a lot more sense ??
*/
u_quad_t
init_va_filerev(void)
{
struct bintime bt;
getbinuptime(&bt);
return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
}
static int filt_vfsread(struct knote *kn, long hint);
static int filt_vfswrite(struct knote *kn, long hint);
static int filt_vfsvnode(struct knote *kn, long hint);
static void filt_vfsdetach(struct knote *kn);
static struct filterops vfsread_filtops = {
.f_isfd = 1,
.f_detach = filt_vfsdetach,
.f_event = filt_vfsread
};
static struct filterops vfswrite_filtops = {
.f_isfd = 1,
.f_detach = filt_vfsdetach,
.f_event = filt_vfswrite
};
static struct filterops vfsvnode_filtops = {
.f_isfd = 1,
.f_detach = filt_vfsdetach,
.f_event = filt_vfsvnode
};
static void
vfs_knllock(void *arg)
{
struct vnode *vp = arg;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
static void
vfs_knlunlock(void *arg)
{
struct vnode *vp = arg;
VOP_UNLOCK(vp);
}
static void
vfs_knl_assert_lock(void *arg, int what)
{
#ifdef DEBUG_VFS_LOCKS
struct vnode *vp = arg;
if (what == LA_LOCKED)
ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
else
ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
#endif
}
int
vfs_kqfilter(struct vop_kqfilter_args *ap)
{
struct vnode *vp = ap->a_vp;
struct knote *kn = ap->a_kn;
struct knlist *knl;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &vfsread_filtops;
break;
case EVFILT_WRITE:
kn->kn_fop = &vfswrite_filtops;
break;
case EVFILT_VNODE:
kn->kn_fop = &vfsvnode_filtops;
break;
default:
return (EINVAL);
}
kn->kn_hook = (caddr_t)vp;
v_addpollinfo(vp);
if (vp->v_pollinfo == NULL)
return (ENOMEM);
knl = &vp->v_pollinfo->vpi_selinfo.si_note;
vhold(vp);
knlist_add(knl, kn, 0);
return (0);
}
/*
* Detach knote from vnode
*/
static void
filt_vfsdetach(struct knote *kn)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
vdrop(vp);
}
/*ARGSUSED*/
static int
filt_vfsread(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
struct vattr va;
int res;
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
VI_LOCK(vp);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
VI_UNLOCK(vp);
return (1);
}
if (VOP_GETATTR(vp, &va, curthread->td_ucred))
return (0);
VI_LOCK(vp);
kn->kn_data = va.va_size - kn->kn_fp->f_offset;
res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
VI_UNLOCK(vp);
return (res);
}
/*ARGSUSED*/
static int
filt_vfswrite(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
VI_LOCK(vp);
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_data = 0;
VI_UNLOCK(vp);
return (1);
}
static int
filt_vfsvnode(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
int res;
VI_LOCK(vp);
if (kn->kn_sfflags & hint)
kn->kn_fflags |= hint;
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
kn->kn_flags |= EV_EOF;
VI_UNLOCK(vp);
return (1);
}
res = (kn->kn_fflags != 0);
VI_UNLOCK(vp);
return (res);
}
/*
* Returns whether the directory is empty or not.
* If it is empty, the return value is 0; otherwise
* the return value is an error value (which may
* be ENOTEMPTY).
*/
int
vfs_emptydir(struct vnode *vp)
{
struct uio uio;
struct iovec iov;
struct dirent *dirent, *dp, *endp;
int error, eof;
error = 0;
eof = 0;
ASSERT_VOP_LOCKED(vp, "vfs_emptydir");
dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK);
iov.iov_base = dirent;
iov.iov_len = sizeof(struct dirent);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = 0;
uio.uio_resid = sizeof(struct dirent);
uio.uio_segflg = UIO_SYSSPACE;
uio.uio_rw = UIO_READ;
uio.uio_td = curthread;
while (eof == 0 && error == 0) {
error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof,
NULL, NULL);
if (error != 0)
break;
endp = (void *)((uint8_t *)dirent +
sizeof(struct dirent) - uio.uio_resid);
for (dp = dirent; dp < endp;
dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) {
if (dp->d_type == DT_WHT)
continue;
if (dp->d_namlen == 0)
continue;
if (dp->d_type != DT_DIR &&
dp->d_type != DT_UNKNOWN) {
error = ENOTEMPTY;
break;
}
if (dp->d_namlen > 2) {
error = ENOTEMPTY;
break;
}
if (dp->d_namlen == 1 &&
dp->d_name[0] != '.') {
error = ENOTEMPTY;
break;
}
if (dp->d_namlen == 2 &&
dp->d_name[1] != '.') {
error = ENOTEMPTY;
break;
}
uio.uio_resid = sizeof(struct dirent);
}
}
free(dirent, M_TEMP);
return (error);
}
int
vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
{
int error;
if (dp->d_reclen > ap->a_uio->uio_resid)
return (ENAMETOOLONG);
error = uiomove(dp, dp->d_reclen, ap->a_uio);
if (error) {
if (ap->a_ncookies != NULL) {
if (ap->a_cookies != NULL)
free(ap->a_cookies, M_TEMP);
ap->a_cookies = NULL;
*ap->a_ncookies = 0;
}
return (error);
}
if (ap->a_ncookies == NULL)
return (0);
KASSERT(ap->a_cookies,
("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
*ap->a_cookies = realloc(*ap->a_cookies,
(*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
(*ap->a_cookies)[*ap->a_ncookies] = off;
*ap->a_ncookies += 1;
return (0);
}
/*
* The purpose of this routine is to remove granularity from accmode_t,
* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
* VADMIN and VAPPEND.
*
* If it returns 0, the caller is supposed to continue with the usual
* access checks using 'accmode' as modified by this routine. If it
* returns nonzero value, the caller is supposed to return that value
* as errno.
*
* Note that after this routine runs, accmode may be zero.
*/
int
vfs_unixify_accmode(accmode_t *accmode)
{
/*
* There is no way to specify explicit "deny" rule using
* file mode or POSIX.1e ACLs.
*/
if (*accmode & VEXPLICIT_DENY) {
*accmode = 0;
return (0);
}
/*
* None of these can be translated into usual access bits.
* Also, the common case for NFSv4 ACLs is to not contain
* either of these bits. Caller should check for VWRITE
* on the containing directory instead.
*/
if (*accmode & (VDELETE_CHILD | VDELETE))
return (EPERM);
if (*accmode & VADMIN_PERMS) {
*accmode &= ~VADMIN_PERMS;
*accmode |= VADMIN;
}
/*
* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
*/
*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
return (0);
}
/*
* Clear out a doomed vnode (if any) and replace it with a new one as long
* as the fs is not being unmounted. Return the root vnode to the caller.
*/
static int __noinline
vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp)
{
struct vnode *vp;
int error;
restart:
if (mp->mnt_rootvnode != NULL) {
MNT_ILOCK(mp);
vp = mp->mnt_rootvnode;
if (vp != NULL) {
if (!VN_IS_DOOMED(vp)) {
vrefact(vp);
MNT_IUNLOCK(mp);
error = vn_lock(vp, flags);
if (error == 0) {
*vpp = vp;
return (0);
}
vrele(vp);
goto restart;
}
/*
* Clear the old one.
*/
mp->mnt_rootvnode = NULL;
}
MNT_IUNLOCK(mp);
if (vp != NULL) {
vfs_op_barrier_wait(mp);
vrele(vp);
}
}
error = VFS_CACHEDROOT(mp, flags, vpp);
if (error != 0)
return (error);
if (mp->mnt_vfs_ops == 0) {
MNT_ILOCK(mp);
if (mp->mnt_vfs_ops != 0) {
MNT_IUNLOCK(mp);
return (0);
}
if (mp->mnt_rootvnode == NULL) {
vrefact(*vpp);
mp->mnt_rootvnode = *vpp;
} else {
if (mp->mnt_rootvnode != *vpp) {
if (!VN_IS_DOOMED(mp->mnt_rootvnode)) {
panic("%s: mismatch between vnode returned "
" by VFS_CACHEDROOT and the one cached "
" (%p != %p)",
__func__, *vpp, mp->mnt_rootvnode);
}
}
}
MNT_IUNLOCK(mp);
}
return (0);
}
int
vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp)
{
struct mount_pcpu *mpcpu;
struct vnode *vp;
int error;
if (!vfs_op_thread_enter(mp, mpcpu))
return (vfs_cache_root_fallback(mp, flags, vpp));
vp = atomic_load_ptr(&mp->mnt_rootvnode);
if (vp == NULL || VN_IS_DOOMED(vp)) {
vfs_op_thread_exit(mp, mpcpu);
return (vfs_cache_root_fallback(mp, flags, vpp));
}
vrefact(vp);
vfs_op_thread_exit(mp, mpcpu);
error = vn_lock(vp, flags);
if (error != 0) {
vrele(vp);
return (vfs_cache_root_fallback(mp, flags, vpp));
}
*vpp = vp;
return (0);
}
struct vnode *
vfs_cache_root_clear(struct mount *mp)
{
struct vnode *vp;
/*
* ops > 0 guarantees there is nobody who can see this vnode
*/
MPASS(mp->mnt_vfs_ops > 0);
vp = mp->mnt_rootvnode;
if (vp != NULL)
vn_seqc_write_begin(vp);
mp->mnt_rootvnode = NULL;
return (vp);
}
void
vfs_cache_root_set(struct mount *mp, struct vnode *vp)
{
MPASS(mp->mnt_vfs_ops > 0);
vrefact(vp);
mp->mnt_rootvnode = vp;
}
/*
* These are helper functions for filesystems to traverse all
* their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
*
* This interface replaces MNT_VNODE_FOREACH.
*/
struct vnode *
__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
if (should_yield())
kern_yield(PRI_USER);
MNT_ILOCK(mp);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
continue;
VI_LOCK(vp);
if (VN_IS_DOOMED(vp)) {
VI_UNLOCK(vp);
continue;
}
break;
}
if (vp == NULL) {
__mnt_vnode_markerfree_all(mvp, mp);
/* MNT_IUNLOCK(mp); -- done in above function */
mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
return (NULL);
}
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
MNT_IUNLOCK(mp);
return (vp);
}
struct vnode *
__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
{
struct vnode *vp;
*mvp = vn_alloc_marker(mp);
MNT_ILOCK(mp);
MNT_REF(mp);
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
/* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */
if (vp->v_type == VMARKER || VN_IS_DOOMED(vp))
continue;
VI_LOCK(vp);
if (VN_IS_DOOMED(vp)) {
VI_UNLOCK(vp);
continue;
}
break;
}
if (vp == NULL) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
vn_free_marker(*mvp);
*mvp = NULL;
return (NULL);
}
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
MNT_IUNLOCK(mp);
return (vp);
}
void
__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
{
if (*mvp == NULL) {
MNT_IUNLOCK(mp);
return;
}
mtx_assert(MNT_MTX(mp), MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
MNT_REL(mp);
MNT_IUNLOCK(mp);
vn_free_marker(*mvp);
*mvp = NULL;
}
/*
* These are helper functions for filesystems to traverse their
* lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h
*/
static void
mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
{
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
MNT_ILOCK(mp);
MNT_REL(mp);
MNT_IUNLOCK(mp);
vn_free_marker(*mvp);
*mvp = NULL;
}
/*
* Relock the mp mount vnode list lock with the vp vnode interlock in the
* conventional lock order during mnt_vnode_next_lazy iteration.
*
* On entry, the mount vnode list lock is held and the vnode interlock is not.
* The list lock is dropped and reacquired. On success, both locks are held.
* On failure, the mount vnode list lock is held but the vnode interlock is
* not, and the procedure may have yielded.
*/
static bool
mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp,
struct vnode *vp)
{
VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp,
("%s: bad marker", __func__));
VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
("%s: inappropriate vnode", __func__));
ASSERT_VI_UNLOCKED(vp, __func__);
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist);
TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist);
/*
* Note we may be racing against vdrop which transitioned the hold
* count to 0 and now waits for the ->mnt_listmtx lock. This is fine,
* if we are the only user after we get the interlock we will just
* vdrop.
*/
vhold(vp);
mtx_unlock(&mp->mnt_listmtx);
VI_LOCK(vp);
if (VN_IS_DOOMED(vp)) {
VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp);
goto out_lost;
}
VNPASS(vp->v_mflag & VMP_LAZYLIST, vp);
/*
* There is nothing to do if we are the last user.
*/
if (!refcount_release_if_not_last(&vp->v_holdcnt))
goto out_lost;
mtx_lock(&mp->mnt_listmtx);
return (true);
out_lost:
vdropl(vp);
maybe_yield();
mtx_lock(&mp->mnt_listmtx);
return (false);
}
static struct vnode *
mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
void *cbarg)
{
struct vnode *vp;
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
restart:
vp = TAILQ_NEXT(*mvp, v_lazylist);
while (vp != NULL) {
if (vp->v_type == VMARKER) {
vp = TAILQ_NEXT(vp, v_lazylist);
continue;
}
/*
* See if we want to process the vnode. Note we may encounter a
* long string of vnodes we don't care about and hog the list
* as a result. Check for it and requeue the marker.
*/
VNPASS(!VN_IS_DOOMED(vp), vp);
if (!cb(vp, cbarg)) {
if (!should_yield()) {
vp = TAILQ_NEXT(vp, v_lazylist);
continue;
}
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp,
v_lazylist);
TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp,
v_lazylist);
mtx_unlock(&mp->mnt_listmtx);
kern_yield(PRI_USER);
mtx_lock(&mp->mnt_listmtx);
goto restart;
}
/*
* Try-lock because this is the wrong lock order.
*/
if (!VI_TRYLOCK(vp) &&
!mnt_vnode_next_lazy_relock(*mvp, mp, vp))
goto restart;
KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
("alien vnode on the lazy list %p %p", vp, mp));
VNPASS(vp->v_mount == mp, vp);
VNPASS(!VN_IS_DOOMED(vp), vp);
break;
}
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
/* Check if we are done */
if (vp == NULL) {
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_lazy(mvp, mp);
return (NULL);
}
TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist);
mtx_unlock(&mp->mnt_listmtx);
ASSERT_VI_LOCKED(vp, "lazy iter");
return (vp);
}
struct vnode *
__mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
void *cbarg)
{
if (should_yield())
kern_yield(PRI_USER);
mtx_lock(&mp->mnt_listmtx);
return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
}
struct vnode *
__mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb,
void *cbarg)
{
struct vnode *vp;
if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist))
return (NULL);
*mvp = vn_alloc_marker(mp);
MNT_ILOCK(mp);
MNT_REF(mp);
MNT_IUNLOCK(mp);
mtx_lock(&mp->mnt_listmtx);
vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist);
if (vp == NULL) {
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_lazy(mvp, mp);
return (NULL);
}
TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist);
return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg));
}
void
__mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp)
{
if (*mvp == NULL)
return;
mtx_lock(&mp->mnt_listmtx);
TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist);
mtx_unlock(&mp->mnt_listmtx);
mnt_vnode_markerfree_lazy(mvp, mp);
}
int
vn_dir_check_exec(struct vnode *vp, struct componentname *cnp)
{
if ((cnp->cn_flags & NOEXECCHECK) != 0) {
cnp->cn_flags &= ~NOEXECCHECK;
return (0);
}
return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, curthread));
}
/*
* Do not use this variant unless you have means other than the hold count
* to prevent the vnode from getting freed.
*/
void
vn_seqc_write_begin_locked(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
VNPASS(vp->v_holdcnt > 0, vp);
VNPASS(vp->v_seqc_users >= 0, vp);
vp->v_seqc_users++;
if (vp->v_seqc_users == 1)
seqc_sleepable_write_begin(&vp->v_seqc);
}
void
vn_seqc_write_begin(struct vnode *vp)
{
VI_LOCK(vp);
vn_seqc_write_begin_locked(vp);
VI_UNLOCK(vp);
}
void
vn_seqc_write_end_locked(struct vnode *vp)
{
ASSERT_VI_LOCKED(vp, __func__);
VNPASS(vp->v_seqc_users > 0, vp);
vp->v_seqc_users--;
if (vp->v_seqc_users == 0)
seqc_sleepable_write_end(&vp->v_seqc);
}
void
vn_seqc_write_end(struct vnode *vp)
{
VI_LOCK(vp);
vn_seqc_write_end_locked(vp);
VI_UNLOCK(vp);
}
/*
* Special case handling for allocating and freeing vnodes.
*
* The counter remains unchanged on free so that a doomed vnode will
* keep testing as in modify as long as it is accessible with SMR.
*/
static void
vn_seqc_init(struct vnode *vp)
{
vp->v_seqc = 0;
vp->v_seqc_users = 0;
}
static void
vn_seqc_write_end_free(struct vnode *vp)
{
VNPASS(seqc_in_modify(vp->v_seqc), vp);
VNPASS(vp->v_seqc_users == 1, vp);
}
void
vn_irflag_set_locked(struct vnode *vp, short toset)
{
short flags;
ASSERT_VI_LOCKED(vp, __func__);
flags = vn_irflag_read(vp);
VNASSERT((flags & toset) == 0, vp,
("%s: some of the passed flags already set (have %d, passed %d)\n",
__func__, flags, toset));
atomic_store_short(&vp->v_irflag, flags | toset);
}
void
vn_irflag_set(struct vnode *vp, short toset)
{
VI_LOCK(vp);
vn_irflag_set_locked(vp, toset);
VI_UNLOCK(vp);
}
void
vn_irflag_set_cond_locked(struct vnode *vp, short toset)
{
short flags;
ASSERT_VI_LOCKED(vp, __func__);
flags = vn_irflag_read(vp);
atomic_store_short(&vp->v_irflag, flags | toset);
}
void
vn_irflag_set_cond(struct vnode *vp, short toset)
{
VI_LOCK(vp);
vn_irflag_set_cond_locked(vp, toset);
VI_UNLOCK(vp);
}
void
vn_irflag_unset_locked(struct vnode *vp, short tounset)
{
short flags;
ASSERT_VI_LOCKED(vp, __func__);
flags = vn_irflag_read(vp);
VNASSERT((flags & tounset) == tounset, vp,
("%s: some of the passed flags not set (have %d, passed %d)\n",
__func__, flags, tounset));
atomic_store_short(&vp->v_irflag, flags & ~tounset);
}
void
vn_irflag_unset(struct vnode *vp, short tounset)
{
VI_LOCK(vp);
vn_irflag_unset_locked(vp, tounset);
VI_UNLOCK(vp);
}