2005-01-06 23:35:40 +00:00
|
|
|
/*-
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1989, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
* (c) UNIX System Laboratories, Inc.
|
|
|
|
* All or some portions of this file are derived from material licensed
|
|
|
|
* to the University of California by American Telephone and Telegraph
|
|
|
|
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
|
|
|
|
* the permission of UNIX System Laboratories, Inc.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2016-09-15 13:16:20 +00:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1994-05-24 10:09:53 +00:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1997-02-10 02:22:35 +00:00
|
|
|
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* External virtual filesystem routines
|
|
|
|
*/
|
2003-06-11 00:56:59 +00:00
|
|
|
|
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
2012-08-22 20:05:34 +00:00
|
|
|
#include "opt_compat.h"
|
1996-01-04 21:13:23 +00:00
|
|
|
#include "opt_ddb.h"
|
2011-04-28 16:02:05 +00:00
|
|
|
#include "opt_watchdog.h"
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
2000-05-05 09:59:14 +00:00
|
|
|
#include <sys/bio.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/buf.h>
|
2008-07-30 12:39:18 +00:00
|
|
|
#include <sys/condvar.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/conf.h>
|
2016-12-31 19:59:31 +00:00
|
|
|
#include <sys/counter.h>
|
2005-09-12 08:46:07 +00:00
|
|
|
#include <sys/dirent.h>
|
2004-07-04 10:52:54 +00:00
|
|
|
#include <sys/event.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/eventhandler.h>
|
2002-09-05 20:38:57 +00:00
|
|
|
#include <sys/extattr.h>
|
2005-06-09 20:20:31 +00:00
|
|
|
#include <sys/file.h>
|
1998-12-24 12:07:16 +00:00
|
|
|
#include <sys/fcntl.h>
|
2007-04-13 23:54:22 +00:00
|
|
|
#include <sys/jail.h>
|
2004-07-10 21:47:53 +00:00
|
|
|
#include <sys/kdb.h>
|
1995-11-16 09:45:23 +00:00
|
|
|
#include <sys/kernel.h>
|
1999-07-01 13:21:46 +00:00
|
|
|
#include <sys/kthread.h>
|
Move the head of byte-level advisory lock list from the
filesystem-specific vnode data to the struct vnode. Provide the
default implementation for the vop_advlock and vop_advlockasync.
Purge the locks on the vnode reclaim by using the lf_purgelocks().
The default implementation is augmented for the nfs and smbfs.
In the nfs_advlock, push the Giant inside the nfs_dolock.
Before the change, the vop_advlock and vop_advlockasync have taken the
unlocked vnode and dereferenced the fs-private inode data, racing with
with the vnode reclamation due to forced unmount. Now, the vop_getattr
under the shared vnode lock is used to obtain the inode size, and
later, in the lf_advlockasync, after locking the vnode interlock, the
VI_DOOMED flag is checked to prevent an operation on the doomed vnode.
The implementation of the lf_purgelocks() is submitted by dfr.
Reported by: kris
Tested by: kris, pho
Discussed with: jeff, dfr
MFC after: 2 weeks
2008-04-16 11:33:32 +00:00
|
|
|
#include <sys/lockf.h>
|
2002-08-01 17:47:56 +00:00
|
|
|
#include <sys/malloc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/mount.h>
|
2000-01-08 16:20:06 +00:00
|
|
|
#include <sys/namei.h>
|
2013-05-12 04:05:01 +00:00
|
|
|
#include <sys/pctrie.h>
|
2006-11-06 13:42:10 +00:00
|
|
|
#include <sys/priv.h>
|
2004-08-20 19:21:47 +00:00
|
|
|
#include <sys/reboot.h>
|
2015-07-16 13:57:05 +00:00
|
|
|
#include <sys/refcount.h>
|
2013-02-20 10:38:34 +00:00
|
|
|
#include <sys/rwlock.h>
|
2011-01-06 22:17:07 +00:00
|
|
|
#include <sys/sched.h>
|
Switch the sleep/wakeup and condition variable implementations to use the
sleep queue interface:
- Sleep queues attempt to merge some of the benefits of both sleep queues
and condition variables. Having sleep qeueus in a hash table avoids
having to allocate a queue head for each wait channel. Thus, struct cv
has shrunk down to just a single char * pointer now. However, the
hash table does not hold threads directly, but queue heads. This means
that once you have located a queue in the hash bucket, you no longer have
to walk the rest of the hash chain looking for threads. Instead, you have
a list of all the threads sleeping on that wait channel.
- Outside of the sleepq code and the sleep/cv code the kernel no longer
differentiates between cv's and sleep/wakeup. For example, calls to
abortsleep() and cv_abort() are replaced with a call to sleepq_abort().
Thus, the TDF_CVWAITQ flag is removed. Also, calls to unsleep() and
cv_waitq_remove() have been replaced with calls to sleepq_remove().
- The sched_sleep() function no longer accepts a priority argument as
sleep's no longer inherently bump the priority. Instead, this is soley
a propery of msleep() which explicitly calls sched_prio() before
blocking.
- The TDF_ONSLEEPQ flag has been dropped as it was never used. The
associated TDF_SET_ONSLEEPQ and TDF_CLR_ON_SLEEPQ macros have also been
dropped and replaced with a single explicit clearing of td_wchan.
TD_SET_ONSLEEPQ() would really have only made sense if it had taken
the wait channel and message as arguments anyway. Now that that only
happens in one place, a macro would be overkill.
2004-02-27 18:52:44 +00:00
|
|
|
#include <sys/sleepqueue.h>
|
2012-12-15 02:04:46 +00:00
|
|
|
#include <sys/smp.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/stat.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/sysctl.h>
|
2001-12-18 20:48:54 +00:00
|
|
|
#include <sys/syslog.h>
|
1997-12-29 00:25:11 +00:00
|
|
|
#include <sys/vmmeter.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/vnode.h>
|
2011-04-28 16:02:05 +00:00
|
|
|
#include <sys/watchdog.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2005-01-24 13:58:08 +00:00
|
|
|
#include <machine/stdarg.h>
|
|
|
|
|
2006-10-22 11:52:19 +00:00
|
|
|
#include <security/mac/mac_framework.h>
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <vm/vm.h>
|
1995-12-07 12:48:31 +00:00
|
|
|
#include <vm/vm_object.h>
|
|
|
|
#include <vm/vm_extern.h>
|
1997-12-19 09:03:37 +00:00
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_map.h>
|
1999-01-21 08:29:12 +00:00
|
|
|
#include <vm/vm_page.h>
|
2003-05-23 19:54:02 +00:00
|
|
|
#include <vm/vm_kern.h>
|
2002-03-20 04:09:59 +00:00
|
|
|
#include <vm/uma.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2006-09-04 22:15:44 +00:00
|
|
|
#ifdef DDB
|
|
|
|
#include <ddb/ddb.h>
|
|
|
|
#endif
|
|
|
|
|
2004-07-04 08:52:35 +00:00
|
|
|
static void delmntque(struct vnode *vp);
|
2005-02-19 11:44:57 +00:00
|
|
|
static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
|
2005-01-11 10:01:54 +00:00
|
|
|
int slpflag, int slptimeo);
|
2004-07-01 23:59:19 +00:00
|
|
|
static void syncer_shutdown(void *arg, int howto);
|
2003-10-05 05:35:41 +00:00
|
|
|
static int vtryrecycle(struct vnode *vp);
|
2015-07-16 13:57:05 +00:00
|
|
|
static void v_init_counters(struct vnode *);
|
2005-06-16 04:41:42 +00:00
|
|
|
static void v_incr_usecount(struct vnode *);
|
2016-01-18 22:21:46 +00:00
|
|
|
static void v_incr_usecount_locked(struct vnode *);
|
2015-07-11 16:28:12 +00:00
|
|
|
static void v_incr_devcount(struct vnode *);
|
|
|
|
static void v_decr_devcount(struct vnode *);
|
2005-06-16 04:41:42 +00:00
|
|
|
static void vgonel(struct vnode *);
|
2005-07-01 16:28:32 +00:00
|
|
|
static void vfs_knllock(void *arg);
|
|
|
|
static void vfs_knlunlock(void *arg);
|
2009-06-10 20:59:32 +00:00
|
|
|
static void vfs_knl_assert_locked(void *arg);
|
|
|
|
static void vfs_knl_assert_unlocked(void *arg);
|
2016-09-30 17:27:17 +00:00
|
|
|
static void vnlru_return_batches(struct vfsops *mnt_op);
|
2008-10-28 12:08:36 +00:00
|
|
|
static void destroy_vpollinfo(struct vpollinfo *vi);
|
2000-09-22 12:22:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Number of vnodes in existence. Increased whenever getnewvnode()
|
2012-04-17 21:46:59 +00:00
|
|
|
* allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
1997-11-22 08:35:46 +00:00
|
|
|
static unsigned long numvnodes;
|
2002-01-10 18:31:53 +00:00
|
|
|
|
2011-01-12 19:54:19 +00:00
|
|
|
SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
|
2010-11-14 16:10:15 +00:00
|
|
|
"Number of vnodes in existence");
|
1995-12-02 18:58:56 +00:00
|
|
|
|
2016-12-31 19:59:31 +00:00
|
|
|
static counter_u64_t vnodes_created;
|
|
|
|
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
|
|
|
|
"Number of vnodes created by getnewvnode");
|
2015-02-14 17:02:51 +00:00
|
|
|
|
2016-09-30 17:27:17 +00:00
|
|
|
static u_long mnt_free_list_batch = 128;
|
|
|
|
SYSCTL_ULONG(_vfs, OID_AUTO, mnt_free_list_batch, CTLFLAG_RW,
|
|
|
|
&mnt_free_list_batch, 0, "Limit of vnodes held on mnt's free list");
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Conversion tables for conversion from vnode types to inode formats
|
|
|
|
* and back.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
enum vtype iftovt_tab[16] = {
|
|
|
|
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
|
|
|
|
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
|
|
|
|
};
|
2006-01-09 20:42:19 +00:00
|
|
|
int vttoif_tab[10] = {
|
1994-05-24 10:09:53 +00:00
|
|
|
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
|
2006-01-09 20:42:19 +00:00
|
|
|
S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
|
1994-05-24 10:09:53 +00:00
|
|
|
};
|
|
|
|
|
2000-10-05 18:22:46 +00:00
|
|
|
/*
|
|
|
|
* List of vnodes that are ready for recycling.
|
|
|
|
*/
|
2000-09-22 12:22:36 +00:00
|
|
|
static TAILQ_HEAD(freelst, vnode) vnode_free_list;
|
1998-01-12 01:46:33 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2015-11-24 09:45:36 +00:00
|
|
|
* "Free" vnode target. Free vnodes are rarely completely free, but are
|
|
|
|
* just ones that are cheap to recycle. Usually they are for files which
|
|
|
|
* have been stat'd but not read; these usually have inode and namecache
|
|
|
|
* data attached to them. This target is the preferred minimum size of a
|
|
|
|
* sub-cache consisting mostly of such files. The system balances the size
|
|
|
|
* of this sub-cache with its complement to try to prevent either from
|
|
|
|
* thrashing while the other is relatively inactive. The targets express
|
|
|
|
* a preference for the best balance.
|
|
|
|
*
|
|
|
|
* "Above" this target there are 2 further targets (watermarks) related
|
|
|
|
* to recyling of free vnodes. In the best-operating case, the cache is
|
|
|
|
* exactly full, the free list has size between vlowat and vhiwat above the
|
|
|
|
* free target, and recycling from it and normal use maintains this state.
|
|
|
|
* Sometimes the free list is below vlowat or even empty, but this state
|
|
|
|
* is even better for immediate use provided the cache is not full.
|
|
|
|
* Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
|
|
|
|
* ones) to reach one of these states. The watermarks are currently hard-
|
|
|
|
* coded as 4% and 9% of the available space higher. These and the default
|
|
|
|
* of 25% for wantfreevnodes are too large if the memory size is large.
|
|
|
|
* E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
|
|
|
|
* whenever vnlru_proc() becomes active.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
2005-03-25 05:34:39 +00:00
|
|
|
static u_long wantfreevnodes;
|
2015-11-24 09:45:36 +00:00
|
|
|
SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
|
|
|
|
&wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
|
2001-12-19 01:35:18 +00:00
|
|
|
static u_long freevnodes;
|
2015-11-24 09:45:36 +00:00
|
|
|
SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
|
|
|
|
&freevnodes, 0, "Number of \"free\" vnodes");
|
2009-12-28 15:35:39 +00:00
|
|
|
|
2016-12-31 19:59:31 +00:00
|
|
|
static counter_u64_t recycles_count;
|
|
|
|
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count,
|
2015-11-24 09:45:36 +00:00
|
|
|
"Number of vnodes recycled to meet vnode cache targets");
|
2015-02-14 17:02:51 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Various variables used for debugging the new implementation of
|
|
|
|
* reassignbuf().
|
|
|
|
* XXX these are probably of (very) limited utility now.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
static int reassignbufcalls;
|
2010-11-14 08:06:29 +00:00
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
|
2010-11-14 16:10:15 +00:00
|
|
|
"Number of calls to reassignbuf");
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
|
2016-12-31 19:59:31 +00:00
|
|
|
static counter_u64_t free_owe_inact;
|
|
|
|
SYSCTL_COUNTER_U64(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact,
|
vfs_msync(), called from syncer vnode fsync VOP, only iterates over
the active vnode list for the given mount point, with the assumption
that vnodes with dirty pages are active. This is enforced by
vinactive() doing vm_object_page_clean() pass over the vnode pages.
The issue is, if vinactive() cannot be called during vput() due to the
vnode being only shared-locked, we might end up with the dirty pages
for the vnode on the free list. Such vnode is invisible to syncer,
and pages are only cleaned on the vnode reactivation. In other words,
the race results in the broken guarantee that user data, written
through the mmap(2), is written to the disk not later than in 30
seconds after the write.
Fix this by keeping the vnode which is freed but still owing
inactivation, on the active list. When syncer loops find such vnode,
it is deactivated and cleaned by the final vput() call.
Tested by: pho
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2015-06-17 04:46:58 +00:00
|
|
|
"Number of times free vnodes kept on active list due to VFS "
|
|
|
|
"owing inactivation");
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* To keep more than one thread at a time from running vfs_getnewfsid */
|
2001-01-24 12:35:55 +00:00
|
|
|
static struct mtx mntid_mtx;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2002-08-13 05:29:48 +00:00
|
|
|
/*
|
|
|
|
* Lock for any access to the following:
|
|
|
|
* vnode_free_list
|
|
|
|
* numvnodes
|
|
|
|
* freevnodes
|
|
|
|
*/
|
2001-01-24 12:35:55 +00:00
|
|
|
static struct mtx vnode_free_list_mtx;
|
2000-09-22 12:22:36 +00:00
|
|
|
|
|
|
|
/* Publicly exported FS */
|
|
|
|
struct nfs_public nfs_pub;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2013-05-12 04:05:01 +00:00
|
|
|
static uma_zone_t buf_trie_zone;
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
|
2002-03-20 04:09:59 +00:00
|
|
|
static uma_zone_t vnode_zone;
|
|
|
|
static uma_zone_t vnodepoll_zone;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* The workitem queue.
|
2002-06-06 15:46:38 +00:00
|
|
|
*
|
2000-09-22 12:22:36 +00:00
|
|
|
* It is useful to delay writes of file data and filesystem metadata
|
|
|
|
* for tens of seconds so that quickly created and deleted files need
|
|
|
|
* not waste disk bandwidth being created and removed. To realize this,
|
|
|
|
* we append vnodes to a "workitem" queue. When running with a soft
|
|
|
|
* updates implementation, most pending metadata dependencies should
|
|
|
|
* not wait for more than a few seconds. Thus, mounted on block devices
|
|
|
|
* are delayed only about a half the time that file data is delayed.
|
|
|
|
* Similarly, directory updates are more critical, so are only delayed
|
|
|
|
* about a third the time that file data is delayed. Thus, there are
|
|
|
|
* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
|
|
|
|
* one each second (driven off the filesystem syncer process). The
|
|
|
|
* syncer_delayno variable indicates the next queue that is to be processed.
|
|
|
|
* Items that need to be processed soon are placed in this queue:
|
|
|
|
*
|
|
|
|
* syncer_workitem_pending[syncer_delayno]
|
|
|
|
*
|
|
|
|
* A delay of fifteen seconds is done by placing the request fifteen
|
|
|
|
* entries later in the queue:
|
|
|
|
*
|
|
|
|
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
|
|
|
|
*
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
2001-12-19 01:35:18 +00:00
|
|
|
static int syncer_delayno;
|
2002-06-06 15:46:38 +00:00
|
|
|
static long syncer_mask;
|
2004-10-27 08:05:02 +00:00
|
|
|
LIST_HEAD(synclist, bufobj);
|
2012-10-22 17:50:54 +00:00
|
|
|
static struct synclist *syncer_workitem_pending;
|
2002-09-25 02:22:21 +00:00
|
|
|
/*
|
|
|
|
* The sync_mtx protects:
|
2004-10-27 08:05:02 +00:00
|
|
|
* bo->bo_synclist
|
2004-07-01 23:59:19 +00:00
|
|
|
* sync_vnode_count
|
2002-09-25 02:22:21 +00:00
|
|
|
* syncer_delayno
|
2004-07-05 01:07:33 +00:00
|
|
|
* syncer_state
|
2002-09-25 02:22:21 +00:00
|
|
|
* syncer_workitem_pending
|
2004-07-01 23:59:19 +00:00
|
|
|
* syncer_worklist_len
|
2002-09-25 02:22:21 +00:00
|
|
|
* rushjob
|
|
|
|
*/
|
|
|
|
static struct mtx sync_mtx;
|
2008-07-30 12:39:18 +00:00
|
|
|
static struct cv sync_wakeup;
|
2000-09-22 12:22:36 +00:00
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
#define SYNCER_MAXDELAY 32
|
1998-12-21 23:38:33 +00:00
|
|
|
static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
|
2001-10-27 19:58:56 +00:00
|
|
|
static int syncdelay = 30; /* max time to delay syncing data */
|
|
|
|
static int filedelay = 30; /* time to delay syncing files */
|
2010-11-14 08:06:29 +00:00
|
|
|
SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
|
2010-11-14 16:10:15 +00:00
|
|
|
"Time to delay syncing files (in seconds)");
|
2001-10-27 19:58:56 +00:00
|
|
|
static int dirdelay = 29; /* time to delay syncing directories */
|
2010-11-14 08:06:29 +00:00
|
|
|
SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
|
2010-11-14 16:10:15 +00:00
|
|
|
"Time to delay syncing directories (in seconds)");
|
2001-10-27 19:58:56 +00:00
|
|
|
static int metadelay = 28; /* time to delay syncing metadata */
|
2010-11-14 08:06:29 +00:00
|
|
|
SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
|
2010-11-14 16:10:15 +00:00
|
|
|
"Time to delay syncing metadata (in seconds)");
|
2000-09-22 12:22:36 +00:00
|
|
|
static int rushjob; /* number of slots to run ASAP */
|
1999-06-15 23:37:29 +00:00
|
|
|
static int stat_rush_requests; /* number of times I/O speeded up */
|
2010-11-14 08:06:29 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
|
2010-11-14 16:10:15 +00:00
|
|
|
"Number of times I/O speeded up (rush requests)");
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2004-07-01 23:59:19 +00:00
|
|
|
/*
|
2004-07-05 01:07:33 +00:00
|
|
|
* When shutting down the syncer, run it at four times normal speed.
|
2004-07-01 23:59:19 +00:00
|
|
|
*/
|
2004-07-05 01:07:33 +00:00
|
|
|
#define SYNCER_SHUTDOWN_SPEEDUP 4
|
2004-07-01 23:59:19 +00:00
|
|
|
static int sync_vnode_count;
|
|
|
|
static int syncer_worklist_len;
|
2004-07-05 01:07:33 +00:00
|
|
|
static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
|
|
|
|
syncer_state;
|
2004-07-01 23:59:19 +00:00
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
/* Target for maximum number of vnodes. */
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
int desiredvnodes;
|
2015-11-24 09:45:36 +00:00
|
|
|
static int gapvnodes; /* gap between wanted and desired */
|
|
|
|
static int vhiwat; /* enough extras after expansion */
|
|
|
|
static int vlowat; /* minimal extras before expansion */
|
|
|
|
static int vstir; /* nonzero to stir non-free vnodes */
|
|
|
|
static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */
|
2015-09-06 05:50:51 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error, old_desiredvnodes;
|
|
|
|
|
|
|
|
old_desiredvnodes = desiredvnodes;
|
|
|
|
if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
|
|
|
|
return (error);
|
|
|
|
if (old_desiredvnodes != desiredvnodes) {
|
2015-11-24 09:45:36 +00:00
|
|
|
wantfreevnodes = desiredvnodes / 4;
|
|
|
|
/* XXX locking seems to be incomplete. */
|
2015-09-06 05:50:51 +00:00
|
|
|
vfs_hash_changesize(desiredvnodes);
|
|
|
|
cache_changesize(desiredvnodes);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
|
|
|
|
CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
|
2015-11-24 09:45:36 +00:00
|
|
|
sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
|
2011-01-12 19:54:19 +00:00
|
|
|
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
|
2015-11-24 09:45:36 +00:00
|
|
|
&wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
|
2001-12-19 01:35:18 +00:00
|
|
|
static int vnlru_nowhere;
|
2004-01-05 19:04:29 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
|
|
|
|
&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2013-01-14 05:52:23 +00:00
|
|
|
/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
|
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 7c243b6..0bdaf36 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -279,6 +279,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+static int vnsz2log;
/*
* Initialize the vnode management data structures.
@@ -293,6 +294,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
static void
vntblinit(void *dummy __unused)
{
+ u_int i;
int physvnodes, virtvnodes;
/*
@@ -332,6 +334,9 @@ vntblinit(void *dummy __unused)
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
+ for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+ vnsz2log++;
+ vnsz2log--;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
@@ -1067,6 +1072,14 @@ alloc:
}
rangelock_init(&vp->v_rl);
+ /*
+ * For the filesystems which do not use vfs_hash_insert(),
+ * still initialize v_hash to have vfs_hash_index() useful.
+ * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+ * its own hashing.
+ */
+ vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
*vpp = vp;
return (0);
}
2013-01-14 05:42:54 +00:00
|
|
|
static int vnsz2log;
|
2005-03-15 14:38:16 +00:00
|
|
|
|
2013-05-12 04:05:01 +00:00
|
|
|
/*
|
|
|
|
* Support for the bufobj clean & dirty pctrie.
|
|
|
|
*/
|
|
|
|
static void *
|
|
|
|
buf_trie_alloc(struct pctrie *ptree)
|
|
|
|
{
|
|
|
|
|
|
|
|
return uma_zalloc(buf_trie_zone, M_NOWAIT);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
buf_trie_free(struct pctrie *ptree, void *node)
|
|
|
|
{
|
|
|
|
|
|
|
|
uma_zfree(buf_trie_zone, node);
|
|
|
|
}
|
|
|
|
PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Initialize the vnode management data structures.
|
2010-08-02 21:33:36 +00:00
|
|
|
*
|
|
|
|
* Reevaluate the following cap on the number of vnodes after the physical
|
|
|
|
* memory size exceeds 512GB. In the limit, as the physical memory size
|
2015-11-24 09:45:36 +00:00
|
|
|
* grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2004-08-16 08:33:37 +00:00
|
|
|
#ifndef MAXVNODES_MAX
|
2015-11-24 09:45:36 +00:00
|
|
|
#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */
|
2004-08-16 08:33:37 +00:00
|
|
|
#endif
|
2015-11-29 21:42:26 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Initialize a vnode as it first enters the zone.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
vnode_init(void *mem, int size, int flags)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
struct bufobj *bo;
|
|
|
|
|
|
|
|
vp = mem;
|
|
|
|
bzero(vp, size);
|
|
|
|
/*
|
|
|
|
* Setup locks.
|
|
|
|
*/
|
|
|
|
vp->v_vnlock = &vp->v_lock;
|
|
|
|
mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
|
|
|
|
/*
|
|
|
|
* By default, don't allow shared locks unless filesystems opt-in.
|
|
|
|
*/
|
|
|
|
lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
|
|
|
|
LK_NOSHARE | LK_IS_VNODE);
|
|
|
|
/*
|
|
|
|
* Initialize bufobj.
|
|
|
|
*/
|
|
|
|
bo = &vp->v_bufobj;
|
|
|
|
rw_init(BO_LOCKPTR(bo), "bufobj interlock");
|
|
|
|
bo->bo_private = vp;
|
|
|
|
TAILQ_INIT(&bo->bo_clean.bv_hd);
|
|
|
|
TAILQ_INIT(&bo->bo_dirty.bv_hd);
|
|
|
|
/*
|
|
|
|
* Initialize namecache.
|
|
|
|
*/
|
|
|
|
LIST_INIT(&vp->v_cache_src);
|
|
|
|
TAILQ_INIT(&vp->v_cache_dst);
|
|
|
|
/*
|
|
|
|
* Initialize rangelocks.
|
|
|
|
*/
|
|
|
|
rangelock_init(&vp->v_rl);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a vnode when it is cleared from the zone.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
vnode_fini(void *mem, int size)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
struct bufobj *bo;
|
|
|
|
|
|
|
|
vp = mem;
|
|
|
|
rangelock_destroy(&vp->v_rl);
|
|
|
|
lockdestroy(vp->v_vnlock);
|
|
|
|
mtx_destroy(&vp->v_interlock);
|
|
|
|
bo = &vp->v_bufobj;
|
|
|
|
rw_destroy(BO_LOCKPTR(bo));
|
|
|
|
}
|
|
|
|
|
2016-02-24 15:15:46 +00:00
|
|
|
/*
|
|
|
|
* Provide the size of NFS nclnode and NFS fh for calculation of the
|
|
|
|
* vnode memory consumption. The size is specified directly to
|
|
|
|
* eliminate dependency on NFS-private header.
|
|
|
|
*
|
|
|
|
* Other filesystems may use bigger or smaller (like UFS and ZFS)
|
|
|
|
* private inode data, but the NFS-based estimation is ample enough.
|
|
|
|
* Still, we care about differences in the size between 64- and 32-bit
|
|
|
|
* platforms.
|
|
|
|
*
|
|
|
|
* Namecache structure size is heuristically
|
|
|
|
* sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1.
|
|
|
|
*/
|
|
|
|
#ifdef _LP64
|
|
|
|
#define NFS_NCLNODE_SZ (528 + 64)
|
|
|
|
#define NC_SZ 148
|
|
|
|
#else
|
|
|
|
#define NFS_NCLNODE_SZ (360 + 32)
|
|
|
|
#define NC_SZ 92
|
|
|
|
#endif
|
|
|
|
|
2000-12-06 07:09:08 +00:00
|
|
|
static void
|
|
|
|
vntblinit(void *dummy __unused)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 7c243b6..0bdaf36 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -279,6 +279,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+static int vnsz2log;
/*
* Initialize the vnode management data structures.
@@ -293,6 +294,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
static void
vntblinit(void *dummy __unused)
{
+ u_int i;
int physvnodes, virtvnodes;
/*
@@ -332,6 +334,9 @@ vntblinit(void *dummy __unused)
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
+ for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+ vnsz2log++;
+ vnsz2log--;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
@@ -1067,6 +1072,14 @@ alloc:
}
rangelock_init(&vp->v_rl);
+ /*
+ * For the filesystems which do not use vfs_hash_insert(),
+ * still initialize v_hash to have vfs_hash_index() useful.
+ * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+ * its own hashing.
+ */
+ vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
*vpp = vp;
return (0);
}
2013-01-14 05:42:54 +00:00
|
|
|
u_int i;
|
2010-08-02 21:33:36 +00:00
|
|
|
int physvnodes, virtvnodes;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2003-05-23 19:54:02 +00:00
|
|
|
/*
|
2010-08-02 21:33:36 +00:00
|
|
|
* Desiredvnodes is a function of the physical memory size and the
|
|
|
|
* kernel's heap size. Generally speaking, it scales with the
|
2015-11-24 09:45:36 +00:00
|
|
|
* physical memory size. The ratio of desiredvnodes to the physical
|
|
|
|
* memory size is 1:16 until desiredvnodes exceeds 98,304.
|
|
|
|
* Thereafter, the
|
|
|
|
* marginal ratio of desiredvnodes to the physical memory size is
|
|
|
|
* 1:64. However, desiredvnodes is limited by the kernel's heap
|
2010-08-02 21:33:36 +00:00
|
|
|
* size. The memory required by desiredvnodes vnodes and vm objects
|
2016-02-24 15:15:46 +00:00
|
|
|
* must not exceed 1/10th of the kernel's heap size.
|
2003-05-23 19:54:02 +00:00
|
|
|
*/
|
2015-11-24 09:45:36 +00:00
|
|
|
physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 +
|
|
|
|
3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64;
|
2016-02-24 15:15:46 +00:00
|
|
|
virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) +
|
|
|
|
sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ));
|
2010-08-02 21:33:36 +00:00
|
|
|
desiredvnodes = min(physvnodes, virtvnodes);
|
2004-08-16 08:33:37 +00:00
|
|
|
if (desiredvnodes > MAXVNODES_MAX) {
|
2004-08-02 21:52:43 +00:00
|
|
|
if (bootverbose)
|
|
|
|
printf("Reducing kern.maxvnodes %d -> %d\n",
|
2004-08-16 08:33:37 +00:00
|
|
|
desiredvnodes, MAXVNODES_MAX);
|
|
|
|
desiredvnodes = MAXVNODES_MAX;
|
2004-08-02 21:52:43 +00:00
|
|
|
}
|
2007-04-10 15:29:37 +00:00
|
|
|
wantfreevnodes = desiredvnodes / 4;
|
2002-04-04 21:03:38 +00:00
|
|
|
mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
TAILQ_INIT(&vnode_free_list);
|
2002-04-04 21:03:38 +00:00
|
|
|
mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
|
2002-03-20 04:09:59 +00:00
|
|
|
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
|
2015-11-29 21:42:26 +00:00
|
|
|
vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
|
2002-03-20 04:09:59 +00:00
|
|
|
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
|
2008-04-24 09:58:33 +00:00
|
|
|
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
|
2013-05-12 04:05:01 +00:00
|
|
|
/*
|
|
|
|
* Preallocate enough nodes to support one-per buf so that
|
|
|
|
* we can not fail an insert. reassignbuf() callers can not
|
|
|
|
* tolerate the insertion failure.
|
|
|
|
*/
|
|
|
|
buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
|
|
|
|
NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
|
|
|
|
UMA_ZONE_NOFREE | UMA_ZONE_VM);
|
|
|
|
uma_prealloc(buf_trie_zone, nbuf);
|
2016-12-31 19:59:31 +00:00
|
|
|
|
|
|
|
vnodes_created = counter_u64_alloc(M_WAITOK);
|
|
|
|
recycles_count = counter_u64_alloc(M_WAITOK);
|
|
|
|
free_owe_inact = counter_u64_alloc(M_WAITOK);
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Initialize the filesystem syncer.
|
2002-06-06 15:46:38 +00:00
|
|
|
*/
|
2012-10-22 17:50:54 +00:00
|
|
|
syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
|
2008-05-04 13:54:55 +00:00
|
|
|
&syncer_mask);
|
1998-03-08 09:59:44 +00:00
|
|
|
syncer_maxdelay = syncer_mask + 1;
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
|
2008-07-30 12:39:18 +00:00
|
|
|
cv_init(&sync_wakeup, "syncer");
|
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 7c243b6..0bdaf36 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -279,6 +279,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+static int vnsz2log;
/*
* Initialize the vnode management data structures.
@@ -293,6 +294,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
static void
vntblinit(void *dummy __unused)
{
+ u_int i;
int physvnodes, virtvnodes;
/*
@@ -332,6 +334,9 @@ vntblinit(void *dummy __unused)
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
+ for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+ vnsz2log++;
+ vnsz2log--;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
@@ -1067,6 +1072,14 @@ alloc:
}
rangelock_init(&vp->v_rl);
+ /*
+ * For the filesystems which do not use vfs_hash_insert(),
+ * still initialize v_hash to have vfs_hash_index() useful.
+ * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+ * its own hashing.
+ */
+ vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
*vpp = vp;
return (0);
}
2013-01-14 05:42:54 +00:00
|
|
|
for (i = 1; i <= sizeof(struct vnode); i <<= 1)
|
|
|
|
vnsz2log++;
|
|
|
|
vnsz2log--;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2008-03-16 10:58:09 +00:00
|
|
|
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
|
2000-12-06 07:09:08 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Mark a mount point as busy. Used to synchronize access and to delay
|
2008-11-02 10:15:42 +00:00
|
|
|
* unmounting. Eventually, mountlist_mtx is not released on failure.
|
2011-10-04 18:45:29 +00:00
|
|
|
*
|
|
|
|
* vfs_busy() is a custom lock, it can block the caller.
|
|
|
|
* vfs_busy() only sleeps if the unmount is active on the mount point.
|
|
|
|
* For a mountpoint mp, vfs_busy-enforced lock is before lock of any
|
|
|
|
* vnode belonging to mp.
|
|
|
|
*
|
|
|
|
* Lookup uses vfs_busy() to traverse mount points.
|
|
|
|
* root fs var fs
|
|
|
|
* / vnode lock A / vnode lock (/var) D
|
|
|
|
* /var vnode lock B /log vnode lock(/var/log) E
|
|
|
|
* vfs_busy lock C vfs_busy lock F
|
|
|
|
*
|
|
|
|
* Within each file system, the lock order is C->A->B and F->D->E.
|
|
|
|
*
|
|
|
|
* When traversing across mounts, the system follows that lock order:
|
|
|
|
*
|
|
|
|
* C->A->B
|
|
|
|
* |
|
|
|
|
* +->F->D->E
|
|
|
|
*
|
|
|
|
* The lookup() process for namei("/var") illustrates the process:
|
|
|
|
* VOP_LOOKUP() obtains B while A is held
|
|
|
|
* vfs_busy() obtains a shared lock on F while A and B are held
|
|
|
|
* vput() releases lock on B
|
|
|
|
* vput() releases lock on A
|
|
|
|
* VFS_ROOT() obtains lock on D while shared lock on F is held
|
|
|
|
* vfs_unbusy() releases shared lock on F
|
|
|
|
* vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
|
|
|
|
* Attempt to lock A (instead of vp_crossmp) while D is held would
|
|
|
|
* violate the global order, causing deadlocks.
|
|
|
|
*
|
|
|
|
* dounmount() locks B while F is drained.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2008-11-02 10:15:42 +00:00
|
|
|
vfs_busy(struct mount *mp, int flags)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2008-11-02 10:15:42 +00:00
|
|
|
|
|
|
|
MPASS((flags & ~MBF_MASK) == 0);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2005-01-24 10:41:01 +00:00
|
|
|
MNT_ILOCK(mp);
|
2006-02-22 06:20:12 +00:00
|
|
|
MNT_REF(mp);
|
2009-03-02 20:51:39 +00:00
|
|
|
/*
|
2016-04-29 22:15:33 +00:00
|
|
|
* If mount point is currently being unmounted, sleep until the
|
2009-03-02 20:51:39 +00:00
|
|
|
* mount point fate is decided. If thread doing the unmounting fails,
|
|
|
|
* it will clear MNTK_UNMOUNT flag before waking us up, indicating
|
|
|
|
* that this mount point has survived the unmount attempt and vfs_busy
|
|
|
|
* should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE
|
|
|
|
* flag in addition to MNTK_UNMOUNT, indicating that mount point is
|
|
|
|
* about to be really destroyed. vfs_busy needs to release its
|
|
|
|
* reference on the mount point in this case and return with ENOENT,
|
|
|
|
* telling the caller that mount mount it tried to busy is no longer
|
|
|
|
* valid.
|
|
|
|
*/
|
|
|
|
while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
|
1) Fix a deadlock in the VFS:
- threadA runs vfs_rel(mp1)
- threadB does unmount the mp1 fs, sets MNTK_UNMOUNT and drop MNT_ILOCK()
- threadA runs vfs_busy(mp1) and, as long as, MNTK_UNMOUNT is set, sleeps
waiting for threadB to complete the unmount
- threadB, in vfs_mount_destroy(), finds mnt_lock > 0 and sleeps waiting
for the refcount to expire.
Fix the deadlock by adding a flag called MNTK_REFEXPIRE which signals the
unmounter is waiting for mnt_ref to expire.
The vfs_busy contenders got awake, fails, and if they retry the
MNTK_REFEXPIRE won't allow them to sleep again.
2) Simplify significantly the code of vfs_mount_destroy() trimming
unnecessary codes:
- as long as any reference exited, it is no-more possible to have
write-op (primarty and secondary) in progress.
- it is no needed to drop and reacquire the mount lock.
- filling the structures with dummy values is unuseful as long as
it is going to be freed.
Tested by: pho, Andrea Barberio <insomniac at slackware dot it>
Discussed with: kib
2008-12-16 23:16:10 +00:00
|
|
|
if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
|
2006-02-22 06:20:12 +00:00
|
|
|
MNT_REL(mp);
|
2005-01-24 10:41:01 +00:00
|
|
|
MNT_IUNLOCK(mp);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR1(KTR_VFS, "%s: failed busying before sleeping",
|
|
|
|
__func__);
|
1997-02-10 02:22:35 +00:00
|
|
|
return (ENOENT);
|
2005-01-24 10:41:01 +00:00
|
|
|
}
|
2008-11-02 10:15:42 +00:00
|
|
|
if (flags & MBF_MNTLSTLOCK)
|
|
|
|
mtx_unlock(&mountlist_mtx);
|
1997-11-12 05:42:33 +00:00
|
|
|
mp->mnt_kern_flag |= MNTK_MWAIT;
|
2011-04-23 11:22:48 +00:00
|
|
|
msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
|
2008-11-02 10:15:42 +00:00
|
|
|
if (flags & MBF_MNTLSTLOCK)
|
|
|
|
mtx_lock(&mountlist_mtx);
|
2011-04-23 11:22:48 +00:00
|
|
|
MNT_ILOCK(mp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2008-11-02 10:15:42 +00:00
|
|
|
if (flags & MBF_MNTLSTLOCK)
|
|
|
|
mtx_unlock(&mountlist_mtx);
|
|
|
|
mp->mnt_lockref++;
|
|
|
|
MNT_IUNLOCK(mp);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Free a busy filesystem.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
void
|
2008-08-31 14:26:08 +00:00
|
|
|
vfs_unbusy(struct mount *mp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
|
2008-11-02 10:15:42 +00:00
|
|
|
MNT_ILOCK(mp);
|
|
|
|
MNT_REL(mp);
|
2009-02-06 18:16:01 +00:00
|
|
|
KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
|
2008-11-02 10:15:42 +00:00
|
|
|
mp->mnt_lockref--;
|
|
|
|
if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
|
|
|
|
MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR1(KTR_VFS, "%s: waking up waiters", __func__);
|
2008-11-02 10:15:42 +00:00
|
|
|
mp->mnt_kern_flag &= ~MNTK_DRAINING;
|
|
|
|
wakeup(&mp->mnt_lockref);
|
|
|
|
}
|
|
|
|
MNT_IUNLOCK(mp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lookup a mount point by filesystem identifier.
|
|
|
|
*/
|
|
|
|
struct mount *
|
2006-01-21 19:42:10 +00:00
|
|
|
vfs_getvfs(fsid_t *fsid)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-01-28 12:39:10 +00:00
|
|
|
struct mount *mp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_lock(&mountlist_mtx);
|
1999-11-20 10:00:46 +00:00
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
|
1997-02-10 02:22:35 +00:00
|
|
|
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
|
2006-03-31 03:53:25 +00:00
|
|
|
vfs_ref(mp);
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_unlock(&mountlist_mtx);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (mp);
|
2002-10-01 15:48:31 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_unlock(&mountlist_mtx);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
return ((struct mount *) 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2008-11-29 13:34:59 +00:00
|
|
|
/*
|
|
|
|
* Lookup a mount point by filesystem identifier, busying it before
|
|
|
|
* returning.
|
2014-06-12 12:43:48 +00:00
|
|
|
*
|
|
|
|
* To avoid congestion on mountlist_mtx, implement simple direct-mapped
|
|
|
|
* cache for popular filesystem identifiers. The cache is lockess, using
|
|
|
|
* the fact that struct mount's are never freed. In worst case we may
|
|
|
|
* get pointer to unmounted or even different filesystem, so we have to
|
|
|
|
* check what we got, and go slow way if so.
|
2008-11-29 13:34:59 +00:00
|
|
|
*/
|
|
|
|
struct mount *
|
|
|
|
vfs_busyfs(fsid_t *fsid)
|
|
|
|
{
|
2014-06-12 12:43:48 +00:00
|
|
|
#define FSID_CACHE_SIZE 256
|
|
|
|
typedef struct mount * volatile vmp_t;
|
|
|
|
static vmp_t cache[FSID_CACHE_SIZE];
|
2008-11-29 13:34:59 +00:00
|
|
|
struct mount *mp;
|
|
|
|
int error;
|
2014-06-12 12:43:48 +00:00
|
|
|
uint32_t hash;
|
2008-11-29 13:34:59 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
|
2014-06-12 12:43:48 +00:00
|
|
|
hash = fsid->val[0] ^ fsid->val[1];
|
|
|
|
hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
|
|
|
|
mp = cache[hash];
|
|
|
|
if (mp == NULL ||
|
|
|
|
mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
|
|
|
|
mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
|
|
|
|
goto slow;
|
|
|
|
if (vfs_busy(mp, 0) != 0) {
|
|
|
|
cache[hash] = NULL;
|
|
|
|
goto slow;
|
|
|
|
}
|
|
|
|
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
|
|
|
|
mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
|
|
|
|
return (mp);
|
|
|
|
else
|
|
|
|
vfs_unbusy(mp);
|
|
|
|
|
|
|
|
slow:
|
2008-11-29 13:34:59 +00:00
|
|
|
mtx_lock(&mountlist_mtx);
|
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
|
|
|
|
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
|
|
|
|
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
|
|
|
|
error = vfs_busy(mp, MBF_MNTLSTLOCK);
|
|
|
|
if (error) {
|
2014-06-12 12:43:48 +00:00
|
|
|
cache[hash] = NULL;
|
2008-11-29 13:34:59 +00:00
|
|
|
mtx_unlock(&mountlist_mtx);
|
|
|
|
return (NULL);
|
|
|
|
}
|
2014-06-12 12:43:48 +00:00
|
|
|
cache[hash] = mp;
|
2008-11-29 13:34:59 +00:00
|
|
|
return (mp);
|
|
|
|
}
|
|
|
|
}
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
|
2008-11-29 13:34:59 +00:00
|
|
|
mtx_unlock(&mountlist_mtx);
|
|
|
|
return ((struct mount *) 0);
|
|
|
|
}
|
|
|
|
|
2004-07-06 09:37:43 +00:00
|
|
|
/*
|
2007-04-10 15:22:40 +00:00
|
|
|
* Check if a user can access privileged mount options.
|
2004-07-06 09:37:43 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
vfs_suser(struct mount *mp, struct thread *td)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
2007-04-13 23:54:22 +00:00
|
|
|
/*
|
|
|
|
* If the thread is jailed, but this is not a jail-friendly file
|
|
|
|
* system, deny immediately.
|
|
|
|
*/
|
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes:
- Delegated Administration
Allows regular users to perform ZFS operations, like file system
creation, snapshot creation, etc.
- L2ARC
Level 2 cache for ZFS - allows to use additional disks for cache.
Huge performance improvements mostly for random read of mostly
static content.
- slog
Allow to use additional disks for ZFS Intent Log to speed up
operations like fsync(2).
- vfs.zfs.super_owner
Allows regular users to perform privileged operations on files stored
on ZFS file systems owned by him. Very careful with this one.
- chflags(2)
Not all the flags are supported. This still needs work.
- ZFSBoot
Support to boot off of ZFS pool. Not finished, AFAIK.
Submitted by: dfr
- Snapshot properties
- New failure modes
Before if write requested failed, system paniced. Now one
can select from one of three failure modes:
- panic - panic on write error
- wait - wait for disk to reappear
- continue - serve read requests if possible, block write requests
- Refquota, refreservation properties
Just quota and reservation properties, but don't count space consumed
by children file systems, clones and snapshots.
- Sparse volumes
ZVOLs that don't reserve space in the pool.
- External attributes
Compatible with extattr(2).
- NFSv4-ACLs
Not sure about the status, might not be complete yet.
Submitted by: trasz
- Creation-time properties
- Regression tests for zpool(8) command.
Obtained from: OpenSolaris
2008-11-17 20:49:29 +00:00
|
|
|
if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
|
2007-04-13 23:54:22 +00:00
|
|
|
return (EPERM);
|
|
|
|
|
|
|
|
/*
|
2009-05-27 14:11:23 +00:00
|
|
|
* If the file system was mounted outside the jail of the calling
|
|
|
|
* thread, deny immediately.
|
2007-04-13 23:54:22 +00:00
|
|
|
*/
|
2009-07-02 14:19:33 +00:00
|
|
|
if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
|
2007-04-13 23:54:22 +00:00
|
|
|
return (EPERM);
|
|
|
|
|
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes:
- Delegated Administration
Allows regular users to perform ZFS operations, like file system
creation, snapshot creation, etc.
- L2ARC
Level 2 cache for ZFS - allows to use additional disks for cache.
Huge performance improvements mostly for random read of mostly
static content.
- slog
Allow to use additional disks for ZFS Intent Log to speed up
operations like fsync(2).
- vfs.zfs.super_owner
Allows regular users to perform privileged operations on files stored
on ZFS file systems owned by him. Very careful with this one.
- chflags(2)
Not all the flags are supported. This still needs work.
- ZFSBoot
Support to boot off of ZFS pool. Not finished, AFAIK.
Submitted by: dfr
- Snapshot properties
- New failure modes
Before if write requested failed, system paniced. Now one
can select from one of three failure modes:
- panic - panic on write error
- wait - wait for disk to reappear
- continue - serve read requests if possible, block write requests
- Refquota, refreservation properties
Just quota and reservation properties, but don't count space consumed
by children file systems, clones and snapshots.
- Sparse volumes
ZVOLs that don't reserve space in the pool.
- External attributes
Compatible with extattr(2).
- NFSv4-ACLs
Not sure about the status, might not be complete yet.
Submitted by: trasz
- Creation-time properties
- Regression tests for zpool(8) command.
Obtained from: OpenSolaris
2008-11-17 20:49:29 +00:00
|
|
|
/*
|
|
|
|
* If file system supports delegated administration, we don't check
|
|
|
|
* for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
|
|
|
|
* by the file system itself.
|
|
|
|
* If this is not the user that did original mount, we check for
|
|
|
|
* the PRIV_VFS_MOUNT_OWNER privilege.
|
|
|
|
*/
|
|
|
|
if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
|
2004-07-06 09:37:43 +00:00
|
|
|
mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
|
2006-11-06 13:42:10 +00:00
|
|
|
if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
|
2004-07-06 09:37:43 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2000-03-14 14:19:49 +00:00
|
|
|
* Get a new unique fsid. Try to make its val[0] unique, since this value
|
|
|
|
* will be used to create fake device numbers for stat(). Also try (but
|
|
|
|
* not so hard) make its val[0] unique mod 2^16, since some emulators only
|
|
|
|
* support 16-bit device numbers. We end up with unique val[0]'s for the
|
|
|
|
* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
|
1999-09-19 06:24:21 +00:00
|
|
|
*
|
2000-03-12 14:23:21 +00:00
|
|
|
* Keep in mind that several mounts may be running in parallel. Starting
|
2000-03-14 14:19:49 +00:00
|
|
|
* the search one past where the previous search terminated is both a
|
|
|
|
* micro-optimization and a defense against returning the same fsid to
|
|
|
|
* different mounts.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
void
|
2006-01-21 19:42:10 +00:00
|
|
|
vfs_getnewfsid(struct mount *mp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2010-06-21 09:55:56 +00:00
|
|
|
static uint16_t mntid_base;
|
2006-03-31 03:53:25 +00:00
|
|
|
struct mount *nmp;
|
1994-05-24 10:09:53 +00:00
|
|
|
fsid_t tfsid;
|
2000-03-14 14:19:49 +00:00
|
|
|
int mtype;
|
1999-09-19 06:24:21 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_lock(&mntid_mtx);
|
1997-02-10 02:22:35 +00:00
|
|
|
mtype = mp->mnt_vfc->vfc_typenum;
|
2000-03-12 14:23:21 +00:00
|
|
|
tfsid.val[1] = mtype;
|
2000-07-07 14:01:08 +00:00
|
|
|
mtype = (mtype & 0xFF) << 24;
|
2000-03-14 14:19:49 +00:00
|
|
|
for (;;) {
|
2004-06-17 17:16:53 +00:00
|
|
|
tfsid.val[0] = makedev(255,
|
2000-07-07 14:01:08 +00:00
|
|
|
mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
|
|
|
|
mntid_base++;
|
2006-03-31 03:53:25 +00:00
|
|
|
if ((nmp = vfs_getvfs(&tfsid)) == NULL)
|
1999-09-19 06:24:21 +00:00
|
|
|
break;
|
2006-03-31 03:53:25 +00:00
|
|
|
vfs_rel(nmp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
|
1999-09-19 06:24:21 +00:00
|
|
|
mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_unlock(&mntid_mtx);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
Support full-precision file timestamps. Until now, only the seconds
have been maintained, and that is still the default. A new sysctl
variable "vfs.timestamp_precision" can be used to enable higher
levels of precision:
0 = seconds only; nanoseconds zeroed (default).
1 = seconds and nanoseconds, accurate within 1/HZ.
2 = seconds and nanoseconds, truncated to microseconds.
>=3 = seconds and nanoseconds, maximum precision.
Level 1 uses getnanotime(), which is fast but can be wrong by up
to 1/HZ. Level 2 uses microtime(). It might be desirable for
consistency with utimes() and friends, which take timeval structures
rather than timespecs. Level 3 uses nanotime() for the higest
precision.
I benchmarked levels 0, 1, and 3 by copying a 550 MB tree with
"cpio -pdu". There was almost negligible difference in the system
times -- much less than 1%, and less than the variation among
multiple runs at the same level. Bruce Evans dreamed up a torture
test involving 1-byte reads with intervening fstat() calls, but
the cpio test seems more realistic to me.
This feature is currently implemented only for the UFS (FFS and
MFS) filesystems. But I think it should be easy to support it in
the others as well.
An earlier version of this was reviewed by Bruce. He's not to
blame for any breakage I've introduced since then.
Reviewed by: bde (an earlier version of the code)
1999-08-22 00:15:16 +00:00
|
|
|
/*
|
|
|
|
* Knob to control the precision of file timestamps:
|
|
|
|
*
|
|
|
|
* 0 = seconds only; nanoseconds zeroed.
|
|
|
|
* 1 = seconds and nanoseconds, accurate within 1/HZ.
|
|
|
|
* 2 = seconds and nanoseconds, truncated to microseconds.
|
|
|
|
* >=3 = seconds and nanoseconds, maximum precision.
|
|
|
|
*/
|
|
|
|
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
|
|
|
|
|
2015-01-25 19:56:45 +00:00
|
|
|
static int timestamp_precision = TSP_USEC;
|
Support full-precision file timestamps. Until now, only the seconds
have been maintained, and that is still the default. A new sysctl
variable "vfs.timestamp_precision" can be used to enable higher
levels of precision:
0 = seconds only; nanoseconds zeroed (default).
1 = seconds and nanoseconds, accurate within 1/HZ.
2 = seconds and nanoseconds, truncated to microseconds.
>=3 = seconds and nanoseconds, maximum precision.
Level 1 uses getnanotime(), which is fast but can be wrong by up
to 1/HZ. Level 2 uses microtime(). It might be desirable for
consistency with utimes() and friends, which take timeval structures
rather than timespecs. Level 3 uses nanotime() for the higest
precision.
I benchmarked levels 0, 1, and 3 by copying a 550 MB tree with
"cpio -pdu". There was almost negligible difference in the system
times -- much less than 1%, and less than the variation among
multiple runs at the same level. Bruce Evans dreamed up a torture
test involving 1-byte reads with intervening fstat() calls, but
the cpio test seems more realistic to me.
This feature is currently implemented only for the UFS (FFS and
MFS) filesystems. But I think it should be easy to support it in
the others as well.
An earlier version of this was reviewed by Bruce. He's not to
blame for any breakage I've introduced since then.
Reviewed by: bde (an earlier version of the code)
1999-08-22 00:15:16 +00:00
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
|
2010-11-14 08:06:29 +00:00
|
|
|
×tamp_precision, 0, "File timestamp precision (0: seconds, "
|
2017-03-12 18:07:03 +00:00
|
|
|
"1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, "
|
2010-11-14 16:10:15 +00:00
|
|
|
"3+: sec + ns (max. precision))");
|
Support full-precision file timestamps. Until now, only the seconds
have been maintained, and that is still the default. A new sysctl
variable "vfs.timestamp_precision" can be used to enable higher
levels of precision:
0 = seconds only; nanoseconds zeroed (default).
1 = seconds and nanoseconds, accurate within 1/HZ.
2 = seconds and nanoseconds, truncated to microseconds.
>=3 = seconds and nanoseconds, maximum precision.
Level 1 uses getnanotime(), which is fast but can be wrong by up
to 1/HZ. Level 2 uses microtime(). It might be desirable for
consistency with utimes() and friends, which take timeval structures
rather than timespecs. Level 3 uses nanotime() for the higest
precision.
I benchmarked levels 0, 1, and 3 by copying a 550 MB tree with
"cpio -pdu". There was almost negligible difference in the system
times -- much less than 1%, and less than the variation among
multiple runs at the same level. Bruce Evans dreamed up a torture
test involving 1-byte reads with intervening fstat() calls, but
the cpio test seems more realistic to me.
This feature is currently implemented only for the UFS (FFS and
MFS) filesystems. But I think it should be easy to support it in
the others as well.
An earlier version of this was reviewed by Bruce. He's not to
blame for any breakage I've introduced since then.
Reviewed by: bde (an earlier version of the code)
1999-08-22 00:15:16 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Get a current timestamp.
|
|
|
|
*/
|
|
|
|
void
|
2006-01-21 19:42:10 +00:00
|
|
|
vfs_timestamp(struct timespec *tsp)
|
Support full-precision file timestamps. Until now, only the seconds
have been maintained, and that is still the default. A new sysctl
variable "vfs.timestamp_precision" can be used to enable higher
levels of precision:
0 = seconds only; nanoseconds zeroed (default).
1 = seconds and nanoseconds, accurate within 1/HZ.
2 = seconds and nanoseconds, truncated to microseconds.
>=3 = seconds and nanoseconds, maximum precision.
Level 1 uses getnanotime(), which is fast but can be wrong by up
to 1/HZ. Level 2 uses microtime(). It might be desirable for
consistency with utimes() and friends, which take timeval structures
rather than timespecs. Level 3 uses nanotime() for the higest
precision.
I benchmarked levels 0, 1, and 3 by copying a 550 MB tree with
"cpio -pdu". There was almost negligible difference in the system
times -- much less than 1%, and less than the variation among
multiple runs at the same level. Bruce Evans dreamed up a torture
test involving 1-byte reads with intervening fstat() calls, but
the cpio test seems more realistic to me.
This feature is currently implemented only for the UFS (FFS and
MFS) filesystems. But I think it should be easy to support it in
the others as well.
An earlier version of this was reviewed by Bruce. He's not to
blame for any breakage I've introduced since then.
Reviewed by: bde (an earlier version of the code)
1999-08-22 00:15:16 +00:00
|
|
|
{
|
|
|
|
struct timeval tv;
|
|
|
|
|
|
|
|
switch (timestamp_precision) {
|
|
|
|
case TSP_SEC:
|
|
|
|
tsp->tv_sec = time_second;
|
|
|
|
tsp->tv_nsec = 0;
|
|
|
|
break;
|
|
|
|
case TSP_HZ:
|
|
|
|
getnanotime(tsp);
|
|
|
|
break;
|
|
|
|
case TSP_USEC:
|
|
|
|
microtime(&tv);
|
|
|
|
TIMEVAL_TO_TIMESPEC(&tv, tsp);
|
|
|
|
break;
|
|
|
|
case TSP_NSEC:
|
|
|
|
default:
|
|
|
|
nanotime(tsp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Set vnode attributes to VNOVAL
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2006-01-21 19:42:10 +00:00
|
|
|
vattr_null(struct vattr *vap)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
vap->va_type = VNON;
|
1994-05-25 09:21:21 +00:00
|
|
|
vap->va_size = VNOVAL;
|
|
|
|
vap->va_bytes = VNOVAL;
|
1998-07-12 16:45:39 +00:00
|
|
|
vap->va_mode = VNOVAL;
|
|
|
|
vap->va_nlink = VNOVAL;
|
|
|
|
vap->va_uid = VNOVAL;
|
|
|
|
vap->va_gid = VNOVAL;
|
|
|
|
vap->va_fsid = VNOVAL;
|
|
|
|
vap->va_fileid = VNOVAL;
|
|
|
|
vap->va_blocksize = VNOVAL;
|
|
|
|
vap->va_rdev = VNOVAL;
|
|
|
|
vap->va_atime.tv_sec = VNOVAL;
|
|
|
|
vap->va_atime.tv_nsec = VNOVAL;
|
|
|
|
vap->va_mtime.tv_sec = VNOVAL;
|
|
|
|
vap->va_mtime.tv_nsec = VNOVAL;
|
|
|
|
vap->va_ctime.tv_sec = VNOVAL;
|
|
|
|
vap->va_ctime.tv_nsec = VNOVAL;
|
2002-07-17 02:03:19 +00:00
|
|
|
vap->va_birthtime.tv_sec = VNOVAL;
|
|
|
|
vap->va_birthtime.tv_nsec = VNOVAL;
|
1998-07-12 16:45:39 +00:00
|
|
|
vap->va_flags = VNOVAL;
|
|
|
|
vap->va_gen = VNOVAL;
|
1994-05-24 10:09:53 +00:00
|
|
|
vap->va_vaflags = 0;
|
|
|
|
}
|
|
|
|
|
2001-10-26 00:08:05 +00:00
|
|
|
/*
|
|
|
|
* This routine is called when we have too many vnodes. It attempts
|
|
|
|
* to free <count> vnodes and will potentially free vnodes that still
|
|
|
|
* have VM backing store (VM backing store is typically the cause
|
|
|
|
* of a vnode blowout so we want to do this). Therefore, this operation
|
|
|
|
* is not considered cheap.
|
|
|
|
*
|
|
|
|
* A number of conditions may prevent a vnode from being reclaimed.
|
|
|
|
* the buffer cache may have references on the vnode, a directory
|
|
|
|
* vnode may still have references due to the namei cache representing
|
|
|
|
* underlying files, or the vnode may be in active use. It is not
|
2016-04-29 22:15:33 +00:00
|
|
|
* desirable to reuse such vnodes. These conditions may cause the
|
2001-10-26 00:08:05 +00:00
|
|
|
* number of vnodes to reach some minimum value regardless of what
|
2001-12-18 20:48:54 +00:00
|
|
|
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
|
2001-10-26 00:08:05 +00:00
|
|
|
*/
|
2001-12-18 20:48:54 +00:00
|
|
|
static int
|
2015-11-24 09:45:36 +00:00
|
|
|
vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
|
2001-10-26 00:08:05 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp;
|
2015-11-24 09:45:36 +00:00
|
|
|
int count, done, target;
|
2002-01-10 18:31:53 +00:00
|
|
|
|
2001-12-18 20:48:54 +00:00
|
|
|
done = 0;
|
2005-03-13 11:54:28 +00:00
|
|
|
vn_start_write(NULL, &mp, V_WAIT);
|
2003-11-05 04:30:08 +00:00
|
|
|
MNT_ILOCK(mp);
|
2015-11-24 09:45:36 +00:00
|
|
|
count = mp->mnt_nvnodelistsize;
|
|
|
|
target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
|
|
|
|
target = target / 10 + 1;
|
|
|
|
while (count != 0 && done < target) {
|
2006-01-09 20:42:19 +00:00
|
|
|
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
|
|
|
|
while (vp != NULL && vp->v_type == VMARKER)
|
|
|
|
vp = TAILQ_NEXT(vp, v_nmntvnodes);
|
|
|
|
if (vp == NULL)
|
|
|
|
break;
|
2015-11-24 09:45:36 +00:00
|
|
|
/*
|
|
|
|
* XXX LRU is completely broken for non-free vnodes. First
|
|
|
|
* by calling here in mountpoint order, then by moving
|
|
|
|
* unselected vnodes to the end here, and most grossly by
|
|
|
|
* removing the vlruvp() function that was supposed to
|
|
|
|
* maintain the order. (This function was born broken
|
|
|
|
* since syncer problems prevented it doing anything.) The
|
|
|
|
* order is closer to LRC (C = Created).
|
|
|
|
*
|
|
|
|
* LRU reclaiming of vnodes seems to have last worked in
|
|
|
|
* FreeBSD-3 where LRU wasn't mentioned under any spelling.
|
|
|
|
* Then there was no hold count, and inactive vnodes were
|
|
|
|
* simply put on the free list in LRU order. The separate
|
|
|
|
* lists also break LRU. We prefer to reclaim from the
|
|
|
|
* free list for technical reasons. This tends to thrash
|
|
|
|
* the free list to keep very unrecently used held vnodes.
|
|
|
|
* The problem is mitigated by keeping the free list large.
|
|
|
|
*/
|
2001-10-26 00:08:05 +00:00
|
|
|
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
|
|
|
|
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
|
|
|
|
--count;
|
2005-06-16 04:41:42 +00:00
|
|
|
if (!VI_TRYLOCK(vp))
|
2005-08-23 03:44:06 +00:00
|
|
|
goto next_iter;
|
2005-06-16 04:41:42 +00:00
|
|
|
/*
|
|
|
|
* If it's been deconstructed already, it's still
|
|
|
|
* referenced, or it exceeds the trigger, skip it.
|
2015-11-24 09:45:36 +00:00
|
|
|
* Also skip free vnodes. We are trying to make space
|
|
|
|
* to expand the free list, not reduce it.
|
2005-06-16 04:41:42 +00:00
|
|
|
*/
|
2009-12-28 15:35:39 +00:00
|
|
|
if (vp->v_usecount ||
|
2015-11-24 09:45:36 +00:00
|
|
|
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
|
|
|
|
((vp->v_iflag & VI_FREE) != 0) ||
|
2005-08-23 03:44:06 +00:00
|
|
|
(vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
|
2005-06-16 04:41:42 +00:00
|
|
|
vp->v_object->resident_page_count > trigger)) {
|
|
|
|
VI_UNLOCK(vp);
|
2005-08-23 03:44:06 +00:00
|
|
|
goto next_iter;
|
2005-06-16 04:41:42 +00:00
|
|
|
}
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
vholdl(vp);
|
2008-01-13 14:44:15 +00:00
|
|
|
if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
|
2005-06-16 04:41:42 +00:00
|
|
|
vdrop(vp);
|
2005-08-23 03:44:06 +00:00
|
|
|
goto next_iter_mntunlocked;
|
2005-06-16 04:41:42 +00:00
|
|
|
}
|
|
|
|
VI_LOCK(vp);
|
2005-08-23 03:44:06 +00:00
|
|
|
/*
|
|
|
|
* v_usecount may have been bumped after VOP_LOCK() dropped
|
|
|
|
* the vnode interlock and before it was locked again.
|
|
|
|
*
|
|
|
|
* It is not necessary to recheck VI_DOOMED because it can
|
|
|
|
* only be set by another thread that holds both the vnode
|
|
|
|
* lock and vnode interlock. If another thread has the
|
|
|
|
* vnode lock before we get to VOP_LOCK() and obtains the
|
|
|
|
* vnode interlock after VOP_LOCK() drops the vnode
|
|
|
|
* interlock, the other thread will be unable to drop the
|
|
|
|
* vnode lock before our VOP_LOCK() call fails.
|
|
|
|
*/
|
2009-12-28 15:35:39 +00:00
|
|
|
if (vp->v_usecount ||
|
2015-11-24 09:45:36 +00:00
|
|
|
(!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
|
|
|
|
(vp->v_iflag & VI_FREE) != 0 ||
|
2007-04-10 15:29:37 +00:00
|
|
|
(vp->v_object != NULL &&
|
2005-08-23 03:44:06 +00:00
|
|
|
vp->v_object->resident_page_count > trigger)) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, LK_INTERLOCK);
|
2013-05-04 18:38:16 +00:00
|
|
|
vdrop(vp);
|
2005-08-23 03:44:06 +00:00
|
|
|
goto next_iter_mntunlocked;
|
|
|
|
}
|
|
|
|
KASSERT((vp->v_iflag & VI_DOOMED) == 0,
|
|
|
|
("VI_DOOMED unexpectedly detected in vlrureclaim()"));
|
2016-12-31 19:59:31 +00:00
|
|
|
counter_u64_add(recycles_count, 1);
|
2005-06-16 04:41:42 +00:00
|
|
|
vgonel(vp);
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2005-06-16 04:41:42 +00:00
|
|
|
vdropl(vp);
|
|
|
|
done++;
|
2005-08-23 03:44:06 +00:00
|
|
|
next_iter_mntunlocked:
|
2011-02-08 00:16:36 +00:00
|
|
|
if (!should_yield())
|
2005-08-23 03:44:06 +00:00
|
|
|
goto relock_mnt;
|
|
|
|
goto yield;
|
|
|
|
next_iter:
|
2011-02-08 00:16:36 +00:00
|
|
|
if (!should_yield())
|
2005-08-23 03:44:06 +00:00
|
|
|
continue;
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
yield:
|
2012-12-21 13:14:12 +00:00
|
|
|
kern_yield(PRI_USER);
|
2005-08-23 03:44:06 +00:00
|
|
|
relock_mnt:
|
2005-06-16 04:41:42 +00:00
|
|
|
MNT_ILOCK(mp);
|
2001-10-26 00:08:05 +00:00
|
|
|
}
|
2003-11-05 04:30:08 +00:00
|
|
|
MNT_IUNLOCK(mp);
|
2005-03-13 11:54:28 +00:00
|
|
|
vn_finished_write(mp);
|
2001-12-18 20:48:54 +00:00
|
|
|
return done;
|
|
|
|
}
|
|
|
|
|
2016-06-17 17:33:25 +00:00
|
|
|
static int max_vnlru_free = 10000; /* limit on vnode free requests per call */
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free,
|
|
|
|
0,
|
|
|
|
"limit on vnode free requests per call to the vnlru_free routine");
|
|
|
|
|
2005-03-25 05:34:39 +00:00
|
|
|
/*
|
2015-11-24 09:45:36 +00:00
|
|
|
* Attempt to reduce the free list by the requested amount.
|
2005-03-25 05:34:39 +00:00
|
|
|
*/
|
|
|
|
static void
|
2016-06-17 17:33:25 +00:00
|
|
|
vnlru_free_locked(int count, struct vfsops *mnt_op)
|
2005-03-25 05:34:39 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp;
|
2016-06-17 17:33:25 +00:00
|
|
|
struct mount *mp;
|
2016-09-30 17:27:17 +00:00
|
|
|
bool tried_batches;
|
2005-03-25 05:34:39 +00:00
|
|
|
|
2016-09-30 17:27:17 +00:00
|
|
|
tried_batches = false;
|
2005-03-25 05:34:39 +00:00
|
|
|
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
|
2016-06-17 17:33:25 +00:00
|
|
|
if (count > max_vnlru_free)
|
|
|
|
count = max_vnlru_free;
|
2005-03-25 05:34:39 +00:00
|
|
|
for (; count > 0; count--) {
|
|
|
|
vp = TAILQ_FIRST(&vnode_free_list);
|
|
|
|
/*
|
|
|
|
* The list can be modified while the free_list_mtx
|
|
|
|
* has been dropped and vp could be NULL here.
|
|
|
|
*/
|
2016-09-30 17:27:17 +00:00
|
|
|
if (vp == NULL) {
|
|
|
|
if (tried_batches)
|
|
|
|
break;
|
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
|
|
|
vnlru_return_batches(mnt_op);
|
|
|
|
tried_batches = true;
|
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2005-06-11 01:16:46 +00:00
|
|
|
VNASSERT(vp->v_op != NULL, vp,
|
|
|
|
("vnlru_free: vnode already reclaimed."));
|
2012-04-20 06:50:44 +00:00
|
|
|
KASSERT((vp->v_iflag & VI_FREE) != 0,
|
|
|
|
("Removing vnode not on freelist"));
|
|
|
|
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
|
|
|
|
("Mangling active vnode"));
|
|
|
|
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
|
2016-06-17 17:33:25 +00:00
|
|
|
|
2005-03-25 05:34:39 +00:00
|
|
|
/*
|
2016-06-17 17:33:25 +00:00
|
|
|
* Don't recycle if our vnode is from different type
|
|
|
|
* of mount point. Note that mp is type-safe, the
|
|
|
|
* check does not reach unmapped address even if
|
|
|
|
* vnode is reclaimed.
|
|
|
|
* Don't recycle if we can't get the interlock without
|
|
|
|
* blocking.
|
2005-03-25 05:34:39 +00:00
|
|
|
*/
|
2016-06-17 17:33:25 +00:00
|
|
|
if ((mnt_op != NULL && (mp = vp->v_mount) != NULL &&
|
|
|
|
mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) {
|
2012-04-20 06:50:44 +00:00
|
|
|
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
|
2005-03-25 05:34:39 +00:00
|
|
|
continue;
|
2005-04-30 11:22:40 +00:00
|
|
|
}
|
2014-07-29 16:42:34 +00:00
|
|
|
VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
|
|
|
|
vp, ("vp inconsistent on freelist"));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The clear of VI_FREE prevents activation of the
|
|
|
|
* vnode. There is no sense in putting the vnode on
|
|
|
|
* the mount point active list, only to remove it
|
|
|
|
* later during recycling. Inline the relevant part
|
|
|
|
* of vholdl(), to avoid triggering assertions or
|
|
|
|
* activating.
|
|
|
|
*/
|
2005-03-25 05:34:39 +00:00
|
|
|
freevnodes--;
|
2005-04-30 11:22:40 +00:00
|
|
|
vp->v_iflag &= ~VI_FREE;
|
2015-07-16 13:57:05 +00:00
|
|
|
refcount_acquire(&vp->v_holdcnt);
|
2014-07-29 16:42:34 +00:00
|
|
|
|
2005-03-25 05:34:39 +00:00
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
2005-06-16 04:41:42 +00:00
|
|
|
VI_UNLOCK(vp);
|
|
|
|
vtryrecycle(vp);
|
|
|
|
/*
|
|
|
|
* If the recycled succeeded this vdrop will actually free
|
|
|
|
* the vnode. If not it will simply place it back on
|
|
|
|
* the free list.
|
|
|
|
*/
|
|
|
|
vdrop(vp);
|
2005-03-25 05:34:39 +00:00
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
|
|
|
}
|
|
|
|
}
|
2015-11-24 09:45:36 +00:00
|
|
|
|
2016-06-17 17:33:25 +00:00
|
|
|
void
|
|
|
|
vnlru_free(int count, struct vfsops *mnt_op)
|
|
|
|
{
|
|
|
|
|
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
|
|
|
vnlru_free_locked(count, mnt_op);
|
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
/* XXX some names and initialization are bad for limits and watermarks. */
|
|
|
|
static int
|
|
|
|
vspace(void)
|
|
|
|
{
|
|
|
|
int space;
|
|
|
|
|
|
|
|
gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
|
|
|
|
vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
|
|
|
|
vlowat = vhiwat / 2;
|
|
|
|
if (numvnodes > desiredvnodes)
|
|
|
|
return (0);
|
|
|
|
space = desiredvnodes - numvnodes;
|
|
|
|
if (freevnodes > wantfreevnodes)
|
|
|
|
space += freevnodes - wantfreevnodes;
|
|
|
|
return (space);
|
|
|
|
}
|
|
|
|
|
2016-09-30 17:27:17 +00:00
|
|
|
static void
|
|
|
|
vnlru_return_batch_locked(struct mount *mp)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
|
|
|
|
|
|
|
|
if (mp->mnt_tmpfreevnodelistsize == 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
TAILQ_FOREACH(vp, &mp->mnt_tmpfreevnodelist, v_actfreelist) {
|
|
|
|
VNASSERT((vp->v_mflag & VMP_TMPMNTFREELIST) != 0, vp,
|
|
|
|
("vnode without VMP_TMPMNTFREELIST on mnt_tmpfreevnodelist"));
|
|
|
|
vp->v_mflag &= ~VMP_TMPMNTFREELIST;
|
|
|
|
}
|
2016-10-08 13:36:59 +00:00
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
2016-09-30 17:27:17 +00:00
|
|
|
TAILQ_CONCAT(&vnode_free_list, &mp->mnt_tmpfreevnodelist, v_actfreelist);
|
|
|
|
freevnodes += mp->mnt_tmpfreevnodelistsize;
|
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
2016-10-08 13:36:59 +00:00
|
|
|
mp->mnt_tmpfreevnodelistsize = 0;
|
2016-09-30 17:27:17 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vnlru_return_batch(struct mount *mp)
|
|
|
|
{
|
|
|
|
|
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
|
|
|
vnlru_return_batch_locked(mp);
|
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vnlru_return_batches(struct vfsops *mnt_op)
|
|
|
|
{
|
|
|
|
struct mount *mp, *nmp;
|
|
|
|
bool need_unbusy;
|
|
|
|
|
|
|
|
mtx_lock(&mountlist_mtx);
|
|
|
|
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
|
|
|
|
need_unbusy = false;
|
|
|
|
if (mnt_op != NULL && mp->mnt_op != mnt_op)
|
|
|
|
goto next;
|
|
|
|
if (mp->mnt_tmpfreevnodelistsize == 0)
|
|
|
|
goto next;
|
|
|
|
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) == 0) {
|
|
|
|
vnlru_return_batch(mp);
|
|
|
|
need_unbusy = true;
|
|
|
|
mtx_lock(&mountlist_mtx);
|
|
|
|
}
|
|
|
|
next:
|
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
|
|
|
if (need_unbusy)
|
|
|
|
vfs_unbusy(mp);
|
|
|
|
}
|
|
|
|
mtx_unlock(&mountlist_mtx);
|
|
|
|
}
|
|
|
|
|
2001-12-18 20:48:54 +00:00
|
|
|
/*
|
|
|
|
* Attempt to recycle vnodes in a context that is always safe to block.
|
2002-05-16 21:28:32 +00:00
|
|
|
* Calling vlrurecycle() from the bowels of filesystem code has some
|
2001-12-18 20:48:54 +00:00
|
|
|
* interesting deadlock problems.
|
|
|
|
*/
|
|
|
|
static struct proc *vnlruproc;
|
|
|
|
static int vnlruproc_sig;
|
|
|
|
|
2002-06-06 15:46:38 +00:00
|
|
|
static void
|
2001-12-18 20:48:54 +00:00
|
|
|
vnlru_proc(void)
|
|
|
|
{
|
|
|
|
struct mount *mp, *nmp;
|
2015-11-24 09:45:36 +00:00
|
|
|
unsigned long ofreevnodes, onumvnodes;
|
|
|
|
int done, force, reclaim_nc_src, trigger, usevnodes;
|
2001-12-18 20:48:54 +00:00
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
|
2002-06-06 15:46:38 +00:00
|
|
|
SHUTDOWN_PRI_FIRST);
|
2001-12-18 20:48:54 +00:00
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
force = 0;
|
2001-12-18 20:48:54 +00:00
|
|
|
for (;;) {
|
2015-11-24 09:45:36 +00:00
|
|
|
kproc_suspend_check(vnlruproc);
|
2002-08-13 05:29:48 +00:00
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
2015-11-24 09:45:36 +00:00
|
|
|
/*
|
|
|
|
* If numvnodes is too large (due to desiredvnodes being
|
|
|
|
* adjusted using its sysctl, or emergency growth), first
|
|
|
|
* try to reduce it by discarding from the free list.
|
|
|
|
*/
|
2016-09-30 17:27:17 +00:00
|
|
|
if (numvnodes > desiredvnodes)
|
|
|
|
vnlru_free_locked(numvnodes - desiredvnodes, NULL);
|
2015-11-24 09:45:36 +00:00
|
|
|
/*
|
|
|
|
* Sleep if the vnode cache is in a good state. This is
|
|
|
|
* when it is not over-full and has space for about a 4%
|
|
|
|
* or 9% expansion (by growing its size or inexcessively
|
|
|
|
* reducing its free list). Otherwise, try to reclaim
|
|
|
|
* space for a 10% expansion.
|
|
|
|
*/
|
|
|
|
if (vstir && force == 0) {
|
|
|
|
force = 1;
|
|
|
|
vstir = 0;
|
|
|
|
}
|
|
|
|
if (vspace() >= vlowat && force == 0) {
|
2001-12-18 20:48:54 +00:00
|
|
|
vnlruproc_sig = 0;
|
2002-12-29 10:39:05 +00:00
|
|
|
wakeup(&vnlruproc_sig);
|
2005-01-24 10:41:01 +00:00
|
|
|
msleep(vnlruproc, &vnode_free_list_mtx,
|
|
|
|
PVFS|PDROP, "vlruwt", hz);
|
2001-12-18 20:48:54 +00:00
|
|
|
continue;
|
|
|
|
}
|
2002-08-13 05:29:48 +00:00
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
2001-12-18 20:48:54 +00:00
|
|
|
done = 0;
|
2015-11-24 09:45:36 +00:00
|
|
|
ofreevnodes = freevnodes;
|
|
|
|
onumvnodes = numvnodes;
|
|
|
|
/*
|
|
|
|
* Calculate parameters for recycling. These are the same
|
|
|
|
* throughout the loop to give some semblance of fairness.
|
|
|
|
* The trigger point is to avoid recycling vnodes with lots
|
|
|
|
* of resident pages. We aren't trying to free memory; we
|
|
|
|
* are trying to recycle or at least free vnodes.
|
|
|
|
*/
|
|
|
|
if (numvnodes <= desiredvnodes)
|
|
|
|
usevnodes = numvnodes - freevnodes;
|
|
|
|
else
|
|
|
|
usevnodes = numvnodes;
|
|
|
|
if (usevnodes <= 0)
|
|
|
|
usevnodes = 1;
|
|
|
|
/*
|
|
|
|
* The trigger value is is chosen to give a conservatively
|
|
|
|
* large value to ensure that it alone doesn't prevent
|
|
|
|
* making progress. The value can easily be so large that
|
|
|
|
* it is effectively infinite in some congested and
|
|
|
|
* misconfigured cases, and this is necessary. Normally
|
|
|
|
* it is about 8 to 100 (pages), which is quite large.
|
|
|
|
*/
|
|
|
|
trigger = vm_cnt.v_page_count * 2 / usevnodes;
|
|
|
|
if (force < 2)
|
|
|
|
trigger = vsmalltrigger;
|
|
|
|
reclaim_nc_src = force >= 3;
|
2001-12-18 20:48:54 +00:00
|
|
|
mtx_lock(&mountlist_mtx);
|
|
|
|
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
|
2008-11-02 10:15:42 +00:00
|
|
|
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
|
2001-12-18 20:48:54 +00:00
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
|
|
|
continue;
|
|
|
|
}
|
2015-11-24 09:45:36 +00:00
|
|
|
done += vlrureclaim(mp, reclaim_nc_src, trigger);
|
2001-12-18 20:48:54 +00:00
|
|
|
mtx_lock(&mountlist_mtx);
|
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
2008-08-31 14:26:08 +00:00
|
|
|
vfs_unbusy(mp);
|
2001-12-18 20:48:54 +00:00
|
|
|
}
|
|
|
|
mtx_unlock(&mountlist_mtx);
|
2015-11-24 09:45:36 +00:00
|
|
|
if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
|
|
|
|
uma_reclaim();
|
2001-12-18 20:48:54 +00:00
|
|
|
if (done == 0) {
|
2015-11-24 09:45:36 +00:00
|
|
|
if (force == 0 || force == 1) {
|
|
|
|
force = 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (force == 2) {
|
|
|
|
force = 3;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
force = 0;
|
2001-12-19 01:31:12 +00:00
|
|
|
vnlru_nowhere++;
|
2001-12-25 01:23:25 +00:00
|
|
|
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
|
2007-04-10 15:29:37 +00:00
|
|
|
} else
|
2012-12-21 13:14:12 +00:00
|
|
|
kern_yield(PRI_USER);
|
2015-11-24 09:45:36 +00:00
|
|
|
/*
|
|
|
|
* After becoming active to expand above low water, keep
|
|
|
|
* active until above high water.
|
|
|
|
*/
|
|
|
|
force = vspace() < vhiwat;
|
2001-12-18 20:48:54 +00:00
|
|
|
}
|
2001-10-26 00:08:05 +00:00
|
|
|
}
|
|
|
|
|
2001-12-18 20:48:54 +00:00
|
|
|
static struct kproc_desc vnlru_kp = {
|
|
|
|
"vnlru",
|
|
|
|
vnlru_proc,
|
|
|
|
&vnlruproc
|
|
|
|
};
|
2008-03-16 10:58:09 +00:00
|
|
|
SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
|
|
|
|
&vnlru_kp);
|
2010-05-12 16:42:28 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Routines having to do with the management of the vnode table.
|
|
|
|
*/
|
|
|
|
|
2002-08-05 10:15:56 +00:00
|
|
|
/*
|
2005-06-16 04:41:42 +00:00
|
|
|
* Try to recycle a freed vnode. We abort if anyone picks up a reference
|
|
|
|
* before we actually vgone(). This function must be called with the vnode
|
|
|
|
* held to prevent the vnode from being returned to the free list midway
|
|
|
|
* through vgone().
|
2002-08-05 10:15:56 +00:00
|
|
|
*/
|
|
|
|
static int
|
2003-10-05 05:35:41 +00:00
|
|
|
vtryrecycle(struct vnode *vp)
|
2002-08-05 10:15:56 +00:00
|
|
|
{
|
2003-10-05 05:35:41 +00:00
|
|
|
struct mount *vnmp;
|
2002-08-05 10:15:56 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2005-06-16 04:41:42 +00:00
|
|
|
VNASSERT(vp->v_holdcnt, vp,
|
|
|
|
("vtryrecycle: Recycling vp %p without a reference.", vp));
|
2003-10-04 15:10:40 +00:00
|
|
|
/*
|
|
|
|
* This vnode may found and locked via some other list, if so we
|
|
|
|
* can't recycle it yet.
|
|
|
|
*/
|
2009-02-05 15:03:35 +00:00
|
|
|
if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
|
|
|
|
CTR2(KTR_VFS,
|
|
|
|
"%s: impossible to recycle, vp %p lock is already held",
|
|
|
|
__func__, vp);
|
2002-08-05 10:15:56 +00:00
|
|
|
return (EWOULDBLOCK);
|
2009-02-05 15:03:35 +00:00
|
|
|
}
|
2002-10-11 01:04:14 +00:00
|
|
|
/*
|
|
|
|
* Don't recycle if its filesystem is being suspended.
|
|
|
|
*/
|
2003-10-05 05:35:41 +00:00
|
|
|
if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS,
|
|
|
|
"%s: impossible to recycle, cannot start the write for %p",
|
|
|
|
__func__, vp);
|
2005-06-16 04:41:42 +00:00
|
|
|
return (EBUSY);
|
2002-10-11 01:04:14 +00:00
|
|
|
}
|
2003-10-05 05:35:41 +00:00
|
|
|
/*
|
|
|
|
* If we got this far, we need to acquire the interlock and see if
|
|
|
|
* anyone picked up this vnode from another list. If not, we will
|
2005-03-15 14:38:16 +00:00
|
|
|
* mark it with DOOMED via vgonel() so that anyone who does find it
|
2003-10-05 05:35:41 +00:00
|
|
|
* will skip over it.
|
|
|
|
*/
|
|
|
|
VI_LOCK(vp);
|
2005-06-16 04:41:42 +00:00
|
|
|
if (vp->v_usecount) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, LK_INTERLOCK);
|
2005-06-16 04:41:42 +00:00
|
|
|
vn_finished_write(vnmp);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS,
|
|
|
|
"%s: impossible to recycle, %p is already referenced",
|
|
|
|
__func__, vp);
|
2005-06-16 04:41:42 +00:00
|
|
|
return (EBUSY);
|
2005-04-30 11:22:40 +00:00
|
|
|
}
|
2015-02-14 17:02:51 +00:00
|
|
|
if ((vp->v_iflag & VI_DOOMED) == 0) {
|
2016-12-31 19:59:31 +00:00
|
|
|
counter_u64_add(recycles_count, 1);
|
2005-06-16 04:41:42 +00:00
|
|
|
vgonel(vp);
|
2015-02-14 17:02:51 +00:00
|
|
|
}
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, LK_INTERLOCK);
|
2004-03-06 04:09:54 +00:00
|
|
|
vn_finished_write(vnmp);
|
2005-04-30 11:22:40 +00:00
|
|
|
return (0);
|
2002-08-05 10:15:56 +00:00
|
|
|
}
|
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
static void
|
|
|
|
vcheckspace(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (vspace() < vlowat && vnlruproc_sig == 0) {
|
|
|
|
vnlruproc_sig = 1;
|
|
|
|
wakeup(vnlruproc);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2015-11-24 09:45:36 +00:00
|
|
|
* Wait if necessary for space for a new vnode.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2012-10-14 19:43:37 +00:00
|
|
|
static int
|
|
|
|
getnewvnode_wait(int suspended)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2012-10-14 19:43:37 +00:00
|
|
|
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
|
2015-11-24 09:45:36 +00:00
|
|
|
if (numvnodes >= desiredvnodes) {
|
2012-10-14 19:43:37 +00:00
|
|
|
if (suspended) {
|
2006-08-09 12:47:30 +00:00
|
|
|
/*
|
2015-11-24 09:45:36 +00:00
|
|
|
* The file system is being suspended. We cannot
|
|
|
|
* risk a deadlock here, so allow allocation of
|
|
|
|
* another vnode even if this would give too many.
|
2006-08-09 12:47:30 +00:00
|
|
|
*/
|
2012-10-14 19:43:37 +00:00
|
|
|
return (0);
|
2006-08-09 12:47:30 +00:00
|
|
|
}
|
2002-12-29 10:39:05 +00:00
|
|
|
if (vnlruproc_sig == 0) {
|
2007-04-10 15:29:37 +00:00
|
|
|
vnlruproc_sig = 1; /* avoid unnecessary wakeups */
|
2002-12-29 10:39:05 +00:00
|
|
|
wakeup(vnlruproc);
|
|
|
|
}
|
2005-01-24 10:41:01 +00:00
|
|
|
msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
|
|
|
|
"vlruwk", hz);
|
2012-10-14 19:43:37 +00:00
|
|
|
}
|
2015-11-24 09:45:36 +00:00
|
|
|
/* Post-adjust like the pre-adjust in getnewvnode(). */
|
|
|
|
if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
|
2016-06-17 17:33:25 +00:00
|
|
|
vnlru_free_locked(1, NULL);
|
2015-11-24 09:45:36 +00:00
|
|
|
return (numvnodes >= desiredvnodes ? ENFILE : 0);
|
2012-10-14 19:43:37 +00:00
|
|
|
}
|
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
/*
|
|
|
|
* This hack is fragile, and probably not needed any more now that the
|
|
|
|
* watermark handling works.
|
|
|
|
*/
|
2012-10-14 19:43:37 +00:00
|
|
|
void
|
|
|
|
getnewvnode_reserve(u_int count)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
|
|
|
|
/* XXX no longer so quick, but this part is not racy. */
|
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
|
|
|
if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
|
2016-06-17 17:33:25 +00:00
|
|
|
vnlru_free_locked(ulmin(numvnodes + count - desiredvnodes,
|
|
|
|
freevnodes - wantfreevnodes), NULL);
|
2015-11-24 09:45:36 +00:00
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
|
|
|
|
2012-10-14 19:43:37 +00:00
|
|
|
td = curthread;
|
2014-06-08 15:38:40 +00:00
|
|
|
/* First try to be quick and racy. */
|
2014-06-08 19:01:37 +00:00
|
|
|
if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
|
|
|
|
td->td_vp_reserv += count;
|
2015-11-24 09:45:36 +00:00
|
|
|
vcheckspace(); /* XXX no longer so quick, but more racy */
|
2014-06-08 19:01:37 +00:00
|
|
|
return;
|
|
|
|
} else
|
|
|
|
atomic_subtract_long(&numvnodes, count);
|
2014-06-08 15:38:40 +00:00
|
|
|
|
2012-10-14 19:43:37 +00:00
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
|
|
|
while (count > 0) {
|
|
|
|
if (getnewvnode_wait(0) == 0) {
|
|
|
|
count--;
|
|
|
|
td->td_vp_reserv++;
|
2014-06-08 15:38:40 +00:00
|
|
|
atomic_add_long(&numvnodes, 1);
|
2005-04-04 12:07:16 +00:00
|
|
|
}
|
2001-12-18 20:48:54 +00:00
|
|
|
}
|
2015-11-24 09:45:36 +00:00
|
|
|
vcheckspace();
|
2012-10-14 19:43:37 +00:00
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
|
|
|
}
|
|
|
|
|
2015-11-24 09:45:36 +00:00
|
|
|
/*
|
|
|
|
* This hack is fragile, especially if desiredvnodes or wantvnodes are
|
|
|
|
* misconfgured or changed significantly. Reducing desiredvnodes below
|
|
|
|
* the reserved amount should cause bizarre behaviour like reducing it
|
|
|
|
* below the number of active vnodes -- the system will try to reduce
|
|
|
|
* numvnodes to match, but should fail, so the subtraction below should
|
|
|
|
* not overflow.
|
|
|
|
*/
|
2012-10-14 19:43:37 +00:00
|
|
|
void
|
|
|
|
getnewvnode_drop_reserve(void)
|
|
|
|
{
|
|
|
|
struct thread *td;
|
|
|
|
|
|
|
|
td = curthread;
|
2014-06-08 15:38:40 +00:00
|
|
|
atomic_subtract_long(&numvnodes, td->td_vp_reserv);
|
2012-10-14 19:43:37 +00:00
|
|
|
td->td_vp_reserv = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the next vnode from the free list.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
|
|
|
|
struct vnode **vpp)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
struct thread *td;
|
2015-11-29 21:42:26 +00:00
|
|
|
struct lock_object *lo;
|
2015-11-24 09:45:36 +00:00
|
|
|
static int cyclecount;
|
2012-10-14 19:43:37 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
|
|
|
|
vp = NULL;
|
|
|
|
td = curthread;
|
|
|
|
if (td->td_vp_reserv > 0) {
|
|
|
|
td->td_vp_reserv -= 1;
|
|
|
|
goto alloc;
|
|
|
|
}
|
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
2015-11-24 09:45:36 +00:00
|
|
|
if (numvnodes < desiredvnodes)
|
|
|
|
cyclecount = 0;
|
|
|
|
else if (cyclecount++ >= freevnodes) {
|
|
|
|
cyclecount = 0;
|
|
|
|
vstir = 1;
|
|
|
|
}
|
2012-10-14 19:43:37 +00:00
|
|
|
/*
|
2015-11-24 09:45:36 +00:00
|
|
|
* Grow the vnode cache if it will not be above its target max
|
|
|
|
* after growing. Otherwise, if the free list is nonempty, try
|
|
|
|
* to reclaim 1 item from it before growing the cache (possibly
|
|
|
|
* above its target max if the reclamation failed or is delayed).
|
|
|
|
* Otherwise, wait for some space. In all cases, schedule
|
|
|
|
* vnlru_proc() if we are getting short of space. The watermarks
|
|
|
|
* should be chosen so that we never wait or even reclaim from
|
|
|
|
* the free list to below its target minimum.
|
2012-10-14 19:43:37 +00:00
|
|
|
*/
|
2015-11-24 09:45:36 +00:00
|
|
|
if (numvnodes + 1 <= desiredvnodes)
|
|
|
|
;
|
|
|
|
else if (freevnodes > 0)
|
2016-06-17 17:33:25 +00:00
|
|
|
vnlru_free_locked(1, NULL);
|
2015-11-24 09:45:36 +00:00
|
|
|
else {
|
|
|
|
error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
|
|
|
|
MNTK_SUSPEND));
|
2012-10-14 19:43:37 +00:00
|
|
|
#if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */
|
2015-11-24 09:45:36 +00:00
|
|
|
if (error != 0) {
|
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
|
|
|
return (error);
|
|
|
|
}
|
2012-10-14 19:43:37 +00:00
|
|
|
#endif
|
2015-11-24 09:45:36 +00:00
|
|
|
}
|
|
|
|
vcheckspace();
|
2014-06-08 15:38:40 +00:00
|
|
|
atomic_add_long(&numvnodes, 1);
|
2005-03-25 05:34:39 +00:00
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
2012-10-14 19:43:37 +00:00
|
|
|
alloc:
|
2016-12-31 19:59:31 +00:00
|
|
|
counter_u64_add(vnodes_created, 1);
|
2015-11-29 21:42:26 +00:00
|
|
|
vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
|
2005-03-25 05:34:39 +00:00
|
|
|
/*
|
2015-11-29 21:42:26 +00:00
|
|
|
* Locks are given the generic name "vnode" when created.
|
|
|
|
* Follow the historic practice of using the filesystem
|
|
|
|
* name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
|
|
|
|
*
|
|
|
|
* Locks live in a witness group keyed on their name. Thus,
|
|
|
|
* when a lock is renamed, it must also move from the witness
|
|
|
|
* group of its old name to the witness group of its new name.
|
|
|
|
*
|
|
|
|
* The change only needs to be made when the vnode moves
|
|
|
|
* from one filesystem type to another. We ensure that each
|
|
|
|
* filesystem use a single static name pointer for its tag so
|
|
|
|
* that we can compare pointers rather than doing a strcmp().
|
2005-03-25 05:34:39 +00:00
|
|
|
*/
|
2015-11-29 21:42:26 +00:00
|
|
|
lo = &vp->v_vnlock->lock_object;
|
|
|
|
if (lo->lo_name != tag) {
|
|
|
|
lo->lo_name = tag;
|
|
|
|
WITNESS_DESTROY(lo);
|
|
|
|
WITNESS_INIT(lo, tag);
|
|
|
|
}
|
2005-03-25 05:34:39 +00:00
|
|
|
/*
|
2015-11-29 21:42:26 +00:00
|
|
|
* By default, don't allow shared locks unless filesystems opt-in.
|
2005-03-25 05:34:39 +00:00
|
|
|
*/
|
2015-11-29 21:42:26 +00:00
|
|
|
vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
|
2005-03-25 05:34:39 +00:00
|
|
|
/*
|
|
|
|
* Finalize various vnode identity bits.
|
|
|
|
*/
|
2015-11-29 21:42:26 +00:00
|
|
|
KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
|
|
|
|
KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
|
|
|
|
KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
|
1995-02-27 06:50:08 +00:00
|
|
|
vp->v_type = VNON;
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_tag = tag;
|
|
|
|
vp->v_op = vops;
|
2015-07-16 13:57:05 +00:00
|
|
|
v_init_counters(vp);
|
2015-11-29 21:42:26 +00:00
|
|
|
vp->v_bufobj.bo_ops = &buf_ops_bio;
|
2017-01-21 16:59:50 +00:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (mp == NULL && vops != &dead_vnodeops)
|
2017-01-22 15:27:14 +00:00
|
|
|
printf("NULL mp in getnewvnode(9), tag %s\n", tag);
|
2017-01-21 16:59:50 +00:00
|
|
|
#endif
|
2002-09-30 20:51:48 +00:00
|
|
|
#ifdef MAC
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_vnode_init(vp);
|
Slightly change the semantics of vnode labels for MAC: rather than
"refreshing" the label on the vnode before use, just get the label
right from inception. For single-label file systems, set the label
in the generic VFS getnewvnode() code; for multi-label file systems,
leave the labeling up to the file system. With UFS1/2, this means
reading the extended attribute during vfs_vget() as the inode is
pulled off disk, rather than hitting the extended attributes
frequently during operations later, improving performance. This
also corrects sematics for shared vnode locks, which were not
previously present in the system. This chances the cache
coherrency properties WRT out-of-band access to label data, but in
an acceptable form. With UFS1, there is a small race condition
during automatic extended attribute start -- this is not present
with UFS2, and occurs because EAs aren't available at vnode
inception. We'll introduce a work around for this shortly.
Approved by: re
Obtained from: TrustedBSD Project
Sponsored by: DARPA, Network Associates Laboratories
2002-10-26 14:38:24 +00:00
|
|
|
if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
|
2007-10-24 19:04:04 +00:00
|
|
|
mac_vnode_associate_singlelabel(mp, vp);
|
2002-09-30 20:51:48 +00:00
|
|
|
#endif
|
2004-10-26 07:39:12 +00:00
|
|
|
if (mp != NULL) {
|
2015-11-29 21:42:26 +00:00
|
|
|
vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
|
2005-08-06 01:42:04 +00:00
|
|
|
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
|
|
|
|
vp->v_vflag |= VV_NOKNOTE;
|
2004-10-26 07:39:12 +00:00
|
|
|
}
|
1998-02-23 06:59:52 +00:00
|
|
|
|
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 7c243b6..0bdaf36 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -279,6 +279,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+static int vnsz2log;
/*
* Initialize the vnode management data structures.
@@ -293,6 +294,7 @@ SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
static void
vntblinit(void *dummy __unused)
{
+ u_int i;
int physvnodes, virtvnodes;
/*
@@ -332,6 +334,9 @@ vntblinit(void *dummy __unused)
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
+ for (i = 1; i <= sizeof(struct vnode); i <<= 1)
+ vnsz2log++;
+ vnsz2log--;
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
@@ -1067,6 +1072,14 @@ alloc:
}
rangelock_init(&vp->v_rl);
+ /*
+ * For the filesystems which do not use vfs_hash_insert(),
+ * still initialize v_hash to have vfs_hash_index() useful.
+ * E.g., nullfs uses vfs_hash_index() on the lower vnode for
+ * its own hashing.
+ */
+ vp->v_hash = (uintptr_t)vp >> vnsz2log;
+
*vpp = vp;
return (0);
}
2013-01-14 05:42:54 +00:00
|
|
|
/*
|
|
|
|
* For the filesystems which do not use vfs_hash_insert(),
|
|
|
|
* still initialize v_hash to have vfs_hash_index() useful.
|
|
|
|
* E.g., nullfs uses vfs_hash_index() on the lower vnode for
|
|
|
|
* its own hashing.
|
|
|
|
*/
|
|
|
|
vp->v_hash = (uintptr_t)vp >> vnsz2log;
|
|
|
|
|
2005-03-25 05:34:39 +00:00
|
|
|
*vpp = vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-07-04 08:52:35 +00:00
|
|
|
* Delete from old mount point vnode list, if on one.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1997-11-22 08:35:46 +00:00
|
|
|
static void
|
2004-07-04 08:52:35 +00:00
|
|
|
delmntque(struct vnode *vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2004-07-04 08:52:35 +00:00
|
|
|
struct mount *mp;
|
2012-04-20 06:50:44 +00:00
|
|
|
int active;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2004-07-04 08:52:35 +00:00
|
|
|
mp = vp->v_mount;
|
2006-02-06 10:19:50 +00:00
|
|
|
if (mp == NULL)
|
|
|
|
return;
|
2004-07-04 08:52:35 +00:00
|
|
|
MNT_ILOCK(mp);
|
2012-04-20 06:50:44 +00:00
|
|
|
VI_LOCK(vp);
|
|
|
|
KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
|
|
|
|
("Active vnode list size %d > Vnode list size %d",
|
|
|
|
mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
|
|
|
|
active = vp->v_iflag & VI_ACTIVE;
|
|
|
|
vp->v_iflag &= ~VI_ACTIVE;
|
|
|
|
if (active) {
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
2012-04-20 06:50:44 +00:00
|
|
|
TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
|
|
|
|
mp->mnt_activevnodelistsize--;
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-04-20 06:50:44 +00:00
|
|
|
}
|
2006-02-23 05:15:37 +00:00
|
|
|
vp->v_mount = NULL;
|
2012-04-20 06:50:44 +00:00
|
|
|
VI_UNLOCK(vp);
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
|
2004-07-04 08:52:35 +00:00
|
|
|
("bad mount point vnode list size"));
|
|
|
|
TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
|
|
|
|
mp->mnt_nvnodelistsize--;
|
2006-02-23 05:15:37 +00:00
|
|
|
MNT_REL(mp);
|
2004-07-04 08:52:35 +00:00
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
}
|
|
|
|
|
2007-03-13 01:50:27 +00:00
|
|
|
static void
|
|
|
|
insmntque_stddtr(struct vnode *vp, void *dtr_arg)
|
|
|
|
{
|
|
|
|
|
|
|
|
vp->v_data = NULL;
|
|
|
|
vp->v_op = &dead_vnodeops;
|
|
|
|
vgone(vp);
|
|
|
|
vput(vp);
|
|
|
|
}
|
|
|
|
|
2004-07-04 08:52:35 +00:00
|
|
|
/*
|
|
|
|
* Insert into list of vnodes for the new mount point, if available.
|
|
|
|
*/
|
2007-03-13 01:50:27 +00:00
|
|
|
int
|
|
|
|
insmntque1(struct vnode *vp, struct mount *mp,
|
|
|
|
void (*dtr)(struct vnode *, void *), void *dtr_arg)
|
2004-07-04 08:52:35 +00:00
|
|
|
{
|
|
|
|
|
2007-03-13 01:50:27 +00:00
|
|
|
KASSERT(vp->v_mount == NULL,
|
|
|
|
("insmntque: vnode already on per mount vnode list"));
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
|
2012-10-22 17:50:54 +00:00
|
|
|
ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
|
|
|
|
|
2012-04-20 06:50:44 +00:00
|
|
|
/*
|
|
|
|
* We acquire the vnode interlock early to ensure that the
|
|
|
|
* vnode cannot be recycled by another process releasing a
|
|
|
|
* holdcnt on it before we get it on both the vnode list
|
|
|
|
* and the active vnode list. The mount mutex protects only
|
|
|
|
* manipulation of the vnode list and the vnode freelist
|
|
|
|
* mutex protects only manipulation of the active vnode list.
|
|
|
|
* Hence the need to hold the vnode interlock throughout.
|
|
|
|
*/
|
2006-02-06 10:19:50 +00:00
|
|
|
MNT_ILOCK(mp);
|
2012-04-20 06:50:44 +00:00
|
|
|
VI_LOCK(vp);
|
2012-11-19 20:43:19 +00:00
|
|
|
if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
|
2008-08-28 09:08:15 +00:00
|
|
|
((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
|
2012-11-19 20:43:19 +00:00
|
|
|
mp->mnt_nvnodelistsize == 0)) &&
|
|
|
|
(vp->v_vflag & VV_FORCEINSMQ) == 0) {
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
if (dtr != NULL)
|
|
|
|
dtr(vp, dtr_arg);
|
|
|
|
return (EBUSY);
|
2007-03-13 01:50:27 +00:00
|
|
|
}
|
|
|
|
vp->v_mount = mp;
|
2006-02-06 10:19:50 +00:00
|
|
|
MNT_REF(mp);
|
2004-07-04 08:52:35 +00:00
|
|
|
TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
|
2006-01-09 20:42:19 +00:00
|
|
|
VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
|
|
|
|
("neg mount point vnode list size"));
|
2004-07-04 08:52:35 +00:00
|
|
|
mp->mnt_nvnodelistsize++;
|
2012-04-20 06:50:44 +00:00
|
|
|
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
|
|
|
|
("Activating already active vnode"));
|
|
|
|
vp->v_iflag |= VI_ACTIVE;
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
2012-04-20 06:50:44 +00:00
|
|
|
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
|
|
|
|
mp->mnt_activevnodelistsize++;
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-04-20 06:50:44 +00:00
|
|
|
VI_UNLOCK(vp);
|
2006-02-06 10:19:50 +00:00
|
|
|
MNT_IUNLOCK(mp);
|
2007-03-13 01:50:27 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
insmntque(struct vnode *vp, struct mount *mp)
|
|
|
|
{
|
|
|
|
|
|
|
|
return (insmntque1(vp, mp, insmntque_stddtr, NULL));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2005-02-19 11:44:57 +00:00
|
|
|
* Flush out and invalidate all buffers associated with a bufobj
|
1994-05-24 10:09:53 +00:00
|
|
|
* Called with the underlying object locked.
|
|
|
|
*/
|
|
|
|
int
|
2008-10-10 21:23:50 +00:00
|
|
|
bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-09-19 23:52:06 +00:00
|
|
|
int error;
|
2002-09-25 02:22:21 +00:00
|
|
|
|
2004-10-27 08:05:02 +00:00
|
|
|
BO_LOCK(bo);
|
1998-06-10 18:13:19 +00:00
|
|
|
if (flags & V_SAVE) {
|
2004-10-21 15:53:54 +00:00
|
|
|
error = bufobj_wwait(bo, slpflag, slptimeo);
|
|
|
|
if (error) {
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_UNLOCK(bo);
|
2004-10-21 15:53:54 +00:00
|
|
|
return (error);
|
1998-06-10 18:13:19 +00:00
|
|
|
}
|
2004-10-21 15:53:54 +00:00
|
|
|
if (bo->bo_dirty.bv_cnt > 0) {
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_UNLOCK(bo);
|
2008-10-10 21:23:50 +00:00
|
|
|
if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
|
1998-06-10 18:13:19 +00:00
|
|
|
return (error);
|
2002-08-04 10:29:36 +00:00
|
|
|
/*
|
|
|
|
* XXX We could save a lock/unlock if this was only
|
|
|
|
* enabled under INVARIANTS
|
|
|
|
*/
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_LOCK(bo);
|
2004-10-21 15:53:54 +00:00
|
|
|
if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
|
1998-06-10 18:13:19 +00:00
|
|
|
panic("vinvalbuf: dirty bufs");
|
|
|
|
}
|
2002-06-06 15:46:38 +00:00
|
|
|
}
|
2002-09-25 02:22:21 +00:00
|
|
|
/*
|
|
|
|
* If you alter this loop please notice that interlock is dropped and
|
|
|
|
* reacquired in flushbuflist. Special care is needed to ensure that
|
|
|
|
* no race conditions occur from this.
|
|
|
|
*/
|
2005-01-11 10:01:54 +00:00
|
|
|
do {
|
|
|
|
error = flushbuflist(&bo->bo_clean,
|
2005-02-19 11:44:57 +00:00
|
|
|
flags, bo, slpflag, slptimeo);
|
2011-11-04 04:02:50 +00:00
|
|
|
if (error == 0 && !(flags & V_CLEANONLY))
|
2005-01-11 10:01:54 +00:00
|
|
|
error = flushbuflist(&bo->bo_dirty,
|
2005-02-19 11:44:57 +00:00
|
|
|
flags, bo, slpflag, slptimeo);
|
2005-01-16 21:09:39 +00:00
|
|
|
if (error != 0 && error != EAGAIN) {
|
2005-01-11 10:01:54 +00:00
|
|
|
BO_UNLOCK(bo);
|
|
|
|
return (error);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
}
|
2005-01-11 10:01:54 +00:00
|
|
|
} while (error != 0);
|
1994-08-29 06:09:15 +00:00
|
|
|
|
2001-10-05 20:10:32 +00:00
|
|
|
/*
|
|
|
|
* Wait for I/O to complete. XXX needs cleaning up. The vnode can
|
|
|
|
* have write I/O in-progress but if there is a VM object then the
|
|
|
|
* VM object can also have read-I/O in-progress.
|
|
|
|
*/
|
|
|
|
do {
|
2004-10-21 15:53:54 +00:00
|
|
|
bufobj_wwait(bo, 0, 0);
|
2017-04-05 16:57:53 +00:00
|
|
|
if ((flags & V_VMIO) == 0) {
|
|
|
|
BO_UNLOCK(bo);
|
|
|
|
if (bo->bo_object != NULL) {
|
|
|
|
VM_OBJECT_WLOCK(bo->bo_object);
|
|
|
|
vm_object_pip_wait(bo->bo_object, "bovlbx");
|
|
|
|
VM_OBJECT_WUNLOCK(bo->bo_object);
|
|
|
|
}
|
|
|
|
BO_LOCK(bo);
|
2001-10-05 20:10:32 +00:00
|
|
|
}
|
2004-10-25 09:14:03 +00:00
|
|
|
} while (bo->bo_numoutput > 0);
|
2005-02-07 10:04:06 +00:00
|
|
|
BO_UNLOCK(bo);
|
1997-03-05 04:54:54 +00:00
|
|
|
|
1995-03-20 02:08:24 +00:00
|
|
|
/*
|
|
|
|
* Destroy the copy in the VM cache, too.
|
|
|
|
*/
|
2011-11-04 04:02:50 +00:00
|
|
|
if (bo->bo_object != NULL &&
|
2017-04-05 16:57:53 +00:00
|
|
|
(flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) {
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WLOCK(bo->bo_object);
|
2011-06-29 16:40:41 +00:00
|
|
|
vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
|
|
|
|
OBJPR_CLEANONLY : 0);
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WUNLOCK(bo->bo_object);
|
1994-08-29 06:09:15 +00:00
|
|
|
}
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
|
2002-09-25 02:22:21 +00:00
|
|
|
#ifdef INVARIANTS
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_LOCK(bo);
|
2017-04-05 16:57:53 +00:00
|
|
|
if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 &&
|
2005-01-11 10:16:39 +00:00
|
|
|
(bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("vinvalbuf: flush failed");
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_UNLOCK(bo);
|
2002-09-25 02:22:21 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2005-02-19 11:44:57 +00:00
|
|
|
/*
|
|
|
|
* Flush out and invalidate all buffers associated with a vnode.
|
|
|
|
* Called with the underlying object locked.
|
|
|
|
*/
|
|
|
|
int
|
2008-10-10 21:23:50 +00:00
|
|
|
vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
|
2005-02-19 11:44:57 +00:00
|
|
|
{
|
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
|
2005-02-19 11:44:57 +00:00
|
|
|
ASSERT_VOP_LOCKED(vp, "vinvalbuf");
|
2013-10-09 18:43:29 +00:00
|
|
|
if (vp->v_object != NULL && vp->v_object->handle != vp)
|
|
|
|
return (0);
|
2008-10-10 21:23:50 +00:00
|
|
|
return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
|
2005-02-19 11:44:57 +00:00
|
|
|
}
|
|
|
|
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
/*
|
|
|
|
* Flush out buffers on the specified list.
|
2002-09-25 02:22:21 +00:00
|
|
|
*
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
*/
|
|
|
|
static int
|
2011-10-27 17:43:36 +00:00
|
|
|
flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
|
2006-01-21 19:42:10 +00:00
|
|
|
int slptimeo)
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
{
|
|
|
|
struct buf *bp, *nbp;
|
2005-01-16 21:09:39 +00:00
|
|
|
int retval, error;
|
2005-09-16 18:28:12 +00:00
|
|
|
daddr_t lblkno;
|
|
|
|
b_xflags_t xflags;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
|
2013-05-31 00:43:41 +00:00
|
|
|
ASSERT_BO_WLOCKED(bo);
|
2002-09-25 02:22:21 +00:00
|
|
|
|
2005-01-16 21:09:39 +00:00
|
|
|
retval = 0;
|
2005-01-11 10:01:54 +00:00
|
|
|
TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
|
2002-09-25 02:22:21 +00:00
|
|
|
((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
continue;
|
2002-09-25 02:22:21 +00:00
|
|
|
}
|
2005-09-16 18:28:12 +00:00
|
|
|
lblkno = 0;
|
|
|
|
xflags = 0;
|
|
|
|
if (nbp != NULL) {
|
|
|
|
lblkno = nbp->b_lblkno;
|
2013-04-06 22:21:23 +00:00
|
|
|
xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
|
2005-09-16 18:28:12 +00:00
|
|
|
}
|
2005-01-16 21:09:39 +00:00
|
|
|
retval = EAGAIN;
|
2003-02-25 03:37:48 +00:00
|
|
|
error = BUF_TIMELOCK(bp,
|
2013-05-31 00:43:41 +00:00
|
|
|
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
|
2003-02-25 03:37:48 +00:00
|
|
|
"flushbuf", slpflag, slptimeo);
|
|
|
|
if (error) {
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_LOCK(bo);
|
2005-01-11 10:01:54 +00:00
|
|
|
return (error != ENOLCK ? error : EAGAIN);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
}
|
2005-04-06 06:49:46 +00:00
|
|
|
KASSERT(bp->b_bufobj == bo,
|
2007-04-10 15:29:37 +00:00
|
|
|
("bp %p wrong b_bufobj %p should be %p",
|
2005-06-14 20:31:53 +00:00
|
|
|
bp, bp->b_bufobj, bo));
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
/*
|
|
|
|
* XXX Since there are no node locks for NFS, I
|
|
|
|
* believe there is a slight chance that a delayed
|
|
|
|
* write will occur while sleeping just above, so
|
2005-02-19 11:44:57 +00:00
|
|
|
* check for it.
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
*/
|
|
|
|
if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
|
2005-02-19 11:44:57 +00:00
|
|
|
(flags & V_SAVE)) {
|
|
|
|
bremfree(bp);
|
|
|
|
bp->b_flags |= B_ASYNC;
|
|
|
|
bwrite(bp);
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_LOCK(bo);
|
2005-02-19 11:44:57 +00:00
|
|
|
return (EAGAIN); /* XXX: why not loop ? */
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
}
|
|
|
|
bremfree(bp);
|
2006-05-25 01:00:35 +00:00
|
|
|
bp->b_flags |= (B_INVAL | B_RELBUF);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
bp->b_flags &= ~B_ASYNC;
|
|
|
|
brelse(bp);
|
2005-01-11 10:16:39 +00:00
|
|
|
BO_LOCK(bo);
|
2015-12-16 08:39:51 +00:00
|
|
|
nbp = gbincore(bo, lblkno);
|
|
|
|
if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
|
|
|
|
!= xflags)
|
2005-09-16 18:28:12 +00:00
|
|
|
break; /* nbp invalid */
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
}
|
2005-01-16 21:09:39 +00:00
|
|
|
return (retval);
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
}
|
|
|
|
|
2015-12-16 08:48:37 +00:00
|
|
|
int
|
|
|
|
bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn)
|
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
int error;
|
|
|
|
daddr_t lblkno;
|
|
|
|
|
|
|
|
ASSERT_BO_LOCKED(bo);
|
|
|
|
|
2016-01-05 14:48:40 +00:00
|
|
|
for (lblkno = startn;;) {
|
|
|
|
again:
|
2015-12-16 08:48:37 +00:00
|
|
|
bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno);
|
2016-02-17 19:39:57 +00:00
|
|
|
if (bp == NULL || bp->b_lblkno >= endn ||
|
|
|
|
bp->b_lblkno < startn)
|
2015-12-16 08:48:37 +00:00
|
|
|
break;
|
|
|
|
error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
|
|
|
|
LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0);
|
|
|
|
if (error != 0) {
|
|
|
|
BO_RLOCK(bo);
|
2016-01-05 14:48:40 +00:00
|
|
|
if (error == ENOLCK)
|
|
|
|
goto again;
|
|
|
|
return (error);
|
2015-12-16 08:48:37 +00:00
|
|
|
}
|
|
|
|
KASSERT(bp->b_bufobj == bo,
|
|
|
|
("bp %p wrong b_bufobj %p should be %p",
|
|
|
|
bp, bp->b_bufobj, bo));
|
2016-01-05 14:48:40 +00:00
|
|
|
lblkno = bp->b_lblkno + 1;
|
2015-12-16 08:48:37 +00:00
|
|
|
if ((bp->b_flags & B_MANAGED) == 0)
|
|
|
|
bremfree(bp);
|
|
|
|
bp->b_flags |= B_RELBUF;
|
|
|
|
/*
|
|
|
|
* In the VMIO case, use the B_NOREUSE flag to hint that the
|
|
|
|
* pages backing each buffer in the range are unlikely to be
|
|
|
|
* reused. Dirty buffers will have the hint applied once
|
|
|
|
* they've been written.
|
|
|
|
*/
|
|
|
|
if (bp->b_vp->v_object != NULL)
|
|
|
|
bp->b_flags |= B_NOREUSE;
|
|
|
|
brelse(bp);
|
|
|
|
BO_RLOCK(bo);
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
/*
|
|
|
|
* Truncate a file's buffer and pages to a specified length. This
|
|
|
|
* is in lieu of the old vinvalbuf mechanism, which performed unneeded
|
|
|
|
* sync activity.
|
|
|
|
*/
|
|
|
|
int
|
2012-04-23 13:21:28 +00:00
|
|
|
vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
{
|
2004-10-21 14:13:54 +00:00
|
|
|
struct buf *bp, *nbp;
|
2003-09-19 23:52:06 +00:00
|
|
|
int anyfreed;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
int trunclbn;
|
2004-10-21 14:13:54 +00:00
|
|
|
struct bufobj *bo;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
|
|
|
|
vp, cred, blksize, (uintmax_t)length);
|
|
|
|
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
/*
|
|
|
|
* Round up to the *next* lbn.
|
|
|
|
*/
|
2016-04-26 15:38:17 +00:00
|
|
|
trunclbn = howmany(length, blksize);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
|
2002-09-25 02:22:21 +00:00
|
|
|
ASSERT_VOP_LOCKED(vp, "vtruncbuf");
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
restart:
|
2004-10-21 14:13:54 +00:00
|
|
|
bo = &vp->v_bufobj;
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_LOCK(bo);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
anyfreed = 1;
|
|
|
|
for (;anyfreed;) {
|
|
|
|
anyfreed = 0;
|
2004-10-21 15:53:54 +00:00
|
|
|
TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
|
2004-10-21 14:13:54 +00:00
|
|
|
if (bp->b_lblkno < trunclbn)
|
|
|
|
continue;
|
|
|
|
if (BUF_LOCK(bp,
|
|
|
|
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
|
2013-05-31 00:43:41 +00:00
|
|
|
BO_LOCKPTR(bo)) == ENOLCK)
|
2004-10-21 14:13:54 +00:00
|
|
|
goto restart;
|
|
|
|
|
|
|
|
bremfree(bp);
|
|
|
|
bp->b_flags |= (B_INVAL | B_RELBUF);
|
|
|
|
bp->b_flags &= ~B_ASYNC;
|
|
|
|
brelse(bp);
|
|
|
|
anyfreed = 1;
|
|
|
|
|
2011-01-25 14:04:02 +00:00
|
|
|
BO_LOCK(bo);
|
2004-10-21 14:13:54 +00:00
|
|
|
if (nbp != NULL &&
|
|
|
|
(((nbp->b_xflags & BX_VNCLEAN) == 0) ||
|
|
|
|
(nbp->b_vp != vp) ||
|
|
|
|
(nbp->b_flags & B_DELWRI))) {
|
2011-01-25 14:04:02 +00:00
|
|
|
BO_UNLOCK(bo);
|
2004-10-21 14:13:54 +00:00
|
|
|
goto restart;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-10-21 15:53:54 +00:00
|
|
|
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
|
2004-10-21 14:13:54 +00:00
|
|
|
if (bp->b_lblkno < trunclbn)
|
|
|
|
continue;
|
|
|
|
if (BUF_LOCK(bp,
|
|
|
|
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
|
2013-05-31 00:43:41 +00:00
|
|
|
BO_LOCKPTR(bo)) == ENOLCK)
|
2004-10-21 14:13:54 +00:00
|
|
|
goto restart;
|
|
|
|
bremfree(bp);
|
|
|
|
bp->b_flags |= (B_INVAL | B_RELBUF);
|
|
|
|
bp->b_flags &= ~B_ASYNC;
|
|
|
|
brelse(bp);
|
|
|
|
anyfreed = 1;
|
2011-01-25 14:04:02 +00:00
|
|
|
|
|
|
|
BO_LOCK(bo);
|
2004-10-21 14:13:54 +00:00
|
|
|
if (nbp != NULL &&
|
|
|
|
(((nbp->b_xflags & BX_VNDIRTY) == 0) ||
|
|
|
|
(nbp->b_vp != vp) ||
|
|
|
|
(nbp->b_flags & B_DELWRI) == 0)) {
|
2011-01-25 14:04:02 +00:00
|
|
|
BO_UNLOCK(bo);
|
2004-10-21 14:13:54 +00:00
|
|
|
goto restart;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
1998-03-17 06:30:52 +00:00
|
|
|
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
if (length > 0) {
|
|
|
|
restartsync:
|
2004-10-21 15:53:54 +00:00
|
|
|
TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
|
2003-03-13 07:22:53 +00:00
|
|
|
if (bp->b_lblkno > 0)
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* Since we hold the vnode lock this should only
|
|
|
|
* fail if we're racing with the buf daemon.
|
|
|
|
*/
|
|
|
|
if (BUF_LOCK(bp,
|
|
|
|
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
|
2013-05-31 00:43:41 +00:00
|
|
|
BO_LOCKPTR(bo)) == ENOLCK) {
|
2003-03-13 07:22:53 +00:00
|
|
|
goto restart;
|
1998-03-17 06:30:52 +00:00
|
|
|
}
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT((bp->b_flags & B_DELWRI), vp,
|
2004-01-05 19:04:29 +00:00
|
|
|
("buf(%p) on dirty queue without DELWRI", bp));
|
2003-03-13 07:22:53 +00:00
|
|
|
|
|
|
|
bremfree(bp);
|
|
|
|
bawrite(bp);
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_LOCK(bo);
|
2003-03-13 07:22:53 +00:00
|
|
|
goto restartsync;
|
1998-03-17 06:30:52 +00:00
|
|
|
}
|
|
|
|
}
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2004-10-21 15:53:54 +00:00
|
|
|
bufobj_wwait(bo, 0, 0);
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_UNLOCK(bo);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
vnode_pager_setsize(vp, length);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2004-10-22 08:47:20 +00:00
|
|
|
static void
|
2002-07-10 17:02:32 +00:00
|
|
|
buf_vlist_remove(struct buf *bp)
|
|
|
|
{
|
2004-10-21 13:48:50 +00:00
|
|
|
struct bufv *bv;
|
2002-07-10 17:02:32 +00:00
|
|
|
|
2004-10-22 08:47:20 +00:00
|
|
|
KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
|
2013-05-31 00:43:41 +00:00
|
|
|
ASSERT_BO_WLOCKED(bp->b_bufobj);
|
2005-06-18 18:17:03 +00:00
|
|
|
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
|
|
|
|
(BX_VNDIRTY|BX_VNCLEAN),
|
|
|
|
("buf_vlist_remove: Buf %p is on two lists", bp));
|
2007-04-10 15:29:37 +00:00
|
|
|
if (bp->b_xflags & BX_VNDIRTY)
|
2004-10-22 08:47:20 +00:00
|
|
|
bv = &bp->b_bufobj->bo_dirty;
|
2004-10-21 13:48:50 +00:00
|
|
|
else
|
2004-10-22 08:47:20 +00:00
|
|
|
bv = &bp->b_bufobj->bo_clean;
|
2013-05-12 04:05:01 +00:00
|
|
|
BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
|
2004-10-21 15:53:54 +00:00
|
|
|
TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
|
2004-10-21 13:48:50 +00:00
|
|
|
bv->bv_cnt--;
|
2002-07-10 17:02:32 +00:00
|
|
|
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-05-12 04:05:01 +00:00
|
|
|
* Add the buffer to the sorted clean or dirty block list.
|
2002-07-10 17:02:32 +00:00
|
|
|
*
|
|
|
|
* NOTE: xflags is passed as a constant, optimizing this inline function!
|
|
|
|
*/
|
2004-10-22 08:47:20 +00:00
|
|
|
static void
|
|
|
|
buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
|
2002-07-10 17:02:32 +00:00
|
|
|
{
|
2004-10-22 08:47:20 +00:00
|
|
|
struct bufv *bv;
|
2013-05-12 04:05:01 +00:00
|
|
|
struct buf *n;
|
|
|
|
int error;
|
2002-07-10 17:02:32 +00:00
|
|
|
|
2013-05-31 00:43:41 +00:00
|
|
|
ASSERT_BO_WLOCKED(bo);
|
2015-07-11 11:21:56 +00:00
|
|
|
KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
|
|
|
|
("dead bo %p", bo));
|
2005-06-18 18:17:03 +00:00
|
|
|
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
|
|
|
|
("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
|
2002-07-10 17:02:32 +00:00
|
|
|
bp->b_xflags |= xflags;
|
2004-10-22 08:47:20 +00:00
|
|
|
if (xflags & BX_VNDIRTY)
|
|
|
|
bv = &bo->bo_dirty;
|
|
|
|
else
|
|
|
|
bv = &bo->bo_clean;
|
|
|
|
|
2013-05-12 04:05:01 +00:00
|
|
|
/*
|
|
|
|
* Keep the list ordered. Optimize empty list insertion. Assume
|
|
|
|
* we tend to grow at the tail so lookup_le should usually be cheaper
|
|
|
|
* than _ge.
|
|
|
|
*/
|
|
|
|
if (bv->bv_cnt == 0 ||
|
|
|
|
bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
|
2004-10-22 08:47:20 +00:00
|
|
|
TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
|
2013-05-12 04:05:01 +00:00
|
|
|
else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
|
|
|
|
TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
|
|
|
|
else
|
|
|
|
TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
|
|
|
|
error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
|
|
|
|
if (error)
|
|
|
|
panic("buf_vlist_add: Preallocated nodes insufficient.");
|
2004-10-22 08:47:20 +00:00
|
|
|
bv->bv_cnt++;
|
2002-07-10 17:02:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2015-07-05 22:44:41 +00:00
|
|
|
* Look up a buffer using the buffer tries.
|
2002-07-10 17:02:32 +00:00
|
|
|
*/
|
|
|
|
struct buf *
|
2004-10-22 08:47:20 +00:00
|
|
|
gbincore(struct bufobj *bo, daddr_t lblkno)
|
2002-07-10 17:02:32 +00:00
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
|
2004-10-22 08:47:20 +00:00
|
|
|
ASSERT_BO_LOCKED(bo);
|
2013-05-12 04:05:01 +00:00
|
|
|
bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
|
|
|
|
if (bp != NULL)
|
2003-05-13 04:36:02 +00:00
|
|
|
return (bp);
|
2013-05-12 04:05:01 +00:00
|
|
|
return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
|
2002-07-10 17:02:32 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Associate a buffer with a vnode.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2004-10-22 08:47:20 +00:00
|
|
|
bgetvp(struct vnode *vp, struct buf *bp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2008-03-22 09:15:16 +00:00
|
|
|
struct bufobj *bo;
|
2005-02-17 10:28:58 +00:00
|
|
|
|
2008-03-22 09:15:16 +00:00
|
|
|
bo = &vp->v_bufobj;
|
2013-05-31 00:43:41 +00:00
|
|
|
ASSERT_BO_WLOCKED(bo);
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
|
1999-01-10 01:58:29 +00:00
|
|
|
|
2005-01-24 10:41:01 +00:00
|
|
|
CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
|
2002-07-10 17:02:32 +00:00
|
|
|
("bgetvp: bp already attached! %p", bp));
|
|
|
|
|
2008-03-22 09:15:16 +00:00
|
|
|
vhold(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
bp->b_vp = vp;
|
2008-03-22 09:15:16 +00:00
|
|
|
bp->b_bufobj = bo;
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Insert onto list for new vnode.
|
|
|
|
*/
|
2008-03-22 09:15:16 +00:00
|
|
|
buf_vlist_add(bp, bo, BX_VNCLEAN);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disassociate a buffer from a vnode.
|
|
|
|
*/
|
2008-03-28 12:30:12 +00:00
|
|
|
void
|
2004-10-22 08:47:20 +00:00
|
|
|
brelvp(struct buf *bp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2004-10-22 08:47:20 +00:00
|
|
|
struct bufobj *bo;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
|
2005-01-24 10:41:01 +00:00
|
|
|
CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
|
1997-12-29 00:25:11 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Delete from old vnode list, if on one.
|
|
|
|
*/
|
2004-10-27 08:05:02 +00:00
|
|
|
vp = bp->b_vp; /* XXX */
|
2004-10-22 08:47:20 +00:00
|
|
|
bo = bp->b_bufobj;
|
2004-10-27 08:05:02 +00:00
|
|
|
BO_LOCK(bo);
|
2002-07-10 17:02:32 +00:00
|
|
|
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
|
|
|
|
buf_vlist_remove(bp);
|
2005-05-01 12:00:36 +00:00
|
|
|
else
|
|
|
|
panic("brelvp: Buffer %p not on queue.", bp);
|
2004-10-27 08:05:02 +00:00
|
|
|
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
|
|
|
|
bo->bo_flag &= ~BO_ONWORKLST;
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
2004-10-27 08:05:02 +00:00
|
|
|
LIST_REMOVE(bo, bo_synclist);
|
2007-04-10 15:29:37 +00:00
|
|
|
syncer_worklist_len--;
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
2004-10-22 08:47:20 +00:00
|
|
|
bp->b_vp = NULL;
|
|
|
|
bp->b_bufobj = NULL;
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_UNLOCK(bo);
|
|
|
|
vdrop(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Add an item to the syncer work queue.
|
|
|
|
*/
|
1999-02-19 17:36:58 +00:00
|
|
|
static void
|
2004-10-27 08:05:02 +00:00
|
|
|
vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
|
1998-03-08 09:59:44 +00:00
|
|
|
{
|
2012-10-22 17:50:54 +00:00
|
|
|
int slot;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2013-05-31 00:43:41 +00:00
|
|
|
ASSERT_BO_WLOCKED(bo);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
2004-10-27 08:05:02 +00:00
|
|
|
if (bo->bo_flag & BO_ONWORKLST)
|
|
|
|
LIST_REMOVE(bo, bo_synclist);
|
2004-07-01 23:59:19 +00:00
|
|
|
else {
|
2004-10-27 08:05:02 +00:00
|
|
|
bo->bo_flag |= BO_ONWORKLST;
|
2007-04-10 15:29:37 +00:00
|
|
|
syncer_worklist_len++;
|
2004-07-01 23:59:19 +00:00
|
|
|
}
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
if (delay > syncer_maxdelay - 2)
|
|
|
|
delay = syncer_maxdelay - 2;
|
|
|
|
slot = (syncer_delayno + delay) & syncer_mask;
|
|
|
|
|
2012-10-22 17:50:54 +00:00
|
|
|
LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
|
2004-07-05 01:07:33 +00:00
|
|
|
static int
|
|
|
|
sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
int error, len;
|
|
|
|
|
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
len = syncer_worklist_len - sync_vnode_count;
|
|
|
|
mtx_unlock(&sync_mtx);
|
|
|
|
error = SYSCTL_OUT(req, &len, sizeof(len));
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
|
|
|
SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
|
|
|
|
sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
|
|
|
|
|
2005-09-30 01:30:01 +00:00
|
|
|
static struct proc *updateproc;
|
2002-03-19 21:25:46 +00:00
|
|
|
static void sched_sync(void);
|
1999-07-01 13:21:46 +00:00
|
|
|
static struct kproc_desc up_kp = {
|
1998-03-08 09:59:44 +00:00
|
|
|
"syncer",
|
|
|
|
sched_sync,
|
|
|
|
&updateproc
|
|
|
|
};
|
2008-03-16 10:58:09 +00:00
|
|
|
SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2004-11-14 15:24:38 +00:00
|
|
|
static int
|
2007-12-05 09:34:04 +00:00
|
|
|
sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
|
2004-11-14 15:24:38 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
struct mount *mp;
|
|
|
|
|
2007-12-05 09:34:04 +00:00
|
|
|
*bo = LIST_FIRST(slp);
|
2008-05-04 13:54:55 +00:00
|
|
|
if (*bo == NULL)
|
2007-12-05 09:34:04 +00:00
|
|
|
return (0);
|
2016-09-30 17:11:03 +00:00
|
|
|
vp = bo2vnode(*bo);
|
2008-05-04 13:54:55 +00:00
|
|
|
if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
|
2004-11-14 15:24:38 +00:00
|
|
|
return (1);
|
|
|
|
/*
|
|
|
|
* We use vhold in case the vnode does not
|
|
|
|
* successfully sync. vhold prevents the vnode from
|
|
|
|
* going away when we unlock the sync_mtx so that
|
|
|
|
* we can acquire the vnode interlock.
|
|
|
|
*/
|
|
|
|
vholdl(vp);
|
|
|
|
mtx_unlock(&sync_mtx);
|
2005-01-24 10:41:01 +00:00
|
|
|
VI_UNLOCK(vp);
|
|
|
|
if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
|
|
|
|
vdrop(vp);
|
|
|
|
mtx_lock(&sync_mtx);
|
2008-03-23 01:44:28 +00:00
|
|
|
return (*bo == LIST_FIRST(slp));
|
2005-01-24 10:41:01 +00:00
|
|
|
}
|
2008-01-10 01:10:58 +00:00
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
|
2005-01-11 07:36:22 +00:00
|
|
|
(void) VOP_FSYNC(vp, MNT_LAZY, td);
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2004-11-14 15:24:38 +00:00
|
|
|
vn_finished_write(mp);
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_LOCK(*bo);
|
2007-12-05 09:34:04 +00:00
|
|
|
if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
|
2004-11-14 15:24:38 +00:00
|
|
|
/*
|
|
|
|
* Put us back on the worklist. The worklist
|
|
|
|
* routine will remove us from our current
|
|
|
|
* position and then add us back in at a later
|
|
|
|
* position.
|
|
|
|
*/
|
2007-12-05 09:34:04 +00:00
|
|
|
vn_syncer_add_to_worklist(*bo, syncdelay);
|
2004-11-14 15:24:38 +00:00
|
|
|
}
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_UNLOCK(*bo);
|
|
|
|
vdrop(vp);
|
2004-11-14 15:24:38 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2014-12-08 16:48:57 +00:00
|
|
|
static int first_printf = 1;
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* System filesystem synchronizer daemon.
|
|
|
|
*/
|
2002-09-28 17:15:38 +00:00
|
|
|
static void
|
1998-03-08 09:59:44 +00:00
|
|
|
sched_sync(void)
|
|
|
|
{
|
2012-10-22 17:50:54 +00:00
|
|
|
struct synclist *next, *slp;
|
2004-10-27 08:05:02 +00:00
|
|
|
struct bufobj *bo;
|
1998-03-08 09:59:44 +00:00
|
|
|
long starttime;
|
2007-12-05 09:34:04 +00:00
|
|
|
struct thread *td = curthread;
|
2004-07-05 01:07:33 +00:00
|
|
|
int last_work_seen;
|
|
|
|
int net_worklist_len;
|
|
|
|
int syncer_final_iter;
|
2004-11-14 15:24:38 +00:00
|
|
|
int error;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2004-07-05 21:32:01 +00:00
|
|
|
last_work_seen = 0;
|
2004-07-05 01:07:33 +00:00
|
|
|
syncer_final_iter = 0;
|
|
|
|
syncer_state = SYNCER_RUNNING;
|
2005-09-12 15:31:28 +00:00
|
|
|
starttime = time_uptime;
|
2005-09-30 01:30:01 +00:00
|
|
|
td->td_pflags |= TDP_NORUNNINGBUF;
|
2000-09-07 01:33:02 +00:00
|
|
|
|
2004-07-01 23:59:19 +00:00
|
|
|
EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
|
2002-06-06 15:46:38 +00:00
|
|
|
SHUTDOWN_PRI_LAST);
|
2000-01-07 08:36:44 +00:00
|
|
|
|
2006-11-07 19:45:05 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
1998-03-08 09:59:44 +00:00
|
|
|
for (;;) {
|
2004-07-05 01:07:33 +00:00
|
|
|
if (syncer_state == SYNCER_FINAL_DELAY &&
|
|
|
|
syncer_final_iter == 0) {
|
2004-07-01 23:59:19 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
2007-10-20 23:23:23 +00:00
|
|
|
kproc_suspend_check(td->td_proc);
|
2004-07-01 23:59:19 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
}
|
2004-07-05 01:07:33 +00:00
|
|
|
net_worklist_len = syncer_worklist_len - sync_vnode_count;
|
2004-07-15 04:29:48 +00:00
|
|
|
if (syncer_state != SYNCER_RUNNING &&
|
2005-09-12 15:31:28 +00:00
|
|
|
starttime != time_uptime) {
|
2004-07-15 04:29:48 +00:00
|
|
|
if (first_printf) {
|
2016-05-31 15:27:33 +00:00
|
|
|
printf("\nSyncing disks, vnodes remaining... ");
|
2004-07-15 04:29:48 +00:00
|
|
|
first_printf = 0;
|
|
|
|
}
|
2004-07-05 01:07:33 +00:00
|
|
|
printf("%d ", net_worklist_len);
|
2004-07-15 04:29:48 +00:00
|
|
|
}
|
2005-09-12 15:31:28 +00:00
|
|
|
starttime = time_uptime;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
/*
|
1999-02-19 17:36:58 +00:00
|
|
|
* Push files whose dirty time has expired. Be careful
|
|
|
|
* of interrupt race on slp queue.
|
2004-07-05 01:07:33 +00:00
|
|
|
*
|
|
|
|
* Skip over empty worklist slots when shutting down.
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
2004-07-05 01:07:33 +00:00
|
|
|
do {
|
2012-10-22 17:50:54 +00:00
|
|
|
slp = &syncer_workitem_pending[syncer_delayno];
|
2004-07-05 01:07:33 +00:00
|
|
|
syncer_delayno += 1;
|
|
|
|
if (syncer_delayno == syncer_maxdelay)
|
|
|
|
syncer_delayno = 0;
|
2012-10-22 17:50:54 +00:00
|
|
|
next = &syncer_workitem_pending[syncer_delayno];
|
2004-07-05 01:07:33 +00:00
|
|
|
/*
|
|
|
|
* If the worklist has wrapped since the
|
2007-04-10 15:29:37 +00:00
|
|
|
* it was emptied of all but syncer vnodes,
|
2004-07-05 01:07:33 +00:00
|
|
|
* switch to the FINAL_DELAY state and run
|
|
|
|
* for one more second.
|
|
|
|
*/
|
|
|
|
if (syncer_state == SYNCER_SHUTTING_DOWN &&
|
|
|
|
net_worklist_len == 0 &&
|
|
|
|
last_work_seen == syncer_delayno) {
|
|
|
|
syncer_state = SYNCER_FINAL_DELAY;
|
|
|
|
syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
|
|
|
|
}
|
|
|
|
} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
|
2012-10-22 17:50:54 +00:00
|
|
|
syncer_worklist_len > 0);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2004-07-05 01:07:33 +00:00
|
|
|
/*
|
|
|
|
* Keep track of the last time there was anything
|
|
|
|
* on the worklist other than syncer vnodes.
|
|
|
|
* Return to the SHUTTING_DOWN state if any
|
|
|
|
* new work appears.
|
|
|
|
*/
|
2004-07-05 21:32:01 +00:00
|
|
|
if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
|
2004-07-05 01:07:33 +00:00
|
|
|
last_work_seen = syncer_delayno;
|
2004-07-05 21:32:01 +00:00
|
|
|
if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
|
|
|
|
syncer_state = SYNCER_SHUTTING_DOWN;
|
2007-12-05 09:34:04 +00:00
|
|
|
while (!LIST_EMPTY(slp)) {
|
|
|
|
error = sync_vnode(slp, &bo, td);
|
2004-11-14 15:24:38 +00:00
|
|
|
if (error == 1) {
|
2004-10-27 08:05:02 +00:00
|
|
|
LIST_REMOVE(bo, bo_synclist);
|
|
|
|
LIST_INSERT_HEAD(next, bo, bo_synclist);
|
2003-10-04 18:03:53 +00:00
|
|
|
continue;
|
|
|
|
}
|
2012-06-03 08:01:12 +00:00
|
|
|
|
2014-10-01 15:32:28 +00:00
|
|
|
if (first_printf == 0) {
|
|
|
|
/*
|
|
|
|
* Drop the sync mutex, because some watchdog
|
|
|
|
* drivers need to sleep while patting
|
|
|
|
*/
|
|
|
|
mtx_unlock(&sync_mtx);
|
2011-04-28 16:02:05 +00:00
|
|
|
wdog_kern_pat(WD_LASTVAL);
|
2014-10-01 15:32:28 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
}
|
2012-06-03 08:01:12 +00:00
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
2004-07-05 01:07:33 +00:00
|
|
|
if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
|
|
|
|
syncer_final_iter--;
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* The variable rushjob allows the kernel to speed up the
|
|
|
|
* processing of the filesystem syncer process. A rushjob
|
|
|
|
* value of N tells the filesystem syncer to process the next
|
|
|
|
* N seconds worth of work on its queue ASAP. Currently rushjob
|
|
|
|
* is used by the soft update code to speed up the filesystem
|
|
|
|
* syncer process when the incore state is getting so far
|
|
|
|
* ahead of the disk that the kernel memory pool is being
|
|
|
|
* threatened with exhaustion.
|
|
|
|
*/
|
|
|
|
if (rushjob > 0) {
|
|
|
|
rushjob -= 1;
|
|
|
|
continue;
|
2004-07-05 01:07:33 +00:00
|
|
|
}
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
2006-11-07 19:07:33 +00:00
|
|
|
* Just sleep for a short period of time between
|
2004-07-05 01:07:33 +00:00
|
|
|
* iterations when shutting down to allow some I/O
|
|
|
|
* to happen.
|
|
|
|
*
|
1998-03-08 09:59:44 +00:00
|
|
|
* If it has taken us less than a second to process the
|
|
|
|
* current work, then wait. Otherwise start right over
|
|
|
|
* again. We can still lose time if any single round
|
|
|
|
* takes more than two seconds, but it does not really
|
|
|
|
* matter as we are just trying to generally pace the
|
|
|
|
* filesystem activity.
|
|
|
|
*/
|
2011-01-06 22:17:07 +00:00
|
|
|
if (syncer_state != SYNCER_RUNNING ||
|
|
|
|
time_uptime == starttime) {
|
|
|
|
thread_lock(td);
|
|
|
|
sched_prio(td, PPAUSE);
|
|
|
|
thread_unlock(td);
|
|
|
|
}
|
2004-07-05 01:07:33 +00:00
|
|
|
if (syncer_state != SYNCER_RUNNING)
|
2008-07-30 12:39:18 +00:00
|
|
|
cv_timedwait(&sync_wakeup, &sync_mtx,
|
2004-07-05 01:07:33 +00:00
|
|
|
hz / SYNCER_SHUTDOWN_SPEEDUP);
|
2005-09-12 15:31:28 +00:00
|
|
|
else if (time_uptime == starttime)
|
2008-07-30 12:39:18 +00:00
|
|
|
cv_timedwait(&sync_wakeup, &sync_mtx, hz);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1999-06-15 23:37:29 +00:00
|
|
|
/*
|
|
|
|
* Request the syncer daemon to speed up its work.
|
|
|
|
* We never push it to speed up more than half of its
|
|
|
|
* normal turn time, otherwise it could take over the cpu.
|
|
|
|
*/
|
|
|
|
int
|
2006-11-07 19:07:33 +00:00
|
|
|
speedup_syncer(void)
|
1999-06-15 23:37:29 +00:00
|
|
|
{
|
2002-09-25 02:22:21 +00:00
|
|
|
int ret = 0;
|
1999-06-15 23:37:29 +00:00
|
|
|
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
1999-06-15 23:37:29 +00:00
|
|
|
if (rushjob < syncdelay / 2) {
|
|
|
|
rushjob += 1;
|
|
|
|
stat_rush_requests += 1;
|
2002-09-25 02:22:21 +00:00
|
|
|
ret = 1;
|
1999-06-15 23:37:29 +00:00
|
|
|
}
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
2008-07-30 12:39:18 +00:00
|
|
|
cv_broadcast(&sync_wakeup);
|
2002-09-25 02:22:21 +00:00
|
|
|
return (ret);
|
1999-06-15 23:37:29 +00:00
|
|
|
}
|
|
|
|
|
2004-07-01 23:59:19 +00:00
|
|
|
/*
|
|
|
|
* Tell the syncer to speed up its work and run though its work
|
|
|
|
* list several times, then tell it to shut down.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
syncer_shutdown(void *arg, int howto)
|
|
|
|
{
|
|
|
|
|
2004-08-20 19:21:47 +00:00
|
|
|
if (howto & RB_NOSYNC)
|
|
|
|
return;
|
2004-07-01 23:59:19 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
2004-07-05 01:07:33 +00:00
|
|
|
syncer_state = SYNCER_SHUTTING_DOWN;
|
|
|
|
rushjob = 0;
|
2004-07-01 23:59:19 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
2008-07-30 12:39:18 +00:00
|
|
|
cv_broadcast(&sync_wakeup);
|
2004-07-01 23:59:19 +00:00
|
|
|
kproc_shutdown(arg, howto);
|
|
|
|
}
|
|
|
|
|
2014-12-08 16:48:57 +00:00
|
|
|
void
|
|
|
|
syncer_suspend(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
syncer_shutdown(updateproc, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
syncer_resume(void)
|
|
|
|
{
|
|
|
|
|
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
first_printf = 1;
|
|
|
|
syncer_state = SYNCER_RUNNING;
|
|
|
|
mtx_unlock(&sync_mtx);
|
|
|
|
cv_broadcast(&sync_wakeup);
|
|
|
|
kproc_resume(updateproc);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Reassign a buffer from one vnode to another.
|
|
|
|
* Used to assign file specific control information
|
|
|
|
* (indirect blocks) to the vnode to which they belong.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2004-07-25 21:24:23 +00:00
|
|
|
reassignbuf(struct buf *bp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2003-09-20 00:21:48 +00:00
|
|
|
struct vnode *vp;
|
2004-10-22 08:47:20 +00:00
|
|
|
struct bufobj *bo;
|
1998-03-08 09:59:44 +00:00
|
|
|
int delay;
|
2005-06-14 20:31:53 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
struct bufv *bv;
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2003-09-20 00:21:48 +00:00
|
|
|
vp = bp->b_vp;
|
2004-10-22 08:47:20 +00:00
|
|
|
bo = bp->b_bufobj;
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
++reassignbufcalls;
|
1996-08-15 06:45:01 +00:00
|
|
|
|
2005-01-24 10:41:01 +00:00
|
|
|
CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
|
|
|
|
bp, bp->b_vp, bp->b_flags);
|
1999-01-21 08:29:12 +00:00
|
|
|
/*
|
|
|
|
* B_PAGING flagged buffers cannot be reassigned because their vp
|
|
|
|
* is not fully linked in.
|
|
|
|
*/
|
|
|
|
if (bp->b_flags & B_PAGING)
|
|
|
|
panic("cannot reassign paging buffer");
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Delete from old vnode list, if on one.
|
|
|
|
*/
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_LOCK(bo);
|
2004-07-25 21:24:23 +00:00
|
|
|
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
|
2002-07-10 17:02:32 +00:00
|
|
|
buf_vlist_remove(bp);
|
2005-05-01 12:00:36 +00:00
|
|
|
else
|
|
|
|
panic("reassignbuf: Buffer %p not on queue.", bp);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* If dirty, put on list of dirty buffers; otherwise insert onto list
|
|
|
|
* of clean buffers.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
if (bp->b_flags & B_DELWRI) {
|
2004-10-27 08:05:02 +00:00
|
|
|
if ((bo->bo_flag & BO_ONWORKLST) == 0) {
|
2004-07-25 21:24:23 +00:00
|
|
|
switch (vp->v_type) {
|
1998-03-08 09:59:44 +00:00
|
|
|
case VDIR:
|
1999-06-15 23:37:29 +00:00
|
|
|
delay = dirdelay;
|
1998-03-08 09:59:44 +00:00
|
|
|
break;
|
1999-11-22 10:33:55 +00:00
|
|
|
case VCHR:
|
2004-06-14 14:25:03 +00:00
|
|
|
delay = metadelay;
|
|
|
|
break;
|
1998-03-08 09:59:44 +00:00
|
|
|
default:
|
1999-06-15 23:37:29 +00:00
|
|
|
delay = filedelay;
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
2004-10-27 08:05:02 +00:00
|
|
|
vn_syncer_add_to_worklist(bo, delay);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
2004-10-22 08:47:20 +00:00
|
|
|
buf_vlist_add(bp, bo, BX_VNDIRTY);
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
} else {
|
2004-10-22 08:47:20 +00:00
|
|
|
buf_vlist_add(bp, bo, BX_VNCLEAN);
|
2002-07-10 17:02:32 +00:00
|
|
|
|
2004-10-27 08:05:02 +00:00
|
|
|
if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
2004-10-27 08:05:02 +00:00
|
|
|
LIST_REMOVE(bo, bo_synclist);
|
2007-04-10 15:29:37 +00:00
|
|
|
syncer_worklist_len--;
|
2002-09-25 02:22:21 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
2004-10-27 08:05:02 +00:00
|
|
|
bo->bo_flag &= ~BO_ONWORKLST;
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
}
|
2005-06-14 20:31:53 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
bv = &bo->bo_clean;
|
|
|
|
bp = TAILQ_FIRST(&bv->bv_hd);
|
|
|
|
KASSERT(bp == NULL || bp->b_bufobj == bo,
|
|
|
|
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
|
|
|
|
bp = TAILQ_LAST(&bv->bv_hd, buflists);
|
|
|
|
KASSERT(bp == NULL || bp->b_bufobj == bo,
|
|
|
|
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
|
|
|
|
bv = &bo->bo_dirty;
|
|
|
|
bp = TAILQ_FIRST(&bv->bv_hd);
|
|
|
|
KASSERT(bp == NULL || bp->b_bufobj == bo,
|
|
|
|
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
|
|
|
|
bp = TAILQ_LAST(&bv->bv_hd, buflists);
|
|
|
|
KASSERT(bp == NULL || bp->b_bufobj == bo,
|
|
|
|
("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
|
|
|
|
#endif
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_UNLOCK(bo);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2005-06-16 04:41:42 +00:00
|
|
|
/*
|
2015-07-16 13:57:05 +00:00
|
|
|
* A temporary hack until refcount_* APIs are sorted out.
|
2005-06-16 04:41:42 +00:00
|
|
|
*/
|
2015-07-16 13:57:05 +00:00
|
|
|
static __inline int
|
|
|
|
vfs_refcount_acquire_if_not_zero(volatile u_int *count)
|
2005-06-16 04:41:42 +00:00
|
|
|
{
|
2015-07-16 13:57:05 +00:00
|
|
|
u_int old;
|
2005-06-16 04:41:42 +00:00
|
|
|
|
2017-02-05 03:23:16 +00:00
|
|
|
old = *count;
|
2015-07-16 13:57:05 +00:00
|
|
|
for (;;) {
|
|
|
|
if (old == 0)
|
|
|
|
return (0);
|
2017-02-05 03:23:16 +00:00
|
|
|
if (atomic_fcmpset_int(count, &old, old + 1))
|
2015-07-16 13:57:05 +00:00
|
|
|
return (1);
|
|
|
|
}
|
2005-06-16 04:41:42 +00:00
|
|
|
}
|
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
static __inline int
|
|
|
|
vfs_refcount_release_if_not_last(volatile u_int *count)
|
2006-02-01 00:30:05 +00:00
|
|
|
{
|
2015-07-16 13:57:05 +00:00
|
|
|
u_int old;
|
2006-02-01 00:30:05 +00:00
|
|
|
|
2017-02-05 03:23:16 +00:00
|
|
|
old = *count;
|
2015-07-16 13:57:05 +00:00
|
|
|
for (;;) {
|
|
|
|
if (old == 1)
|
|
|
|
return (0);
|
2017-02-05 03:23:16 +00:00
|
|
|
if (atomic_fcmpset_int(count, &old, old - 1))
|
2015-07-16 13:57:05 +00:00
|
|
|
return (1);
|
|
|
|
}
|
2006-02-01 00:30:05 +00:00
|
|
|
}
|
|
|
|
|
2005-06-16 04:41:42 +00:00
|
|
|
static void
|
2015-07-16 13:57:05 +00:00
|
|
|
v_init_counters(struct vnode *vp)
|
2005-06-16 04:41:42 +00:00
|
|
|
{
|
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0,
|
|
|
|
vp, ("%s called for an initialized vnode", __FUNCTION__));
|
|
|
|
ASSERT_VI_UNLOCKED(vp, __FUNCTION__);
|
|
|
|
|
|
|
|
refcount_init(&vp->v_holdcnt, 1);
|
|
|
|
refcount_init(&vp->v_usecount, 1);
|
2005-06-16 04:41:42 +00:00
|
|
|
}
|
|
|
|
|
2016-01-18 22:21:46 +00:00
|
|
|
static void
|
|
|
|
v_incr_usecount_locked(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
|
|
|
if ((vp->v_iflag & VI_OWEINACT) != 0) {
|
|
|
|
VNASSERT(vp->v_usecount == 0, vp,
|
|
|
|
("vnode with usecount and VI_OWEINACT set"));
|
|
|
|
vp->v_iflag &= ~VI_OWEINACT;
|
|
|
|
}
|
|
|
|
refcount_acquire(&vp->v_usecount);
|
|
|
|
v_incr_devcount(vp);
|
|
|
|
}
|
|
|
|
|
2005-06-16 04:41:42 +00:00
|
|
|
/*
|
2016-10-04 21:44:20 +00:00
|
|
|
* Increment the use count on the vnode, taking care to reference
|
|
|
|
* the driver's usecount if this is a chardev.
|
2005-06-16 04:41:42 +00:00
|
|
|
*/
|
2002-10-24 19:38:56 +00:00
|
|
|
static void
|
2015-07-16 13:57:05 +00:00
|
|
|
v_incr_usecount(struct vnode *vp)
|
2002-10-24 19:38:56 +00:00
|
|
|
{
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2015-07-16 13:57:05 +00:00
|
|
|
|
2016-01-18 22:21:46 +00:00
|
|
|
if (vp->v_type != VCHR &&
|
|
|
|
vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
|
2015-07-16 13:57:05 +00:00
|
|
|
VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
|
|
|
|
("vnode with usecount and VI_OWEINACT set"));
|
|
|
|
} else {
|
|
|
|
VI_LOCK(vp);
|
2016-01-18 22:21:46 +00:00
|
|
|
v_incr_usecount_locked(vp);
|
2015-07-16 13:57:05 +00:00
|
|
|
VI_UNLOCK(vp);
|
|
|
|
}
|
2015-07-11 16:28:12 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Increment si_usecount of the associated device, if any.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
v_incr_devcount(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
ASSERT_VI_LOCKED(vp, __FUNCTION__);
|
2015-07-11 16:28:12 +00:00
|
|
|
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
|
|
|
|
dev_lock();
|
|
|
|
vp->v_rdev->si_usecount++;
|
|
|
|
dev_unlock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Decrement si_usecount of the associated device, if any.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
v_decr_devcount(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
|
|
|
ASSERT_VI_LOCKED(vp, __FUNCTION__);
|
2002-10-25 07:58:25 +00:00
|
|
|
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
|
2004-09-23 07:17:41 +00:00
|
|
|
dev_lock();
|
2005-06-16 04:41:42 +00:00
|
|
|
vp->v_rdev->si_usecount--;
|
2004-09-23 07:17:41 +00:00
|
|
|
dev_unlock();
|
2002-10-24 19:38:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Grab a particular vnode from the free list, increment its
|
2008-03-24 04:22:58 +00:00
|
|
|
* reference count and lock it. VI_DOOMED is set if the vnode
|
|
|
|
* is being destroyed. Only callers who specify LK_RETRY will
|
|
|
|
* see doomed vnodes. If inactive processing was delayed in
|
|
|
|
* vput try to do it here.
|
2015-07-16 13:57:05 +00:00
|
|
|
*
|
|
|
|
* Notes on lockless counter manipulation:
|
|
|
|
* _vhold, vputx and other routines make various decisions based
|
2016-06-20 15:45:50 +00:00
|
|
|
* on either holdcnt or usecount being 0. As long as either counter
|
2015-07-16 13:57:05 +00:00
|
|
|
* is not transitioning 0->1 nor 1->0, the manipulation can be done
|
2016-06-20 15:45:50 +00:00
|
|
|
* with atomic operations. Otherwise the interlock is taken covering
|
|
|
|
* both the atomic and additional actions.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2006-02-01 00:30:05 +00:00
|
|
|
vget(struct vnode *vp, int flags, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2015-07-16 13:57:05 +00:00
|
|
|
int error, oweinact;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2008-07-21 23:01:09 +00:00
|
|
|
VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
|
|
|
|
("vget: invalid lock operation"));
|
2015-07-16 13:57:05 +00:00
|
|
|
|
|
|
|
if ((flags & LK_INTERLOCK) != 0)
|
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
|
|
|
else
|
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
|
|
|
if ((flags & LK_VNHELD) != 0)
|
|
|
|
VNASSERT((vp->v_holdcnt > 0), vp,
|
|
|
|
("vget: LK_VNHELD passed but vnode not held"));
|
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
|
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
if ((flags & LK_VNHELD) == 0)
|
|
|
|
_vhold(vp, (flags & LK_INTERLOCK) != 0);
|
|
|
|
|
|
|
|
if ((error = vn_lock(vp, flags)) != 0) {
|
2006-02-01 00:30:05 +00:00
|
|
|
vdrop(vp);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
|
|
|
|
vp);
|
2005-04-11 09:28:32 +00:00
|
|
|
return (error);
|
2005-03-13 11:54:28 +00:00
|
|
|
}
|
2008-03-24 04:22:58 +00:00
|
|
|
if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
|
|
|
|
panic("vget: vn_lock failed to return ENOENT\n");
|
|
|
|
/*
|
2010-04-03 11:19:20 +00:00
|
|
|
* We don't guarantee that any particular close will
|
2008-03-24 04:22:58 +00:00
|
|
|
* trigger inactive processing so just make a best effort
|
|
|
|
* here at preventing a reference to a removed file. If
|
|
|
|
* we don't succeed no harm is done.
|
2015-07-16 13:57:05 +00:00
|
|
|
*
|
|
|
|
* Upgrade our holdcnt to a usecount.
|
2008-03-24 04:22:58 +00:00
|
|
|
*/
|
2016-07-03 01:56:48 +00:00
|
|
|
if (vp->v_type == VCHR ||
|
|
|
|
!vfs_refcount_acquire_if_not_zero(&vp->v_usecount)) {
|
2015-07-16 13:57:05 +00:00
|
|
|
VI_LOCK(vp);
|
|
|
|
if ((vp->v_iflag & VI_OWEINACT) == 0) {
|
|
|
|
oweinact = 0;
|
|
|
|
} else {
|
|
|
|
oweinact = 1;
|
|
|
|
vp->v_iflag &= ~VI_OWEINACT;
|
|
|
|
}
|
|
|
|
refcount_acquire(&vp->v_usecount);
|
|
|
|
v_incr_devcount(vp);
|
|
|
|
if (oweinact && VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
|
2008-03-24 04:22:58 +00:00
|
|
|
(flags & LK_NOWAIT) == 0)
|
2005-04-11 09:28:32 +00:00
|
|
|
vinactive(vp, td);
|
2015-07-16 13:57:05 +00:00
|
|
|
VI_UNLOCK(vp);
|
2008-03-24 04:22:58 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
return (0);
|
|
|
|
}
|
1997-12-29 00:25:11 +00:00
|
|
|
|
2002-06-06 15:46:38 +00:00
|
|
|
/*
|
2016-10-04 21:44:20 +00:00
|
|
|
* Increase the reference (use) and hold count of a vnode.
|
|
|
|
* This will also remove the vnode from the free list if it is presently free.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
1997-12-29 16:54:03 +00:00
|
|
|
void
|
|
|
|
vref(struct vnode *vp)
|
|
|
|
{
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2016-01-18 22:21:46 +00:00
|
|
|
_vhold(vp, false);
|
2005-06-16 04:41:42 +00:00
|
|
|
v_incr_usecount(vp);
|
1997-12-29 16:54:03 +00:00
|
|
|
}
|
|
|
|
|
2016-01-18 22:21:46 +00:00
|
|
|
void
|
|
|
|
vrefl(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
2016-10-06 18:10:19 +00:00
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
2016-01-18 22:21:46 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
|
|
|
_vhold(vp, true);
|
|
|
|
v_incr_usecount_locked(vp);
|
|
|
|
}
|
|
|
|
|
2016-12-12 15:37:11 +00:00
|
|
|
void
|
|
|
|
vrefact(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
|
|
|
if (__predict_false(vp->v_type == VCHR)) {
|
|
|
|
VNASSERT(vp->v_holdcnt > 0 && vp->v_usecount > 0, vp,
|
|
|
|
("%s: wrong ref counts", __func__));
|
|
|
|
vref(vp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
#ifdef INVARIANTS
|
|
|
|
int old = atomic_fetchadd_int(&vp->v_holdcnt, 1);
|
|
|
|
VNASSERT(old > 0, vp, ("%s: wrong hold count", __func__));
|
|
|
|
old = atomic_fetchadd_int(&vp->v_usecount, 1);
|
|
|
|
VNASSERT(old > 0, vp, ("%s: wrong use count", __func__));
|
|
|
|
#else
|
|
|
|
refcount_acquire(&vp->v_holdcnt);
|
|
|
|
refcount_acquire(&vp->v_usecount);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2002-09-25 02:22:21 +00:00
|
|
|
/*
|
|
|
|
* Return reference count of a vnode.
|
|
|
|
*
|
2015-07-16 13:57:05 +00:00
|
|
|
* The results of this call are only guaranteed when some mechanism is used to
|
|
|
|
* stop other processes from gaining references to the vnode. This may be the
|
|
|
|
* case if the caller holds the only reference. This is also useful when stale
|
|
|
|
* data is acceptable as race conditions may be accounted for by some other
|
|
|
|
* means.
|
2002-09-25 02:22:21 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
vrefcnt(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
return (vp->v_usecount);
|
2002-09-25 02:22:21 +00:00
|
|
|
}
|
|
|
|
|
2010-01-17 21:24:27 +00:00
|
|
|
#define VPUTX_VRELE 1
|
|
|
|
#define VPUTX_VPUT 2
|
|
|
|
#define VPUTX_VUNREF 3
|
2002-09-25 02:22:21 +00:00
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
/*
|
|
|
|
* Decrement the use and hold counts for a vnode.
|
|
|
|
*
|
|
|
|
* See an explanation near vget() as to why atomic operation is safe.
|
|
|
|
*/
|
2010-01-17 21:24:27 +00:00
|
|
|
static void
|
|
|
|
vputx(struct vnode *vp, int func)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2010-01-17 21:24:27 +00:00
|
|
|
int error;
|
1997-12-29 00:25:11 +00:00
|
|
|
|
2010-01-17 21:24:27 +00:00
|
|
|
KASSERT(vp != NULL, ("vputx: null vp"));
|
|
|
|
if (func == VPUTX_VUNREF)
|
2010-11-24 12:30:41 +00:00
|
|
|
ASSERT_VOP_LOCKED(vp, "vunref");
|
2010-01-17 21:24:27 +00:00
|
|
|
else if (func == VPUTX_VPUT)
|
|
|
|
ASSERT_VOP_LOCKED(vp, "vput");
|
|
|
|
else
|
|
|
|
KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
|
2015-07-16 13:57:05 +00:00
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
2010-01-17 21:24:27 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2000-10-02 09:57:06 +00:00
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
if (vp->v_type != VCHR &&
|
|
|
|
vfs_refcount_release_if_not_last(&vp->v_usecount)) {
|
2010-01-17 21:24:27 +00:00
|
|
|
if (func == VPUTX_VPUT)
|
|
|
|
VOP_UNLOCK(vp, 0);
|
2015-07-16 13:57:05 +00:00
|
|
|
vdrop(vp);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
return;
|
|
|
|
}
|
2010-01-17 21:24:27 +00:00
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
VI_LOCK(vp);
|
|
|
|
|
2005-06-16 04:41:42 +00:00
|
|
|
/*
|
|
|
|
* We want to hold the vnode until the inactive finishes to
|
|
|
|
* prevent vgone() races. We drop the use count here and the
|
|
|
|
* hold count below when we're done.
|
|
|
|
*/
|
2015-07-16 13:57:05 +00:00
|
|
|
if (!refcount_release(&vp->v_usecount) ||
|
|
|
|
(vp->v_iflag & VI_DOINGINACT)) {
|
|
|
|
if (func == VPUTX_VPUT)
|
|
|
|
VOP_UNLOCK(vp, 0);
|
|
|
|
v_decr_devcount(vp);
|
|
|
|
vdropl(vp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
v_decr_devcount(vp);
|
|
|
|
|
|
|
|
error = 0;
|
|
|
|
|
|
|
|
if (vp->v_usecount != 0) {
|
2016-08-10 16:12:31 +00:00
|
|
|
vn_printf(vp, "vputx: usecount not zero for vnode ");
|
2015-07-16 13:57:05 +00:00
|
|
|
panic("vputx: usecount not zero");
|
|
|
|
}
|
|
|
|
|
|
|
|
CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
|
|
|
|
|
2005-03-14 07:16:55 +00:00
|
|
|
/*
|
|
|
|
* We must call VOP_INACTIVE with the node locked. Mark
|
|
|
|
* as VI_DOINGINACT to avoid recursion.
|
|
|
|
*/
|
2006-03-08 23:43:39 +00:00
|
|
|
vp->v_iflag |= VI_OWEINACT;
|
2010-11-24 12:30:41 +00:00
|
|
|
switch (func) {
|
|
|
|
case VPUTX_VRELE:
|
2010-01-17 21:24:27 +00:00
|
|
|
error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
|
2005-03-14 07:16:55 +00:00
|
|
|
VI_LOCK(vp);
|
2010-11-24 12:30:41 +00:00
|
|
|
break;
|
|
|
|
case VPUTX_VPUT:
|
|
|
|
if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
|
|
|
|
error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
|
|
|
|
LK_NOWAIT);
|
|
|
|
VI_LOCK(vp);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case VPUTX_VUNREF:
|
2013-09-29 18:07:14 +00:00
|
|
|
if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
|
|
|
|
error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
|
|
|
|
VI_LOCK(vp);
|
|
|
|
}
|
2010-11-24 12:30:41 +00:00
|
|
|
break;
|
2010-01-17 21:24:27 +00:00
|
|
|
}
|
2015-07-11 16:28:55 +00:00
|
|
|
VNASSERT(vp->v_usecount == 0 || (vp->v_iflag & VI_OWEINACT) == 0, vp,
|
|
|
|
("vnode with usecount and VI_OWEINACT set"));
|
2010-01-17 21:24:27 +00:00
|
|
|
if (error == 0) {
|
|
|
|
if (vp->v_iflag & VI_OWEINACT)
|
|
|
|
vinactive(vp, curthread);
|
|
|
|
if (func != VPUTX_VUNREF)
|
|
|
|
VOP_UNLOCK(vp, 0);
|
2006-03-08 23:43:39 +00:00
|
|
|
}
|
2005-06-16 04:41:42 +00:00
|
|
|
vdropl(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2010-01-17 21:24:27 +00:00
|
|
|
/*
|
|
|
|
* Vnode put/release.
|
|
|
|
* If count drops to zero, call inactive routine and return to freelist.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vrele(struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
|
|
|
vputx(vp, VPUTX_VRELE);
|
|
|
|
}
|
|
|
|
|
2002-06-06 15:46:38 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Release an already locked vnode. This give the same effects as
|
|
|
|
* unlock+vrele(), but takes less time and avoids releasing and
|
2007-05-27 20:50:23 +00:00
|
|
|
* re-aquiring the lock (as vrele() acquires the lock internally.)
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
1997-12-29 00:25:11 +00:00
|
|
|
void
|
2006-01-21 19:42:10 +00:00
|
|
|
vput(struct vnode *vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2010-01-17 21:24:27 +00:00
|
|
|
vputx(vp, VPUTX_VPUT);
|
|
|
|
}
|
2000-10-02 09:57:06 +00:00
|
|
|
|
2010-01-17 21:24:27 +00:00
|
|
|
/*
|
|
|
|
* Release an exclusively locked vnode. Do not unlock the vnode lock.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vunref(struct vnode *vp)
|
|
|
|
{
|
1996-08-21 21:56:23 +00:00
|
|
|
|
2010-01-17 21:24:27 +00:00
|
|
|
vputx(vp, VPUTX_VUNREF);
|
1997-02-27 02:57:03 +00:00
|
|
|
}
|
|
|
|
|
2012-04-17 21:46:59 +00:00
|
|
|
/*
|
|
|
|
* Increase the hold count and activate if this is the first reference.
|
|
|
|
*/
|
2005-03-15 13:43:10 +00:00
|
|
|
void
|
2015-07-16 13:57:05 +00:00
|
|
|
_vhold(struct vnode *vp, bool locked)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2012-04-20 06:50:44 +00:00
|
|
|
struct mount *mp;
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
if (locked)
|
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
|
|
|
else
|
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2015-07-16 13:57:05 +00:00
|
|
|
if (!locked && vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt)) {
|
|
|
|
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
|
|
|
|
("_vhold: vnode with holdcnt is free"));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!locked)
|
|
|
|
VI_LOCK(vp);
|
|
|
|
if ((vp->v_iflag & VI_FREE) == 0) {
|
|
|
|
refcount_acquire(&vp->v_holdcnt);
|
|
|
|
if (!locked)
|
|
|
|
VI_UNLOCK(vp);
|
2012-04-17 21:46:59 +00:00
|
|
|
return;
|
2015-07-16 13:57:05 +00:00
|
|
|
}
|
|
|
|
VNASSERT(vp->v_holdcnt == 0, vp,
|
|
|
|
("%s: wrong hold count", __func__));
|
|
|
|
VNASSERT(vp->v_op != NULL, vp,
|
|
|
|
("%s: vnode already reclaimed.", __func__));
|
2012-04-17 21:46:59 +00:00
|
|
|
/*
|
2012-04-20 06:50:44 +00:00
|
|
|
* Remove a vnode from the free list, mark it as in use,
|
|
|
|
* and put it on the active list.
|
2012-04-17 21:46:59 +00:00
|
|
|
*/
|
2016-09-30 17:27:17 +00:00
|
|
|
mp = vp->v_mount;
|
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
|
|
|
if ((vp->v_mflag & VMP_TMPMNTFREELIST) != 0) {
|
|
|
|
TAILQ_REMOVE(&mp->mnt_tmpfreevnodelist, vp, v_actfreelist);
|
|
|
|
mp->mnt_tmpfreevnodelistsize--;
|
|
|
|
vp->v_mflag &= ~VMP_TMPMNTFREELIST;
|
|
|
|
} else {
|
|
|
|
mtx_lock(&vnode_free_list_mtx);
|
|
|
|
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
|
|
|
|
freevnodes--;
|
|
|
|
mtx_unlock(&vnode_free_list_mtx);
|
|
|
|
}
|
2012-04-20 06:50:44 +00:00
|
|
|
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
|
|
|
|
("Activating already active vnode"));
|
2016-09-30 17:27:17 +00:00
|
|
|
vp->v_iflag &= ~VI_FREE;
|
2012-04-20 06:50:44 +00:00
|
|
|
vp->v_iflag |= VI_ACTIVE;
|
|
|
|
TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
|
|
|
|
mp->mnt_activevnodelistsize++;
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2015-07-16 13:57:05 +00:00
|
|
|
refcount_acquire(&vp->v_holdcnt);
|
|
|
|
if (!locked)
|
|
|
|
VI_UNLOCK(vp);
|
2002-09-25 02:22:21 +00:00
|
|
|
}
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2005-06-16 04:41:42 +00:00
|
|
|
/*
|
|
|
|
* Drop the hold count of the vnode. If this is the last reference to
|
2012-04-17 21:46:59 +00:00
|
|
|
* the vnode we place it on the free list unless it has been vgone'd
|
|
|
|
* (marked VI_DOOMED) in which case we will free it.
|
2015-11-27 01:16:35 +00:00
|
|
|
*
|
|
|
|
* Because the vnode vm object keeps a hold reference on the vnode if
|
|
|
|
* there is at least one resident non-cached page, the vnode cannot
|
|
|
|
* leave the active list without the page cleanup done.
|
2005-06-16 04:41:42 +00:00
|
|
|
*/
|
2007-03-31 23:57:17 +00:00
|
|
|
void
|
2015-07-16 13:57:05 +00:00
|
|
|
_vdrop(struct vnode *vp, bool locked)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2012-04-17 21:46:59 +00:00
|
|
|
struct bufobj *bo;
|
2012-04-20 06:50:44 +00:00
|
|
|
struct mount *mp;
|
|
|
|
int active;
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2015-07-16 13:57:05 +00:00
|
|
|
if (locked)
|
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
|
|
|
else
|
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2015-07-16 13:57:05 +00:00
|
|
|
if ((int)vp->v_holdcnt <= 0)
|
2005-03-14 09:25:19 +00:00
|
|
|
panic("vdrop: holdcnt %d", vp->v_holdcnt);
|
2015-07-16 13:57:05 +00:00
|
|
|
if (vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
|
|
|
|
if (locked)
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!locked)
|
|
|
|
VI_LOCK(vp);
|
|
|
|
if (refcount_release(&vp->v_holdcnt) == 0) {
|
2012-04-17 21:46:59 +00:00
|
|
|
VI_UNLOCK(vp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if ((vp->v_iflag & VI_DOOMED) == 0) {
|
|
|
|
/*
|
2012-04-20 06:50:44 +00:00
|
|
|
* Mark a vnode as free: remove it from its active list
|
|
|
|
* and put it up for recycling on the freelist.
|
2012-04-17 21:46:59 +00:00
|
|
|
*/
|
|
|
|
VNASSERT(vp->v_op != NULL, vp,
|
|
|
|
("vdropl: vnode already reclaimed."));
|
|
|
|
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
|
|
|
|
("vnode already free"));
|
2014-07-29 16:42:34 +00:00
|
|
|
VNASSERT(vp->v_holdcnt == 0, vp,
|
2012-04-17 21:46:59 +00:00
|
|
|
("vdropl: freeing when we shouldn't"));
|
2012-04-20 06:50:44 +00:00
|
|
|
active = vp->v_iflag & VI_ACTIVE;
|
vfs_msync(), called from syncer vnode fsync VOP, only iterates over
the active vnode list for the given mount point, with the assumption
that vnodes with dirty pages are active. This is enforced by
vinactive() doing vm_object_page_clean() pass over the vnode pages.
The issue is, if vinactive() cannot be called during vput() due to the
vnode being only shared-locked, we might end up with the dirty pages
for the vnode on the free list. Such vnode is invisible to syncer,
and pages are only cleaned on the vnode reactivation. In other words,
the race results in the broken guarantee that user data, written
through the mmap(2), is written to the disk not later than in 30
seconds after the write.
Fix this by keeping the vnode which is freed but still owing
inactivation, on the active list. When syncer loops find such vnode,
it is deactivated and cleaned by the final vput() call.
Tested by: pho
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2015-06-17 04:46:58 +00:00
|
|
|
if ((vp->v_iflag & VI_OWEINACT) == 0) {
|
|
|
|
vp->v_iflag &= ~VI_ACTIVE;
|
|
|
|
mp = vp->v_mount;
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
vfs_msync(), called from syncer vnode fsync VOP, only iterates over
the active vnode list for the given mount point, with the assumption
that vnodes with dirty pages are active. This is enforced by
vinactive() doing vm_object_page_clean() pass over the vnode pages.
The issue is, if vinactive() cannot be called during vput() due to the
vnode being only shared-locked, we might end up with the dirty pages
for the vnode on the free list. Such vnode is invisible to syncer,
and pages are only cleaned on the vnode reactivation. In other words,
the race results in the broken guarantee that user data, written
through the mmap(2), is written to the disk not later than in 30
seconds after the write.
Fix this by keeping the vnode which is freed but still owing
inactivation, on the active list. When syncer loops find such vnode,
it is deactivated and cleaned by the final vput() call.
Tested by: pho
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2015-06-17 04:46:58 +00:00
|
|
|
if (active) {
|
|
|
|
TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
|
|
|
|
v_actfreelist);
|
|
|
|
mp->mnt_activevnodelistsize--;
|
|
|
|
}
|
2016-09-30 17:27:17 +00:00
|
|
|
TAILQ_INSERT_TAIL(&mp->mnt_tmpfreevnodelist, vp,
|
2015-11-27 01:45:40 +00:00
|
|
|
v_actfreelist);
|
2016-09-30 17:27:17 +00:00
|
|
|
mp->mnt_tmpfreevnodelistsize++;
|
vfs_msync(), called from syncer vnode fsync VOP, only iterates over
the active vnode list for the given mount point, with the assumption
that vnodes with dirty pages are active. This is enforced by
vinactive() doing vm_object_page_clean() pass over the vnode pages.
The issue is, if vinactive() cannot be called during vput() due to the
vnode being only shared-locked, we might end up with the dirty pages
for the vnode on the free list. Such vnode is invisible to syncer,
and pages are only cleaned on the vnode reactivation. In other words,
the race results in the broken guarantee that user data, written
through the mmap(2), is written to the disk not later than in 30
seconds after the write.
Fix this by keeping the vnode which is freed but still owing
inactivation, on the active list. When syncer loops find such vnode,
it is deactivated and cleaned by the final vput() call.
Tested by: pho
Sponsored by: The FreeBSD Foundation
MFC after: 2 weeks
2015-06-17 04:46:58 +00:00
|
|
|
vp->v_iflag |= VI_FREE;
|
2016-09-30 17:27:17 +00:00
|
|
|
vp->v_mflag |= VMP_TMPMNTFREELIST;
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
if (mp->mnt_tmpfreevnodelistsize >= mnt_free_list_batch)
|
|
|
|
vnlru_return_batch_locked(mp);
|
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-04-17 21:46:59 +00:00
|
|
|
} else {
|
2016-09-30 17:27:17 +00:00
|
|
|
VI_UNLOCK(vp);
|
2016-12-31 19:59:31 +00:00
|
|
|
counter_u64_add(free_owe_inact, 1);
|
2012-04-17 21:46:59 +00:00
|
|
|
}
|
|
|
|
return;
|
2005-06-16 04:41:42 +00:00
|
|
|
}
|
2012-04-17 21:46:59 +00:00
|
|
|
/*
|
|
|
|
* The vnode has been marked for destruction, so free it.
|
2015-11-29 21:42:26 +00:00
|
|
|
*
|
|
|
|
* The vnode will be returned to the zone where it will
|
|
|
|
* normally remain until it is needed for another vnode. We
|
|
|
|
* need to cleanup (or verify that the cleanup has already
|
|
|
|
* been done) any residual data left from its current use
|
|
|
|
* so as not to contaminate the freshly allocated vnode.
|
2012-04-17 21:46:59 +00:00
|
|
|
*/
|
|
|
|
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
|
2014-06-08 15:38:40 +00:00
|
|
|
atomic_subtract_long(&numvnodes, 1);
|
2012-04-17 21:46:59 +00:00
|
|
|
bo = &vp->v_bufobj;
|
|
|
|
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
|
|
|
|
("cleaned vnode still on the free list."));
|
|
|
|
VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
|
|
|
|
VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
|
|
|
|
VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
|
|
|
|
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
|
|
|
|
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
|
|
|
|
VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
|
2013-05-12 04:05:01 +00:00
|
|
|
VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
|
|
|
|
("clean blk trie not empty"));
|
2012-04-17 21:46:59 +00:00
|
|
|
VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
|
2013-05-12 04:05:01 +00:00
|
|
|
VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
|
|
|
|
("dirty blk trie not empty"));
|
2012-04-17 21:46:59 +00:00
|
|
|
VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
|
|
|
|
VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
|
|
|
|
VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
|
2015-12-04 03:54:18 +00:00
|
|
|
VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
|
|
|
|
("Dangling rangelock waiters"));
|
2005-06-16 04:41:42 +00:00
|
|
|
VI_UNLOCK(vp);
|
2012-04-17 21:46:59 +00:00
|
|
|
#ifdef MAC
|
|
|
|
mac_vnode_destroy(vp);
|
|
|
|
#endif
|
2015-11-29 21:42:26 +00:00
|
|
|
if (vp->v_pollinfo != NULL) {
|
2012-04-17 21:46:59 +00:00
|
|
|
destroy_vpollinfo(vp->v_pollinfo);
|
2015-11-29 21:42:26 +00:00
|
|
|
vp->v_pollinfo = NULL;
|
|
|
|
}
|
2012-04-17 21:46:59 +00:00
|
|
|
#ifdef INVARIANTS
|
|
|
|
/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
|
|
|
|
vp->v_op = NULL;
|
|
|
|
#endif
|
2017-06-02 17:31:25 +00:00
|
|
|
vp->v_mountedhere = NULL;
|
|
|
|
vp->v_unpcb = NULL;
|
|
|
|
vp->v_rdev = NULL;
|
|
|
|
vp->v_fifoinfo = NULL;
|
2015-12-04 03:54:18 +00:00
|
|
|
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
|
2015-11-29 21:42:26 +00:00
|
|
|
vp->v_iflag = 0;
|
|
|
|
vp->v_vflag = 0;
|
|
|
|
bo->bo_flag = 0;
|
2012-04-17 21:46:59 +00:00
|
|
|
uma_zfree(vnode_zone, vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2005-06-16 04:41:42 +00:00
|
|
|
/*
|
|
|
|
* Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
|
|
|
|
* flags. DOINGINACT prevents us from recursing in calls to vinactive.
|
|
|
|
* OWEINACT tracks whether a vnode missed a call to inactive due to a
|
|
|
|
* failed lock upgrade.
|
|
|
|
*/
|
2012-04-11 23:01:11 +00:00
|
|
|
void
|
2005-03-13 11:54:28 +00:00
|
|
|
vinactive(struct vnode *vp, struct thread *td)
|
|
|
|
{
|
2012-04-20 07:00:28 +00:00
|
|
|
struct vm_object *obj;
|
2005-06-16 04:41:42 +00:00
|
|
|
|
2008-07-27 11:48:15 +00:00
|
|
|
ASSERT_VOP_ELOCKED(vp, "vinactive");
|
2005-03-13 11:54:28 +00:00
|
|
|
ASSERT_VI_LOCKED(vp, "vinactive");
|
|
|
|
VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
|
|
|
|
("vinactive: recursed on VI_DOINGINACT"));
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2005-03-13 11:54:28 +00:00
|
|
|
vp->v_iflag |= VI_DOINGINACT;
|
2005-06-07 22:05:32 +00:00
|
|
|
vp->v_iflag &= ~VI_OWEINACT;
|
2005-03-13 11:54:28 +00:00
|
|
|
VI_UNLOCK(vp);
|
2012-04-20 07:00:28 +00:00
|
|
|
/*
|
|
|
|
* Before moving off the active list, we must be sure that any
|
2015-11-27 01:16:35 +00:00
|
|
|
* modified pages are converted into the vnode's dirty
|
|
|
|
* buffers, since these will no longer be checked once the
|
|
|
|
* vnode is on the inactive list.
|
|
|
|
*
|
|
|
|
* The write-out of the dirty pages is asynchronous. At the
|
|
|
|
* point that VOP_INACTIVE() is called, there could still be
|
|
|
|
* pending I/O and dirty pages in the object.
|
2012-04-20 07:00:28 +00:00
|
|
|
*/
|
|
|
|
obj = vp->v_object;
|
|
|
|
if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WLOCK(obj);
|
2016-11-26 21:00:27 +00:00
|
|
|
vm_object_page_clean(obj, 0, 0, 0);
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
2012-04-20 07:00:28 +00:00
|
|
|
}
|
2005-03-13 11:54:28 +00:00
|
|
|
VOP_INACTIVE(vp, td);
|
|
|
|
VI_LOCK(vp);
|
|
|
|
VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
|
|
|
|
("vinactive: lost VI_DOINGINACT"));
|
2005-06-07 22:05:32 +00:00
|
|
|
vp->v_iflag &= ~VI_DOINGINACT;
|
2005-03-13 11:54:28 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Remove any vnodes in the vnode table belonging to mount point mp.
|
|
|
|
*
|
2001-05-16 18:04:37 +00:00
|
|
|
* If FORCECLOSE is not specified, there should not be any active ones,
|
1994-05-24 10:09:53 +00:00
|
|
|
* return error if any are found (nb: this is a user error, not a
|
2001-05-16 18:04:37 +00:00
|
|
|
* system error). If FORCECLOSE is specified, detach any active vnodes
|
1994-05-24 10:09:53 +00:00
|
|
|
* that are found.
|
2001-05-16 18:04:37 +00:00
|
|
|
*
|
|
|
|
* If WRITECLOSE is set, only flush out regular file vnodes open for
|
|
|
|
* writing.
|
|
|
|
*
|
2002-08-04 10:29:36 +00:00
|
|
|
* SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
|
2001-05-16 18:04:37 +00:00
|
|
|
*
|
|
|
|
* `rootrefs' specifies the base reference count for the root vnode
|
|
|
|
* of this filesystem. The root vnode is considered busy if its
|
2004-07-12 08:14:09 +00:00
|
|
|
* v_usecount exceeds this value. On a successful return, vflush(, td)
|
2001-05-16 18:04:37 +00:00
|
|
|
* will call vrele() on the root vnode exactly rootrefs times.
|
|
|
|
* If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
|
|
|
|
* be zero.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#ifdef DIAGNOSTIC
|
1995-12-17 21:23:44 +00:00
|
|
|
static int busyprt = 0; /* print out busy vnodes */
|
2010-11-14 07:38:42 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2010-04-03 11:19:20 +00:00
|
|
|
vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2006-01-09 20:42:19 +00:00
|
|
|
struct vnode *vp, *mvp, *rootvp = NULL;
|
2002-01-15 07:17:12 +00:00
|
|
|
struct vattr vattr;
|
2001-05-16 18:04:37 +00:00
|
|
|
int busy = 0, error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
|
|
|
|
rootrefs, flags);
|
2001-05-16 18:04:37 +00:00
|
|
|
if (rootrefs > 0) {
|
|
|
|
KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
|
|
|
|
("vflush: bad args"));
|
|
|
|
/*
|
|
|
|
* Get the filesystem root vnode. We can vput() it
|
|
|
|
* immediately, since with rootrefs > 0, it won't go away.
|
|
|
|
*/
|
2009-05-11 15:33:26 +00:00
|
|
|
if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
|
|
|
|
__func__, error);
|
2001-05-16 18:04:37 +00:00
|
|
|
return (error);
|
2009-02-05 15:03:35 +00:00
|
|
|
}
|
2001-05-16 18:04:37 +00:00
|
|
|
vput(rootvp);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
loop:
|
2012-04-17 16:28:22 +00:00
|
|
|
MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
|
2005-06-16 04:41:42 +00:00
|
|
|
vholdl(vp);
|
2008-01-10 01:10:58 +00:00
|
|
|
error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
|
2003-10-05 07:12:38 +00:00
|
|
|
if (error) {
|
2005-06-16 04:41:42 +00:00
|
|
|
vdrop(vp);
|
2012-04-17 16:28:22 +00:00
|
|
|
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
|
2003-05-16 19:46:51 +00:00
|
|
|
goto loop;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-08-04 10:29:36 +00:00
|
|
|
* Skip over a vnodes marked VV_SYSTEM.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-08-04 10:29:36 +00:00
|
|
|
if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2005-06-16 04:41:42 +00:00
|
|
|
vdrop(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-01-15 07:17:12 +00:00
|
|
|
* If WRITECLOSE is set, flush out unlinked but still open
|
|
|
|
* files (even if open only for reading) and regular file
|
2002-06-06 15:46:38 +00:00
|
|
|
* vnodes open for writing.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-09-25 02:22:21 +00:00
|
|
|
if (flags & WRITECLOSE) {
|
2012-01-25 20:54:09 +00:00
|
|
|
if (vp->v_object != NULL) {
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WLOCK(vp->v_object);
|
2012-01-25 20:54:09 +00:00
|
|
|
vm_object_page_clean(vp->v_object, 0, 0, 0);
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WUNLOCK(vp->v_object);
|
2012-01-25 20:54:09 +00:00
|
|
|
}
|
|
|
|
error = VOP_FSYNC(vp, MNT_WAIT, td);
|
|
|
|
if (error != 0) {
|
|
|
|
VOP_UNLOCK(vp, 0);
|
|
|
|
vdrop(vp);
|
2012-04-17 16:28:22 +00:00
|
|
|
MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
|
2012-01-25 20:54:09 +00:00
|
|
|
return (error);
|
|
|
|
}
|
2008-08-28 15:23:18 +00:00
|
|
|
error = VOP_GETATTR(vp, &vattr, td->td_ucred);
|
2002-09-25 02:22:21 +00:00
|
|
|
VI_LOCK(vp);
|
2002-08-22 06:51:06 +00:00
|
|
|
|
2002-09-25 02:22:21 +00:00
|
|
|
if ((vp->v_type == VNON ||
|
|
|
|
(error == 0 && vattr.va_nlink > 0)) &&
|
|
|
|
(vp->v_writecount == 0 || vp->v_type != VREG)) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2005-06-16 04:41:42 +00:00
|
|
|
vdropl(vp);
|
2002-09-25 02:22:21 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
VI_LOCK(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* With v_usecount == 0, all we need to do is clear out the
|
|
|
|
* vnode data structures and we are done.
|
2005-06-16 04:41:42 +00:00
|
|
|
*
|
|
|
|
* If FORCECLOSE is set, forcibly close the vnode.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-16 04:41:42 +00:00
|
|
|
if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
|
|
|
|
vgonel(vp);
|
|
|
|
} else {
|
|
|
|
busy++;
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef DIAGNOSTIC
|
2005-06-16 04:41:42 +00:00
|
|
|
if (busyprt)
|
2016-08-10 16:12:31 +00:00
|
|
|
vn_printf(vp, "vflush: busy vnode ");
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
2005-06-16 04:41:42 +00:00
|
|
|
}
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2005-06-16 04:41:42 +00:00
|
|
|
vdropl(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-05-16 18:04:37 +00:00
|
|
|
if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
|
|
|
|
/*
|
|
|
|
* If just the root vnode is busy, and if its refcount
|
|
|
|
* is equal to `rootrefs', then go ahead and kill it.
|
|
|
|
*/
|
2002-09-25 02:22:21 +00:00
|
|
|
VI_LOCK(rootvp);
|
2001-05-16 18:04:37 +00:00
|
|
|
KASSERT(busy > 0, ("vflush: not busy"));
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
|
2004-07-12 04:13:38 +00:00
|
|
|
("vflush: usecount %d < rootrefs %d",
|
|
|
|
rootvp->v_usecount, rootrefs));
|
2001-05-16 18:04:37 +00:00
|
|
|
if (busy == 1 && rootvp->v_usecount == rootrefs) {
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
|
2005-03-13 11:54:28 +00:00
|
|
|
vgone(rootvp);
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(rootvp, 0);
|
2001-05-16 18:04:37 +00:00
|
|
|
busy = 0;
|
|
|
|
} else
|
2002-09-25 02:22:21 +00:00
|
|
|
VI_UNLOCK(rootvp);
|
2001-05-16 18:04:37 +00:00
|
|
|
}
|
2009-02-05 15:03:35 +00:00
|
|
|
if (busy) {
|
|
|
|
CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
|
|
|
|
busy);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EBUSY);
|
2009-02-05 15:03:35 +00:00
|
|
|
}
|
2001-05-16 18:04:37 +00:00
|
|
|
for (; rootrefs > 0; rootrefs--)
|
|
|
|
vrele(rootvp);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2005-01-28 13:00:03 +00:00
|
|
|
/*
|
|
|
|
* Recycle an unused vnode to the front of the free list.
|
|
|
|
*/
|
|
|
|
int
|
2012-04-23 14:10:34 +00:00
|
|
|
vrecycle(struct vnode *vp)
|
2005-01-28 13:00:03 +00:00
|
|
|
{
|
2005-06-16 04:41:42 +00:00
|
|
|
int recycled;
|
2005-01-28 13:00:03 +00:00
|
|
|
|
2016-10-06 18:09:22 +00:00
|
|
|
VI_LOCK(vp);
|
|
|
|
recycled = vrecyclel(vp);
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
return (recycled);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vrecycle, with the vp interlock held.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vrecyclel(struct vnode *vp)
|
|
|
|
{
|
|
|
|
int recycled;
|
|
|
|
|
|
|
|
ASSERT_VOP_ELOCKED(vp, __func__);
|
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2005-06-16 04:41:42 +00:00
|
|
|
recycled = 0;
|
|
|
|
if (vp->v_usecount == 0) {
|
|
|
|
recycled = 1;
|
|
|
|
vgonel(vp);
|
2005-01-28 13:00:03 +00:00
|
|
|
}
|
2005-06-16 04:41:42 +00:00
|
|
|
return (recycled);
|
2005-01-28 13:00:03 +00:00
|
|
|
}
|
2003-10-05 00:02:41 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2005-01-28 13:00:03 +00:00
|
|
|
* Eliminate all activity associated with a vnode
|
|
|
|
* in preparation for reuse.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-01-28 13:00:03 +00:00
|
|
|
void
|
|
|
|
vgone(struct vnode *vp)
|
|
|
|
{
|
|
|
|
VI_LOCK(vp);
|
2005-06-16 04:41:42 +00:00
|
|
|
vgonel(vp);
|
|
|
|
VI_UNLOCK(vp);
|
2005-01-28 13:00:03 +00:00
|
|
|
}
|
|
|
|
|
2012-09-09 19:17:15 +00:00
|
|
|
static void
|
2013-05-11 11:17:44 +00:00
|
|
|
notify_lowervp_vfs_dummy(struct mount *mp __unused,
|
2012-09-09 19:17:15 +00:00
|
|
|
struct vnode *lowervp __unused)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2013-05-11 11:17:44 +00:00
|
|
|
* Notify upper mounts about reclaimed or unlinked vnode.
|
2012-09-09 19:17:15 +00:00
|
|
|
*/
|
2013-05-11 11:17:44 +00:00
|
|
|
void
|
|
|
|
vfs_notify_upper(struct vnode *vp, int event)
|
2012-09-09 19:17:15 +00:00
|
|
|
{
|
|
|
|
static struct vfsops vgonel_vfsops = {
|
2013-05-11 11:17:44 +00:00
|
|
|
.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
|
|
|
|
.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
|
2012-09-09 19:17:15 +00:00
|
|
|
};
|
|
|
|
struct mount *mp, *ump, *mmp;
|
|
|
|
|
|
|
|
mp = vp->v_mount;
|
|
|
|
if (mp == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
MNT_ILOCK(mp);
|
|
|
|
if (TAILQ_EMPTY(&mp->mnt_uppers))
|
|
|
|
goto unlock;
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
|
|
|
|
mmp->mnt_op = &vgonel_vfsops;
|
|
|
|
mmp->mnt_kern_flag |= MNTK_MARKER;
|
|
|
|
MNT_ILOCK(mp);
|
|
|
|
mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
|
|
|
|
for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
|
|
|
|
if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
|
|
|
|
ump = TAILQ_NEXT(ump, mnt_upper_link);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
|
|
|
|
MNT_IUNLOCK(mp);
|
2013-05-11 11:17:44 +00:00
|
|
|
switch (event) {
|
|
|
|
case VFS_NOTIFY_UPPER_RECLAIM:
|
|
|
|
VFS_RECLAIM_LOWERVP(ump, vp);
|
|
|
|
break;
|
|
|
|
case VFS_NOTIFY_UPPER_UNLINK:
|
|
|
|
VFS_UNLINK_LOWERVP(ump, vp);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
KASSERT(0, ("invalid event %d", event));
|
|
|
|
break;
|
|
|
|
}
|
2012-09-09 19:17:15 +00:00
|
|
|
MNT_ILOCK(mp);
|
|
|
|
ump = TAILQ_NEXT(mmp, mnt_upper_link);
|
|
|
|
TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
|
|
|
|
}
|
|
|
|
free(mmp, M_TEMP);
|
|
|
|
mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
|
|
|
|
if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
|
|
|
|
mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
|
|
|
|
wakeup(&mp->mnt_uppers);
|
|
|
|
}
|
|
|
|
unlock:
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
}
|
|
|
|
|
2005-01-28 13:00:03 +00:00
|
|
|
/*
|
|
|
|
* vgone, with the vp interlock held.
|
|
|
|
*/
|
2015-08-04 08:51:56 +00:00
|
|
|
static void
|
2005-06-16 04:41:42 +00:00
|
|
|
vgonel(struct vnode *vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-06-13 06:26:55 +00:00
|
|
|
struct thread *td;
|
2005-03-29 10:02:48 +00:00
|
|
|
int oweinact;
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
int active;
|
2006-03-08 23:43:39 +00:00
|
|
|
struct mount *mp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2008-07-27 11:48:15 +00:00
|
|
|
ASSERT_VOP_ELOCKED(vp, "vgonel");
|
2005-01-28 13:00:03 +00:00
|
|
|
ASSERT_VI_LOCKED(vp, "vgonel");
|
2005-06-16 04:41:42 +00:00
|
|
|
VNASSERT(vp->v_holdcnt, vp,
|
|
|
|
("vgonel: vp %p has no reference.", vp));
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
|
2005-06-14 20:31:53 +00:00
|
|
|
td = curthread;
|
2005-01-28 13:00:03 +00:00
|
|
|
|
2005-06-13 06:26:55 +00:00
|
|
|
/*
|
|
|
|
* Don't vgonel if we're already doomed.
|
|
|
|
*/
|
2005-08-10 11:46:03 +00:00
|
|
|
if (vp->v_iflag & VI_DOOMED)
|
2005-06-13 06:26:55 +00:00
|
|
|
return;
|
2005-06-14 20:31:53 +00:00
|
|
|
vp->v_iflag |= VI_DOOMED;
|
2012-09-09 19:17:15 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2005-06-14 20:31:53 +00:00
|
|
|
* Check to see if the vnode is in use. If so, we have to call
|
|
|
|
* VOP_CLOSE() and VOP_INACTIVE().
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-14 20:31:53 +00:00
|
|
|
active = vp->v_usecount;
|
2005-03-29 10:02:48 +00:00
|
|
|
oweinact = (vp->v_iflag & VI_OWEINACT);
|
2005-03-13 11:54:28 +00:00
|
|
|
VI_UNLOCK(vp);
|
2013-05-11 11:17:44 +00:00
|
|
|
vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
|
2012-09-09 19:17:15 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* If purging an active vnode, it must be closed and
|
2005-03-13 11:54:28 +00:00
|
|
|
* deactivated before being reclaimed.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-03-29 10:02:48 +00:00
|
|
|
if (active)
|
2005-01-24 22:22:02 +00:00
|
|
|
VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
|
2005-03-29 10:02:48 +00:00
|
|
|
if (oweinact || active) {
|
2002-12-29 18:30:49 +00:00
|
|
|
VI_LOCK(vp);
|
2005-03-13 11:54:28 +00:00
|
|
|
if ((vp->v_iflag & VI_DOINGINACT) == 0)
|
|
|
|
vinactive(vp, td);
|
2002-12-29 18:30:49 +00:00
|
|
|
VI_UNLOCK(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2012-02-25 10:15:41 +00:00
|
|
|
if (vp->v_type == VSOCK)
|
|
|
|
vfs_unp_reclaim(vp);
|
2014-12-13 16:02:37 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clean out any buffers associated with the vnode.
|
|
|
|
* If the flush fails, just toss the buffers.
|
|
|
|
*/
|
|
|
|
mp = NULL;
|
|
|
|
if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
|
|
|
|
(void) vn_start_secondary_write(vp, &mp, V_WAIT);
|
|
|
|
if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
|
|
|
|
while (vinvalbuf(vp, 0, 0, 0) != 0)
|
|
|
|
;
|
|
|
|
}
|
2015-07-11 11:21:56 +00:00
|
|
|
|
2014-12-13 16:02:37 +00:00
|
|
|
BO_LOCK(&vp->v_bufobj);
|
|
|
|
KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
|
|
|
|
vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
|
|
|
|
TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
|
|
|
|
vp->v_bufobj.bo_clean.bv_cnt == 0,
|
|
|
|
("vp %p bufobj not invalidated", vp));
|
2016-07-11 14:19:09 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
|
2016-07-11 17:04:22 +00:00
|
|
|
* after the object's page queue is flushed.
|
2016-07-11 14:19:09 +00:00
|
|
|
*/
|
|
|
|
if (vp->v_bufobj.bo_object == NULL)
|
|
|
|
vp->v_bufobj.bo_flag |= BO_DEAD;
|
2014-12-13 16:02:37 +00:00
|
|
|
BO_UNLOCK(&vp->v_bufobj);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Reclaim the vnode.
|
|
|
|
*/
|
2001-09-12 08:38:13 +00:00
|
|
|
if (VOP_RECLAIM(vp, td))
|
2005-03-13 11:54:28 +00:00
|
|
|
panic("vgone: cannot reclaim");
|
2006-03-08 23:43:39 +00:00
|
|
|
if (mp != NULL)
|
|
|
|
vn_finished_secondary_write(mp);
|
2005-02-17 10:28:58 +00:00
|
|
|
VNASSERT(vp->v_object == NULL, vp,
|
2005-02-07 07:48:03 +00:00
|
|
|
("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
|
Move the head of byte-level advisory lock list from the
filesystem-specific vnode data to the struct vnode. Provide the
default implementation for the vop_advlock and vop_advlockasync.
Purge the locks on the vnode reclaim by using the lf_purgelocks().
The default implementation is augmented for the nfs and smbfs.
In the nfs_advlock, push the Giant inside the nfs_dolock.
Before the change, the vop_advlock and vop_advlockasync have taken the
unlocked vnode and dereferenced the fs-private inode data, racing with
with the vnode reclamation due to forced unmount. Now, the vop_getattr
under the shared vnode lock is used to obtain the inode size, and
later, in the lf_advlockasync, after locking the vnode interlock, the
VI_DOOMED flag is checked to prevent an operation on the doomed vnode.
The implementation of the lf_purgelocks() is submitted by dfr.
Reported by: kris
Tested by: kris, pho
Discussed with: jeff, dfr
MFC after: 2 weeks
2008-04-16 11:33:32 +00:00
|
|
|
/*
|
|
|
|
* Clear the advisory locks and wake up waiting threads.
|
|
|
|
*/
|
2010-05-12 21:24:46 +00:00
|
|
|
(void)VOP_ADVLOCKPURGE(vp);
|
2015-11-29 21:42:26 +00:00
|
|
|
vp->v_lockf = NULL;
|
2003-10-05 02:48:04 +00:00
|
|
|
/*
|
|
|
|
* Delete from old mount point vnode list.
|
|
|
|
*/
|
2004-07-04 08:52:35 +00:00
|
|
|
delmntque(vp);
|
1997-02-10 02:22:35 +00:00
|
|
|
cache_purge(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2005-06-14 20:31:53 +00:00
|
|
|
* Done with purge, reset to the standard lock and invalidate
|
|
|
|
* the vnode.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2005-06-14 20:31:53 +00:00
|
|
|
VI_LOCK(vp);
|
2002-10-14 19:44:51 +00:00
|
|
|
vp->v_vnlock = &vp->v_lock;
|
2004-12-01 23:16:38 +00:00
|
|
|
vp->v_op = &dead_vnodeops;
|
2002-09-14 09:02:28 +00:00
|
|
|
vp->v_tag = "none";
|
2005-03-13 11:54:28 +00:00
|
|
|
vp->v_type = VBAD;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the total number of references to a special device.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2006-01-21 19:42:10 +00:00
|
|
|
vcount(struct vnode *vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
int count;
|
|
|
|
|
2004-09-23 07:17:41 +00:00
|
|
|
dev_lock();
|
2002-10-24 19:38:56 +00:00
|
|
|
count = vp->v_rdev->si_usecount;
|
2004-09-23 07:17:41 +00:00
|
|
|
dev_unlock();
|
1994-05-24 10:09:53 +00:00
|
|
|
return (count);
|
|
|
|
}
|
1999-08-26 14:53:31 +00:00
|
|
|
|
2000-02-07 23:05:40 +00:00
|
|
|
/*
|
2004-06-16 09:47:26 +00:00
|
|
|
* Same as above, but using the struct cdev *as argument
|
2000-02-07 23:05:40 +00:00
|
|
|
*/
|
|
|
|
int
|
2006-01-21 19:42:10 +00:00
|
|
|
count_dev(struct cdev *dev)
|
2000-02-07 23:05:40 +00:00
|
|
|
{
|
2003-10-17 11:56:48 +00:00
|
|
|
int count;
|
2000-02-07 23:05:40 +00:00
|
|
|
|
2004-09-23 07:17:41 +00:00
|
|
|
dev_lock();
|
2003-10-17 11:56:48 +00:00
|
|
|
count = dev->si_usecount;
|
2004-09-23 07:17:41 +00:00
|
|
|
dev_unlock();
|
2003-10-17 11:56:48 +00:00
|
|
|
return(count);
|
2000-02-07 23:05:40 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Print out a description of a vnode.
|
|
|
|
*/
|
|
|
|
static char *typename[] =
|
2006-01-09 20:42:19 +00:00
|
|
|
{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
|
|
|
|
"VMARKER"};
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
2005-01-24 13:58:08 +00:00
|
|
|
vn_printf(struct vnode *vp, const char *fmt, ...)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2005-01-24 13:58:08 +00:00
|
|
|
va_list ap;
|
2007-08-13 21:23:30 +00:00
|
|
|
char buf[256], buf2[16];
|
|
|
|
u_long flags;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2005-01-24 13:58:08 +00:00
|
|
|
va_start(ap, fmt);
|
|
|
|
vprintf(fmt, ap);
|
|
|
|
va_end(ap);
|
|
|
|
printf("%p: ", (void *)vp);
|
|
|
|
printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
|
|
|
|
printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n",
|
2004-12-03 12:09:34 +00:00
|
|
|
vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
|
1994-05-24 10:09:53 +00:00
|
|
|
buf[0] = '\0';
|
2005-01-24 13:58:08 +00:00
|
|
|
buf[1] = '\0';
|
2002-08-04 10:29:36 +00:00
|
|
|
if (vp->v_vflag & VV_ROOT)
|
2007-08-13 21:23:30 +00:00
|
|
|
strlcat(buf, "|VV_ROOT", sizeof(buf));
|
|
|
|
if (vp->v_vflag & VV_ISTTY)
|
|
|
|
strlcat(buf, "|VV_ISTTY", sizeof(buf));
|
|
|
|
if (vp->v_vflag & VV_NOSYNC)
|
|
|
|
strlcat(buf, "|VV_NOSYNC", sizeof(buf));
|
2012-11-04 13:32:45 +00:00
|
|
|
if (vp->v_vflag & VV_ETERNALDEV)
|
|
|
|
strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
|
2007-08-13 21:23:30 +00:00
|
|
|
if (vp->v_vflag & VV_CACHEDLABEL)
|
|
|
|
strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
|
2002-08-04 10:29:36 +00:00
|
|
|
if (vp->v_vflag & VV_TEXT)
|
2007-08-13 21:23:30 +00:00
|
|
|
strlcat(buf, "|VV_TEXT", sizeof(buf));
|
|
|
|
if (vp->v_vflag & VV_COPYONWRITE)
|
|
|
|
strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
|
2002-08-04 10:29:36 +00:00
|
|
|
if (vp->v_vflag & VV_SYSTEM)
|
2007-08-13 21:23:30 +00:00
|
|
|
strlcat(buf, "|VV_SYSTEM", sizeof(buf));
|
|
|
|
if (vp->v_vflag & VV_PROCDEP)
|
|
|
|
strlcat(buf, "|VV_PROCDEP", sizeof(buf));
|
|
|
|
if (vp->v_vflag & VV_NOKNOTE)
|
|
|
|
strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
|
2006-10-31 21:48:54 +00:00
|
|
|
if (vp->v_vflag & VV_DELETED)
|
2007-08-13 21:23:30 +00:00
|
|
|
strlcat(buf, "|VV_DELETED", sizeof(buf));
|
|
|
|
if (vp->v_vflag & VV_MD)
|
|
|
|
strlcat(buf, "|VV_MD", sizeof(buf));
|
2012-11-04 13:32:45 +00:00
|
|
|
if (vp->v_vflag & VV_FORCEINSMQ)
|
|
|
|
strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
|
|
|
|
flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
|
2007-08-13 21:23:30 +00:00
|
|
|
VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
|
2012-11-04 13:32:45 +00:00
|
|
|
VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
|
2007-08-13 21:23:30 +00:00
|
|
|
if (flags != 0) {
|
|
|
|
snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
|
|
|
|
strlcat(buf, buf2, sizeof(buf));
|
|
|
|
}
|
|
|
|
if (vp->v_iflag & VI_MOUNT)
|
|
|
|
strlcat(buf, "|VI_MOUNT", sizeof(buf));
|
2002-08-04 10:29:36 +00:00
|
|
|
if (vp->v_iflag & VI_DOOMED)
|
2007-08-13 21:23:30 +00:00
|
|
|
strlcat(buf, "|VI_DOOMED", sizeof(buf));
|
2002-08-04 10:29:36 +00:00
|
|
|
if (vp->v_iflag & VI_FREE)
|
2007-08-13 21:23:30 +00:00
|
|
|
strlcat(buf, "|VI_FREE", sizeof(buf));
|
2012-11-04 13:32:45 +00:00
|
|
|
if (vp->v_iflag & VI_ACTIVE)
|
|
|
|
strlcat(buf, "|VI_ACTIVE", sizeof(buf));
|
2007-08-13 21:23:30 +00:00
|
|
|
if (vp->v_iflag & VI_DOINGINACT)
|
|
|
|
strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
|
|
|
|
if (vp->v_iflag & VI_OWEINACT)
|
|
|
|
strlcat(buf, "|VI_OWEINACT", sizeof(buf));
|
2015-11-27 01:45:40 +00:00
|
|
|
flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
|
2012-11-04 13:32:45 +00:00
|
|
|
VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
|
2007-08-13 21:23:30 +00:00
|
|
|
if (flags != 0) {
|
|
|
|
snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
|
|
|
|
strlcat(buf, buf2, sizeof(buf));
|
|
|
|
}
|
2005-01-24 13:58:08 +00:00
|
|
|
printf(" flags (%s)\n", buf + 1);
|
2004-12-03 12:09:34 +00:00
|
|
|
if (mtx_owned(VI_MTX(vp)))
|
|
|
|
printf(" VI_LOCKed");
|
2005-03-27 07:52:12 +00:00
|
|
|
if (vp->v_object != NULL)
|
2013-10-01 20:18:33 +00:00
|
|
|
printf(" v_object %p ref %d pages %d "
|
|
|
|
"cleanbuf %d dirtybuf %d\n",
|
2005-03-13 11:54:28 +00:00
|
|
|
vp->v_object, vp->v_object->ref_count,
|
2013-10-01 20:18:33 +00:00
|
|
|
vp->v_object->resident_page_count,
|
2015-09-26 22:16:54 +00:00
|
|
|
vp->v_bufobj.bo_clean.bv_cnt,
|
|
|
|
vp->v_bufobj.bo_dirty.bv_cnt);
|
2005-01-24 13:58:08 +00:00
|
|
|
printf(" ");
|
2002-10-14 03:20:36 +00:00
|
|
|
lockmgr_printinfo(vp->v_vnlock);
|
2003-03-03 19:15:40 +00:00
|
|
|
if (vp->v_data != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
VOP_PRINT(vp);
|
|
|
|
}
|
|
|
|
|
1995-04-16 11:33:33 +00:00
|
|
|
#ifdef DDB
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* List all of the locked vnodes in the system.
|
|
|
|
* Called when debugging the kernel.
|
|
|
|
*/
|
2002-06-29 04:45:09 +00:00
|
|
|
DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2013-08-17 14:13:45 +00:00
|
|
|
struct mount *mp;
|
1997-02-25 19:33:23 +00:00
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2003-05-12 14:37:47 +00:00
|
|
|
/*
|
|
|
|
* Note: because this is DDB, we can't obey the locking semantics
|
|
|
|
* for these structures, which means we could catch an inconsistent
|
|
|
|
* state and dereference a nasty pointer. Not much to be done
|
|
|
|
* about that.
|
|
|
|
*/
|
2007-12-28 00:47:31 +00:00
|
|
|
db_printf("Locked vnodes\n");
|
2013-08-17 14:13:45 +00:00
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
|
2001-10-23 01:21:29 +00:00
|
|
|
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
|
2013-08-17 14:13:45 +00:00
|
|
|
if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
|
2016-08-10 16:12:31 +00:00
|
|
|
vn_printf(vp, "vnode ");
|
1997-02-25 19:33:23 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
2006-09-04 22:15:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Show details about the given vnode.
|
|
|
|
*/
|
|
|
|
DB_SHOW_COMMAND(vnode, db_show_vnode)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
if (!have_addr)
|
|
|
|
return;
|
|
|
|
vp = (struct vnode *)addr;
|
|
|
|
vn_printf(vp, "vnode ");
|
|
|
|
}
|
2008-04-26 13:04:48 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Show details about the given mount point.
|
|
|
|
*/
|
|
|
|
DB_SHOW_COMMAND(mount, db_show_mount)
|
|
|
|
{
|
|
|
|
struct mount *mp;
|
2009-11-19 14:33:03 +00:00
|
|
|
struct vfsopt *opt;
|
2008-04-26 13:04:48 +00:00
|
|
|
struct statfs *sp;
|
|
|
|
struct vnode *vp;
|
|
|
|
char buf[512];
|
2012-01-17 01:08:01 +00:00
|
|
|
uint64_t mflags;
|
2008-04-26 13:04:48 +00:00
|
|
|
u_int flags;
|
|
|
|
|
|
|
|
if (!have_addr) {
|
|
|
|
/* No address given, print short info about all mount points. */
|
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
|
|
|
|
db_printf("%p %s on %s (%s)\n", mp,
|
|
|
|
mp->mnt_stat.f_mntfromname,
|
|
|
|
mp->mnt_stat.f_mntonname,
|
|
|
|
mp->mnt_stat.f_fstypename);
|
2008-05-18 21:08:12 +00:00
|
|
|
if (db_pager_quit)
|
|
|
|
break;
|
2008-04-26 13:04:48 +00:00
|
|
|
}
|
|
|
|
db_printf("\nMore info: show mount <addr>\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mp = (struct mount *)addr;
|
|
|
|
db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
|
|
|
|
mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
|
|
|
|
|
|
|
|
buf[0] = '\0';
|
2012-01-17 01:08:01 +00:00
|
|
|
mflags = mp->mnt_flag;
|
2008-04-26 13:04:48 +00:00
|
|
|
#define MNT_FLAG(flag) do { \
|
2012-01-17 01:08:01 +00:00
|
|
|
if (mflags & (flag)) { \
|
2008-04-26 13:04:48 +00:00
|
|
|
if (buf[0] != '\0') \
|
|
|
|
strlcat(buf, ", ", sizeof(buf)); \
|
|
|
|
strlcat(buf, (#flag) + 4, sizeof(buf)); \
|
2012-01-17 01:08:01 +00:00
|
|
|
mflags &= ~(flag); \
|
2008-04-26 13:04:48 +00:00
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
MNT_FLAG(MNT_RDONLY);
|
|
|
|
MNT_FLAG(MNT_SYNCHRONOUS);
|
|
|
|
MNT_FLAG(MNT_NOEXEC);
|
|
|
|
MNT_FLAG(MNT_NOSUID);
|
2012-11-04 13:31:41 +00:00
|
|
|
MNT_FLAG(MNT_NFS4ACLS);
|
2008-04-26 13:04:48 +00:00
|
|
|
MNT_FLAG(MNT_UNION);
|
|
|
|
MNT_FLAG(MNT_ASYNC);
|
|
|
|
MNT_FLAG(MNT_SUIDDIR);
|
|
|
|
MNT_FLAG(MNT_SOFTDEP);
|
|
|
|
MNT_FLAG(MNT_NOSYMFOLLOW);
|
|
|
|
MNT_FLAG(MNT_GJOURNAL);
|
|
|
|
MNT_FLAG(MNT_MULTILABEL);
|
|
|
|
MNT_FLAG(MNT_ACLS);
|
|
|
|
MNT_FLAG(MNT_NOATIME);
|
|
|
|
MNT_FLAG(MNT_NOCLUSTERR);
|
|
|
|
MNT_FLAG(MNT_NOCLUSTERW);
|
2012-11-04 13:31:41 +00:00
|
|
|
MNT_FLAG(MNT_SUJ);
|
2008-04-26 13:04:48 +00:00
|
|
|
MNT_FLAG(MNT_EXRDONLY);
|
|
|
|
MNT_FLAG(MNT_EXPORTED);
|
|
|
|
MNT_FLAG(MNT_DEFEXPORTED);
|
|
|
|
MNT_FLAG(MNT_EXPORTANON);
|
|
|
|
MNT_FLAG(MNT_EXKERB);
|
|
|
|
MNT_FLAG(MNT_EXPUBLIC);
|
|
|
|
MNT_FLAG(MNT_LOCAL);
|
|
|
|
MNT_FLAG(MNT_QUOTA);
|
|
|
|
MNT_FLAG(MNT_ROOTFS);
|
|
|
|
MNT_FLAG(MNT_USER);
|
|
|
|
MNT_FLAG(MNT_IGNORE);
|
|
|
|
MNT_FLAG(MNT_UPDATE);
|
|
|
|
MNT_FLAG(MNT_DELEXPORT);
|
|
|
|
MNT_FLAG(MNT_RELOAD);
|
|
|
|
MNT_FLAG(MNT_FORCE);
|
|
|
|
MNT_FLAG(MNT_SNAPSHOT);
|
|
|
|
MNT_FLAG(MNT_BYFSID);
|
|
|
|
#undef MNT_FLAG
|
2012-01-17 01:08:01 +00:00
|
|
|
if (mflags != 0) {
|
2008-04-26 13:04:48 +00:00
|
|
|
if (buf[0] != '\0')
|
|
|
|
strlcat(buf, ", ", sizeof(buf));
|
|
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
|
2012-01-17 01:08:01 +00:00
|
|
|
"0x%016jx", mflags);
|
2008-04-26 13:04:48 +00:00
|
|
|
}
|
|
|
|
db_printf(" mnt_flag = %s\n", buf);
|
|
|
|
|
|
|
|
buf[0] = '\0';
|
|
|
|
flags = mp->mnt_kern_flag;
|
|
|
|
#define MNT_KERN_FLAG(flag) do { \
|
|
|
|
if (flags & (flag)) { \
|
|
|
|
if (buf[0] != '\0') \
|
|
|
|
strlcat(buf, ", ", sizeof(buf)); \
|
|
|
|
strlcat(buf, (#flag) + 5, sizeof(buf)); \
|
|
|
|
flags &= ~(flag); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
MNT_KERN_FLAG(MNTK_UNMOUNTF);
|
|
|
|
MNT_KERN_FLAG(MNTK_ASYNC);
|
|
|
|
MNT_KERN_FLAG(MNTK_SOFTDEP);
|
|
|
|
MNT_KERN_FLAG(MNTK_NOINSMNTQ);
|
2010-04-03 11:15:55 +00:00
|
|
|
MNT_KERN_FLAG(MNTK_DRAINING);
|
|
|
|
MNT_KERN_FLAG(MNTK_REFEXPIRE);
|
|
|
|
MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
|
|
|
|
MNT_KERN_FLAG(MNTK_SHARED_WRITES);
|
2012-11-04 13:33:13 +00:00
|
|
|
MNT_KERN_FLAG(MNTK_NO_IOPF);
|
|
|
|
MNT_KERN_FLAG(MNTK_VGONE_UPPER);
|
|
|
|
MNT_KERN_FLAG(MNTK_VGONE_WAITER);
|
|
|
|
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
|
|
|
|
MNT_KERN_FLAG(MNTK_MARKER);
|
2015-04-15 20:16:31 +00:00
|
|
|
MNT_KERN_FLAG(MNTK_USES_BCACHE);
|
2012-03-09 00:12:05 +00:00
|
|
|
MNT_KERN_FLAG(MNTK_NOASYNC);
|
2008-04-26 13:04:48 +00:00
|
|
|
MNT_KERN_FLAG(MNTK_UNMOUNT);
|
|
|
|
MNT_KERN_FLAG(MNTK_MWAIT);
|
|
|
|
MNT_KERN_FLAG(MNTK_SUSPEND);
|
|
|
|
MNT_KERN_FLAG(MNTK_SUSPEND2);
|
|
|
|
MNT_KERN_FLAG(MNTK_SUSPENDED);
|
|
|
|
MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
|
2010-04-03 11:15:55 +00:00
|
|
|
MNT_KERN_FLAG(MNTK_NOKNOTE);
|
2008-04-26 13:04:48 +00:00
|
|
|
#undef MNT_KERN_FLAG
|
|
|
|
if (flags != 0) {
|
|
|
|
if (buf[0] != '\0')
|
|
|
|
strlcat(buf, ", ", sizeof(buf));
|
|
|
|
snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
|
|
|
|
"0x%08x", flags);
|
|
|
|
}
|
|
|
|
db_printf(" mnt_kern_flag = %s\n", buf);
|
|
|
|
|
2009-11-19 14:33:03 +00:00
|
|
|
db_printf(" mnt_opt = ");
|
|
|
|
opt = TAILQ_FIRST(mp->mnt_opt);
|
|
|
|
if (opt != NULL) {
|
|
|
|
db_printf("%s", opt->name);
|
|
|
|
opt = TAILQ_NEXT(opt, link);
|
|
|
|
while (opt != NULL) {
|
|
|
|
db_printf(", %s", opt->name);
|
|
|
|
opt = TAILQ_NEXT(opt, link);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
db_printf("\n");
|
|
|
|
|
2008-04-26 13:04:48 +00:00
|
|
|
sp = &mp->mnt_stat;
|
|
|
|
db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx "
|
|
|
|
"bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
|
|
|
|
"ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
|
|
|
|
"asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
|
|
|
|
(u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
|
|
|
|
(uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
|
|
|
|
(uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
|
|
|
|
(intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
|
|
|
|
(intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
|
|
|
|
(uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
|
|
|
|
(uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
|
|
|
|
(u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
|
|
|
|
|
|
|
|
db_printf(" mnt_cred = { uid=%u ruid=%u",
|
|
|
|
(u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
|
2009-05-27 14:11:23 +00:00
|
|
|
if (jailed(mp->mnt_cred))
|
2008-04-26 13:04:48 +00:00
|
|
|
db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
|
|
|
|
db_printf(" }\n");
|
|
|
|
db_printf(" mnt_ref = %d\n", mp->mnt_ref);
|
|
|
|
db_printf(" mnt_gen = %d\n", mp->mnt_gen);
|
|
|
|
db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
|
2012-04-20 06:50:44 +00:00
|
|
|
db_printf(" mnt_activevnodelistsize = %d\n",
|
|
|
|
mp->mnt_activevnodelistsize);
|
2008-04-26 13:04:48 +00:00
|
|
|
db_printf(" mnt_writeopcount = %d\n", mp->mnt_writeopcount);
|
|
|
|
db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
|
|
|
|
db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
|
|
|
|
db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
|
2015-02-17 09:31:58 +00:00
|
|
|
db_printf(" mnt_lockref = %d\n", mp->mnt_lockref);
|
2008-04-26 13:04:48 +00:00
|
|
|
db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
|
|
|
|
db_printf(" mnt_secondary_accwrites = %d\n",
|
|
|
|
mp->mnt_secondary_accwrites);
|
|
|
|
db_printf(" mnt_gjprovider = %s\n",
|
|
|
|
mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
|
|
|
|
|
2012-04-20 06:50:44 +00:00
|
|
|
db_printf("\n\nList of active vnodes\n");
|
|
|
|
TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
|
2008-05-18 21:08:12 +00:00
|
|
|
if (vp->v_type != VMARKER) {
|
2008-04-26 13:04:48 +00:00
|
|
|
vn_printf(vp, "vnode ");
|
2008-05-18 21:08:12 +00:00
|
|
|
if (db_pager_quit)
|
|
|
|
break;
|
|
|
|
}
|
2008-04-26 13:04:48 +00:00
|
|
|
}
|
2012-04-20 06:50:44 +00:00
|
|
|
db_printf("\n\nList of inactive vnodes\n");
|
|
|
|
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
|
|
|
|
if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
|
|
|
|
vn_printf(vp, "vnode ");
|
|
|
|
if (db_pager_quit)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2008-04-26 13:04:48 +00:00
|
|
|
}
|
2006-09-04 22:15:44 +00:00
|
|
|
#endif /* DDB */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
/*
|
|
|
|
* Fill in a struct xvfsconf based on a struct vfsconf.
|
|
|
|
*/
|
2012-08-22 20:05:34 +00:00
|
|
|
static int
|
|
|
|
vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
{
|
2012-08-22 20:05:34 +00:00
|
|
|
struct xvfsconf xvfsp;
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
|
2012-08-22 20:05:34 +00:00
|
|
|
bzero(&xvfsp, sizeof(xvfsp));
|
|
|
|
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
|
|
|
|
xvfsp.vfc_typenum = vfsp->vfc_typenum;
|
|
|
|
xvfsp.vfc_refcount = vfsp->vfc_refcount;
|
|
|
|
xvfsp.vfc_flags = vfsp->vfc_flags;
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
/*
|
|
|
|
* These are unused in userland, we keep them
|
|
|
|
* to not break binary compatibility.
|
|
|
|
*/
|
2012-08-22 20:05:34 +00:00
|
|
|
xvfsp.vfc_vfsops = NULL;
|
|
|
|
xvfsp.vfc_next = NULL;
|
|
|
|
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef COMPAT_FREEBSD32
|
|
|
|
struct xvfsconf32 {
|
|
|
|
uint32_t vfc_vfsops;
|
|
|
|
char vfc_name[MFSNAMELEN];
|
|
|
|
int32_t vfc_typenum;
|
|
|
|
int32_t vfc_refcount;
|
|
|
|
int32_t vfc_flags;
|
|
|
|
uint32_t vfc_next;
|
|
|
|
};
|
|
|
|
|
|
|
|
static int
|
|
|
|
vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
|
|
|
|
{
|
|
|
|
struct xvfsconf32 xvfsp;
|
|
|
|
|
2017-04-04 17:32:08 +00:00
|
|
|
bzero(&xvfsp, sizeof(xvfsp));
|
2012-08-22 20:05:34 +00:00
|
|
|
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
|
|
|
|
xvfsp.vfc_typenum = vfsp->vfc_typenum;
|
|
|
|
xvfsp.vfc_refcount = vfsp->vfc_refcount;
|
|
|
|
xvfsp.vfc_flags = vfsp->vfc_flags;
|
|
|
|
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
}
|
2012-08-22 20:05:34 +00:00
|
|
|
#endif
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
|
2004-04-11 21:09:22 +00:00
|
|
|
/*
|
|
|
|
* Top level filesystem related information gathering.
|
|
|
|
*/
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
static int
|
|
|
|
sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct vfsconf *vfsp;
|
2004-07-27 22:32:01 +00:00
|
|
|
int error;
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
|
2004-07-27 22:32:01 +00:00
|
|
|
error = 0;
|
2014-08-03 03:27:54 +00:00
|
|
|
vfsconf_slock();
|
2004-07-27 22:32:01 +00:00
|
|
|
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
|
2012-08-22 20:05:34 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
|
|
|
if (req->flags & SCTL_MASK32)
|
|
|
|
error = vfsconf2x32(req, vfsp);
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
error = vfsconf2x(req, vfsp);
|
2004-07-27 22:32:01 +00:00
|
|
|
if (error)
|
|
|
|
break;
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
}
|
2014-08-03 03:27:54 +00:00
|
|
|
vfsconf_sunlock();
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2014-08-03 03:27:54 +00:00
|
|
|
SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
|
|
|
|
CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
"S,xvfsconf", "List of all configured filesystems");
|
|
|
|
|
2004-04-11 21:09:22 +00:00
|
|
|
#ifndef BURN_BRIDGES
|
2002-03-19 21:25:46 +00:00
|
|
|
static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
|
1997-03-03 12:58:20 +00:00
|
|
|
|
1997-03-04 18:31:56 +00:00
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
vfs_sysctl(SYSCTL_HANDLER_ARGS)
|
1997-03-02 11:06:22 +00:00
|
|
|
{
|
1997-03-04 18:31:56 +00:00
|
|
|
int *name = (int *)arg1 - 1; /* XXX */
|
|
|
|
u_int namelen = arg2 + 1; /* XXX */
|
1997-03-02 11:06:22 +00:00
|
|
|
struct vfsconf *vfsp;
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
|
2011-10-07 09:51:12 +00:00
|
|
|
log(LOG_WARNING, "userland calling deprecated sysctl, "
|
- Introduce a new struct xvfsconf, the userland version of struct vfsconf.
- Make getvfsbyname() take a struct xvfsconf *.
- Convert several consumers of getvfsbyname() to use struct xvfsconf.
- Correct the getvfsbyname.3 manpage.
- Create a new vfs.conflist sysctl to dump all the struct xvfsconf in the
kernel, and rewrite getvfsbyname() to use this instead of the weird
existing API.
- Convert some {set,get,end}vfsent() consumers to use the new vfs.conflist
sysctl.
- Convert a vfsload() call in nfsiod.c to kldload() and remove the useless
vfsisloadable() and endvfsent() calls.
- Add a warning printf() in vfs_sysctl() to tell people they are using
an old userland.
After these changes, it's possible to modify struct vfsconf without
breaking the binary compatibility. Please note that these changes don't
break this compatibility either.
When bp will have updated mount_smbfs(8) with the patch I sent him, there
will be no more consumers of the {set,get,end}vfsent(), vfsisloadable()
and vfsload() API, and I will promptly delete it.
2002-08-10 20:19:04 +00:00
|
|
|
"please rebuild world\n");
|
1997-03-02 11:06:22 +00:00
|
|
|
|
1998-08-29 13:13:10 +00:00
|
|
|
#if 1 || defined(COMPAT_PRELITE2)
|
1997-03-03 12:58:20 +00:00
|
|
|
/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
|
1997-03-04 18:31:56 +00:00
|
|
|
if (namelen == 1)
|
1997-03-03 12:58:20 +00:00
|
|
|
return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
|
|
|
|
#endif
|
1997-03-02 11:06:22 +00:00
|
|
|
|
1997-03-03 12:58:20 +00:00
|
|
|
switch (name[1]) {
|
|
|
|
case VFS_MAXTYPENUM:
|
|
|
|
if (namelen != 2)
|
|
|
|
return (ENOTDIR);
|
|
|
|
return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
|
|
|
|
case VFS_CONF:
|
|
|
|
if (namelen != 3)
|
|
|
|
return (ENOTDIR); /* overloaded */
|
2014-08-03 03:27:54 +00:00
|
|
|
vfsconf_slock();
|
|
|
|
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
|
1997-03-03 12:58:20 +00:00
|
|
|
if (vfsp->vfc_typenum == name[2])
|
|
|
|
break;
|
2014-08-03 03:27:54 +00:00
|
|
|
}
|
|
|
|
vfsconf_sunlock();
|
1997-03-03 12:58:20 +00:00
|
|
|
if (vfsp == NULL)
|
|
|
|
return (EOPNOTSUPP);
|
2012-08-22 20:05:34 +00:00
|
|
|
#ifdef COMPAT_FREEBSD32
|
|
|
|
if (req->flags & SCTL_MASK32)
|
|
|
|
return (vfsconf2x32(req, vfsp));
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
return (vfsconf2x(req, vfsp));
|
1997-03-03 12:58:20 +00:00
|
|
|
}
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
}
|
1997-03-02 11:06:22 +00:00
|
|
|
|
2014-08-03 03:27:54 +00:00
|
|
|
static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
|
|
|
|
CTLFLAG_MPSAFE, vfs_sysctl,
|
|
|
|
"Generic filesystem");
|
1997-03-04 18:31:56 +00:00
|
|
|
|
1998-08-29 13:13:10 +00:00
|
|
|
#if 1 || defined(COMPAT_PRELITE2)
|
1997-03-02 11:06:22 +00:00
|
|
|
|
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
|
1997-03-02 11:06:22 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vfsconf *vfsp;
|
1997-03-03 12:58:20 +00:00
|
|
|
struct ovfsconf ovfs;
|
1997-03-02 11:06:22 +00:00
|
|
|
|
2014-08-03 03:27:54 +00:00
|
|
|
vfsconf_slock();
|
2004-07-27 22:32:01 +00:00
|
|
|
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
|
2005-05-06 02:50:00 +00:00
|
|
|
bzero(&ovfs, sizeof(ovfs));
|
1997-03-02 11:06:22 +00:00
|
|
|
ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
|
|
|
|
strcpy(ovfs.vfc_name, vfsp->vfc_name);
|
|
|
|
ovfs.vfc_index = vfsp->vfc_typenum;
|
|
|
|
ovfs.vfc_refcount = vfsp->vfc_refcount;
|
|
|
|
ovfs.vfc_flags = vfsp->vfc_flags;
|
|
|
|
error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
|
2014-08-03 03:27:54 +00:00
|
|
|
if (error != 0) {
|
|
|
|
vfsconf_sunlock();
|
|
|
|
return (error);
|
|
|
|
}
|
1997-03-02 11:06:22 +00:00
|
|
|
}
|
2014-08-03 03:27:54 +00:00
|
|
|
vfsconf_sunlock();
|
|
|
|
return (0);
|
1997-03-02 11:06:22 +00:00
|
|
|
}
|
|
|
|
|
1998-08-29 13:13:10 +00:00
|
|
|
#endif /* 1 || COMPAT_PRELITE2 */
|
2004-04-11 21:09:22 +00:00
|
|
|
#endif /* !BURN_BRIDGES */
|
1997-03-02 11:06:22 +00:00
|
|
|
|
2002-07-31 12:24:35 +00:00
|
|
|
#define KINFO_VNODESLOP 10
|
2003-02-23 18:09:05 +00:00
|
|
|
#ifdef notyet
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Dump vnode list (via sysctl).
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-11-20 12:42:39 +00:00
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
sysctl_vnode(SYSCTL_HANDLER_ARGS)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-07-31 12:24:35 +00:00
|
|
|
struct xvnode *xvn;
|
|
|
|
struct mount *mp;
|
|
|
|
struct vnode *vp;
|
|
|
|
int error, len, n;
|
1995-11-20 12:42:39 +00:00
|
|
|
|
2002-08-13 05:29:48 +00:00
|
|
|
/*
|
|
|
|
* Stale numvnodes access is not fatal here.
|
|
|
|
*/
|
1995-11-20 12:42:39 +00:00
|
|
|
req->lock = 0;
|
2002-07-31 12:24:35 +00:00
|
|
|
len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
|
|
|
|
if (!req->oldptr)
|
|
|
|
/* Make an estimate */
|
|
|
|
return (SYSCTL_OUT(req, 0, len));
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
|
2004-02-26 00:27:04 +00:00
|
|
|
error = sysctl_wire_old_buffer(req, 0);
|
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
2003-02-19 05:47:46 +00:00
|
|
|
xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
|
2002-07-31 12:24:35 +00:00
|
|
|
n = 0;
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_lock(&mountlist_mtx);
|
2002-07-31 12:24:35 +00:00
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
|
2008-11-02 10:15:42 +00:00
|
|
|
if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
2003-11-05 04:30:08 +00:00
|
|
|
MNT_ILOCK(mp);
|
2002-07-31 12:24:35 +00:00
|
|
|
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
|
|
|
|
if (n == len)
|
|
|
|
break;
|
|
|
|
vref(vp);
|
|
|
|
xvn[n].xv_size = sizeof *xvn;
|
|
|
|
xvn[n].xv_vnode = vp;
|
2005-03-30 03:01:36 +00:00
|
|
|
xvn[n].xv_id = 0; /* XXX compat */
|
2002-07-31 12:24:35 +00:00
|
|
|
#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
|
|
|
|
XV_COPY(usecount);
|
|
|
|
XV_COPY(writecount);
|
|
|
|
XV_COPY(holdcnt);
|
|
|
|
XV_COPY(mount);
|
|
|
|
XV_COPY(numoutput);
|
|
|
|
XV_COPY(type);
|
|
|
|
#undef XV_COPY
|
2002-08-04 10:29:36 +00:00
|
|
|
xvn[n].xv_flag = vp->v_vflag;
|
|
|
|
|
2002-07-31 12:24:35 +00:00
|
|
|
switch (vp->v_type) {
|
|
|
|
case VREG:
|
|
|
|
case VDIR:
|
|
|
|
case VLNK:
|
|
|
|
break;
|
|
|
|
case VBLK:
|
|
|
|
case VCHR:
|
|
|
|
if (vp->v_rdev == NULL) {
|
|
|
|
vrele(vp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
xvn[n].xv_dev = dev2udev(vp->v_rdev);
|
|
|
|
break;
|
|
|
|
case VSOCK:
|
|
|
|
xvn[n].xv_socket = vp->v_socket;
|
|
|
|
break;
|
|
|
|
case VFIFO:
|
|
|
|
xvn[n].xv_fifo = vp->v_fifoinfo;
|
|
|
|
break;
|
|
|
|
case VNON:
|
|
|
|
case VBAD:
|
|
|
|
default:
|
|
|
|
/* shouldn't happen? */
|
|
|
|
vrele(vp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
vrele(vp);
|
|
|
|
++n;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2003-11-05 04:30:08 +00:00
|
|
|
MNT_IUNLOCK(mp);
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_lock(&mountlist_mtx);
|
2008-08-31 14:26:08 +00:00
|
|
|
vfs_unbusy(mp);
|
2002-07-31 12:24:35 +00:00
|
|
|
if (n == len)
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
Change and clean the mutex lock interface.
mtx_enter(lock, type) becomes:
mtx_lock(lock) for sleep locks (MTX_DEF-initialized locks)
mtx_lock_spin(lock) for spin locks (MTX_SPIN-initialized)
similarily, for releasing a lock, we now have:
mtx_unlock(lock) for MTX_DEF and mtx_unlock_spin(lock) for MTX_SPIN.
We change the caller interface for the two different types of locks
because the semantics are entirely different for each case, and this
makes it explicitly clear and, at the same time, it rids us of the
extra `type' argument.
The enter->lock and exit->unlock change has been made with the idea
that we're "locking data" and not "entering locked code" in mind.
Further, remove all additional "flags" previously passed to the
lock acquire/release routines with the exception of two:
MTX_QUIET and MTX_NOSWITCH
The functionality of these flags is preserved and they can be passed
to the lock/unlock routines by calling the corresponding wrappers:
mtx_{lock, unlock}_flags(lock, flag(s)) and
mtx_{lock, unlock}_spin_flags(lock, flag(s)) for MTX_DEF and MTX_SPIN
locks, respectively.
Re-inline some lock acq/rel code; in the sleep lock case, we only
inline the _obtain_lock()s in order to ensure that the inlined code
fits into a cache line. In the spin lock case, we inline recursion and
actually only perform a function call if we need to spin. This change
has been made with the idea that we generally tend to avoid spin locks
and that also the spin locks that we do have and are heavily used
(i.e. sched_lock) do recurse, and therefore in an effort to reduce
function call overhead for some architectures (such as alpha), we
inline recursion for this case.
Create a new malloc type for the witness code and retire from using
the M_DEV type. The new type is called M_WITNESS and is only declared
if WITNESS is enabled.
Begin cleaning up some machdep/mutex.h code - specifically updated the
"optimized" inlined code in alpha/mutex.h and wrote MTX_LOCK_SPIN
and MTX_UNLOCK_SPIN asm macros for the i386/mutex.h as we presently
need those.
Finally, caught up to the interface changes in all sys code.
Contributors: jake, jhb, jasone (in no particular order)
2001-02-09 06:11:45 +00:00
|
|
|
mtx_unlock(&mountlist_mtx);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2002-07-31 12:24:35 +00:00
|
|
|
error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
|
|
|
|
free(xvn, M_TEMP);
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2014-08-03 03:27:54 +00:00
|
|
|
SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
|
|
|
|
CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
|
|
|
|
"");
|
2003-02-23 18:09:05 +00:00
|
|
|
#endif
|
1995-11-20 12:42:39 +00:00
|
|
|
|
2015-08-24 13:18:13 +00:00
|
|
|
static void
|
|
|
|
unmount_or_warn(struct mount *mp)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
error = dounmount(mp, MNT_FORCE, curthread);
|
|
|
|
if (error != 0) {
|
|
|
|
printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
|
|
|
|
if (error == EBUSY)
|
|
|
|
printf("BUSY)\n");
|
|
|
|
else
|
|
|
|
printf("%d)\n", error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1997-02-10 02:22:35 +00:00
|
|
|
/*
|
1997-02-26 15:35:42 +00:00
|
|
|
* Unmount all filesystems. The list is traversed in reverse order
|
|
|
|
* of mounting to avoid dependencies.
|
1997-02-10 02:22:35 +00:00
|
|
|
*/
|
|
|
|
void
|
2006-01-21 19:42:10 +00:00
|
|
|
vfs_unmountall(void)
|
1997-02-10 02:22:35 +00:00
|
|
|
{
|
2015-08-24 13:18:13 +00:00
|
|
|
struct mount *mp, *tmp;
|
1997-02-10 02:22:35 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
|
|
|
|
|
1997-02-26 15:35:42 +00:00
|
|
|
/*
|
|
|
|
* Since this only runs when rebooting, it is not interlocked.
|
|
|
|
*/
|
2015-08-24 13:18:13 +00:00
|
|
|
TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
|
2015-05-27 09:22:50 +00:00
|
|
|
vfs_ref(mp);
|
2015-08-24 13:18:13 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Forcibly unmounting "/dev" before "/" would prevent clean
|
|
|
|
* unmount of the latter.
|
|
|
|
*/
|
|
|
|
if (mp == rootdevmp)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unmount_or_warn(mp);
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
2015-08-24 13:18:13 +00:00
|
|
|
|
|
|
|
if (rootdevmp != NULL)
|
|
|
|
unmount_or_warn(rootdevmp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1995-05-21 21:39:31 +00:00
|
|
|
/*
|
|
|
|
* perform msync on all vnodes under a mount point
|
|
|
|
* the mount point must be locked.
|
|
|
|
*/
|
|
|
|
void
|
2001-10-23 01:21:29 +00:00
|
|
|
vfs_msync(struct mount *mp, int flags)
|
|
|
|
{
|
2006-01-09 20:42:19 +00:00
|
|
|
struct vnode *vp, *mvp;
|
1998-04-18 06:26:16 +00:00
|
|
|
struct vm_object *obj;
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
|
2009-02-05 15:03:35 +00:00
|
|
|
CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
|
2016-09-30 17:27:17 +00:00
|
|
|
|
|
|
|
vnlru_return_batch(mp);
|
|
|
|
|
2012-04-20 07:00:28 +00:00
|
|
|
MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
|
2009-12-21 12:29:38 +00:00
|
|
|
obj = vp->v_object;
|
|
|
|
if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
|
2008-02-25 18:45:57 +00:00
|
|
|
(flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
if (!vget(vp,
|
2002-09-25 02:22:21 +00:00
|
|
|
LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
|
|
|
|
curthread)) {
|
|
|
|
if (vp->v_vflag & VV_NOSYNC) { /* unlinked */
|
|
|
|
vput(vp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2005-01-25 00:40:01 +00:00
|
|
|
obj = vp->v_object;
|
|
|
|
if (obj != NULL) {
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WLOCK(obj);
|
2001-05-19 01:28:09 +00:00
|
|
|
vm_object_page_clean(obj, 0, 0,
|
|
|
|
flags == MNT_WAIT ?
|
|
|
|
OBJPC_SYNC : OBJPC_NOSYNC);
|
2013-02-20 12:03:20 +00:00
|
|
|
VM_OBJECT_WUNLOCK(obj);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
}
|
|
|
|
vput(vp);
|
|
|
|
}
|
2002-09-25 02:22:21 +00:00
|
|
|
} else
|
|
|
|
VI_UNLOCK(vp);
|
1995-05-21 21:39:31 +00:00
|
|
|
}
|
|
|
|
}
|
1996-08-21 21:56:23 +00:00
|
|
|
|
2008-10-28 12:08:36 +00:00
|
|
|
static void
|
2013-07-28 06:59:29 +00:00
|
|
|
destroy_vpollinfo_free(struct vpollinfo *vi)
|
2008-10-28 12:08:36 +00:00
|
|
|
{
|
2013-07-17 10:56:21 +00:00
|
|
|
|
2008-10-28 12:08:36 +00:00
|
|
|
knlist_destroy(&vi->vpi_selinfo.si_note);
|
|
|
|
mtx_destroy(&vi->vpi_lock);
|
|
|
|
uma_zfree(vnodepoll_zone, vi);
|
|
|
|
}
|
|
|
|
|
2013-07-28 06:59:29 +00:00
|
|
|
static void
|
|
|
|
destroy_vpollinfo(struct vpollinfo *vi)
|
|
|
|
{
|
|
|
|
|
|
|
|
knlist_clear(&vi->vpi_selinfo.si_note, 1);
|
|
|
|
seldrain(&vi->vpi_selinfo);
|
|
|
|
destroy_vpollinfo_free(vi);
|
|
|
|
}
|
|
|
|
|
2004-01-05 19:04:29 +00:00
|
|
|
/*
|
2016-04-29 22:15:33 +00:00
|
|
|
* Initialize per-vnode helper structure to hold poll-related state.
|
2004-01-05 19:04:29 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
v_addpollinfo(struct vnode *vp)
|
|
|
|
{
|
2004-08-11 01:27:53 +00:00
|
|
|
struct vpollinfo *vi;
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2008-10-28 12:08:36 +00:00
|
|
|
if (vp->v_pollinfo != NULL)
|
|
|
|
return;
|
2015-06-23 18:40:20 +00:00
|
|
|
vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO);
|
2008-10-28 12:08:36 +00:00
|
|
|
mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
|
|
|
|
knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
|
2009-06-10 20:59:32 +00:00
|
|
|
vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
|
2008-10-28 12:08:36 +00:00
|
|
|
VI_LOCK(vp);
|
2004-08-11 01:27:53 +00:00
|
|
|
if (vp->v_pollinfo != NULL) {
|
2008-10-28 12:08:36 +00:00
|
|
|
VI_UNLOCK(vp);
|
2013-07-28 06:59:29 +00:00
|
|
|
destroy_vpollinfo_free(vi);
|
2004-08-11 01:27:53 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
vp->v_pollinfo = vi;
|
2008-10-28 12:08:36 +00:00
|
|
|
VI_UNLOCK(vp);
|
2004-01-05 19:04:29 +00:00
|
|
|
}
|
|
|
|
|
1997-12-15 03:09:59 +00:00
|
|
|
/*
|
|
|
|
* Record a process's interest in events which might happen to
|
|
|
|
* a vnode. Because poll uses the historic select-style interface
|
|
|
|
* internally, this routine serves as both the ``check for any
|
|
|
|
* pending events'' and the ``record my interest in future events''
|
|
|
|
* functions. (These are done together, while the lock is held,
|
|
|
|
* to avoid race conditions.)
|
|
|
|
*/
|
|
|
|
int
|
2006-01-21 19:42:10 +00:00
|
|
|
vn_pollrecord(struct vnode *vp, struct thread *td, int events)
|
1997-12-15 03:09:59 +00:00
|
|
|
{
|
2002-02-17 21:15:36 +00:00
|
|
|
|
2008-10-28 12:08:36 +00:00
|
|
|
v_addpollinfo(vp);
|
2002-02-17 21:15:36 +00:00
|
|
|
mtx_lock(&vp->v_pollinfo->vpi_lock);
|
|
|
|
if (vp->v_pollinfo->vpi_revents & events) {
|
1997-12-15 03:09:59 +00:00
|
|
|
/*
|
|
|
|
* This leaves events we are not interested
|
|
|
|
* in available for the other process which
|
|
|
|
* which presumably had requested them
|
|
|
|
* (otherwise they would never have been
|
|
|
|
* recorded).
|
|
|
|
*/
|
2002-02-17 21:15:36 +00:00
|
|
|
events &= vp->v_pollinfo->vpi_revents;
|
|
|
|
vp->v_pollinfo->vpi_revents &= ~events;
|
1997-12-15 03:09:59 +00:00
|
|
|
|
2002-02-17 21:15:36 +00:00
|
|
|
mtx_unlock(&vp->v_pollinfo->vpi_lock);
|
2008-10-28 12:22:33 +00:00
|
|
|
return (events);
|
1997-12-15 03:09:59 +00:00
|
|
|
}
|
2002-02-17 21:15:36 +00:00
|
|
|
vp->v_pollinfo->vpi_events |= events;
|
|
|
|
selrecord(td, &vp->v_pollinfo->vpi_selinfo);
|
|
|
|
mtx_unlock(&vp->v_pollinfo->vpi_lock);
|
2008-10-28 12:22:33 +00:00
|
|
|
return (0);
|
1997-12-15 03:09:59 +00:00
|
|
|
}
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Routine to create and manage a filesystem syncer vnode.
|
|
|
|
*/
|
2002-03-19 21:25:46 +00:00
|
|
|
#define sync_close ((int (*)(struct vop_close_args *))nullop)
|
|
|
|
static int sync_fsync(struct vop_fsync_args *);
|
|
|
|
static int sync_inactive(struct vop_inactive_args *);
|
|
|
|
static int sync_reclaim(struct vop_reclaim_args *);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2004-12-01 23:16:38 +00:00
|
|
|
static struct vop_vector sync_vnodeops = {
|
|
|
|
.vop_bypass = VOP_EOPNOTSUPP,
|
|
|
|
.vop_close = sync_close, /* close */
|
|
|
|
.vop_fsync = sync_fsync, /* fsync */
|
|
|
|
.vop_inactive = sync_inactive, /* inactive */
|
|
|
|
.vop_reclaim = sync_reclaim, /* reclaim */
|
2007-05-18 13:02:13 +00:00
|
|
|
.vop_lock1 = vop_stdlock, /* lock */
|
2004-12-01 23:16:38 +00:00
|
|
|
.vop_unlock = vop_stdunlock, /* unlock */
|
|
|
|
.vop_islocked = vop_stdislocked, /* islocked */
|
1998-03-08 09:59:44 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new filesystem syncer vnode for the specified mount point.
|
|
|
|
*/
|
2010-08-28 08:57:15 +00:00
|
|
|
void
|
2006-01-21 19:42:10 +00:00
|
|
|
vfs_allocate_syncvnode(struct mount *mp)
|
1998-03-08 09:59:44 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp;
|
2008-03-22 09:15:16 +00:00
|
|
|
struct bufobj *bo;
|
1998-03-08 09:59:44 +00:00
|
|
|
static long start, incr, next;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* Allocate a new vnode */
|
2010-08-28 08:57:15 +00:00
|
|
|
error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
|
|
|
|
if (error != 0)
|
|
|
|
panic("vfs_allocate_syncvnode: getnewvnode() failed");
|
1998-03-08 09:59:44 +00:00
|
|
|
vp->v_type = VNON;
|
2008-08-28 09:08:15 +00:00
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
|
|
|
|
vp->v_vflag |= VV_FORCEINSMQ;
|
2007-03-13 01:50:27 +00:00
|
|
|
error = insmntque(vp, mp);
|
|
|
|
if (error != 0)
|
2010-08-28 08:57:15 +00:00
|
|
|
panic("vfs_allocate_syncvnode: insmntque() failed");
|
2008-08-28 09:08:15 +00:00
|
|
|
vp->v_vflag &= ~VV_FORCEINSMQ;
|
|
|
|
VOP_UNLOCK(vp, 0);
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Place the vnode onto the syncer worklist. We attempt to
|
|
|
|
* scatter them about on the list so that they will go off
|
|
|
|
* at evenly distributed times even if all the filesystems
|
|
|
|
* are mounted at once.
|
|
|
|
*/
|
|
|
|
next += incr;
|
|
|
|
if (next == 0 || next > syncer_maxdelay) {
|
|
|
|
start /= 2;
|
|
|
|
incr /= 2;
|
|
|
|
if (start == 0) {
|
|
|
|
start = syncer_maxdelay / 2;
|
|
|
|
incr = syncer_maxdelay;
|
|
|
|
}
|
|
|
|
next = start;
|
|
|
|
}
|
2008-03-22 09:15:16 +00:00
|
|
|
bo = &vp->v_bufobj;
|
|
|
|
BO_LOCK(bo);
|
|
|
|
vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
|
2004-07-01 23:59:19 +00:00
|
|
|
/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
|
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
sync_vnode_count++;
|
2010-09-11 13:06:06 +00:00
|
|
|
if (mp->mnt_syncer == NULL) {
|
|
|
|
mp->mnt_syncer = vp;
|
|
|
|
vp = NULL;
|
|
|
|
}
|
2004-07-01 23:59:19 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_UNLOCK(bo);
|
2010-09-11 13:06:06 +00:00
|
|
|
if (vp != NULL) {
|
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
|
|
|
|
vgone(vp);
|
|
|
|
vput(vp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vfs_deallocate_syncvnode(struct mount *mp)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
vp = mp->mnt_syncer;
|
|
|
|
if (vp != NULL)
|
|
|
|
mp->mnt_syncer = NULL;
|
|
|
|
mtx_unlock(&sync_mtx);
|
|
|
|
if (vp != NULL)
|
|
|
|
vrele(vp);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do a lazy sync of the filesystem.
|
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
2006-01-21 19:42:10 +00:00
|
|
|
sync_fsync(struct vop_fsync_args *ap)
|
1998-03-08 09:59:44 +00:00
|
|
|
{
|
|
|
|
struct vnode *syncvp = ap->a_vp;
|
|
|
|
struct mount *mp = syncvp->v_mount;
|
2012-02-06 11:04:36 +00:00
|
|
|
int error, save;
|
2004-10-27 08:05:02 +00:00
|
|
|
struct bufobj *bo;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We only need to do something if this is a lazy evaluation.
|
|
|
|
*/
|
|
|
|
if (ap->a_waitfor != MNT_LAZY)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move ourselves to the back of the sync list.
|
|
|
|
*/
|
2004-10-27 08:05:02 +00:00
|
|
|
bo = &syncvp->v_bufobj;
|
|
|
|
BO_LOCK(bo);
|
|
|
|
vn_syncer_add_to_worklist(bo, syncdelay);
|
|
|
|
BO_UNLOCK(bo);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk the list of vnodes pushing all that are dirty and
|
|
|
|
* not already on the sync list.
|
|
|
|
*/
|
2014-06-11 12:56:49 +00:00
|
|
|
if (vfs_busy(mp, MBF_NOWAIT) != 0)
|
1998-03-08 09:59:44 +00:00
|
|
|
return (0);
|
2000-07-11 22:07:57 +00:00
|
|
|
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
|
2008-08-31 14:26:08 +00:00
|
|
|
vfs_unbusy(mp);
|
2000-07-11 22:07:57 +00:00
|
|
|
return (0);
|
|
|
|
}
|
2012-02-06 11:04:36 +00:00
|
|
|
save = curthread_pflags_set(TDP_SYNCIO);
|
1998-04-16 03:31:26 +00:00
|
|
|
vfs_msync(mp, MNT_NOWAIT);
|
2009-05-11 15:33:26 +00:00
|
|
|
error = VFS_SYNC(mp, MNT_LAZY);
|
2012-02-06 11:04:36 +00:00
|
|
|
curthread_pflags_restore(save);
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
2008-08-31 14:26:08 +00:00
|
|
|
vfs_unbusy(mp);
|
2002-10-25 00:20:37 +00:00
|
|
|
return (error);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The syncer vnode is no referenced.
|
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
2006-01-21 19:42:10 +00:00
|
|
|
sync_inactive(struct vop_inactive_args *ap)
|
1998-03-08 09:59:44 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
vgone(ap->a_vp);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The syncer vnode is no longer needed and is being decommissioned.
|
1999-02-19 17:36:58 +00:00
|
|
|
*
|
2003-09-19 23:52:06 +00:00
|
|
|
* Modifications to the worklist must be protected by sync_mtx.
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
2006-01-21 19:42:10 +00:00
|
|
|
sync_reclaim(struct vop_reclaim_args *ap)
|
1998-03-08 09:59:44 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp = ap->a_vp;
|
2004-10-27 08:05:02 +00:00
|
|
|
struct bufobj *bo;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2004-10-27 08:05:02 +00:00
|
|
|
bo = &vp->v_bufobj;
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_LOCK(bo);
|
2010-09-11 13:06:06 +00:00
|
|
|
mtx_lock(&sync_mtx);
|
|
|
|
if (vp->v_mount->mnt_syncer == vp)
|
|
|
|
vp->v_mount->mnt_syncer = NULL;
|
2004-10-27 08:05:02 +00:00
|
|
|
if (bo->bo_flag & BO_ONWORKLST) {
|
|
|
|
LIST_REMOVE(bo, bo_synclist);
|
2007-04-10 15:29:37 +00:00
|
|
|
syncer_worklist_len--;
|
2004-07-01 23:59:19 +00:00
|
|
|
sync_vnode_count--;
|
2004-10-27 08:05:02 +00:00
|
|
|
bo->bo_flag &= ~BO_ONWORKLST;
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
2010-09-11 13:06:06 +00:00
|
|
|
mtx_unlock(&sync_mtx);
|
2008-03-22 09:15:16 +00:00
|
|
|
BO_UNLOCK(bo);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1999-08-25 12:24:39 +00:00
|
|
|
/*
|
|
|
|
* Check if vnode represents a disk device
|
|
|
|
*/
|
|
|
|
int
|
2006-01-21 19:42:10 +00:00
|
|
|
vn_isdisk(struct vnode *vp, int *errp)
|
1999-08-25 12:24:39 +00:00
|
|
|
{
|
2003-10-12 14:04:39 +00:00
|
|
|
int error;
|
2000-09-05 21:09:56 +00:00
|
|
|
|
2014-10-15 05:17:36 +00:00
|
|
|
if (vp->v_type != VCHR) {
|
|
|
|
error = ENOTBLK;
|
|
|
|
goto out;
|
|
|
|
}
|
2003-10-12 14:04:39 +00:00
|
|
|
error = 0;
|
2004-09-24 06:16:08 +00:00
|
|
|
dev_lock();
|
2014-10-15 05:17:36 +00:00
|
|
|
if (vp->v_rdev == NULL)
|
2003-10-12 14:04:39 +00:00
|
|
|
error = ENXIO;
|
2004-09-24 06:16:08 +00:00
|
|
|
else if (vp->v_rdev->si_devsw == NULL)
|
|
|
|
error = ENXIO;
|
|
|
|
else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
|
2003-10-12 14:04:39 +00:00
|
|
|
error = ENOTBLK;
|
2004-09-24 06:16:08 +00:00
|
|
|
dev_unlock();
|
2014-10-15 05:17:36 +00:00
|
|
|
out:
|
2000-01-10 12:04:27 +00:00
|
|
|
if (errp != NULL)
|
2003-10-12 14:04:39 +00:00
|
|
|
*errp = error;
|
|
|
|
return (error == 0);
|
1999-08-25 12:24:39 +00:00
|
|
|
}
|
|
|
|
|
2000-09-20 17:18:12 +00:00
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Common filesystem object access control check routine. Accepts a
|
2000-09-20 17:18:12 +00:00
|
|
|
* vnode's type, "mode", uid and gid, requested access mode, credentials,
|
|
|
|
* and optional call-by-reference privused argument allowing vaccess()
|
|
|
|
* to indicate to the caller whether privilege was used to satisfy the
|
2002-07-31 02:05:12 +00:00
|
|
|
* request (obsoleted). Returns 0 on success, or an errno on failure.
|
2000-09-20 17:18:12 +00:00
|
|
|
*/
|
2000-08-20 08:36:26 +00:00
|
|
|
int
|
2006-01-21 19:42:10 +00:00
|
|
|
vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
|
2008-10-28 13:44:11 +00:00
|
|
|
accmode_t accmode, struct ucred *cred, int *privused)
|
2000-08-20 08:36:26 +00:00
|
|
|
{
|
2008-10-28 13:44:11 +00:00
|
|
|
accmode_t dac_granted;
|
|
|
|
accmode_t priv_granted;
|
2000-08-20 08:36:26 +00:00
|
|
|
|
2009-10-01 17:22:03 +00:00
|
|
|
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
|
|
|
|
("invalid bit in accmode"));
|
2009-12-26 11:36:10 +00:00
|
|
|
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
|
2010-04-03 11:19:20 +00:00
|
|
|
("VAPPEND without VWRITE"));
|
2009-10-01 17:22:03 +00:00
|
|
|
|
2000-08-20 08:36:26 +00:00
|
|
|
/*
|
2000-08-29 14:45:49 +00:00
|
|
|
* Look for a normal, non-privileged way to access the file/directory
|
|
|
|
* as requested. If it exists, go with that.
|
2000-08-20 08:36:26 +00:00
|
|
|
*/
|
2000-08-29 14:45:49 +00:00
|
|
|
|
|
|
|
if (privused != NULL)
|
|
|
|
*privused = 0;
|
|
|
|
|
|
|
|
dac_granted = 0;
|
|
|
|
|
|
|
|
/* Check the owner. */
|
|
|
|
if (cred->cr_uid == file_uid) {
|
2000-10-19 07:53:59 +00:00
|
|
|
dac_granted |= VADMIN;
|
2000-08-29 14:45:49 +00:00
|
|
|
if (file_mode & S_IXUSR)
|
|
|
|
dac_granted |= VEXEC;
|
|
|
|
if (file_mode & S_IRUSR)
|
|
|
|
dac_granted |= VREAD;
|
|
|
|
if (file_mode & S_IWUSR)
|
2002-07-22 03:57:07 +00:00
|
|
|
dac_granted |= (VWRITE | VAPPEND);
|
2000-08-29 14:45:49 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & dac_granted) == accmode)
|
2000-08-29 14:45:49 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
goto privcheck;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Otherwise, check the groups (first match) */
|
|
|
|
if (groupmember(file_gid, cred)) {
|
|
|
|
if (file_mode & S_IXGRP)
|
|
|
|
dac_granted |= VEXEC;
|
|
|
|
if (file_mode & S_IRGRP)
|
|
|
|
dac_granted |= VREAD;
|
|
|
|
if (file_mode & S_IWGRP)
|
2002-07-22 03:57:07 +00:00
|
|
|
dac_granted |= (VWRITE | VAPPEND);
|
2000-08-29 14:45:49 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & dac_granted) == accmode)
|
2000-08-29 14:45:49 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
goto privcheck;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Otherwise, check everyone else. */
|
|
|
|
if (file_mode & S_IXOTH)
|
|
|
|
dac_granted |= VEXEC;
|
|
|
|
if (file_mode & S_IROTH)
|
|
|
|
dac_granted |= VREAD;
|
|
|
|
if (file_mode & S_IWOTH)
|
2002-07-22 03:57:07 +00:00
|
|
|
dac_granted |= (VWRITE | VAPPEND);
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & dac_granted) == accmode)
|
2000-08-29 14:45:49 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
privcheck:
|
|
|
|
/*
|
2006-11-06 13:42:10 +00:00
|
|
|
* Build a privilege mask to determine if the set of privileges
|
2000-08-29 14:45:49 +00:00
|
|
|
* satisfies the requirements when combined with the granted mask
|
2006-11-06 13:42:10 +00:00
|
|
|
* from above. For each privilege, if the privilege is required,
|
|
|
|
* bitwise or the request type onto the priv_granted mask.
|
2000-08-29 14:45:49 +00:00
|
|
|
*/
|
2006-11-06 13:42:10 +00:00
|
|
|
priv_granted = 0;
|
2001-11-02 15:16:59 +00:00
|
|
|
|
|
|
|
if (type == VDIR) {
|
|
|
|
/*
|
2006-11-06 13:42:10 +00:00
|
|
|
* For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
|
|
|
|
* requests, instead of PRIV_VFS_EXEC.
|
2001-11-02 15:16:59 +00:00
|
|
|
*/
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
|
2007-06-12 00:12:01 +00:00
|
|
|
!priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
|
2006-11-06 13:42:10 +00:00
|
|
|
priv_granted |= VEXEC;
|
2001-11-02 15:16:59 +00:00
|
|
|
} else {
|
2010-08-30 16:30:18 +00:00
|
|
|
/*
|
|
|
|
* Ensure that at least one execute bit is on. Otherwise,
|
|
|
|
* a privileged user will always succeed, and we don't want
|
|
|
|
* this to happen unless the file really is executable.
|
|
|
|
*/
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
|
2010-08-30 16:30:18 +00:00
|
|
|
(file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
|
2007-06-12 00:12:01 +00:00
|
|
|
!priv_check_cred(cred, PRIV_VFS_EXEC, 0))
|
2006-11-06 13:42:10 +00:00
|
|
|
priv_granted |= VEXEC;
|
2001-11-02 15:16:59 +00:00
|
|
|
}
|
2000-08-29 14:45:49 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
|
2007-06-12 00:12:01 +00:00
|
|
|
!priv_check_cred(cred, PRIV_VFS_READ, 0))
|
2006-11-06 13:42:10 +00:00
|
|
|
priv_granted |= VREAD;
|
2000-08-29 14:45:49 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
|
2007-06-12 00:12:01 +00:00
|
|
|
!priv_check_cred(cred, PRIV_VFS_WRITE, 0))
|
2006-11-06 13:42:10 +00:00
|
|
|
priv_granted |= (VWRITE | VAPPEND);
|
2000-08-29 14:45:49 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
|
2007-06-12 00:12:01 +00:00
|
|
|
!priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
|
2006-11-06 13:42:10 +00:00
|
|
|
priv_granted |= VADMIN;
|
2000-10-19 07:53:59 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
if ((accmode & (priv_granted | dac_granted)) == accmode) {
|
2000-08-29 14:45:49 +00:00
|
|
|
/* XXX audit: privilege used */
|
|
|
|
if (privused != NULL)
|
|
|
|
*privused = 1;
|
|
|
|
return (0);
|
|
|
|
}
|
2000-08-20 08:36:26 +00:00
|
|
|
|
2008-10-28 13:44:11 +00:00
|
|
|
return ((accmode & VADMIN) ? EPERM : EACCES);
|
2000-08-20 08:36:26 +00:00
|
|
|
}
|
2002-09-05 20:38:57 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Credential check based on process requesting service, and per-attribute
|
|
|
|
* permissions.
|
|
|
|
*/
|
|
|
|
int
|
2006-01-21 19:42:10 +00:00
|
|
|
extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
|
2008-10-28 13:44:11 +00:00
|
|
|
struct thread *td, accmode_t accmode)
|
2002-09-05 20:38:57 +00:00
|
|
|
{
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Kernel-invoked always succeeds.
|
|
|
|
*/
|
2002-09-05 20:46:19 +00:00
|
|
|
if (cred == NOCRED)
|
2002-09-05 20:38:57 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
2006-11-06 13:42:10 +00:00
|
|
|
* Do not allow privileged processes in jail to directly manipulate
|
|
|
|
* system attributes.
|
2002-09-05 20:38:57 +00:00
|
|
|
*/
|
|
|
|
switch (attrnamespace) {
|
|
|
|
case EXTATTR_NAMESPACE_SYSTEM:
|
|
|
|
/* Potentially should be: return (EPERM); */
|
2006-11-06 13:42:10 +00:00
|
|
|
return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
|
2002-09-05 20:38:57 +00:00
|
|
|
case EXTATTR_NAMESPACE_USER:
|
2008-10-28 13:44:11 +00:00
|
|
|
return (VOP_ACCESS(vp, accmode, cred, td));
|
2002-09-05 20:38:57 +00:00
|
|
|
default:
|
|
|
|
return (EPERM);
|
|
|
|
}
|
|
|
|
}
|
2004-01-05 19:04:29 +00:00
|
|
|
|
|
|
|
#ifdef DEBUG_VFS_LOCKS
|
|
|
|
/*
|
2016-04-29 22:15:33 +00:00
|
|
|
* This only exists to suppress warnings from unlocked specfs accesses. It is
|
2004-01-05 19:04:29 +00:00
|
|
|
* no longer ok to have an unlocked VFS.
|
|
|
|
*/
|
2008-07-27 11:46:42 +00:00
|
|
|
#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \
|
|
|
|
(vp)->v_type == VCHR || (vp)->v_type == VBAD)
|
2004-01-05 19:04:29 +00:00
|
|
|
|
|
|
|
int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */
|
2010-11-14 16:10:15 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
|
|
|
|
"Drop into debugger on lock violation");
|
2004-07-21 07:13:14 +00:00
|
|
|
|
2004-01-05 23:40:46 +00:00
|
|
|
int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */
|
2010-11-14 16:10:15 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
|
|
|
|
0, "Check for interlock across VOPs");
|
2004-07-21 07:13:14 +00:00
|
|
|
|
2004-01-05 19:04:29 +00:00
|
|
|
int vfs_badlock_print = 1; /* Print lock violations. */
|
2010-11-14 16:10:15 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
|
|
|
|
0, "Print lock violations");
|
2004-07-21 07:13:14 +00:00
|
|
|
|
2016-08-12 22:20:52 +00:00
|
|
|
int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode,
|
|
|
|
0, "Print vnode details on lock violations");
|
|
|
|
|
2004-07-21 07:13:14 +00:00
|
|
|
#ifdef KDB
|
|
|
|
int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */
|
2010-11-14 16:10:15 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
|
|
|
|
&vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
|
2004-07-21 07:13:14 +00:00
|
|
|
#endif
|
2004-01-05 19:04:29 +00:00
|
|
|
|
|
|
|
static void
|
|
|
|
vfs_badlock(const char *msg, const char *str, struct vnode *vp)
|
|
|
|
{
|
|
|
|
|
2004-07-21 07:13:14 +00:00
|
|
|
#ifdef KDB
|
|
|
|
if (vfs_badlock_backtrace)
|
|
|
|
kdb_backtrace();
|
|
|
|
#endif
|
2016-08-12 22:20:52 +00:00
|
|
|
if (vfs_badlock_vnode)
|
|
|
|
vn_printf(vp, "vnode ");
|
2004-01-05 19:04:29 +00:00
|
|
|
if (vfs_badlock_print)
|
|
|
|
printf("%s: %p %s\n", str, (void *)vp, msg);
|
|
|
|
if (vfs_badlock_ddb)
|
2007-12-25 17:52:02 +00:00
|
|
|
kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
|
2004-01-05 19:04:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
assert_vi_locked(struct vnode *vp, const char *str)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
|
|
|
|
vfs_badlock("interlock is not locked but should be", str, vp);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
assert_vi_unlocked(struct vnode *vp, const char *str)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
|
|
|
|
vfs_badlock("interlock is locked but should not be", str, vp);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
assert_vop_locked(struct vnode *vp, const char *str)
|
|
|
|
{
|
2012-11-24 13:11:47 +00:00
|
|
|
int locked;
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2012-11-24 13:11:47 +00:00
|
|
|
if (!IGNORE_LOCK(vp)) {
|
|
|
|
locked = VOP_ISLOCKED(vp);
|
|
|
|
if (locked == 0 || locked == LK_EXCLOTHER)
|
|
|
|
vfs_badlock("is not locked but should be", str, vp);
|
|
|
|
}
|
2004-01-05 19:04:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
assert_vop_unlocked(struct vnode *vp, const char *str)
|
|
|
|
{
|
|
|
|
|
2008-07-27 11:46:42 +00:00
|
|
|
if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
|
2004-01-05 19:04:29 +00:00
|
|
|
vfs_badlock("is locked but should not be", str, vp);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
assert_vop_elocked(struct vnode *vp, const char *str)
|
|
|
|
{
|
|
|
|
|
2008-07-27 11:46:42 +00:00
|
|
|
if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
|
2004-01-05 19:04:29 +00:00
|
|
|
vfs_badlock("is not exclusive locked but should be", str, vp);
|
|
|
|
}
|
2005-06-09 20:20:31 +00:00
|
|
|
#endif /* DEBUG_VFS_LOCKS */
|
2004-01-05 19:04:29 +00:00
|
|
|
|
2010-04-02 14:03:01 +00:00
|
|
|
void
|
|
|
|
vop_rename_fail(struct vop_rename_args *ap)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (ap->a_tvp != NULL)
|
|
|
|
vput(ap->a_tvp);
|
|
|
|
if (ap->a_tdvp == ap->a_tvp)
|
|
|
|
vrele(ap->a_tdvp);
|
|
|
|
else
|
|
|
|
vput(ap->a_tdvp);
|
|
|
|
vrele(ap->a_fdvp);
|
|
|
|
vrele(ap->a_fvp);
|
|
|
|
}
|
|
|
|
|
2004-01-05 19:04:29 +00:00
|
|
|
void
|
|
|
|
vop_rename_pre(void *ap)
|
|
|
|
{
|
|
|
|
struct vop_rename_args *a = ap;
|
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
#ifdef DEBUG_VFS_LOCKS
|
2004-01-05 19:04:29 +00:00
|
|
|
if (a->a_tvp)
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
|
|
|
|
|
|
|
|
/* Check the source (from). */
|
2010-06-03 10:20:08 +00:00
|
|
|
if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
|
|
|
|
(a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
|
2004-01-05 19:04:29 +00:00
|
|
|
ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
|
2010-06-03 10:20:08 +00:00
|
|
|
if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
|
2006-11-04 23:57:02 +00:00
|
|
|
ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
|
2004-01-05 19:04:29 +00:00
|
|
|
|
|
|
|
/* Check the target. */
|
|
|
|
if (a->a_tvp)
|
|
|
|
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
|
|
|
|
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
|
2005-06-09 20:20:31 +00:00
|
|
|
#endif
|
|
|
|
if (a->a_tdvp != a->a_fdvp)
|
2005-08-28 23:00:11 +00:00
|
|
|
vhold(a->a_fdvp);
|
2005-06-09 20:20:31 +00:00
|
|
|
if (a->a_tvp != a->a_fvp)
|
|
|
|
vhold(a->a_fvp);
|
|
|
|
vhold(a->a_tdvp);
|
|
|
|
if (a->a_tvp)
|
|
|
|
vhold(a->a_tvp);
|
2004-01-05 19:04:29 +00:00
|
|
|
}
|
|
|
|
|
2016-06-17 19:41:30 +00:00
|
|
|
#ifdef DEBUG_VFS_LOCKS
|
2004-01-05 19:04:29 +00:00
|
|
|
void
|
|
|
|
vop_strategy_pre(void *ap)
|
|
|
|
{
|
2004-01-05 23:40:46 +00:00
|
|
|
struct vop_strategy_args *a;
|
2004-01-05 19:04:29 +00:00
|
|
|
struct buf *bp;
|
|
|
|
|
2004-01-05 23:40:46 +00:00
|
|
|
a = ap;
|
2004-01-05 19:04:29 +00:00
|
|
|
bp = a->a_bp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cluster ops lock their component buffers but not the IO container.
|
|
|
|
*/
|
|
|
|
if ((bp->b_flags & B_CLUSTER) != 0)
|
|
|
|
return;
|
|
|
|
|
2011-03-08 11:50:59 +00:00
|
|
|
if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
|
2004-01-05 19:04:29 +00:00
|
|
|
if (vfs_badlock_print)
|
|
|
|
printf(
|
2004-01-05 23:40:46 +00:00
|
|
|
"VOP_STRATEGY: bp is not locked but should be\n");
|
2004-01-05 19:04:29 +00:00
|
|
|
if (vfs_badlock_ddb)
|
2007-12-25 17:52:02 +00:00
|
|
|
kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
|
2004-01-05 19:04:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_lock_pre(void *ap)
|
|
|
|
{
|
2007-05-18 13:02:13 +00:00
|
|
|
struct vop_lock1_args *a = ap;
|
2004-01-05 19:04:29 +00:00
|
|
|
|
|
|
|
if ((a->a_flags & LK_INTERLOCK) == 0)
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
|
|
|
|
else
|
|
|
|
ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_lock_post(void *ap, int rc)
|
|
|
|
{
|
2007-05-18 13:02:13 +00:00
|
|
|
struct vop_lock1_args *a = ap;
|
2004-01-05 19:04:29 +00:00
|
|
|
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
|
2013-04-16 14:22:16 +00:00
|
|
|
if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
|
2004-01-05 19:04:29 +00:00
|
|
|
ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_unlock_pre(void *ap)
|
|
|
|
{
|
|
|
|
struct vop_unlock_args *a = ap;
|
|
|
|
|
|
|
|
if (a->a_flags & LK_INTERLOCK)
|
|
|
|
ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
|
|
|
|
ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_unlock_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_unlock_args *a = ap;
|
|
|
|
|
|
|
|
if (a->a_flags & LK_INTERLOCK)
|
|
|
|
ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
2016-06-17 19:41:30 +00:00
|
|
|
#endif
|
2005-06-09 20:20:31 +00:00
|
|
|
|
|
|
|
void
|
|
|
|
vop_create_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_create_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
2007-04-10 15:29:37 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
2011-12-23 20:11:37 +00:00
|
|
|
void
|
|
|
|
vop_deleteextattr_post(void *ap, int rc)
|
|
|
|
{
|
2012-01-06 20:05:48 +00:00
|
|
|
struct vop_deleteextattr_args *a = ap;
|
2011-12-23 20:11:37 +00:00
|
|
|
|
|
|
|
if (!rc)
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
|
|
|
|
}
|
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
void
|
|
|
|
vop_link_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_link_args *a = ap;
|
2007-04-10 15:29:37 +00:00
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
if (!rc) {
|
2007-04-10 15:29:37 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_mkdir_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_mkdir_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_mknod_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_mknod_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
2015-09-15 20:22:30 +00:00
|
|
|
void
|
|
|
|
vop_reclaim_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_reclaim_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
|
|
|
|
}
|
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
void
|
|
|
|
vop_remove_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_remove_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc) {
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_rename_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_rename_args *a = ap;
|
2016-05-02 13:13:32 +00:00
|
|
|
long hint;
|
2005-06-09 20:20:31 +00:00
|
|
|
|
|
|
|
if (!rc) {
|
2016-05-02 13:13:32 +00:00
|
|
|
hint = NOTE_WRITE;
|
|
|
|
if (a->a_fdvp == a->a_tdvp) {
|
|
|
|
if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
|
|
|
|
hint |= NOTE_LINK;
|
|
|
|
VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
|
|
|
|
VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
|
|
|
|
} else {
|
2016-05-02 13:18:17 +00:00
|
|
|
hint |= NOTE_EXTEND;
|
2016-05-02 13:13:32 +00:00
|
|
|
if (a->a_fvp->v_type == VDIR)
|
|
|
|
hint |= NOTE_LINK;
|
|
|
|
VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
|
|
|
|
|
|
|
|
if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
|
|
|
|
a->a_tvp->v_type == VDIR)
|
|
|
|
hint &= ~NOTE_LINK;
|
|
|
|
VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
|
|
|
|
}
|
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
|
2005-06-09 20:20:31 +00:00
|
|
|
if (a->a_tvp)
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
if (a->a_tdvp != a->a_fdvp)
|
|
|
|
vdrop(a->a_fdvp);
|
|
|
|
if (a->a_tvp != a->a_fvp)
|
|
|
|
vdrop(a->a_fvp);
|
|
|
|
vdrop(a->a_tdvp);
|
|
|
|
if (a->a_tvp)
|
|
|
|
vdrop(a->a_tvp);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_rmdir_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_rmdir_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc) {
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_setattr_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_setattr_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
2011-12-23 20:11:37 +00:00
|
|
|
void
|
|
|
|
vop_setextattr_post(void *ap, int rc)
|
|
|
|
{
|
2012-01-06 20:05:48 +00:00
|
|
|
struct vop_setextattr_args *a = ap;
|
2011-12-23 20:11:37 +00:00
|
|
|
|
|
|
|
if (!rc)
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
|
|
|
|
}
|
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
void
|
|
|
|
vop_symlink_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_symlink_args *a = ap;
|
2007-04-10 15:29:37 +00:00
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
if (!rc)
|
2005-07-01 16:28:32 +00:00
|
|
|
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
|
2004-01-05 19:04:29 +00:00
|
|
|
}
|
2004-07-04 10:52:54 +00:00
|
|
|
|
2016-05-03 15:17:43 +00:00
|
|
|
void
|
|
|
|
vop_open_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_open_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_close_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_close_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
|
|
|
|
(a->a_vp->v_iflag & VI_DOOMED) == 0)) {
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
|
|
|
|
NOTE_CLOSE_WRITE : NOTE_CLOSE);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_read_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_read_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
vop_readdir_post(void *ap, int rc)
|
|
|
|
{
|
|
|
|
struct vop_readdir_args *a = ap;
|
|
|
|
|
|
|
|
if (!rc)
|
|
|
|
VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
|
|
|
|
}
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
static struct knlist fs_knlist;
|
|
|
|
|
|
|
|
static void
|
|
|
|
vfs_event_init(void *arg)
|
|
|
|
{
|
2009-06-10 20:59:32 +00:00
|
|
|
knlist_init_mtx(&fs_knlist, NULL);
|
2004-08-15 06:24:42 +00:00
|
|
|
}
|
|
|
|
/* XXX - correct order? */
|
|
|
|
SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
|
2004-07-04 10:52:54 +00:00
|
|
|
|
|
|
|
void
|
2010-06-21 09:55:56 +00:00
|
|
|
vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
|
2004-07-04 10:52:54 +00:00
|
|
|
{
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
KNOTE_UNLOCKED(&fs_knlist, event);
|
2004-07-04 10:52:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int filt_fsattach(struct knote *kn);
|
|
|
|
static void filt_fsdetach(struct knote *kn);
|
|
|
|
static int filt_fsevent(struct knote *kn, long hint);
|
|
|
|
|
2009-09-12 20:03:45 +00:00
|
|
|
struct filterops fs_filtops = {
|
|
|
|
.f_isfd = 0,
|
|
|
|
.f_attach = filt_fsattach,
|
|
|
|
.f_detach = filt_fsdetach,
|
|
|
|
.f_event = filt_fsevent
|
|
|
|
};
|
2004-07-04 10:52:54 +00:00
|
|
|
|
|
|
|
static int
|
|
|
|
filt_fsattach(struct knote *kn)
|
|
|
|
{
|
|
|
|
|
|
|
|
kn->kn_flags |= EV_CLEAR;
|
2004-08-15 06:24:42 +00:00
|
|
|
knlist_add(&fs_knlist, kn, 0);
|
2004-07-04 10:52:54 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
filt_fsdetach(struct knote *kn)
|
|
|
|
{
|
|
|
|
|
2004-08-15 06:24:42 +00:00
|
|
|
knlist_remove(&fs_knlist, kn, 0);
|
2004-07-04 10:52:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
filt_fsevent(struct knote *kn, long hint)
|
|
|
|
{
|
|
|
|
|
|
|
|
kn->kn_fflags |= hint;
|
|
|
|
return (kn->kn_fflags != 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
|
|
|
struct vfsidctl vc;
|
|
|
|
int error;
|
|
|
|
struct mount *mp;
|
|
|
|
|
|
|
|
error = SYSCTL_IN(req, &vc, sizeof(vc));
|
|
|
|
if (error)
|
|
|
|
return (error);
|
2004-07-04 20:21:58 +00:00
|
|
|
if (vc.vc_vers != VFS_CTL_VERS1)
|
|
|
|
return (EINVAL);
|
2004-07-04 10:52:54 +00:00
|
|
|
mp = vfs_getvfs(&vc.vc_fsid);
|
|
|
|
if (mp == NULL)
|
|
|
|
return (ENOENT);
|
2004-07-04 20:21:58 +00:00
|
|
|
/* ensure that a specific sysctl goes to the right filesystem. */
|
|
|
|
if (strcmp(vc.vc_fstypename, "*") != 0 &&
|
|
|
|
strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
|
2006-03-31 03:53:25 +00:00
|
|
|
vfs_rel(mp);
|
2004-07-04 20:21:58 +00:00
|
|
|
return (EINVAL);
|
|
|
|
}
|
2004-07-04 10:52:54 +00:00
|
|
|
VCTLTOREQ(&vc, req);
|
2006-03-31 03:53:25 +00:00
|
|
|
error = VFS_SYSCTL(mp, vc.vc_op, req);
|
|
|
|
vfs_rel(mp);
|
|
|
|
return (error);
|
2004-07-04 10:52:54 +00:00
|
|
|
}
|
|
|
|
|
2011-01-18 21:14:18 +00:00
|
|
|
SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
|
|
|
|
NULL, 0, sysctl_vfs_ctl, "",
|
2007-04-10 15:29:37 +00:00
|
|
|
"Sysctl by fsid");
|
2004-09-07 09:17:05 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Function to initialize a va_filerev field sensibly.
|
|
|
|
* XXX: Wouldn't a random number make a lot more sense ??
|
|
|
|
*/
|
|
|
|
u_quad_t
|
|
|
|
init_va_filerev(void)
|
|
|
|
{
|
|
|
|
struct bintime bt;
|
|
|
|
|
|
|
|
getbinuptime(&bt);
|
|
|
|
return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
|
|
|
|
}
|
2005-06-09 20:20:31 +00:00
|
|
|
|
|
|
|
static int filt_vfsread(struct knote *kn, long hint);
|
|
|
|
static int filt_vfswrite(struct knote *kn, long hint);
|
|
|
|
static int filt_vfsvnode(struct knote *kn, long hint);
|
|
|
|
static void filt_vfsdetach(struct knote *kn);
|
2009-09-12 20:03:45 +00:00
|
|
|
static struct filterops vfsread_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_vfsdetach,
|
|
|
|
.f_event = filt_vfsread
|
|
|
|
};
|
|
|
|
static struct filterops vfswrite_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_vfsdetach,
|
|
|
|
.f_event = filt_vfswrite
|
|
|
|
};
|
|
|
|
static struct filterops vfsvnode_filtops = {
|
|
|
|
.f_isfd = 1,
|
|
|
|
.f_detach = filt_vfsdetach,
|
|
|
|
.f_event = filt_vfsvnode
|
|
|
|
};
|
2005-06-09 20:20:31 +00:00
|
|
|
|
2005-07-01 16:28:32 +00:00
|
|
|
static void
|
|
|
|
vfs_knllock(void *arg)
|
|
|
|
{
|
|
|
|
struct vnode *vp = arg;
|
|
|
|
|
2008-01-10 01:10:58 +00:00
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
|
2005-07-01 16:28:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vfs_knlunlock(void *arg)
|
|
|
|
{
|
|
|
|
struct vnode *vp = arg;
|
|
|
|
|
2008-01-13 14:44:15 +00:00
|
|
|
VOP_UNLOCK(vp, 0);
|
2005-07-01 16:28:32 +00:00
|
|
|
}
|
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
static void
|
|
|
|
vfs_knl_assert_locked(void *arg)
|
|
|
|
{
|
|
|
|
#ifdef DEBUG_VFS_LOCKS
|
|
|
|
struct vnode *vp = arg;
|
|
|
|
|
|
|
|
ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
vfs_knl_assert_unlocked(void *arg)
|
2005-07-01 16:28:32 +00:00
|
|
|
{
|
2009-06-10 20:59:32 +00:00
|
|
|
#ifdef DEBUG_VFS_LOCKS
|
2005-07-01 16:28:32 +00:00
|
|
|
struct vnode *vp = arg;
|
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
|
|
|
|
#endif
|
2005-07-01 16:28:32 +00:00
|
|
|
}
|
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
int
|
|
|
|
vfs_kqfilter(struct vop_kqfilter_args *ap)
|
|
|
|
{
|
|
|
|
struct vnode *vp = ap->a_vp;
|
|
|
|
struct knote *kn = ap->a_kn;
|
2007-04-10 15:29:37 +00:00
|
|
|
struct knlist *knl;
|
2005-06-09 20:20:31 +00:00
|
|
|
|
|
|
|
switch (kn->kn_filter) {
|
|
|
|
case EVFILT_READ:
|
|
|
|
kn->kn_fop = &vfsread_filtops;
|
|
|
|
break;
|
|
|
|
case EVFILT_WRITE:
|
|
|
|
kn->kn_fop = &vfswrite_filtops;
|
|
|
|
break;
|
|
|
|
case EVFILT_VNODE:
|
|
|
|
kn->kn_fop = &vfsvnode_filtops;
|
|
|
|
break;
|
|
|
|
default:
|
2005-09-12 19:22:37 +00:00
|
|
|
return (EINVAL);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
kn->kn_hook = (caddr_t)vp;
|
|
|
|
|
2008-10-28 12:08:36 +00:00
|
|
|
v_addpollinfo(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
if (vp->v_pollinfo == NULL)
|
|
|
|
return (ENOMEM);
|
2005-07-01 16:28:32 +00:00
|
|
|
knl = &vp->v_pollinfo->vpi_selinfo.si_note;
|
2013-09-26 13:14:51 +00:00
|
|
|
vhold(vp);
|
2005-07-01 16:28:32 +00:00
|
|
|
knlist_add(knl, kn, 0);
|
2005-06-09 20:20:31 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Detach knote from vnode
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
filt_vfsdetach(struct knote *kn)
|
|
|
|
{
|
|
|
|
struct vnode *vp = (struct vnode *)kn->kn_hook;
|
|
|
|
|
|
|
|
KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
|
|
|
|
knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
|
2013-09-26 13:14:51 +00:00
|
|
|
vdrop(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_vfsread(struct knote *kn, long hint)
|
|
|
|
{
|
|
|
|
struct vnode *vp = (struct vnode *)kn->kn_hook;
|
|
|
|
struct vattr va;
|
2009-06-10 20:59:32 +00:00
|
|
|
int res;
|
2005-06-09 20:20:31 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* filesystem is gone, so set the EOF flag and schedule
|
|
|
|
* the knote for deletion.
|
|
|
|
*/
|
2015-09-15 20:22:30 +00:00
|
|
|
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
|
2009-06-10 20:59:32 +00:00
|
|
|
VI_LOCK(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
|
2009-06-10 20:59:32 +00:00
|
|
|
VI_UNLOCK(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
2008-08-28 15:23:18 +00:00
|
|
|
if (VOP_GETATTR(vp, &va, curthread->td_ucred))
|
2005-06-09 20:20:31 +00:00
|
|
|
return (0);
|
|
|
|
|
2009-06-10 20:59:32 +00:00
|
|
|
VI_LOCK(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
kn->kn_data = va.va_size - kn->kn_fp->f_offset;
|
2015-08-05 07:34:29 +00:00
|
|
|
res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0;
|
2009-06-10 20:59:32 +00:00
|
|
|
VI_UNLOCK(vp);
|
|
|
|
return (res);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*ARGSUSED*/
|
|
|
|
static int
|
|
|
|
filt_vfswrite(struct knote *kn, long hint)
|
|
|
|
{
|
2009-06-10 20:59:32 +00:00
|
|
|
struct vnode *vp = (struct vnode *)kn->kn_hook;
|
|
|
|
|
|
|
|
VI_LOCK(vp);
|
|
|
|
|
2005-06-09 20:20:31 +00:00
|
|
|
/*
|
|
|
|
* filesystem is gone, so set the EOF flag and schedule
|
|
|
|
* the knote for deletion.
|
|
|
|
*/
|
2015-09-15 20:22:30 +00:00
|
|
|
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
|
2005-06-09 20:20:31 +00:00
|
|
|
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
|
|
|
|
|
|
|
|
kn->kn_data = 0;
|
2009-06-10 20:59:32 +00:00
|
|
|
VI_UNLOCK(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
filt_vfsvnode(struct knote *kn, long hint)
|
|
|
|
{
|
2009-06-10 20:59:32 +00:00
|
|
|
struct vnode *vp = (struct vnode *)kn->kn_hook;
|
|
|
|
int res;
|
|
|
|
|
|
|
|
VI_LOCK(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
if (kn->kn_sfflags & hint)
|
|
|
|
kn->kn_fflags |= hint;
|
2015-09-15 20:22:30 +00:00
|
|
|
if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
|
2005-06-09 20:20:31 +00:00
|
|
|
kn->kn_flags |= EV_EOF;
|
2009-06-10 20:59:32 +00:00
|
|
|
VI_UNLOCK(vp);
|
2005-06-09 20:20:31 +00:00
|
|
|
return (1);
|
|
|
|
}
|
2009-06-10 20:59:32 +00:00
|
|
|
res = (kn->kn_fflags != 0);
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
return (res);
|
2005-06-09 20:20:31 +00:00
|
|
|
}
|
2005-09-12 08:46:07 +00:00
|
|
|
|
|
|
|
int
|
|
|
|
vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (dp->d_reclen > ap->a_uio->uio_resid)
|
|
|
|
return (ENAMETOOLONG);
|
|
|
|
error = uiomove(dp, dp->d_reclen, ap->a_uio);
|
|
|
|
if (error) {
|
|
|
|
if (ap->a_ncookies != NULL) {
|
|
|
|
if (ap->a_cookies != NULL)
|
|
|
|
free(ap->a_cookies, M_TEMP);
|
|
|
|
ap->a_cookies = NULL;
|
|
|
|
*ap->a_ncookies = 0;
|
|
|
|
}
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
if (ap->a_ncookies == NULL)
|
|
|
|
return (0);
|
2005-11-09 22:03:50 +00:00
|
|
|
|
|
|
|
KASSERT(ap->a_cookies,
|
|
|
|
("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
|
|
|
|
|
2005-09-12 08:46:07 +00:00
|
|
|
*ap->a_cookies = realloc(*ap->a_cookies,
|
|
|
|
(*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
|
|
|
|
(*ap->a_cookies)[*ap->a_ncookies] = off;
|
2016-05-16 07:31:11 +00:00
|
|
|
*ap->a_ncookies += 1;
|
2005-09-12 08:46:07 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2005-10-12 06:56:00 +00:00
|
|
|
/*
|
|
|
|
* Mark for update the access time of the file if the filesystem
|
2009-01-23 22:13:00 +00:00
|
|
|
* supports VOP_MARKATIME. This functionality is used by execve and
|
|
|
|
* mmap, so we want to avoid the I/O implied by directly setting
|
|
|
|
* va_atime for the sake of efficiency.
|
2005-10-12 06:56:00 +00:00
|
|
|
*/
|
|
|
|
void
|
2008-08-28 15:23:18 +00:00
|
|
|
vfs_mark_atime(struct vnode *vp, struct ucred *cred)
|
2005-10-12 06:56:00 +00:00
|
|
|
{
|
2009-09-09 10:51:50 +00:00
|
|
|
struct mount *mp;
|
2005-10-12 06:56:00 +00:00
|
|
|
|
2009-09-09 10:51:50 +00:00
|
|
|
mp = vp->v_mount;
|
|
|
|
ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
|
|
|
|
if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
|
2009-01-21 14:42:00 +00:00
|
|
|
(void)VOP_MARKATIME(vp);
|
2005-10-12 06:56:00 +00:00
|
|
|
}
|
2009-05-30 13:59:05 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The purpose of this routine is to remove granularity from accmode_t,
|
|
|
|
* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
|
|
|
|
* VADMIN and VAPPEND.
|
|
|
|
*
|
|
|
|
* If it returns 0, the caller is supposed to continue with the usual
|
|
|
|
* access checks using 'accmode' as modified by this routine. If it
|
|
|
|
* returns nonzero value, the caller is supposed to return that value
|
|
|
|
* as errno.
|
|
|
|
*
|
|
|
|
* Note that after this routine runs, accmode may be zero.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vfs_unixify_accmode(accmode_t *accmode)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* There is no way to specify explicit "deny" rule using
|
|
|
|
* file mode or POSIX.1e ACLs.
|
|
|
|
*/
|
|
|
|
if (*accmode & VEXPLICIT_DENY) {
|
|
|
|
*accmode = 0;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* None of these can be translated into usual access bits.
|
|
|
|
* Also, the common case for NFSv4 ACLs is to not contain
|
|
|
|
* either of these bits. Caller should check for VWRITE
|
|
|
|
* on the containing directory instead.
|
|
|
|
*/
|
|
|
|
if (*accmode & (VDELETE_CHILD | VDELETE))
|
|
|
|
return (EPERM);
|
|
|
|
|
|
|
|
if (*accmode & VADMIN_PERMS) {
|
|
|
|
*accmode &= ~VADMIN_PERMS;
|
|
|
|
*accmode |= VADMIN;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
|
|
|
|
* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
|
|
|
|
*/
|
|
|
|
*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
2012-04-17 16:28:22 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* These are helper functions for filesystems to traverse all
|
|
|
|
* their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
|
|
|
|
*
|
|
|
|
* This interface replaces MNT_VNODE_FOREACH.
|
|
|
|
*/
|
|
|
|
|
|
|
|
MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
|
|
|
|
|
|
|
|
struct vnode *
|
|
|
|
__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
if (should_yield())
|
2012-12-21 13:14:12 +00:00
|
|
|
kern_yield(PRI_USER);
|
2012-04-17 16:28:22 +00:00
|
|
|
MNT_ILOCK(mp);
|
|
|
|
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
|
|
|
|
vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
|
|
|
|
while (vp != NULL && (vp->v_type == VMARKER ||
|
|
|
|
(vp->v_iflag & VI_DOOMED) != 0))
|
|
|
|
vp = TAILQ_NEXT(vp, v_nmntvnodes);
|
|
|
|
|
|
|
|
/* Check if we are done */
|
|
|
|
if (vp == NULL) {
|
|
|
|
__mnt_vnode_markerfree_all(mvp, mp);
|
|
|
|
/* MNT_IUNLOCK(mp); -- done in above function */
|
|
|
|
mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
|
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
|
|
|
|
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
|
|
|
|
VI_LOCK(vp);
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
return (vp);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct vnode *
|
|
|
|
__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
|
|
|
|
MNT_ILOCK(mp);
|
|
|
|
MNT_REF(mp);
|
|
|
|
(*mvp)->v_type = VMARKER;
|
|
|
|
|
|
|
|
vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
|
|
|
|
while (vp != NULL && (vp->v_type == VMARKER ||
|
|
|
|
(vp->v_iflag & VI_DOOMED) != 0))
|
|
|
|
vp = TAILQ_NEXT(vp, v_nmntvnodes);
|
|
|
|
|
|
|
|
/* Check if we are done */
|
|
|
|
if (vp == NULL) {
|
|
|
|
MNT_REL(mp);
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
free(*mvp, M_VNODE_MARKER);
|
2012-04-18 19:30:22 +00:00
|
|
|
*mvp = NULL;
|
2012-04-17 16:28:22 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
(*mvp)->v_mount = mp;
|
|
|
|
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
|
|
|
|
VI_LOCK(vp);
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
return (vp);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (*mvp == NULL) {
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
mtx_assert(MNT_MTX(mp), MA_OWNED);
|
|
|
|
|
|
|
|
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
|
|
|
|
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
|
|
|
|
MNT_REL(mp);
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
free(*mvp, M_VNODE_MARKER);
|
|
|
|
*mvp = NULL;
|
|
|
|
}
|
2012-04-20 06:50:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* These are helper functions for filesystems to traverse their
|
|
|
|
* active vnodes. See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
|
|
|
|
*/
|
2012-12-15 02:04:46 +00:00
|
|
|
static void
|
|
|
|
mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
|
|
|
|
|
|
|
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
|
|
|
|
|
|
|
|
MNT_ILOCK(mp);
|
|
|
|
MNT_REL(mp);
|
|
|
|
MNT_IUNLOCK(mp);
|
|
|
|
free(*mvp, M_VNODE_MARKER);
|
|
|
|
*mvp = NULL;
|
|
|
|
}
|
|
|
|
|
2017-05-15 10:02:45 +00:00
|
|
|
/*
|
|
|
|
* Relock the mp mount vnode list lock with the vp vnode interlock in the
|
|
|
|
* conventional lock order during mnt_vnode_next_active iteration.
|
|
|
|
*
|
|
|
|
* On entry, the mount vnode list lock is held and the vnode interlock is not.
|
|
|
|
* The list lock is dropped and reacquired. On success, both locks are held.
|
|
|
|
* On failure, the mount vnode list lock is held but the vnode interlock is
|
|
|
|
* not, and the procedure may have yielded.
|
|
|
|
*/
|
|
|
|
static bool
|
|
|
|
mnt_vnode_next_active_relock(struct vnode *mvp, struct mount *mp,
|
|
|
|
struct vnode *vp)
|
|
|
|
{
|
|
|
|
const struct vnode *tmp;
|
|
|
|
bool held, ret;
|
|
|
|
|
|
|
|
VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER &&
|
|
|
|
TAILQ_NEXT(mvp, v_actfreelist) != NULL, mvp,
|
|
|
|
("%s: bad marker", __func__));
|
|
|
|
VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp,
|
|
|
|
("%s: inappropriate vnode", __func__));
|
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
|
|
|
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
|
|
|
|
|
|
|
|
ret = false;
|
|
|
|
|
|
|
|
TAILQ_REMOVE(&mp->mnt_activevnodelist, mvp, v_actfreelist);
|
|
|
|
TAILQ_INSERT_BEFORE(vp, mvp, v_actfreelist);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Use a hold to prevent vp from disappearing while the mount vnode
|
|
|
|
* list lock is dropped and reacquired. Normally a hold would be
|
|
|
|
* acquired with vhold(), but that might try to acquire the vnode
|
|
|
|
* interlock, which would be a LOR with the mount vnode list lock.
|
|
|
|
*/
|
|
|
|
held = vfs_refcount_acquire_if_not_zero(&vp->v_holdcnt);
|
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
|
|
|
if (!held)
|
|
|
|
goto abort;
|
|
|
|
VI_LOCK(vp);
|
|
|
|
if (!vfs_refcount_release_if_not_last(&vp->v_holdcnt)) {
|
|
|
|
vdropl(vp);
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether the vnode is still the next one after the marker,
|
|
|
|
* excepting any other markers. If the vnode has not been doomed by
|
|
|
|
* vgone() then the hold should have ensured that it remained on the
|
|
|
|
* active list. If it has been doomed but is still on the active list,
|
|
|
|
* don't abort, but rather skip over it (avoid spinning on doomed
|
|
|
|
* vnodes).
|
|
|
|
*/
|
|
|
|
tmp = mvp;
|
|
|
|
do {
|
|
|
|
tmp = TAILQ_NEXT(tmp, v_actfreelist);
|
|
|
|
} while (tmp != NULL && tmp->v_type == VMARKER);
|
|
|
|
if (tmp != vp) {
|
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
goto abort;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = true;
|
|
|
|
goto out;
|
|
|
|
abort:
|
|
|
|
maybe_yield();
|
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
|
|
|
out:
|
|
|
|
if (ret)
|
|
|
|
ASSERT_VI_LOCKED(vp, __func__);
|
|
|
|
else
|
|
|
|
ASSERT_VI_UNLOCKED(vp, __func__);
|
|
|
|
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
2012-12-15 02:04:46 +00:00
|
|
|
static struct vnode *
|
|
|
|
mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
|
2012-04-20 06:50:44 +00:00
|
|
|
{
|
|
|
|
struct vnode *vp, *nvp;
|
|
|
|
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_assert(&mp->mnt_listmtx, MA_OWNED);
|
2012-04-20 06:50:44 +00:00
|
|
|
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
|
2012-12-15 02:04:46 +00:00
|
|
|
restart:
|
2012-04-20 06:50:44 +00:00
|
|
|
vp = TAILQ_NEXT(*mvp, v_actfreelist);
|
|
|
|
while (vp != NULL) {
|
2012-12-03 22:15:16 +00:00
|
|
|
if (vp->v_type == VMARKER) {
|
|
|
|
vp = TAILQ_NEXT(vp, v_actfreelist);
|
|
|
|
continue;
|
|
|
|
}
|
2017-05-15 10:02:45 +00:00
|
|
|
/*
|
|
|
|
* Try-lock because this is the wrong lock order. If that does
|
|
|
|
* not succeed, drop the mount vnode list lock and try to
|
|
|
|
* reacquire it and the vnode interlock in the right order.
|
|
|
|
*/
|
|
|
|
if (!VI_TRYLOCK(vp) &&
|
|
|
|
!mnt_vnode_next_active_relock(*mvp, mp, vp))
|
|
|
|
goto restart;
|
2012-12-15 02:04:46 +00:00
|
|
|
KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
|
|
|
|
KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
|
|
|
|
("alien vnode on the active list %p %p", vp, mp));
|
|
|
|
if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
|
2012-04-20 06:50:44 +00:00
|
|
|
break;
|
|
|
|
nvp = TAILQ_NEXT(vp, v_actfreelist);
|
|
|
|
VI_UNLOCK(vp);
|
|
|
|
vp = nvp;
|
|
|
|
}
|
2017-05-15 10:02:45 +00:00
|
|
|
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
|
2012-04-20 06:50:44 +00:00
|
|
|
|
|
|
|
/* Check if we are done */
|
|
|
|
if (vp == NULL) {
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-12-15 02:04:46 +00:00
|
|
|
mnt_vnode_markerfree_active(mvp, mp);
|
2012-04-20 06:50:44 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
|
|
|
TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-12-03 22:15:16 +00:00
|
|
|
ASSERT_VI_LOCKED(vp, "active iter");
|
|
|
|
KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
|
2012-04-20 06:50:44 +00:00
|
|
|
return (vp);
|
|
|
|
}
|
2012-12-15 02:04:46 +00:00
|
|
|
|
|
|
|
struct vnode *
|
|
|
|
__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (should_yield())
|
2012-12-21 13:14:12 +00:00
|
|
|
kern_yield(PRI_USER);
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
2012-12-15 02:04:46 +00:00
|
|
|
return (mnt_vnode_next_active(mvp, mp));
|
|
|
|
}
|
2012-04-20 06:50:44 +00:00
|
|
|
|
|
|
|
struct vnode *
|
|
|
|
__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
2012-12-15 02:04:46 +00:00
|
|
|
struct vnode *vp;
|
2012-04-20 06:50:44 +00:00
|
|
|
|
|
|
|
*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
|
|
|
|
MNT_ILOCK(mp);
|
|
|
|
MNT_REF(mp);
|
2012-12-10 20:44:09 +00:00
|
|
|
MNT_IUNLOCK(mp);
|
2012-04-20 06:50:44 +00:00
|
|
|
(*mvp)->v_type = VMARKER;
|
2012-12-10 20:44:09 +00:00
|
|
|
(*mvp)->v_mount = mp;
|
2012-04-20 06:50:44 +00:00
|
|
|
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
2012-11-27 06:07:58 +00:00
|
|
|
vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
|
2012-04-20 06:50:44 +00:00
|
|
|
if (vp == NULL) {
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-12-15 02:04:46 +00:00
|
|
|
mnt_vnode_markerfree_active(mvp, mp);
|
2012-04-20 06:50:44 +00:00
|
|
|
return (NULL);
|
|
|
|
}
|
2012-12-15 02:04:46 +00:00
|
|
|
TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
|
|
|
|
return (mnt_vnode_next_active(mvp, mp));
|
2012-04-20 06:50:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
|
|
|
|
{
|
|
|
|
|
2012-12-10 20:44:09 +00:00
|
|
|
if (*mvp == NULL)
|
2012-04-20 06:50:44 +00:00
|
|
|
return;
|
|
|
|
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_lock(&mp->mnt_listmtx);
|
2012-04-20 06:50:44 +00:00
|
|
|
TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
|
2016-09-30 17:27:17 +00:00
|
|
|
mtx_unlock(&mp->mnt_listmtx);
|
2012-12-15 02:04:46 +00:00
|
|
|
mnt_vnode_markerfree_active(mvp, mp);
|
2012-04-20 06:50:44 +00:00
|
|
|
}
|