1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 1989, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
* (c) UNIX System Laboratories, Inc.
|
|
|
|
* All or some portions of this file are derived from material licensed
|
|
|
|
* to the University of California by American Telephone and Telegraph
|
|
|
|
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
|
|
|
|
* the permission of UNIX System Laboratories, Inc.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgement:
|
|
|
|
* This product includes software developed by the University of
|
|
|
|
* California, Berkeley and its contributors.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1997-02-10 02:22:35 +00:00
|
|
|
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
|
1999-08-28 01:08:13 +00:00
|
|
|
* $FreeBSD$
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* External virtual filesystem routines
|
|
|
|
*/
|
1996-01-04 21:13:23 +00:00
|
|
|
#include "opt_ddb.h"
|
2000-07-03 13:26:54 +00:00
|
|
|
#include "opt_ffs.h"
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
2000-05-05 09:59:14 +00:00
|
|
|
#include <sys/bio.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/buf.h>
|
|
|
|
#include <sys/conf.h>
|
|
|
|
#include <sys/dirent.h>
|
|
|
|
#include <sys/domain.h>
|
|
|
|
#include <sys/eventhandler.h>
|
1998-12-24 12:07:16 +00:00
|
|
|
#include <sys/fcntl.h>
|
1995-11-16 09:45:23 +00:00
|
|
|
#include <sys/kernel.h>
|
1999-07-01 13:21:46 +00:00
|
|
|
#include <sys/kthread.h>
|
2000-09-07 01:33:02 +00:00
|
|
|
#include <sys/ktr.h>
|
1997-10-12 20:26:33 +00:00
|
|
|
#include <sys/malloc.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/mount.h>
|
2000-10-20 07:58:15 +00:00
|
|
|
#include <sys/mutex.h>
|
2000-01-08 16:20:06 +00:00
|
|
|
#include <sys/namei.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/reboot.h>
|
1998-03-28 12:04:40 +00:00
|
|
|
#include <sys/socket.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/stat.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/sysctl.h>
|
1997-12-29 00:25:11 +00:00
|
|
|
#include <sys/vmmeter.h>
|
2000-01-07 08:36:44 +00:00
|
|
|
#include <sys/vnode.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1997-08-26 11:59:20 +00:00
|
|
|
#include <machine/limits.h>
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <vm/vm.h>
|
1995-12-07 12:48:31 +00:00
|
|
|
#include <vm/vm_object.h>
|
|
|
|
#include <vm/vm_extern.h>
|
1997-12-19 09:03:37 +00:00
|
|
|
#include <vm/pmap.h>
|
|
|
|
#include <vm/vm_map.h>
|
1999-01-21 08:29:12 +00:00
|
|
|
#include <vm/vm_page.h>
|
1998-01-17 09:17:02 +00:00
|
|
|
#include <vm/vm_pager.h>
|
1996-08-21 21:56:23 +00:00
|
|
|
#include <vm/vnode_pager.h>
|
VM level code cleanups.
1) Start using TSM.
Struct procs continue to point to upages structure, after being freed.
Struct vmspace continues to point to pte object and kva space for kstack.
u_map is now superfluous.
2) vm_map's don't need to be reference counted. They always exist either
in the kernel or in a vmspace. The vmspaces are managed by reference
counts.
3) Remove the "wired" vm_map nonsense.
4) No need to keep a cache of kernel stack kva's.
5) Get rid of strange looking ++var, and change to var++.
6) Change more data structures to use our "zone" allocator. Added
struct proc, struct vmspace and struct vnode. This saves a significant
amount of kva space and physical memory. Additionally, this enables
TSM for the zone managed memory.
7) Keep ioopt disabled for now.
8) Remove the now bogus "single use" map concept.
9) Use generation counts or id's for data structures residing in TSM, where
it allows us to avoid unneeded restart overhead during traversals, where
blocking might occur.
10) Account better for memory deficits, so the pageout daemon will be able
to make enough memory available (experimental.)
11) Fix some vnode locking problems. (From Tor, I think.)
12) Add a check in ufs_lookup, to avoid lots of unneeded calls to bcmp.
(experimental.)
13) Significantly shrink, cleanup, and make slightly faster the vm_fault.c
code. Use generation counts, get rid of unneded collpase operations,
and clean up the cluster code.
14) Make vm_zone more suitable for TSM.
This commit is partially as a result of discussions and contributions from
other people, including DG, Tor Egge, PHK, and probably others that I
have forgotten to attribute (so let me know, if I forgot.)
This is not the infamous, final cleanup of the vnode stuff, but a necessary
step. Vnode mgmt should be correct, but things might still change, and
there is still some missing stuff (like ioopt, and physical backing of
non-merged cache files, debugging of layering concepts.)
1998-01-22 17:30:44 +00:00
|
|
|
#include <vm/vm_zone.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1997-10-12 20:26:33 +00:00
|
|
|
static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
|
1997-10-11 18:31:40 +00:00
|
|
|
|
2000-09-22 11:54:48 +00:00
|
|
|
static void addalias __P((struct vnode *vp, dev_t nvp_rdev));
|
1997-11-22 08:35:46 +00:00
|
|
|
static void insmntque __P((struct vnode *vp, struct mount *mp));
|
1997-02-10 02:22:35 +00:00
|
|
|
static void vclean __P((struct vnode *vp, int flags, struct proc *p));
|
2000-09-22 12:22:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Number of vnodes in existence. Increased whenever getnewvnode()
|
|
|
|
* allocates a new vnode, never decreased.
|
|
|
|
*/
|
1997-11-22 08:35:46 +00:00
|
|
|
static unsigned long numvnodes;
|
2000-12-02 20:08:33 +00:00
|
|
|
SYSCTL_LONG(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "");
|
1995-12-02 18:58:56 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Conversion tables for conversion from vnode types to inode formats
|
|
|
|
* and back.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
enum vtype iftovt_tab[16] = {
|
|
|
|
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
|
|
|
|
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
|
|
|
|
};
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
int vttoif_tab[9] = {
|
1994-05-24 10:09:53 +00:00
|
|
|
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
|
|
|
|
S_IFSOCK, S_IFIFO, S_IFMT,
|
|
|
|
};
|
|
|
|
|
2000-10-05 18:22:46 +00:00
|
|
|
/*
|
|
|
|
* List of vnodes that are ready for recycling.
|
|
|
|
*/
|
2000-09-22 12:22:36 +00:00
|
|
|
static TAILQ_HEAD(freelst, vnode) vnode_free_list;
|
1998-01-12 01:46:33 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Minimum number of free vnodes. If there are fewer than this free vnodes,
|
|
|
|
* getnewvnode() will return a newly allocated vnode.
|
|
|
|
*/
|
1997-09-25 16:17:57 +00:00
|
|
|
static u_long wantfreevnodes = 25;
|
2000-12-02 20:08:33 +00:00
|
|
|
SYSCTL_LONG(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
|
2000-09-22 12:22:36 +00:00
|
|
|
/* Number of vnodes in the free list. */
|
1996-07-12 07:41:34 +00:00
|
|
|
static u_long freevnodes = 0;
|
2000-12-02 20:08:33 +00:00
|
|
|
SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
|
1995-03-09 20:27:04 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Various variables used for debugging the new implementation of
|
|
|
|
* reassignbuf().
|
|
|
|
* XXX these are probably of (very) limited utility now.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
static int reassignbufcalls;
|
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0, "");
|
|
|
|
static int reassignbufloops;
|
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, reassignbufloops, CTLFLAG_RW, &reassignbufloops, 0, "");
|
|
|
|
static int reassignbufsortgood;
|
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortgood, CTLFLAG_RW, &reassignbufsortgood, 0, "");
|
|
|
|
static int reassignbufsortbad;
|
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, reassignbufsortbad, CTLFLAG_RW, &reassignbufsortbad, 0, "");
|
2000-09-22 12:22:36 +00:00
|
|
|
/* Set to 0 for old insertion-sort based reassignbuf, 1 for modern method. */
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
static int reassignbufmethod = 1;
|
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, reassignbufmethod, CTLFLAG_RW, &reassignbufmethod, 0, "");
|
|
|
|
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
#ifdef ENABLE_VFS_IOOPT
|
2000-10-05 18:22:46 +00:00
|
|
|
/* See NOTES for a description of this setting. */
|
1999-07-08 06:06:00 +00:00
|
|
|
int vfs_ioopt = 0;
|
1997-12-29 01:03:55 +00:00
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
|
1998-03-14 19:50:36 +00:00
|
|
|
#endif
|
1997-12-29 01:03:55 +00:00
|
|
|
|
2000-10-05 18:22:46 +00:00
|
|
|
/* List of mounted filesystems. */
|
2000-09-22 12:22:36 +00:00
|
|
|
struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* For any iteration/modification of mountlist */
|
2000-10-04 01:29:17 +00:00
|
|
|
struct mtx mountlist_mtx;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* For any iteration/modification of mnt_vnodelist */
|
2001-01-24 12:35:55 +00:00
|
|
|
struct mtx mntvnode_mtx;
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Cache for the mount type id assigned to NFS. This is used for
|
|
|
|
* special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
|
|
|
|
*/
|
1998-09-05 15:17:34 +00:00
|
|
|
int nfs_mount_type = -1;
|
2000-09-22 12:22:36 +00:00
|
|
|
|
|
|
|
/* To keep more than one thread at a time from running vfs_getnewfsid */
|
2001-01-24 12:35:55 +00:00
|
|
|
static struct mtx mntid_mtx;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* For any iteration/modification of vnode_free_list */
|
2001-01-24 12:35:55 +00:00
|
|
|
static struct mtx vnode_free_list_mtx;
|
2000-09-22 12:22:36 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* For any iteration/modification of dev->si_hlist (linked through
|
|
|
|
* v_specnext)
|
|
|
|
*/
|
2001-01-24 12:35:55 +00:00
|
|
|
static struct mtx spechash_mtx;
|
2000-09-22 12:22:36 +00:00
|
|
|
|
|
|
|
/* Publicly exported FS */
|
|
|
|
struct nfs_public nfs_pub;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
|
VM level code cleanups.
1) Start using TSM.
Struct procs continue to point to upages structure, after being freed.
Struct vmspace continues to point to pte object and kva space for kstack.
u_map is now superfluous.
2) vm_map's don't need to be reference counted. They always exist either
in the kernel or in a vmspace. The vmspaces are managed by reference
counts.
3) Remove the "wired" vm_map nonsense.
4) No need to keep a cache of kernel stack kva's.
5) Get rid of strange looking ++var, and change to var++.
6) Change more data structures to use our "zone" allocator. Added
struct proc, struct vmspace and struct vnode. This saves a significant
amount of kva space and physical memory. Additionally, this enables
TSM for the zone managed memory.
7) Keep ioopt disabled for now.
8) Remove the now bogus "single use" map concept.
9) Use generation counts or id's for data structures residing in TSM, where
it allows us to avoid unneeded restart overhead during traversals, where
blocking might occur.
10) Account better for memory deficits, so the pageout daemon will be able
to make enough memory available (experimental.)
11) Fix some vnode locking problems. (From Tor, I think.)
12) Add a check in ufs_lookup, to avoid lots of unneeded calls to bcmp.
(experimental.)
13) Significantly shrink, cleanup, and make slightly faster the vm_fault.c
code. Use generation counts, get rid of unneded collpase operations,
and clean up the cluster code.
14) Make vm_zone more suitable for TSM.
This commit is partially as a result of discussions and contributions from
other people, including DG, Tor Egge, PHK, and probably others that I
have forgotten to attribute (so let me know, if I forgot.)
This is not the infamous, final cleanup of the vnode stuff, but a necessary
step. Vnode mgmt should be correct, but things might still change, and
there is still some missing stuff (like ioopt, and physical backing of
non-merged cache files, debugging of layering concepts.)
1998-01-22 17:30:44 +00:00
|
|
|
static vm_zone_t vnode_zone;
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/* Set to 1 to print out reclaim of active vnodes */
|
|
|
|
int prtactive = 0;
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* The workitem queue.
|
2000-09-22 12:22:36 +00:00
|
|
|
*
|
|
|
|
* It is useful to delay writes of file data and filesystem metadata
|
|
|
|
* for tens of seconds so that quickly created and deleted files need
|
|
|
|
* not waste disk bandwidth being created and removed. To realize this,
|
|
|
|
* we append vnodes to a "workitem" queue. When running with a soft
|
|
|
|
* updates implementation, most pending metadata dependencies should
|
|
|
|
* not wait for more than a few seconds. Thus, mounted on block devices
|
|
|
|
* are delayed only about a half the time that file data is delayed.
|
|
|
|
* Similarly, directory updates are more critical, so are only delayed
|
|
|
|
* about a third the time that file data is delayed. Thus, there are
|
|
|
|
* SYNCER_MAXDELAY queues that are processed round-robin at a rate of
|
|
|
|
* one each second (driven off the filesystem syncer process). The
|
|
|
|
* syncer_delayno variable indicates the next queue that is to be processed.
|
|
|
|
* Items that need to be processed soon are placed in this queue:
|
|
|
|
*
|
|
|
|
* syncer_workitem_pending[syncer_delayno]
|
|
|
|
*
|
|
|
|
* A delay of fifteen seconds is done by placing the request fifteen
|
|
|
|
* entries later in the queue:
|
|
|
|
*
|
|
|
|
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
|
|
|
|
*
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
2000-09-22 12:22:36 +00:00
|
|
|
static int syncer_delayno = 0;
|
|
|
|
static long syncer_mask;
|
|
|
|
LIST_HEAD(synclist, vnode);
|
|
|
|
static struct synclist *syncer_workitem_pending;
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
#define SYNCER_MAXDELAY 32
|
1998-12-21 23:38:33 +00:00
|
|
|
static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
|
1999-06-15 23:37:29 +00:00
|
|
|
time_t syncdelay = 30; /* max time to delay syncing data */
|
|
|
|
time_t filedelay = 30; /* time to delay syncing files */
|
|
|
|
SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
|
1999-06-26 02:47:16 +00:00
|
|
|
time_t dirdelay = 29; /* time to delay syncing directories */
|
1999-06-15 23:37:29 +00:00
|
|
|
SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
|
1999-06-26 02:47:16 +00:00
|
|
|
time_t metadelay = 28; /* time to delay syncing metadata */
|
1999-06-15 23:37:29 +00:00
|
|
|
SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
|
2000-09-22 12:22:36 +00:00
|
|
|
static int rushjob; /* number of slots to run ASAP */
|
1999-06-15 23:37:29 +00:00
|
|
|
static int stat_rush_requests; /* number of times I/O speeded up */
|
|
|
|
SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
|
1998-03-08 09:59:44 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Number of vnodes we want to exist at any one time. This is mostly used
|
|
|
|
* to size hash tables in vnode-related code. It is normally not used in
|
|
|
|
* getnewvnode(), as wantfreevnodes is normally nonzero.)
|
|
|
|
*
|
|
|
|
* XXX desiredvnodes is historical cruft and should not exist.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
int desiredvnodes;
|
1999-05-03 23:57:32 +00:00
|
|
|
SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
|
|
|
|
&desiredvnodes, 0, "Maximum number of vnodes");
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1995-12-02 18:58:56 +00:00
|
|
|
static void vfs_free_addrlist __P((struct netexport *nep));
|
|
|
|
static int vfs_free_netcred __P((struct radix_node *rn, void *w));
|
|
|
|
static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
|
|
|
|
struct export_args *argp));
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Initialize the vnode management data structures.
|
|
|
|
*/
|
2000-12-06 07:09:08 +00:00
|
|
|
static void
|
|
|
|
vntblinit(void *dummy __unused)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
1997-12-29 00:25:11 +00:00
|
|
|
desiredvnodes = maxproc + cnt.v_page_count / 4;
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_init(&mountlist_mtx, "mountlist", MTX_DEF);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_init(&mntvnode_mtx, "mntvnode", MTX_DEF);
|
|
|
|
mtx_init(&mntid_mtx, "mntid", MTX_DEF);
|
|
|
|
mtx_init(&spechash_mtx, "spechash", MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
TAILQ_INIT(&vnode_free_list);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_init(&vnode_free_list_mtx, "vnode_free_list", MTX_DEF);
|
VM level code cleanups.
1) Start using TSM.
Struct procs continue to point to upages structure, after being freed.
Struct vmspace continues to point to pte object and kva space for kstack.
u_map is now superfluous.
2) vm_map's don't need to be reference counted. They always exist either
in the kernel or in a vmspace. The vmspaces are managed by reference
counts.
3) Remove the "wired" vm_map nonsense.
4) No need to keep a cache of kernel stack kva's.
5) Get rid of strange looking ++var, and change to var++.
6) Change more data structures to use our "zone" allocator. Added
struct proc, struct vmspace and struct vnode. This saves a significant
amount of kva space and physical memory. Additionally, this enables
TSM for the zone managed memory.
7) Keep ioopt disabled for now.
8) Remove the now bogus "single use" map concept.
9) Use generation counts or id's for data structures residing in TSM, where
it allows us to avoid unneeded restart overhead during traversals, where
blocking might occur.
10) Account better for memory deficits, so the pageout daemon will be able
to make enough memory available (experimental.)
11) Fix some vnode locking problems. (From Tor, I think.)
12) Add a check in ufs_lookup, to avoid lots of unneeded calls to bcmp.
(experimental.)
13) Significantly shrink, cleanup, and make slightly faster the vm_fault.c
code. Use generation counts, get rid of unneded collpase operations,
and clean up the cluster code.
14) Make vm_zone more suitable for TSM.
This commit is partially as a result of discussions and contributions from
other people, including DG, Tor Egge, PHK, and probably others that I
have forgotten to attribute (so let me know, if I forgot.)
This is not the infamous, final cleanup of the vnode stuff, but a necessary
step. Vnode mgmt should be correct, but things might still change, and
there is still some missing stuff (like ioopt, and physical backing of
non-merged cache files, debugging of layering concepts.)
1998-01-22 17:30:44 +00:00
|
|
|
vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5);
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Initialize the filesystem syncer.
|
|
|
|
*/
|
|
|
|
syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
|
|
|
|
&syncer_mask);
|
|
|
|
syncer_maxdelay = syncer_mask + 1;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-12-06 07:09:08 +00:00
|
|
|
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL)
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Mark a mount point as busy. Used to synchronize access and to delay
|
|
|
|
* unmounting. Interlock is not released on failure.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1997-02-10 02:22:35 +00:00
|
|
|
vfs_busy(mp, flags, interlkp, p)
|
|
|
|
struct mount *mp;
|
|
|
|
int flags;
|
2000-10-04 01:29:17 +00:00
|
|
|
struct mtx *interlkp;
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
int lkflags;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1997-11-12 05:42:33 +00:00
|
|
|
if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
|
1997-02-10 02:22:35 +00:00
|
|
|
if (flags & LK_NOWAIT)
|
|
|
|
return (ENOENT);
|
1997-11-12 05:42:33 +00:00
|
|
|
mp->mnt_kern_flag |= MNTK_MWAIT;
|
1997-02-10 02:22:35 +00:00
|
|
|
/*
|
|
|
|
* Since all busy locks are shared except the exclusive
|
|
|
|
* lock granted when unmounting, the only place that a
|
|
|
|
* wakeup needs to be done is at the release of the
|
|
|
|
* exclusive lock at the end of dounmount.
|
|
|
|
*/
|
2000-12-01 03:43:33 +00:00
|
|
|
msleep((caddr_t)mp, interlkp, PVFS, "vfs_busy", 0);
|
1997-02-10 02:22:35 +00:00
|
|
|
return (ENOENT);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
This mega-commit is meant to fix numerous interrelated problems. There
has been some bitrot and incorrect assumptions in the vfs_bio code. These
problems have manifest themselves worse on NFS type filesystems, but can
still affect local filesystems under certain circumstances. Most of
the problems have involved mmap consistancy, and as a side-effect broke
the vfs.ioopt code. This code might have been committed seperately, but
almost everything is interrelated.
1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that
are fully valid.
2) Rather than deactivating erroneously read initial (header) pages in
kern_exec, we now free them.
3) Fix the rundown of non-VMIO buffers that are in an inconsistent
(missing vp) state.
4) Fix the disassociation of pages from buffers in brelse. The previous
code had rotted and was faulty in a couple of important circumstances.
5) Remove a gratuitious buffer wakeup in vfs_vmio_release.
6) Remove a crufty and currently unused cluster mechanism for VBLK
files in vfs_bio_awrite. When the code is functional, I'll add back
a cleaner version.
7) The page busy count wakeups assocated with the buffer cache usage were
incorrectly cleaned up in a previous commit by me. Revert to the
original, correct version, but with a cleaner implementation.
8) The cluster read code now tries to keep data associated with buffers
more aggressively (without breaking the heuristics) when it is presumed
that the read data (buffers) will be soon needed.
9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The
delay loop waiting is not useful for filesystem locks, due to the
length of the time intervals.
10) Correct and clean-up spec_getpages.
11) Implement a fully functional nfs_getpages, nfs_putpages.
12) Fix nfs_write so that modifications are coherent with the NFS data on
the server disk (at least as well as NFS seems to allow.)
13) Properly support MS_INVALIDATE on NFS.
14) Properly pass down MS_INVALIDATE to lower levels of the VM code from
vm_map_clean.
15) Better support the notion of pages being busy but valid, so that
fewer in-transit waits occur. (use p->busy more for pageouts instead
of PG_BUSY.) Since the page is fully valid, it is still usable for
reads.
16) It is possible (in error) for cached pages to be busy. Make the
page allocation code handle that case correctly. (It should probably
be a printf or panic, but I want the system to handle coding errors
robustly. I'll probably add a printf.)
17) Correct the design and usage of vm_page_sleep. It didn't handle
consistancy problems very well, so make the design a little less
lofty. After vm_page_sleep, if it ever blocked, it is still important
to relookup the page (if the object generation count changed), and
verify it's status (always.)
18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up.
19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush.
20) Fix vm_pager_put_pages and it's descendents to support an int flag
instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
|
|
|
lkflags = LK_SHARED | LK_NOPAUSE;
|
1997-02-10 02:22:35 +00:00
|
|
|
if (interlkp)
|
|
|
|
lkflags |= LK_INTERLOCK;
|
|
|
|
if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
|
|
|
|
panic("vfs_busy: unexpected lock failure");
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Free a busy filesystem.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
void
|
1997-02-10 02:22:35 +00:00
|
|
|
vfs_unbusy(mp, p)
|
|
|
|
struct mount *mp;
|
|
|
|
struct proc *p;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
1997-02-10 02:22:35 +00:00
|
|
|
lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Lookup a filesystem type, and if found allocate and initialize
|
|
|
|
* a mount structure for it.
|
|
|
|
*
|
|
|
|
* Devname is usually updated by mount(8) after booting.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1997-02-10 02:22:35 +00:00
|
|
|
vfs_rootmountalloc(fstypename, devname, mpp)
|
|
|
|
char *fstypename;
|
|
|
|
char *devname;
|
|
|
|
struct mount **mpp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
|
|
|
struct vfsconf *vfsp;
|
|
|
|
struct mount *mp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-06-07 17:13:14 +00:00
|
|
|
if (fstypename == NULL)
|
|
|
|
return (ENODEV);
|
1997-02-10 02:22:35 +00:00
|
|
|
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
|
|
|
|
if (!strcmp(vfsp->vfc_name, fstypename))
|
|
|
|
break;
|
|
|
|
if (vfsp == NULL)
|
|
|
|
return (ENODEV);
|
2000-12-08 21:51:06 +00:00
|
|
|
mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
|
This mega-commit is meant to fix numerous interrelated problems. There
has been some bitrot and incorrect assumptions in the vfs_bio code. These
problems have manifest themselves worse on NFS type filesystems, but can
still affect local filesystems under certain circumstances. Most of
the problems have involved mmap consistancy, and as a side-effect broke
the vfs.ioopt code. This code might have been committed seperately, but
almost everything is interrelated.
1) Allow (pmap_object_init_pt) prefaulting of buffer-busy pages that
are fully valid.
2) Rather than deactivating erroneously read initial (header) pages in
kern_exec, we now free them.
3) Fix the rundown of non-VMIO buffers that are in an inconsistent
(missing vp) state.
4) Fix the disassociation of pages from buffers in brelse. The previous
code had rotted and was faulty in a couple of important circumstances.
5) Remove a gratuitious buffer wakeup in vfs_vmio_release.
6) Remove a crufty and currently unused cluster mechanism for VBLK
files in vfs_bio_awrite. When the code is functional, I'll add back
a cleaner version.
7) The page busy count wakeups assocated with the buffer cache usage were
incorrectly cleaned up in a previous commit by me. Revert to the
original, correct version, but with a cleaner implementation.
8) The cluster read code now tries to keep data associated with buffers
more aggressively (without breaking the heuristics) when it is presumed
that the read data (buffers) will be soon needed.
9) Change to filesystem lockmgr locks so that they use LK_NOPAUSE. The
delay loop waiting is not useful for filesystem locks, due to the
length of the time intervals.
10) Correct and clean-up spec_getpages.
11) Implement a fully functional nfs_getpages, nfs_putpages.
12) Fix nfs_write so that modifications are coherent with the NFS data on
the server disk (at least as well as NFS seems to allow.)
13) Properly support MS_INVALIDATE on NFS.
14) Properly pass down MS_INVALIDATE to lower levels of the VM code from
vm_map_clean.
15) Better support the notion of pages being busy but valid, so that
fewer in-transit waits occur. (use p->busy more for pageouts instead
of PG_BUSY.) Since the page is fully valid, it is still usable for
reads.
16) It is possible (in error) for cached pages to be busy. Make the
page allocation code handle that case correctly. (It should probably
be a printf or panic, but I want the system to handle coding errors
robustly. I'll probably add a printf.)
17) Correct the design and usage of vm_page_sleep. It didn't handle
consistancy problems very well, so make the design a little less
lofty. After vm_page_sleep, if it ever blocked, it is still important
to relookup the page (if the object generation count changed), and
verify it's status (always.)
18) In vm_pageout.c, vm_pageout_clean had rotted, so clean that up.
19) Push the page busy for writes and VM_PROT_READ into vm_pageout_flush.
20) Fix vm_pager_put_pages and it's descendents to support an int flag
instead of a boolean, so that we can pass down the invalidate bit.
1998-03-07 21:37:31 +00:00
|
|
|
lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
|
1997-02-10 02:22:35 +00:00
|
|
|
(void)vfs_busy(mp, LK_NOWAIT, 0, p);
|
|
|
|
LIST_INIT(&mp->mnt_vnodelist);
|
|
|
|
mp->mnt_vfc = vfsp;
|
|
|
|
mp->mnt_op = vfsp->vfc_vfsops;
|
|
|
|
mp->mnt_flag = MNT_RDONLY;
|
|
|
|
mp->mnt_vnodecovered = NULLVP;
|
|
|
|
vfsp->vfc_refcount++;
|
1999-09-29 20:05:33 +00:00
|
|
|
mp->mnt_iosize_max = DFLTPHYS;
|
1997-02-10 02:22:35 +00:00
|
|
|
mp->mnt_stat.f_type = vfsp->vfc_typenum;
|
|
|
|
mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
|
|
|
|
strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
|
|
|
|
mp->mnt_stat.f_mntonname[0] = '/';
|
|
|
|
mp->mnt_stat.f_mntonname[1] = 0;
|
|
|
|
(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
|
|
|
|
*mpp = mp;
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Find an appropriate filesystem to use for the root. If a filesystem
|
|
|
|
* has not been preselected, walk through the list of known filesystems
|
|
|
|
* trying those that have mountroot routines, and try them until one
|
|
|
|
* works or we have tried them all.
|
1994-08-20 16:03:26 +00:00
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
#ifdef notdef /* XXX JH */
|
|
|
|
int
|
1997-09-16 11:44:05 +00:00
|
|
|
lite2_vfs_mountroot()
|
1994-08-20 16:03:26 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct vfsconf *vfsp;
|
1997-09-16 11:44:05 +00:00
|
|
|
extern int (*lite2_mountroot) __P((void));
|
1994-08-20 16:03:26 +00:00
|
|
|
int error;
|
|
|
|
|
1997-02-10 02:22:35 +00:00
|
|
|
if (lite2_mountroot != NULL)
|
|
|
|
return ((*lite2_mountroot)());
|
|
|
|
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
|
|
|
|
if (vfsp->vfc_mountroot == NULL)
|
1994-08-20 16:03:26 +00:00
|
|
|
continue;
|
1997-02-10 02:22:35 +00:00
|
|
|
if ((error = (*vfsp->vfc_mountroot)()) == 0)
|
|
|
|
return (0);
|
|
|
|
printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
|
1994-08-20 16:03:26 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
return (ENODEV);
|
1994-08-20 16:03:26 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
#endif
|
1994-08-20 16:03:26 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Lookup a mount point by filesystem identifier.
|
|
|
|
*/
|
|
|
|
struct mount *
|
1997-02-10 02:22:35 +00:00
|
|
|
vfs_getvfs(fsid)
|
1994-05-24 10:09:53 +00:00
|
|
|
fsid_t *fsid;
|
|
|
|
{
|
|
|
|
register struct mount *mp;
|
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&mountlist_mtx, MTX_DEF);
|
1999-11-20 10:00:46 +00:00
|
|
|
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
|
1997-02-10 02:22:35 +00:00
|
|
|
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&mountlist_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (mp);
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&mountlist_mtx, MTX_DEF);
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
return ((struct mount *) 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-03-14 14:19:49 +00:00
|
|
|
* Get a new unique fsid. Try to make its val[0] unique, since this value
|
|
|
|
* will be used to create fake device numbers for stat(). Also try (but
|
|
|
|
* not so hard) make its val[0] unique mod 2^16, since some emulators only
|
|
|
|
* support 16-bit device numbers. We end up with unique val[0]'s for the
|
|
|
|
* first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
|
1999-09-19 06:24:21 +00:00
|
|
|
*
|
2000-03-12 14:23:21 +00:00
|
|
|
* Keep in mind that several mounts may be running in parallel. Starting
|
2000-03-14 14:19:49 +00:00
|
|
|
* the search one past where the previous search terminated is both a
|
|
|
|
* micro-optimization and a defense against returning the same fsid to
|
|
|
|
* different mounts.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
void
|
1997-02-10 02:22:35 +00:00
|
|
|
vfs_getnewfsid(mp)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct mount *mp;
|
|
|
|
{
|
2000-03-14 14:19:49 +00:00
|
|
|
static u_int16_t mntid_base;
|
1994-05-24 10:09:53 +00:00
|
|
|
fsid_t tfsid;
|
2000-03-14 14:19:49 +00:00
|
|
|
int mtype;
|
1999-09-19 06:24:21 +00:00
|
|
|
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntid_mtx, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
mtype = mp->mnt_vfc->vfc_typenum;
|
2000-03-12 14:23:21 +00:00
|
|
|
tfsid.val[1] = mtype;
|
2000-07-07 14:01:08 +00:00
|
|
|
mtype = (mtype & 0xFF) << 24;
|
2000-03-14 14:19:49 +00:00
|
|
|
for (;;) {
|
2000-07-07 14:01:08 +00:00
|
|
|
tfsid.val[0] = makeudev(255,
|
|
|
|
mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
|
|
|
|
mntid_base++;
|
1999-09-19 06:24:21 +00:00
|
|
|
if (vfs_getvfs(&tfsid) == NULL)
|
|
|
|
break;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
|
1999-09-19 06:24:21 +00:00
|
|
|
mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntid_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
Support full-precision file timestamps. Until now, only the seconds
have been maintained, and that is still the default. A new sysctl
variable "vfs.timestamp_precision" can be used to enable higher
levels of precision:
0 = seconds only; nanoseconds zeroed (default).
1 = seconds and nanoseconds, accurate within 1/HZ.
2 = seconds and nanoseconds, truncated to microseconds.
>=3 = seconds and nanoseconds, maximum precision.
Level 1 uses getnanotime(), which is fast but can be wrong by up
to 1/HZ. Level 2 uses microtime(). It might be desirable for
consistency with utimes() and friends, which take timeval structures
rather than timespecs. Level 3 uses nanotime() for the higest
precision.
I benchmarked levels 0, 1, and 3 by copying a 550 MB tree with
"cpio -pdu". There was almost negligible difference in the system
times -- much less than 1%, and less than the variation among
multiple runs at the same level. Bruce Evans dreamed up a torture
test involving 1-byte reads with intervening fstat() calls, but
the cpio test seems more realistic to me.
This feature is currently implemented only for the UFS (FFS and
MFS) filesystems. But I think it should be easy to support it in
the others as well.
An earlier version of this was reviewed by Bruce. He's not to
blame for any breakage I've introduced since then.
Reviewed by: bde (an earlier version of the code)
1999-08-22 00:15:16 +00:00
|
|
|
/*
|
|
|
|
* Knob to control the precision of file timestamps:
|
|
|
|
*
|
|
|
|
* 0 = seconds only; nanoseconds zeroed.
|
|
|
|
* 1 = seconds and nanoseconds, accurate within 1/HZ.
|
|
|
|
* 2 = seconds and nanoseconds, truncated to microseconds.
|
|
|
|
* >=3 = seconds and nanoseconds, maximum precision.
|
|
|
|
*/
|
|
|
|
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
|
|
|
|
|
|
|
|
static int timestamp_precision = TSP_SEC;
|
|
|
|
SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
|
|
|
|
×tamp_precision, 0, "");
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get a current timestamp.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vfs_timestamp(tsp)
|
|
|
|
struct timespec *tsp;
|
|
|
|
{
|
|
|
|
struct timeval tv;
|
|
|
|
|
|
|
|
switch (timestamp_precision) {
|
|
|
|
case TSP_SEC:
|
|
|
|
tsp->tv_sec = time_second;
|
|
|
|
tsp->tv_nsec = 0;
|
|
|
|
break;
|
|
|
|
case TSP_HZ:
|
|
|
|
getnanotime(tsp);
|
|
|
|
break;
|
|
|
|
case TSP_USEC:
|
|
|
|
microtime(&tv);
|
|
|
|
TIMEVAL_TO_TIMESPEC(&tv, tsp);
|
|
|
|
break;
|
|
|
|
case TSP_NSEC:
|
|
|
|
default:
|
|
|
|
nanotime(tsp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Set vnode attributes to VNOVAL
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
|
|
|
vattr_null(vap)
|
1994-05-24 10:09:53 +00:00
|
|
|
register struct vattr *vap;
|
|
|
|
{
|
|
|
|
|
|
|
|
vap->va_type = VNON;
|
1994-05-25 09:21:21 +00:00
|
|
|
vap->va_size = VNOVAL;
|
|
|
|
vap->va_bytes = VNOVAL;
|
1998-07-12 16:45:39 +00:00
|
|
|
vap->va_mode = VNOVAL;
|
|
|
|
vap->va_nlink = VNOVAL;
|
|
|
|
vap->va_uid = VNOVAL;
|
|
|
|
vap->va_gid = VNOVAL;
|
|
|
|
vap->va_fsid = VNOVAL;
|
|
|
|
vap->va_fileid = VNOVAL;
|
|
|
|
vap->va_blocksize = VNOVAL;
|
|
|
|
vap->va_rdev = VNOVAL;
|
|
|
|
vap->va_atime.tv_sec = VNOVAL;
|
|
|
|
vap->va_atime.tv_nsec = VNOVAL;
|
|
|
|
vap->va_mtime.tv_sec = VNOVAL;
|
|
|
|
vap->va_mtime.tv_nsec = VNOVAL;
|
|
|
|
vap->va_ctime.tv_sec = VNOVAL;
|
|
|
|
vap->va_ctime.tv_nsec = VNOVAL;
|
|
|
|
vap->va_flags = VNOVAL;
|
|
|
|
vap->va_gen = VNOVAL;
|
1994-05-24 10:09:53 +00:00
|
|
|
vap->va_vaflags = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routines having to do with the management of the vnode table.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Return the next vnode from the free list.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1994-05-24 10:09:53 +00:00
|
|
|
getnewvnode(tag, mp, vops, vpp)
|
|
|
|
enum vtagtype tag;
|
|
|
|
struct mount *mp;
|
1995-11-09 08:17:23 +00:00
|
|
|
vop_t **vops;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct vnode **vpp;
|
|
|
|
{
|
2000-07-04 04:32:40 +00:00
|
|
|
int s, count;
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
2000-07-04 04:32:40 +00:00
|
|
|
struct vnode *vp = NULL;
|
2000-07-11 22:07:57 +00:00
|
|
|
struct mount *vnmp;
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
vm_object_t object;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1995-03-09 20:27:04 +00:00
|
|
|
/*
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
* We take the least recently used vnode from the freelist
|
|
|
|
* if we can get it and it has no cached pages, and no
|
|
|
|
* namecache entries are relative to it.
|
|
|
|
* Otherwise we allocate a new vnode
|
1995-03-09 20:27:04 +00:00
|
|
|
*/
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
|
1998-01-12 01:46:33 +00:00
|
|
|
s = splbio();
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vnode_free_list_mtx, MTX_DEF);
|
1998-01-12 01:46:33 +00:00
|
|
|
|
1997-09-24 07:46:54 +00:00
|
|
|
if (wantfreevnodes && freevnodes < wantfreevnodes) {
|
|
|
|
vp = NULL;
|
1997-09-26 08:08:58 +00:00
|
|
|
} else if (!wantfreevnodes && freevnodes <= desiredvnodes) {
|
|
|
|
/*
|
|
|
|
* XXX: this is only here to be backwards compatible
|
|
|
|
*/
|
1997-09-24 07:46:54 +00:00
|
|
|
vp = NULL;
|
2000-07-04 04:32:40 +00:00
|
|
|
} else for (count = 0; count < freevnodes; count++) {
|
|
|
|
vp = TAILQ_FIRST(&vnode_free_list);
|
|
|
|
if (vp == NULL || vp->v_usecount)
|
|
|
|
panic("getnewvnode: free vnode isn't");
|
|
|
|
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
|
|
|
|
/*
|
|
|
|
* Don't recycle if active in the namecache or
|
|
|
|
* if it still has cached pages or we cannot get
|
|
|
|
* its interlock.
|
|
|
|
*/
|
|
|
|
if (LIST_FIRST(&vp->v_cache_src) != NULL ||
|
2000-09-12 09:49:08 +00:00
|
|
|
(VOP_GETVOBJECT(vp, &object) == 0 &&
|
|
|
|
(object->resident_page_count || object->ref_count)) ||
|
2000-10-04 01:29:17 +00:00
|
|
|
!mtx_try_enter(&vp->v_interlock, MTX_DEF)) {
|
2000-07-04 04:32:40 +00:00
|
|
|
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
|
|
|
|
vp = NULL;
|
|
|
|
continue;
|
1996-01-19 04:00:31 +00:00
|
|
|
}
|
2000-07-11 22:07:57 +00:00
|
|
|
/*
|
|
|
|
* Skip over it if its filesystem is being suspended.
|
|
|
|
*/
|
|
|
|
if (vn_start_write(vp, &vnmp, V_NOWAIT) == 0)
|
|
|
|
break;
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
2000-07-11 22:07:57 +00:00
|
|
|
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
|
|
|
|
vp = NULL;
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
}
|
|
|
|
if (vp) {
|
1997-08-31 07:32:39 +00:00
|
|
|
vp->v_flag |= VDOOMED;
|
2000-11-02 21:42:54 +00:00
|
|
|
vp->v_flag &= ~VFREE;
|
1995-03-09 20:27:04 +00:00
|
|
|
freevnodes--;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vnode_free_list_mtx, MTX_DEF);
|
1997-08-31 07:32:39 +00:00
|
|
|
cache_purge(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_lease = NULL;
|
1997-12-29 00:25:11 +00:00
|
|
|
if (vp->v_type != VBAD) {
|
1997-02-10 02:22:35 +00:00
|
|
|
vgonel(vp, p);
|
1997-12-29 00:25:11 +00:00
|
|
|
} else {
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(vnmp);
|
1996-01-19 04:00:31 +00:00
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
#ifdef INVARIANTS
|
1994-10-02 17:35:40 +00:00
|
|
|
{
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
int s;
|
|
|
|
|
|
|
|
if (vp->v_data)
|
|
|
|
panic("cleaned vnode isn't");
|
|
|
|
s = splbio();
|
|
|
|
if (vp->v_numoutput)
|
|
|
|
panic("Clean vnode has pending I/O's");
|
|
|
|
splx(s);
|
2000-07-11 22:07:57 +00:00
|
|
|
if (vp->v_writecount != 0)
|
|
|
|
panic("Non-zero write count");
|
1994-10-02 17:35:40 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
|
|
|
vp->v_flag = 0;
|
|
|
|
vp->v_lastw = 0;
|
|
|
|
vp->v_lasta = 0;
|
|
|
|
vp->v_cstart = 0;
|
|
|
|
vp->v_clen = 0;
|
|
|
|
vp->v_socket = 0;
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
} else {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vnode_free_list_mtx, MTX_DEF);
|
VM level code cleanups.
1) Start using TSM.
Struct procs continue to point to upages structure, after being freed.
Struct vmspace continues to point to pte object and kva space for kstack.
u_map is now superfluous.
2) vm_map's don't need to be reference counted. They always exist either
in the kernel or in a vmspace. The vmspaces are managed by reference
counts.
3) Remove the "wired" vm_map nonsense.
4) No need to keep a cache of kernel stack kva's.
5) Get rid of strange looking ++var, and change to var++.
6) Change more data structures to use our "zone" allocator. Added
struct proc, struct vmspace and struct vnode. This saves a significant
amount of kva space and physical memory. Additionally, this enables
TSM for the zone managed memory.
7) Keep ioopt disabled for now.
8) Remove the now bogus "single use" map concept.
9) Use generation counts or id's for data structures residing in TSM, where
it allows us to avoid unneeded restart overhead during traversals, where
blocking might occur.
10) Account better for memory deficits, so the pageout daemon will be able
to make enough memory available (experimental.)
11) Fix some vnode locking problems. (From Tor, I think.)
12) Add a check in ufs_lookup, to avoid lots of unneeded calls to bcmp.
(experimental.)
13) Significantly shrink, cleanup, and make slightly faster the vm_fault.c
code. Use generation counts, get rid of unneded collpase operations,
and clean up the cluster code.
14) Make vm_zone more suitable for TSM.
This commit is partially as a result of discussions and contributions from
other people, including DG, Tor Egge, PHK, and probably others that I
have forgotten to attribute (so let me know, if I forgot.)
This is not the infamous, final cleanup of the vnode stuff, but a necessary
step. Vnode mgmt should be correct, but things might still change, and
there is still some missing stuff (like ioopt, and physical backing of
non-merged cache files, debugging of layering concepts.)
1998-01-22 17:30:44 +00:00
|
|
|
vp = (struct vnode *) zalloc(vnode_zone);
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
bzero((char *) vp, sizeof *vp);
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_init(&vp->v_interlock, "vnode interlock", MTX_DEF);
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
vp->v_dd = vp;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_init(&vp->v_pollinfo.vpi_lock, "vnode pollinfo", MTX_DEF);
|
1997-08-31 07:32:39 +00:00
|
|
|
cache_purge(vp);
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
LIST_INIT(&vp->v_cache_src);
|
|
|
|
TAILQ_INIT(&vp->v_cache_dst);
|
|
|
|
numvnodes++;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_INIT(&vp->v_cleanblkhd);
|
|
|
|
TAILQ_INIT(&vp->v_dirtyblkhd);
|
1995-02-27 06:50:08 +00:00
|
|
|
vp->v_type = VNON;
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_tag = tag;
|
|
|
|
vp->v_op = vops;
|
2000-09-25 15:24:04 +00:00
|
|
|
lockinit(&vp->v_lock, PVFS, "vnlock", 0, LK_NOPAUSE);
|
1994-05-24 10:09:53 +00:00
|
|
|
insmntque(vp, mp);
|
|
|
|
*vpp = vp;
|
|
|
|
vp->v_usecount = 1;
|
|
|
|
vp->v_data = 0;
|
1998-01-12 01:46:33 +00:00
|
|
|
splx(s);
|
1998-02-23 06:59:52 +00:00
|
|
|
|
1999-01-05 18:50:03 +00:00
|
|
|
vfs_object_create(vp, p, p->p_ucred);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move a vnode from one mount queue to another.
|
|
|
|
*/
|
1997-11-22 08:35:46 +00:00
|
|
|
static void
|
1994-05-24 10:09:53 +00:00
|
|
|
insmntque(vp, mp)
|
|
|
|
register struct vnode *vp;
|
|
|
|
register struct mount *mp;
|
|
|
|
{
|
|
|
|
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Delete from old mount point vnode list, if on one.
|
|
|
|
*/
|
|
|
|
if (vp->v_mount != NULL)
|
|
|
|
LIST_REMOVE(vp, v_mntvnodes);
|
|
|
|
/*
|
|
|
|
* Insert into list of vnodes for the new mount point, if available.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
if ((vp->v_mount = mp) == NULL) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Update outstanding I/O count and do wakeup if requested.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1994-05-24 10:09:53 +00:00
|
|
|
vwakeup(bp)
|
|
|
|
register struct buf *bp;
|
|
|
|
{
|
|
|
|
register struct vnode *vp;
|
|
|
|
|
|
|
|
bp->b_flags &= ~B_WRITEINPROG;
|
1994-09-25 19:34:02 +00:00
|
|
|
if ((vp = bp->b_vp)) {
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_numoutput--;
|
|
|
|
if (vp->v_numoutput < 0)
|
|
|
|
panic("vwakeup: neg numoutput");
|
1995-02-22 09:39:22 +00:00
|
|
|
if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_flag &= ~VBWAIT;
|
1995-01-10 07:32:52 +00:00
|
|
|
wakeup((caddr_t) &vp->v_numoutput);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flush out and invalidate all buffers associated with a vnode.
|
|
|
|
* Called with the underlying object locked.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
|
|
|
|
register struct vnode *vp;
|
|
|
|
int flags;
|
|
|
|
struct ucred *cred;
|
|
|
|
struct proc *p;
|
|
|
|
int slpflag, slptimeo;
|
|
|
|
{
|
|
|
|
register struct buf *bp;
|
|
|
|
struct buf *nbp, *blist;
|
|
|
|
int s, error;
|
1994-08-29 06:09:15 +00:00
|
|
|
vm_object_t object;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-06-10 18:13:19 +00:00
|
|
|
if (flags & V_SAVE) {
|
|
|
|
s = splbio();
|
|
|
|
while (vp->v_numoutput) {
|
|
|
|
vp->v_flag |= VBWAIT;
|
1998-12-22 00:44:11 +00:00
|
|
|
error = tsleep((caddr_t)&vp->v_numoutput,
|
|
|
|
slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo);
|
|
|
|
if (error) {
|
|
|
|
splx(s);
|
|
|
|
return (error);
|
|
|
|
}
|
1998-06-10 18:13:19 +00:00
|
|
|
}
|
1998-10-31 14:20:39 +00:00
|
|
|
if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
|
1998-06-10 18:13:19 +00:00
|
|
|
splx(s);
|
|
|
|
if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0)
|
|
|
|
return (error);
|
|
|
|
s = splbio();
|
|
|
|
if (vp->v_numoutput > 0 ||
|
1998-10-31 14:20:39 +00:00
|
|
|
!TAILQ_EMPTY(&vp->v_dirtyblkhd))
|
1998-06-10 18:13:19 +00:00
|
|
|
panic("vinvalbuf: dirty bufs");
|
|
|
|
}
|
|
|
|
splx(s);
|
|
|
|
}
|
1996-08-21 21:56:23 +00:00
|
|
|
s = splbio();
|
1994-05-24 10:09:53 +00:00
|
|
|
for (;;) {
|
1998-10-31 14:20:39 +00:00
|
|
|
blist = TAILQ_FIRST(&vp->v_cleanblkhd);
|
1998-10-29 09:51:28 +00:00
|
|
|
if (!blist)
|
1998-10-31 14:20:39 +00:00
|
|
|
blist = TAILQ_FIRST(&vp->v_dirtyblkhd);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (!blist)
|
|
|
|
break;
|
|
|
|
|
|
|
|
for (bp = blist; bp; bp = nbp) {
|
1998-10-31 14:20:39 +00:00
|
|
|
nbp = TAILQ_NEXT(bp, b_vnbufs);
|
1999-06-26 02:47:16 +00:00
|
|
|
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
|
|
|
|
error = BUF_TIMELOCK(bp,
|
|
|
|
LK_EXCLUSIVE | LK_SLEEPFAIL,
|
|
|
|
"vinvalbuf", slpflag, slptimeo);
|
|
|
|
if (error == ENOLCK)
|
|
|
|
break;
|
|
|
|
splx(s);
|
|
|
|
return (error);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* XXX Since there are no node locks for NFS, I
|
|
|
|
* believe there is a slight chance that a delayed
|
|
|
|
* write will occur while sleeping just above, so
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
* check for it. Note that vfs_bio_awrite expects
|
|
|
|
* buffers to reside on a queue, while VOP_BWRITE and
|
|
|
|
* brelse do not.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
|
|
|
|
(flags & V_SAVE)) {
|
|
|
|
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
if (bp->b_vp == vp) {
|
|
|
|
if (bp->b_flags & B_CLUSTEROK) {
|
1999-06-26 02:47:16 +00:00
|
|
|
BUF_UNLOCK(bp);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
vfs_bio_awrite(bp);
|
|
|
|
} else {
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
bremfree(bp);
|
1999-06-26 02:47:16 +00:00
|
|
|
bp->b_flags |= B_ASYNC;
|
2000-03-20 11:29:10 +00:00
|
|
|
BUF_WRITE(bp);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
}
|
|
|
|
} else {
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
bremfree(bp);
|
2000-03-20 11:29:10 +00:00
|
|
|
(void) BUF_WRITE(bp);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
bremfree(bp);
|
1999-06-26 02:47:16 +00:00
|
|
|
bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
bp->b_flags &= ~B_ASYNC;
|
1994-05-24 10:09:53 +00:00
|
|
|
brelse(bp);
|
|
|
|
}
|
|
|
|
}
|
1994-08-29 06:09:15 +00:00
|
|
|
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
while (vp->v_numoutput > 0) {
|
|
|
|
vp->v_flag |= VBWAIT;
|
|
|
|
tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
|
|
|
|
}
|
1997-03-05 04:54:54 +00:00
|
|
|
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
splx(s);
|
|
|
|
|
1995-03-20 02:08:24 +00:00
|
|
|
/*
|
|
|
|
* Destroy the copy in the VM cache, too.
|
|
|
|
*/
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
2000-09-12 09:49:08 +00:00
|
|
|
if (VOP_GETVOBJECT(vp, &object) == 0) {
|
1998-10-29 09:51:28 +00:00
|
|
|
vm_object_page_remove(object, 0, 0,
|
|
|
|
(flags & V_SAVE) ? TRUE : FALSE);
|
1994-08-29 06:09:15 +00:00
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
|
1998-10-31 14:20:39 +00:00
|
|
|
if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd))
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("vinvalbuf: flush failed");
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
/*
|
|
|
|
* Truncate a file's buffer and pages to a specified length. This
|
|
|
|
* is in lieu of the old vinvalbuf mechanism, which performed unneeded
|
|
|
|
* sync activity.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vtruncbuf(vp, cred, p, length, blksize)
|
|
|
|
register struct vnode *vp;
|
|
|
|
struct ucred *cred;
|
|
|
|
struct proc *p;
|
|
|
|
off_t length;
|
|
|
|
int blksize;
|
|
|
|
{
|
|
|
|
register struct buf *bp;
|
1998-10-25 17:44:59 +00:00
|
|
|
struct buf *nbp;
|
|
|
|
int s, anyfreed;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
int trunclbn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Round up to the *next* lbn.
|
|
|
|
*/
|
1998-03-19 18:46:58 +00:00
|
|
|
trunclbn = (length + blksize - 1) / blksize;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
|
|
|
|
s = splbio();
|
|
|
|
restart:
|
|
|
|
anyfreed = 1;
|
|
|
|
for (;anyfreed;) {
|
|
|
|
anyfreed = 0;
|
1998-10-31 14:20:39 +00:00
|
|
|
for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
|
|
|
|
nbp = TAILQ_NEXT(bp, b_vnbufs);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
if (bp->b_lblkno >= trunclbn) {
|
1999-06-26 02:47:16 +00:00
|
|
|
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
|
|
|
|
BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
goto restart;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
} else {
|
|
|
|
bremfree(bp);
|
1999-06-26 02:47:16 +00:00
|
|
|
bp->b_flags |= (B_INVAL | B_RELBUF);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
bp->b_flags &= ~B_ASYNC;
|
|
|
|
brelse(bp);
|
|
|
|
anyfreed = 1;
|
|
|
|
}
|
1999-12-22 03:11:04 +00:00
|
|
|
if (nbp &&
|
|
|
|
(((nbp->b_xflags & BX_VNCLEAN) == 0) ||
|
|
|
|
(nbp->b_vp != vp) ||
|
|
|
|
(nbp->b_flags & B_DELWRI))) {
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1998-10-31 14:20:39 +00:00
|
|
|
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
|
|
|
|
nbp = TAILQ_NEXT(bp, b_vnbufs);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
if (bp->b_lblkno >= trunclbn) {
|
1999-06-26 02:47:16 +00:00
|
|
|
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
|
|
|
|
BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
goto restart;
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
} else {
|
|
|
|
bremfree(bp);
|
1999-06-26 02:47:16 +00:00
|
|
|
bp->b_flags |= (B_INVAL | B_RELBUF);
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
bp->b_flags &= ~B_ASYNC;
|
|
|
|
brelse(bp);
|
|
|
|
anyfreed = 1;
|
|
|
|
}
|
1999-12-22 03:11:04 +00:00
|
|
|
if (nbp &&
|
|
|
|
(((nbp->b_xflags & BX_VNDIRTY) == 0) ||
|
|
|
|
(nbp->b_vp != vp) ||
|
|
|
|
(nbp->b_flags & B_DELWRI) == 0)) {
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
goto restart;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
1998-03-17 06:30:52 +00:00
|
|
|
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
if (length > 0) {
|
|
|
|
restartsync:
|
1998-10-31 14:20:39 +00:00
|
|
|
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
|
|
|
|
nbp = TAILQ_NEXT(bp, b_vnbufs);
|
1998-03-17 06:30:52 +00:00
|
|
|
if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) {
|
1999-06-26 02:47:16 +00:00
|
|
|
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
|
|
|
|
BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL);
|
|
|
|
goto restart;
|
1998-03-17 06:30:52 +00:00
|
|
|
} else {
|
|
|
|
bremfree(bp);
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
if (bp->b_vp == vp) {
|
|
|
|
bp->b_flags |= B_ASYNC;
|
|
|
|
} else {
|
|
|
|
bp->b_flags &= ~B_ASYNC;
|
|
|
|
}
|
2000-03-20 11:29:10 +00:00
|
|
|
BUF_WRITE(bp);
|
1998-03-17 06:30:52 +00:00
|
|
|
}
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
goto restartsync;
|
1998-03-17 06:30:52 +00:00
|
|
|
}
|
In kern_physio.c fix tsleep priority messup.
In vfs_bio.c, remove b_generation count usage,
remove redundant reassignbuf,
remove redundant spl(s),
manage page PG_ZERO flags more correctly,
utilize in invalid value for b_offset until it
is properly initialized. Add asserts
for #ifdef DIAGNOSTIC, when b_offset is
improperly used.
when a process is not performing I/O, and just waiting
on a buffer generally, make the sleep priority
low.
only check page validity in getblk for B_VMIO buffers.
In vfs_cluster, add b_offset asserts, correct pointer calculation
for clustered reads. Improve readability of certain parts of
the code. Remove redundant spl(s).
In vfs_subr, correct usage of vfs_bio_awrite (From Andrew Gallatin
<gallatin@cs.duke.edu>). More vtruncbuf problems fixed.
1998-03-19 22:48:16 +00:00
|
|
|
|
1998-03-17 06:30:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (vp->v_numoutput > 0) {
|
|
|
|
vp->v_flag |= VBWAIT;
|
|
|
|
tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0);
|
|
|
|
}
|
|
|
|
|
Some VM improvements, including elimination of alot of Sig-11
problems. Tor Egge and others have helped with various VM bugs
lately, but don't blame him -- blame me!!!
pmap.c:
1) Create an object for kernel page table allocations. This
fixes a bogus allocation method previously used for such, by
grabbing pages from the kernel object, using bogus pindexes.
(This was a code cleanup, and perhaps a minor system stability
issue.)
pmap.c:
2) Pre-set the modify and accessed bits when prudent. This will
decrease bus traffic under certain circumstances.
vfs_bio.c, vfs_cluster.c:
3) Rather than calculating the beginning virtual byte offset
multiple times, stick the offset into the buffer header, so
that the calculated offset can be reused. (Long long multiplies
are often expensive, and this is a probably unmeasurable performance
improvement, and code cleanup.)
vfs_bio.c:
4) Handle write recursion more intelligently (but not perfectly) so
that it is less likely to cause a system panic, and is also
much more robust.
vfs_bio.c:
5) getblk incorrectly wrote out blocks that are incorrectly sized.
The problem is fixed, and writes blocks out ONLY when B_DELWRI
is true.
vfs_bio.c:
6) Check that already constituted buffers have fully valid pages. If
not, then make sure that the B_CACHE bit is not set. (This was
a major source of Sig-11 type problems.)
vfs_bio.c:
7) Fix a potential system deadlock due to an incorrectly specified
sleep priority while waiting for a buffer write operation. The
change that I made opens the system up to serious problems, and
we need to examine the issue of process sleep priorities.
vfs_cluster.c, vfs_bio.c:
8) Make clustered reads work more correctly (and more completely)
when buffers are already constituted, but not fully valid.
(This was another system reliability issue.)
vfs_subr.c, ffs_inode.c:
9) Create a vtruncbuf function, which is used by filesystems that
can truncate files. The vinvalbuf forced a file sync type operation,
while vtruncbuf only invalidates the buffers past the new end of file,
and also invalidates the appropriate pages. (This was a system reliabiliy
and performance issue.)
10) Modify FFS to use vtruncbuf.
vm_object.c:
11) Make the object rundown mechanism for OBJT_VNODE type objects work
more correctly. Included in that fix, create pager entries for
the OBJT_DEAD pager type, so that paging requests that might slip
in during race conditions are properly handled. (This was a system
reliability issue.)
vm_page.c:
12) Make some of the page validation routines be a little less picky
about arguments passed to them. Also, support page invalidation
change the object generation count so that we handle generation
counts a little more robustly.
vm_pageout.c:
13) Further reduce pageout daemon activity when the system doesn't
need help from it. There should be no additional performance
decrease even when the pageout daemon is running. (This was
a significant performance issue.)
vnode_pager.c:
14) Teach the vnode pager to handle race conditions during vnode
deallocations.
1998-03-16 01:56:03 +00:00
|
|
|
splx(s);
|
|
|
|
|
|
|
|
vnode_pager_setsize(vp, length);
|
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Associate a buffer with a vnode.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1994-05-24 10:09:53 +00:00
|
|
|
bgetvp(vp, bp)
|
|
|
|
register struct vnode *vp;
|
|
|
|
register struct buf *bp;
|
|
|
|
{
|
1994-12-23 04:52:55 +00:00
|
|
|
int s;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT(bp->b_vp == NULL, ("bgetvp: not free"));
|
1999-01-10 01:58:29 +00:00
|
|
|
|
1997-08-31 07:32:39 +00:00
|
|
|
vhold(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
bp->b_vp = vp;
|
1999-08-25 12:24:39 +00:00
|
|
|
bp->b_dev = vn_todev(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Insert onto list for new vnode.
|
|
|
|
*/
|
1994-12-23 04:52:55 +00:00
|
|
|
s = splbio();
|
1999-12-22 03:11:04 +00:00
|
|
|
bp->b_xflags |= BX_VNCLEAN;
|
|
|
|
bp->b_xflags &= ~BX_VNDIRTY;
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs);
|
1994-12-23 04:52:55 +00:00
|
|
|
splx(s);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disassociate a buffer from a vnode.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1994-05-24 10:09:53 +00:00
|
|
|
brelvp(bp)
|
|
|
|
register struct buf *bp;
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
1998-10-31 14:20:39 +00:00
|
|
|
struct buflists *listheadp;
|
1994-12-23 04:52:55 +00:00
|
|
|
int s;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
|
1997-12-29 00:25:11 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Delete from old vnode list, if on one.
|
|
|
|
*/
|
1998-03-08 09:59:44 +00:00
|
|
|
vp = bp->b_vp;
|
1994-12-23 04:52:55 +00:00
|
|
|
s = splbio();
|
1999-12-22 03:11:04 +00:00
|
|
|
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
|
|
|
|
if (bp->b_xflags & BX_VNDIRTY)
|
1998-10-31 14:20:39 +00:00
|
|
|
listheadp = &vp->v_dirtyblkhd;
|
|
|
|
else
|
|
|
|
listheadp = &vp->v_cleanblkhd;
|
|
|
|
TAILQ_REMOVE(listheadp, bp, b_vnbufs);
|
1999-12-22 03:11:04 +00:00
|
|
|
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
|
1998-10-31 14:20:39 +00:00
|
|
|
}
|
|
|
|
if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) {
|
1998-03-08 09:59:44 +00:00
|
|
|
vp->v_flag &= ~VONWORKLST;
|
|
|
|
LIST_REMOVE(vp, v_synclist);
|
|
|
|
}
|
1994-12-23 04:52:55 +00:00
|
|
|
splx(s);
|
1994-05-24 10:09:53 +00:00
|
|
|
bp->b_vp = (struct vnode *) 0;
|
1997-08-31 07:32:39 +00:00
|
|
|
vdrop(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Add an item to the syncer work queue.
|
|
|
|
*/
|
1999-02-19 17:36:58 +00:00
|
|
|
static void
|
|
|
|
vn_syncer_add_to_worklist(struct vnode *vp, int delay)
|
1998-03-08 09:59:44 +00:00
|
|
|
{
|
|
|
|
int s, slot;
|
|
|
|
|
|
|
|
s = splbio();
|
|
|
|
|
|
|
|
if (vp->v_flag & VONWORKLST) {
|
|
|
|
LIST_REMOVE(vp, v_synclist);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (delay > syncer_maxdelay - 2)
|
|
|
|
delay = syncer_maxdelay - 2;
|
|
|
|
slot = (syncer_delayno + delay) & syncer_mask;
|
|
|
|
|
|
|
|
LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist);
|
|
|
|
vp->v_flag |= VONWORKLST;
|
|
|
|
splx(s);
|
|
|
|
}
|
|
|
|
|
1999-03-12 02:24:58 +00:00
|
|
|
struct proc *updateproc;
|
1998-03-08 09:59:44 +00:00
|
|
|
static void sched_sync __P((void));
|
1999-07-01 13:21:46 +00:00
|
|
|
static struct kproc_desc up_kp = {
|
1998-03-08 09:59:44 +00:00
|
|
|
"syncer",
|
|
|
|
sched_sync,
|
|
|
|
&updateproc
|
|
|
|
};
|
1999-07-01 13:21:46 +00:00
|
|
|
SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* System filesystem synchronizer daemon.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
sched_sync(void)
|
|
|
|
{
|
|
|
|
struct synclist *slp;
|
|
|
|
struct vnode *vp;
|
2000-07-11 22:07:57 +00:00
|
|
|
struct mount *mp;
|
1998-03-08 09:59:44 +00:00
|
|
|
long starttime;
|
|
|
|
int s;
|
|
|
|
struct proc *p = updateproc;
|
|
|
|
|
2000-09-07 01:33:02 +00:00
|
|
|
mtx_enter(&Giant, MTX_DEF);
|
|
|
|
|
2000-12-15 20:08:20 +00:00
|
|
|
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
|
2000-01-07 08:36:44 +00:00
|
|
|
SHUTDOWN_PRI_LAST);
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
for (;;) {
|
2000-12-15 20:08:20 +00:00
|
|
|
kthread_suspend_check(p);
|
2000-01-07 08:36:44 +00:00
|
|
|
|
1998-03-30 09:56:58 +00:00
|
|
|
starttime = time_second;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
/*
|
1999-02-19 17:36:58 +00:00
|
|
|
* Push files whose dirty time has expired. Be careful
|
|
|
|
* of interrupt race on slp queue.
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
|
|
|
s = splbio();
|
|
|
|
slp = &syncer_workitem_pending[syncer_delayno];
|
|
|
|
syncer_delayno += 1;
|
|
|
|
if (syncer_delayno == syncer_maxdelay)
|
|
|
|
syncer_delayno = 0;
|
|
|
|
splx(s);
|
|
|
|
|
|
|
|
while ((vp = LIST_FIRST(slp)) != NULL) {
|
2000-07-11 22:07:57 +00:00
|
|
|
if (VOP_ISLOCKED(vp, NULL) == 0 &&
|
|
|
|
vn_start_write(vp, &mp, V_NOWAIT) == 0) {
|
1999-03-12 02:24:58 +00:00
|
|
|
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
|
|
|
|
(void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p);
|
|
|
|
VOP_UNLOCK(vp, 0, p);
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
1999-03-12 02:24:58 +00:00
|
|
|
}
|
1999-02-19 17:36:58 +00:00
|
|
|
s = splbio();
|
1998-03-08 09:59:44 +00:00
|
|
|
if (LIST_FIRST(slp) == vp) {
|
1999-03-12 02:24:58 +00:00
|
|
|
/*
|
|
|
|
* Note: v_tag VT_VFS vps can remain on the
|
|
|
|
* worklist too with no dirty blocks, but
|
|
|
|
* since sync_fsync() moves it to a different
|
|
|
|
* slot we are safe.
|
|
|
|
*/
|
1998-10-31 14:20:39 +00:00
|
|
|
if (TAILQ_EMPTY(&vp->v_dirtyblkhd) &&
|
2000-01-10 12:04:27 +00:00
|
|
|
!vn_isdisk(vp, NULL))
|
1999-02-19 17:36:58 +00:00
|
|
|
panic("sched_sync: fsync failed vp %p tag %d", vp, vp->v_tag);
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
1999-02-19 17:36:58 +00:00
|
|
|
* Put us back on the worklist. The worklist
|
|
|
|
* routine will remove us from our current
|
|
|
|
* position and then add us back in at a later
|
|
|
|
* position.
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
|
|
|
vn_syncer_add_to_worklist(vp, syncdelay);
|
|
|
|
}
|
1999-02-19 17:36:58 +00:00
|
|
|
splx(s);
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do soft update processing.
|
|
|
|
*/
|
2000-07-03 13:26:54 +00:00
|
|
|
#ifdef SOFTUPDATES
|
2000-06-16 08:48:51 +00:00
|
|
|
softdep_process_worklist(NULL);
|
2000-07-03 13:26:54 +00:00
|
|
|
#endif
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* The variable rushjob allows the kernel to speed up the
|
|
|
|
* processing of the filesystem syncer process. A rushjob
|
|
|
|
* value of N tells the filesystem syncer to process the next
|
|
|
|
* N seconds worth of work on its queue ASAP. Currently rushjob
|
|
|
|
* is used by the soft update code to speed up the filesystem
|
|
|
|
* syncer process when the incore state is getting so far
|
|
|
|
* ahead of the disk that the kernel memory pool is being
|
|
|
|
* threatened with exhaustion.
|
|
|
|
*/
|
|
|
|
if (rushjob > 0) {
|
|
|
|
rushjob -= 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If it has taken us less than a second to process the
|
|
|
|
* current work, then wait. Otherwise start right over
|
|
|
|
* again. We can still lose time if any single round
|
|
|
|
* takes more than two seconds, but it does not really
|
|
|
|
* matter as we are just trying to generally pace the
|
|
|
|
* filesystem activity.
|
|
|
|
*/
|
1998-03-30 09:56:58 +00:00
|
|
|
if (time_second == starttime)
|
1998-03-08 09:59:44 +00:00
|
|
|
tsleep(&lbolt, PPAUSE, "syncer", 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1999-06-15 23:37:29 +00:00
|
|
|
/*
|
|
|
|
* Request the syncer daemon to speed up its work.
|
|
|
|
* We never push it to speed up more than half of its
|
|
|
|
* normal turn time, otherwise it could take over the cpu.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
speedup_syncer()
|
|
|
|
{
|
|
|
|
|
2000-12-13 01:06:53 +00:00
|
|
|
mtx_enter(&sched_lock, MTX_SPIN);
|
1999-06-15 23:37:29 +00:00
|
|
|
if (updateproc->p_wchan == &lbolt)
|
|
|
|
setrunnable(updateproc);
|
2000-12-13 01:06:53 +00:00
|
|
|
mtx_exit(&sched_lock, MTX_SPIN);
|
1999-06-15 23:37:29 +00:00
|
|
|
if (rushjob < syncdelay / 2) {
|
|
|
|
rushjob += 1;
|
|
|
|
stat_rush_requests += 1;
|
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
/*
|
|
|
|
* Associate a p-buffer with a vnode.
|
1999-01-21 08:29:12 +00:00
|
|
|
*
|
|
|
|
* Also sets B_PAGING flag to indicate that vnode is not fully associated
|
|
|
|
* with the buffer. i.e. the bp has not been linked into the vnode or
|
|
|
|
* ref-counted.
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
pbgetvp(vp, bp)
|
|
|
|
register struct vnode *vp;
|
|
|
|
register struct buf *bp;
|
|
|
|
{
|
1999-01-10 01:58:29 +00:00
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT(bp->b_vp == NULL, ("pbgetvp: not free"));
|
|
|
|
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
bp->b_vp = vp;
|
1999-01-21 08:29:12 +00:00
|
|
|
bp->b_flags |= B_PAGING;
|
1999-08-25 12:24:39 +00:00
|
|
|
bp->b_dev = vn_todev(vp);
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disassociate a p-buffer from a vnode.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
pbrelvp(bp)
|
|
|
|
register struct buf *bp;
|
|
|
|
{
|
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL"));
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
|
1999-01-21 08:29:12 +00:00
|
|
|
/* XXX REMOVE ME */
|
2001-02-04 13:13:25 +00:00
|
|
|
if (TAILQ_NEXT(bp, b_vnbufs) != NULL) {
|
1999-01-21 08:29:12 +00:00
|
|
|
panic(
|
|
|
|
"relpbuf(): b_vp was probably reassignbuf()d %p %x",
|
|
|
|
bp,
|
|
|
|
(int)bp->b_flags
|
|
|
|
);
|
|
|
|
}
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
bp->b_vp = (struct vnode *) 0;
|
1999-01-21 08:29:12 +00:00
|
|
|
bp->b_flags &= ~B_PAGING;
|
|
|
|
}
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Change the vnode a pager buffer is associated with.
|
|
|
|
*/
|
1999-01-21 08:29:12 +00:00
|
|
|
void
|
|
|
|
pbreassignbuf(bp, newvp)
|
|
|
|
struct buf *bp;
|
|
|
|
struct vnode *newvp;
|
|
|
|
{
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
KASSERT(bp->b_flags & B_PAGING,
|
|
|
|
("pbreassignbuf() on non phys bp %p", bp));
|
1999-01-21 08:29:12 +00:00
|
|
|
bp->b_vp = newvp;
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Reassign a buffer from one vnode to another.
|
|
|
|
* Used to assign file specific control information
|
|
|
|
* (indirect blocks) to the vnode to which they belong.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1994-05-24 10:09:53 +00:00
|
|
|
reassignbuf(bp, newvp)
|
|
|
|
register struct buf *bp;
|
|
|
|
register struct vnode *newvp;
|
|
|
|
{
|
1998-03-08 09:59:44 +00:00
|
|
|
struct buflists *listheadp;
|
|
|
|
int delay;
|
1996-08-15 06:45:01 +00:00
|
|
|
int s;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
if (newvp == NULL) {
|
|
|
|
printf("reassignbuf: NULL");
|
|
|
|
return;
|
|
|
|
}
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
++reassignbufcalls;
|
1996-08-15 06:45:01 +00:00
|
|
|
|
1999-01-21 08:29:12 +00:00
|
|
|
/*
|
|
|
|
* B_PAGING flagged buffers cannot be reassigned because their vp
|
|
|
|
* is not fully linked in.
|
|
|
|
*/
|
|
|
|
if (bp->b_flags & B_PAGING)
|
|
|
|
panic("cannot reassign paging buffer");
|
|
|
|
|
1996-08-15 06:45:01 +00:00
|
|
|
s = splbio();
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Delete from old vnode list, if on one.
|
|
|
|
*/
|
1999-12-22 03:11:04 +00:00
|
|
|
if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) {
|
|
|
|
if (bp->b_xflags & BX_VNDIRTY)
|
1999-03-12 02:24:58 +00:00
|
|
|
listheadp = &bp->b_vp->v_dirtyblkhd;
|
1998-10-31 14:20:39 +00:00
|
|
|
else
|
1999-03-12 02:24:58 +00:00
|
|
|
listheadp = &bp->b_vp->v_cleanblkhd;
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_REMOVE(listheadp, bp, b_vnbufs);
|
1999-12-22 03:11:04 +00:00
|
|
|
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
|
1999-03-12 02:24:58 +00:00
|
|
|
if (bp->b_vp != newvp) {
|
|
|
|
vdrop(bp->b_vp);
|
|
|
|
bp->b_vp = NULL; /* for clarification */
|
|
|
|
}
|
1997-08-31 07:32:39 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* If dirty, put on list of dirty buffers; otherwise insert onto list
|
|
|
|
* of clean buffers.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
if (bp->b_flags & B_DELWRI) {
|
|
|
|
struct buf *tbp;
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
listheadp = &newvp->v_dirtyblkhd;
|
|
|
|
if ((newvp->v_flag & VONWORKLST) == 0) {
|
|
|
|
switch (newvp->v_type) {
|
|
|
|
case VDIR:
|
1999-06-15 23:37:29 +00:00
|
|
|
delay = dirdelay;
|
1998-03-08 09:59:44 +00:00
|
|
|
break;
|
1999-11-22 10:33:55 +00:00
|
|
|
case VCHR:
|
2000-10-09 17:31:39 +00:00
|
|
|
if (newvp->v_rdev->si_mountpoint != NULL) {
|
1999-06-15 23:37:29 +00:00
|
|
|
delay = metadelay;
|
1998-03-08 09:59:44 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* fall through */
|
|
|
|
default:
|
1999-06-15 23:37:29 +00:00
|
|
|
delay = filedelay;
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
vn_syncer_add_to_worklist(newvp, delay);
|
|
|
|
}
|
1999-12-22 03:11:04 +00:00
|
|
|
bp->b_xflags |= BX_VNDIRTY;
|
1998-10-31 14:20:39 +00:00
|
|
|
tbp = TAILQ_FIRST(listheadp);
|
|
|
|
if (tbp == NULL ||
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
bp->b_lblkno == 0 ||
|
2000-01-05 05:11:37 +00:00
|
|
|
(bp->b_lblkno > 0 && tbp->b_lblkno < 0) ||
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
(bp->b_lblkno > 0 && bp->b_lblkno < tbp->b_lblkno)) {
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs);
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
++reassignbufsortgood;
|
|
|
|
} else if (bp->b_lblkno < 0) {
|
|
|
|
TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs);
|
|
|
|
++reassignbufsortgood;
|
|
|
|
} else if (reassignbufmethod == 1) {
|
|
|
|
/*
|
|
|
|
* New sorting algorithm, only handle sequential case,
|
2000-01-05 05:11:37 +00:00
|
|
|
* otherwise append to end (but before metadata)
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
*/
|
|
|
|
if ((tbp = gbincore(newvp, bp->b_lblkno - 1)) != NULL &&
|
1999-12-22 03:11:04 +00:00
|
|
|
(tbp->b_xflags & BX_VNDIRTY)) {
|
2000-01-05 05:11:37 +00:00
|
|
|
/*
|
|
|
|
* Found the best place to insert the buffer
|
|
|
|
*/
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
++reassignbufsortgood;
|
1998-10-31 14:20:39 +00:00
|
|
|
} else {
|
2000-01-05 05:11:37 +00:00
|
|
|
/*
|
|
|
|
* Missed, append to end, but before meta-data.
|
|
|
|
* We know that the head buffer in the list is
|
|
|
|
* not meta-data due to prior conditionals.
|
|
|
|
*
|
|
|
|
* Indirect effects: NFS second stage write
|
|
|
|
* tends to wind up here, giving maximum
|
|
|
|
* distance between the unstable write and the
|
|
|
|
* commit rpc.
|
|
|
|
*/
|
|
|
|
tbp = TAILQ_LAST(listheadp, buflists);
|
|
|
|
while (tbp && tbp->b_lblkno < 0)
|
|
|
|
tbp = TAILQ_PREV(tbp, buflists, b_vnbufs);
|
|
|
|
TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
++reassignbufsortbad;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Old sorting algorithm, scan queue and insert
|
|
|
|
*/
|
|
|
|
struct buf *ttbp;
|
|
|
|
while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) &&
|
|
|
|
(ttbp->b_lblkno < bp->b_lblkno)) {
|
|
|
|
++reassignbufloops;
|
|
|
|
tbp = ttbp;
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
}
|
The buffer queue mechanism has been reformulated. Instead of having
QUEUE_AGE, QUEUE_LRU, and QUEUE_EMPTY we instead have QUEUE_CLEAN,
QUEUE_DIRTY, QUEUE_EMPTY, and QUEUE_EMPTYKVA. With this patch clean
and dirty buffers have been separated. Empty buffers with KVM
assignments have been separated from truely empty buffers. getnewbuf()
has been rewritten and now operates in a 100% optimal fashion. That is,
it is able to find precisely the right kind of buffer it needs to
allocate a new buffer, defragment KVM, or to free-up an existing buffer
when the buffer cache is full (which is a steady-state situation for
the buffer cache).
Buffer flushing has been reorganized. Previously buffers were flushed
in the context of whatever process hit the conditions forcing buffer
flushing to occur. This resulted in processes blocking on conditions
unrelated to what they were doing. This also resulted in inappropriate
VFS stacking chains due to multiple processes getting stuck trying to
flush dirty buffers or due to a single process getting into a situation
where it might attempt to flush buffers recursively - a situation that
was only partially fixed in prior commits. We have added a new daemon
called the buf_daemon which is responsible for flushing dirty buffers
when the number of dirty buffers exceeds the vfs.hidirtybuffers limit.
This daemon attempts to dynamically adjust the rate at which dirty buffers
are flushed such that getnewbuf() calls (almost) never block.
The number of nbufs and amount of buffer space is now scaled past the
8MB limit that was previously imposed for systems with over 64MB of
memory, and the vfs.{lo,hi}dirtybuffers limits have been relaxed
somewhat. The number of physical buffers has been increased with the
intention that we will manage physical I/O differently in the future.
reassignbuf previously attempted to keep the dirtyblkhd list sorted which
could result in non-deterministic operation under certain conditions,
such as when a large number of dirty buffers are being managed. This
algorithm has been changed. reassignbuf now keeps buffers locally sorted
if it can do so cheaply, and otherwise gives up and adds buffers to
the head of the dirtyblkhd list. The new algorithm is deterministic but
not perfect. The new algorithm greatly reduces problems that previously
occured when write_behind was turned off in the system.
The P_FLSINPROG proc->p_flag bit has been replaced by the more descriptive
P_BUFEXHAUST bit. This bit allows processes working with filesystem
buffers to use available emergency reserves. Normal processes do not set
this bit and are not allowed to dig into emergency reserves. The purpose
of this bit is to avoid low-memory deadlocks.
A small race condition was fixed in getpbuf() in vm/vm_pager.c.
Submitted by: Matthew Dillon <dillon@apollo.backplane.com>
Reviewed by: Kirk McKusick <mckusick@mckusick.com>
1999-07-04 00:25:38 +00:00
|
|
|
TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs);
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
}
|
|
|
|
} else {
|
1999-12-22 03:11:04 +00:00
|
|
|
bp->b_xflags |= BX_VNCLEAN;
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs);
|
1998-03-08 09:59:44 +00:00
|
|
|
if ((newvp->v_flag & VONWORKLST) &&
|
1998-10-31 14:20:39 +00:00
|
|
|
TAILQ_EMPTY(&newvp->v_dirtyblkhd)) {
|
1998-03-08 09:59:44 +00:00
|
|
|
newvp->v_flag &= ~VONWORKLST;
|
|
|
|
LIST_REMOVE(newvp, v_synclist);
|
|
|
|
}
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
}
|
1999-03-12 02:24:58 +00:00
|
|
|
if (bp->b_vp != newvp) {
|
|
|
|
bp->b_vp = newvp;
|
|
|
|
vhold(bp->b_vp);
|
|
|
|
}
|
1996-08-15 06:45:01 +00:00
|
|
|
splx(s);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-11-02 21:14:13 +00:00
|
|
|
* Create a vnode for a device.
|
1997-09-07 16:21:11 +00:00
|
|
|
* Used for mounting the root file system.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1994-05-24 10:09:53 +00:00
|
|
|
bdevvp(dev, vpp)
|
|
|
|
dev_t dev;
|
|
|
|
struct vnode **vpp;
|
|
|
|
{
|
|
|
|
register struct vnode *vp;
|
|
|
|
struct vnode *nvp;
|
|
|
|
int error;
|
|
|
|
|
1999-05-31 11:29:30 +00:00
|
|
|
if (dev == NODEV) {
|
1998-10-25 16:11:49 +00:00
|
|
|
*vpp = NULLVP;
|
|
|
|
return (ENXIO);
|
|
|
|
}
|
2000-09-27 18:03:17 +00:00
|
|
|
if (vfinddev(dev, VCHR, vpp))
|
|
|
|
return (0);
|
1998-10-25 16:11:49 +00:00
|
|
|
error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error) {
|
1998-10-25 16:11:49 +00:00
|
|
|
*vpp = NULLVP;
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
vp = nvp;
|
2000-05-14 07:43:12 +00:00
|
|
|
vp->v_type = VCHR;
|
1999-08-26 14:53:31 +00:00
|
|
|
addalias(vp, dev);
|
1994-05-24 10:09:53 +00:00
|
|
|
*vpp = vp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1999-08-26 14:53:31 +00:00
|
|
|
* Add vnode to the alias list hung off the dev_t.
|
|
|
|
*
|
|
|
|
* The reason for this gunk is that multiple vnodes can reference
|
|
|
|
* the same physical device, so checking vp->v_usecount to see
|
|
|
|
* how many users there are is inadequate; the v_usecount for
|
|
|
|
* the vnodes need to be accumulated. vcount() does that.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2000-07-24 05:28:33 +00:00
|
|
|
struct vnode *
|
1999-08-26 14:53:31 +00:00
|
|
|
addaliasu(nvp, nvp_rdev)
|
|
|
|
struct vnode *nvp;
|
Divorce "dev_t" from the "major|minor" bitmap, which is now called
udev_t in the kernel but still called dev_t in userland.
Provide functions to manipulate both types:
major() umajor()
minor() uminor()
makedev() umakedev()
dev2udev() udev2dev()
For now they're functions, they will become in-line functions
after one of the next two steps in this process.
Return major/minor/makedev to macro-hood for userland.
Register a name in cdevsw[] for the "filedescriptor" driver.
In the kernel the udev_t appears in places where we have the
major/minor number combination, (ie: a potential device: we
may not have the driver nor the device), like in inodes, vattr,
cdevsw registration and so on, whereas the dev_t appears where
we carry around a reference to a actual device.
In the future the cdevsw and the aliased-from vnode will be hung
directly from the dev_t, along with up to two softc pointers for
the device driver and a few houskeeping bits. This will essentially
replace the current "alias" check code (same buck, bigger bang).
A little stunt has been provided to try to catch places where the
wrong type is being used (dev_t vs udev_t), if you see something
not working, #undef DEVT_FASCIST in kern/kern_conf.c and see if
it makes a difference. If it does, please try to track it down
(many hands make light work) or at least try to reproduce it
as simply as possible, and describe how to do that.
Without DEVT_FASCIST I belive this patch is a no-op.
Stylistic/posixoid comments about the userland view of the <sys/*.h>
files welcome now, from userland they now contain the end result.
Next planned step: make all dev_t's refer to the same devsw[] which
means convert BLK's to CHR's at the perimeter of the vnodes and
other places where they enter the game (bootdev, mknod, sysctl).
1999-05-11 19:55:07 +00:00
|
|
|
udev_t nvp_rdev;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2000-07-24 05:28:33 +00:00
|
|
|
struct vnode *ovp;
|
|
|
|
vop_t **ops;
|
|
|
|
dev_t dev;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2000-11-02 21:14:13 +00:00
|
|
|
if (nvp->v_type == VBLK)
|
|
|
|
return (nvp);
|
|
|
|
if (nvp->v_type != VCHR)
|
1999-08-26 14:53:31 +00:00
|
|
|
panic("addaliasu on non-special vnode");
|
2000-11-02 21:14:13 +00:00
|
|
|
dev = udev2dev(nvp_rdev, 0);
|
2000-07-24 05:28:33 +00:00
|
|
|
/*
|
|
|
|
* Check to see if we have a bdevvp vnode with no associated
|
|
|
|
* filesystem. If so, we want to associate the filesystem of
|
|
|
|
* the new newly instigated vnode with the bdevvp vnode and
|
|
|
|
* discard the newly created vnode rather than leaving the
|
|
|
|
* bdevvp vnode lying around with no associated filesystem.
|
|
|
|
*/
|
|
|
|
if (vfinddev(dev, nvp->v_type, &ovp) == 0 || ovp->v_data != NULL) {
|
|
|
|
addalias(nvp, dev);
|
|
|
|
return (nvp);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Discard unneeded vnode, but save its node specific data.
|
|
|
|
* Note that if there is a lock, it is carried over in the
|
|
|
|
* node specific data to the replacement vnode.
|
|
|
|
*/
|
|
|
|
vref(ovp);
|
|
|
|
ovp->v_data = nvp->v_data;
|
|
|
|
ovp->v_tag = nvp->v_tag;
|
|
|
|
nvp->v_data = NULL;
|
2001-01-31 04:54:23 +00:00
|
|
|
lockinit(&ovp->v_lock, PVFS, nvp->v_lock.lk_wmesg,
|
|
|
|
nvp->v_lock.lk_timo, nvp->v_lock.lk_flags & LK_EXTFLG_MASK);
|
2000-09-25 15:24:04 +00:00
|
|
|
if (nvp->v_vnlock)
|
|
|
|
ovp->v_vnlock = &ovp->v_lock;
|
2001-01-31 04:54:23 +00:00
|
|
|
ops = ovp->v_op;
|
|
|
|
ovp->v_op = nvp->v_op;
|
|
|
|
if (VOP_ISLOCKED(nvp, curproc)) {
|
|
|
|
VOP_UNLOCK(nvp, 0, curproc);
|
|
|
|
vn_lock(ovp, LK_EXCLUSIVE | LK_RETRY, curproc);
|
|
|
|
}
|
|
|
|
nvp->v_op = ops;
|
2000-07-24 05:28:33 +00:00
|
|
|
insmntque(ovp, nvp->v_mount);
|
|
|
|
vrele(nvp);
|
|
|
|
vgone(nvp);
|
|
|
|
return (ovp);
|
1999-07-20 09:47:55 +00:00
|
|
|
}
|
Divorce "dev_t" from the "major|minor" bitmap, which is now called
udev_t in the kernel but still called dev_t in userland.
Provide functions to manipulate both types:
major() umajor()
minor() uminor()
makedev() umakedev()
dev2udev() udev2dev()
For now they're functions, they will become in-line functions
after one of the next two steps in this process.
Return major/minor/makedev to macro-hood for userland.
Register a name in cdevsw[] for the "filedescriptor" driver.
In the kernel the udev_t appears in places where we have the
major/minor number combination, (ie: a potential device: we
may not have the driver nor the device), like in inodes, vattr,
cdevsw registration and so on, whereas the dev_t appears where
we carry around a reference to a actual device.
In the future the cdevsw and the aliased-from vnode will be hung
directly from the dev_t, along with up to two softc pointers for
the device driver and a few houskeeping bits. This will essentially
replace the current "alias" check code (same buck, bigger bang).
A little stunt has been provided to try to catch places where the
wrong type is being used (dev_t vs udev_t), if you see something
not working, #undef DEVT_FASCIST in kern/kern_conf.c and see if
it makes a difference. If it does, please try to track it down
(many hands make light work) or at least try to reproduce it
as simply as possible, and describe how to do that.
Without DEVT_FASCIST I belive this patch is a no-op.
Stylistic/posixoid comments about the userland view of the <sys/*.h>
files welcome now, from userland they now contain the end result.
Next planned step: make all dev_t's refer to the same devsw[] which
means convert BLK's to CHR's at the perimeter of the vnodes and
other places where they enter the game (bootdev, mknod, sysctl).
1999-05-11 19:55:07 +00:00
|
|
|
|
2000-10-05 18:22:46 +00:00
|
|
|
/* This is a local helper function that do the same as addaliasu, but for a
|
|
|
|
* dev_t instead of an udev_t. */
|
2000-09-22 11:54:48 +00:00
|
|
|
static void
|
1999-08-26 14:53:31 +00:00
|
|
|
addalias(nvp, dev)
|
|
|
|
struct vnode *nvp;
|
1999-07-20 09:47:55 +00:00
|
|
|
dev_t dev;
|
|
|
|
{
|
2000-10-05 18:22:46 +00:00
|
|
|
|
2000-11-02 21:14:13 +00:00
|
|
|
KASSERT(nvp->v_type == VCHR, ("addalias on non-special vnode"));
|
1999-08-26 14:53:31 +00:00
|
|
|
nvp->v_rdev = dev;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&spechash_mtx, MTX_DEF);
|
1999-08-26 14:53:31 +00:00
|
|
|
SLIST_INSERT_HEAD(&dev->si_hlist, nvp, v_specnext);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&spechash_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab a particular vnode from the free list, increment its
|
1999-07-12 15:02:51 +00:00
|
|
|
* reference count and lock it. The vnode lock bit is set if the
|
1994-05-24 10:09:53 +00:00
|
|
|
* vnode is being eliminated in vgone. The process is awakened
|
|
|
|
* when the transition is completed, and an error returned to
|
|
|
|
* indicate that the vnode is no longer usable (possibly having
|
|
|
|
* been changed to a new file system type).
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1997-02-10 02:22:35 +00:00
|
|
|
vget(vp, flags, p)
|
1994-05-24 10:09:53 +00:00
|
|
|
register struct vnode *vp;
|
1997-02-10 02:22:35 +00:00
|
|
|
int flags;
|
|
|
|
struct proc *p;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
int error;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* If the vnode is in the process of being cleaned out for
|
|
|
|
* another use, we wait for the cleaning to finish and then
|
|
|
|
* return failure. Cleaning is determined by checking that
|
|
|
|
* the VXLOCK flag is set.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2000-09-22 12:22:36 +00:00
|
|
|
if ((flags & LK_INTERLOCK) == 0)
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (vp->v_flag & VXLOCK) {
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
if (vp->v_vxproc == curproc) {
|
|
|
|
printf("VXLOCK interlock avoided\n");
|
|
|
|
} else {
|
|
|
|
vp->v_flag |= VXWANT;
|
2000-12-01 03:43:33 +00:00
|
|
|
msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
|
|
|
|
"vget", 0);
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
return (ENOENT);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-12-29 00:25:11 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_usecount++;
|
1997-12-29 00:25:11 +00:00
|
|
|
|
1997-08-31 07:32:39 +00:00
|
|
|
if (VSHOULDBUSY(vp))
|
|
|
|
vbusy(vp);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (flags & LK_TYPE_MASK) {
|
1998-02-23 06:59:52 +00:00
|
|
|
if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) {
|
|
|
|
/*
|
|
|
|
* must expand vrele here because we do not want
|
|
|
|
* to call VOP_INACTIVE if the reference count
|
|
|
|
* drops back to zero since it was never really
|
|
|
|
* active. We must remove it from the free list
|
|
|
|
* before sleeping so that multiple processes do
|
|
|
|
* not try to recycle it.
|
|
|
|
*/
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1998-02-23 06:59:52 +00:00
|
|
|
vp->v_usecount--;
|
|
|
|
if (VSHOULDFREE(vp))
|
|
|
|
vfree(vp);
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1998-02-23 06:59:52 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
return (error);
|
1996-08-21 21:56:23 +00:00
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
return (0);
|
|
|
|
}
|
1997-12-29 00:25:11 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Increase the reference count of a vnode.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
1997-12-29 16:54:03 +00:00
|
|
|
void
|
|
|
|
vref(struct vnode *vp)
|
|
|
|
{
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1997-12-29 16:54:03 +00:00
|
|
|
vp->v_usecount++;
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-12-29 16:54:03 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-12-29 00:25:11 +00:00
|
|
|
* Vnode put/release.
|
|
|
|
* If count drops to zero, call inactive routine and return to freelist.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1997-12-29 00:25:11 +00:00
|
|
|
vrele(vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
{
|
1997-12-29 00:25:11 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
|
|
|
|
1999-01-10 01:58:29 +00:00
|
|
|
KASSERT(vp != NULL, ("vrele: null vp"));
|
1999-01-08 17:31:30 +00:00
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1996-08-21 21:56:23 +00:00
|
|
|
|
2000-10-02 09:57:06 +00:00
|
|
|
KASSERT(vp->v_writecount < vp->v_usecount, ("vrele: missed vn_close"));
|
|
|
|
|
1997-12-29 00:25:11 +00:00
|
|
|
if (vp->v_usecount > 1) {
|
1996-09-28 03:36:07 +00:00
|
|
|
|
1997-12-29 00:25:11 +00:00
|
|
|
vp->v_usecount--;
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-12-29 00:25:11 +00:00
|
|
|
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (vp->v_usecount == 1) {
|
1997-12-29 00:25:11 +00:00
|
|
|
|
|
|
|
vp->v_usecount--;
|
|
|
|
if (VSHOULDFREE(vp))
|
|
|
|
vfree(vp);
|
|
|
|
/*
|
|
|
|
* If we are doing a vput, the node is already locked, and we must
|
|
|
|
* call VOP_INACTIVE with the node locked. So, in the case of
|
|
|
|
* vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
|
|
|
|
*/
|
|
|
|
if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
|
|
|
|
VOP_INACTIVE(vp, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
} else {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
vprint("vrele: negative ref count", vp);
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-12-29 00:25:11 +00:00
|
|
|
#endif
|
|
|
|
panic("vrele: negative ref cnt");
|
1996-08-21 21:56:23 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Release an already locked vnode. This give the same effects as
|
|
|
|
* unlock+vrele(), but takes less time and avoids releasing and
|
|
|
|
* re-aquiring the lock (as vrele() aquires the lock internally.)
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
1997-12-29 00:25:11 +00:00
|
|
|
void
|
|
|
|
vput(vp)
|
1997-02-10 02:22:35 +00:00
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT(vp != NULL, ("vput: null vp"));
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
2000-10-02 09:57:06 +00:00
|
|
|
KASSERT(vp->v_writecount < vp->v_usecount, ("vput: missed vn_close"));
|
|
|
|
|
1997-12-29 00:25:11 +00:00
|
|
|
if (vp->v_usecount > 1) {
|
1997-12-19 09:03:37 +00:00
|
|
|
|
1997-08-31 07:32:39 +00:00
|
|
|
vp->v_usecount--;
|
1997-12-29 00:25:11 +00:00
|
|
|
VOP_UNLOCK(vp, LK_INTERLOCK, p);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
}
|
1996-08-21 21:56:23 +00:00
|
|
|
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
if (vp->v_usecount == 1) {
|
1996-08-21 21:56:23 +00:00
|
|
|
|
1997-12-29 00:25:11 +00:00
|
|
|
vp->v_usecount--;
|
|
|
|
if (VSHOULDFREE(vp))
|
|
|
|
vfree(vp);
|
1997-02-27 02:57:03 +00:00
|
|
|
/*
|
|
|
|
* If we are doing a vput, the node is already locked, and we must
|
|
|
|
* call VOP_INACTIVE with the node locked. So, in the case of
|
|
|
|
* vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
|
|
|
|
*/
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
VOP_INACTIVE(vp, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1997-12-29 00:25:11 +00:00
|
|
|
} else {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
vprint("vput: negative ref count", vp);
|
|
|
|
#endif
|
|
|
|
panic("vput: negative ref cnt");
|
|
|
|
}
|
1997-02-27 02:57:03 +00:00
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-08-31 07:32:39 +00:00
|
|
|
* Somebody doesn't want the vnode recycled.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
|
|
|
vhold(vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
register struct vnode *vp;
|
|
|
|
{
|
1998-03-14 02:55:01 +00:00
|
|
|
int s;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-03-14 02:55:01 +00:00
|
|
|
s = splbio();
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_holdcnt++;
|
1997-08-31 07:32:39 +00:00
|
|
|
if (VSHOULDBUSY(vp))
|
|
|
|
vbusy(vp);
|
1998-03-14 02:55:01 +00:00
|
|
|
splx(s);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Note that there is one less who cares about this vnode. vdrop() is the
|
|
|
|
* opposite of vhold().
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1997-08-31 07:32:39 +00:00
|
|
|
vdrop(vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
register struct vnode *vp;
|
|
|
|
{
|
1998-03-14 02:55:01 +00:00
|
|
|
int s;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-03-14 02:55:01 +00:00
|
|
|
s = splbio();
|
1994-05-24 10:09:53 +00:00
|
|
|
if (vp->v_holdcnt <= 0)
|
1998-03-08 09:59:44 +00:00
|
|
|
panic("vdrop: holdcnt");
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_holdcnt--;
|
1997-08-31 07:32:39 +00:00
|
|
|
if (VSHOULDFREE(vp))
|
|
|
|
vfree(vp);
|
1998-03-14 02:55:01 +00:00
|
|
|
splx(s);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Remove any vnodes in the vnode table belonging to mount point mp.
|
|
|
|
*
|
|
|
|
* If MNT_NOFORCE is specified, there should not be any active ones,
|
|
|
|
* return error if any are found (nb: this is a user error, not a
|
|
|
|
* system error). If MNT_FORCE is specified, detach any active vnodes
|
|
|
|
* that are found.
|
|
|
|
*/
|
|
|
|
#ifdef DIAGNOSTIC
|
1995-12-17 21:23:44 +00:00
|
|
|
static int busyprt = 0; /* print out busy vnodes */
|
1997-04-01 13:05:34 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "");
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1994-05-24 10:09:53 +00:00
|
|
|
vflush(mp, skipvp, flags)
|
|
|
|
struct mount *mp;
|
|
|
|
struct vnode *skipvp;
|
|
|
|
int flags;
|
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
|
|
|
struct vnode *vp, *nvp;
|
1994-05-24 10:09:53 +00:00
|
|
|
int busy = 0;
|
|
|
|
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
loop:
|
1999-11-16 16:28:58 +00:00
|
|
|
for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp; vp = nvp) {
|
1995-03-11 22:29:07 +00:00
|
|
|
/*
|
|
|
|
* Make sure this vnode wasn't reclaimed in getnewvnode().
|
|
|
|
* Start over if it has (it won't be on the list anymore).
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
if (vp->v_mount != mp)
|
|
|
|
goto loop;
|
1999-11-16 16:28:58 +00:00
|
|
|
nvp = LIST_NEXT(vp, v_mntvnodes);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Skip over a selected vnode.
|
|
|
|
*/
|
|
|
|
if (vp == skipvp)
|
|
|
|
continue;
|
1997-02-10 02:22:35 +00:00
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Skip over a vnodes marked VSYSTEM.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* If WRITECLOSE is set, only flush out regular file vnodes
|
|
|
|
* open for writing.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if ((flags & WRITECLOSE) &&
|
1997-02-10 02:22:35 +00:00
|
|
|
(vp->v_writecount == 0 || vp->v_type != VREG)) {
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
1996-08-21 21:56:23 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* With v_usecount == 0, all we need to do is clear out the
|
|
|
|
* vnode data structures and we are done.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (vp->v_usecount == 0) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
vgonel(vp, p);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
|
|
|
}
|
1996-10-17 02:49:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* If FORCECLOSE is set, forcibly close the vnode. For block
|
|
|
|
* or character devices, revert to an anonymous device. For
|
|
|
|
* all other files, just kill them.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (flags & FORCECLOSE) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
2000-11-02 21:14:13 +00:00
|
|
|
if (vp->v_type != VCHR) {
|
1997-02-10 02:22:35 +00:00
|
|
|
vgonel(vp, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
} else {
|
1997-02-10 02:22:35 +00:00
|
|
|
vclean(vp, 0, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_op = spec_vnodeop_p;
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
insmntque(vp, (struct mount *) 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (busyprt)
|
|
|
|
vprint("vflush: busy vnode", vp);
|
|
|
|
#endif
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
busy++;
|
|
|
|
}
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (busy)
|
|
|
|
return (EBUSY);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Disassociate the underlying file system from a vnode.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
static void
|
1997-09-16 11:44:05 +00:00
|
|
|
vclean(vp, flags, p)
|
|
|
|
struct vnode *vp;
|
|
|
|
int flags;
|
|
|
|
struct proc *p;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
int active;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* Check to see if the vnode is in use. If so we have to reference it
|
|
|
|
* before we clean it out so that its count cannot fall to zero and
|
|
|
|
* generate a race against ourselves to recycle it.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1994-09-25 19:34:02 +00:00
|
|
|
if ((active = vp->v_usecount))
|
1997-02-10 02:22:35 +00:00
|
|
|
vp->v_usecount++;
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* Prevent the vnode from being recycled or brought into use while we
|
|
|
|
* clean it out.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (vp->v_flag & VXLOCK)
|
|
|
|
panic("vclean: deadlock");
|
|
|
|
vp->v_flag |= VXLOCK;
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
vp->v_vxproc = curproc;
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Even if the count is zero, the VOP_INACTIVE routine may still
|
|
|
|
* have the object locked while it cleans it out. The VOP_LOCK
|
|
|
|
* ensures that the VOP_INACTIVE routine is done with its work.
|
|
|
|
* For active vnodes, it ensures that no other activity can
|
|
|
|
* occur while the underlying object is being cleaned out.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
|
1997-06-22 03:00:24 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Clean out any buffers associated with the vnode.
|
2000-07-04 03:23:29 +00:00
|
|
|
* If the flush fails, just toss the buffers.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2000-07-04 03:23:29 +00:00
|
|
|
if (flags & DOCLOSE) {
|
2000-07-11 22:07:57 +00:00
|
|
|
if (TAILQ_FIRST(&vp->v_dirtyblkhd) != NULL)
|
2000-07-24 05:28:33 +00:00
|
|
|
(void) vn_write_suspend_wait(vp, NULL, V_WAIT);
|
2000-07-04 03:23:29 +00:00
|
|
|
if (vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0) != 0)
|
|
|
|
vinvalbuf(vp, 0, NOCRED, p, 0, 0);
|
|
|
|
}
|
|
|
|
|
2000-09-12 09:49:08 +00:00
|
|
|
VOP_DESTROYVOBJECT(vp);
|
1997-06-22 03:00:24 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* If purging an active vnode, it must be closed and
|
|
|
|
* deactivated before being reclaimed. Note that the
|
|
|
|
* VOP_INACTIVE will unlock the vnode.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (active) {
|
|
|
|
if (flags & DOCLOSE)
|
1998-12-24 12:07:16 +00:00
|
|
|
VOP_CLOSE(vp, FNONBLOCK, NOCRED, p);
|
1997-02-10 02:22:35 +00:00
|
|
|
VOP_INACTIVE(vp, p);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* Any other processes trying to obtain this lock must first
|
|
|
|
* wait for VXLOCK to clear, then call the new lock operation.
|
|
|
|
*/
|
|
|
|
VOP_UNLOCK(vp, 0, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Reclaim the vnode.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
if (VOP_RECLAIM(vp, p))
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("vclean: cannot reclaim");
|
1998-02-23 06:59:52 +00:00
|
|
|
|
2000-01-29 15:22:58 +00:00
|
|
|
if (active) {
|
|
|
|
/*
|
|
|
|
* Inline copy of vrele() since VOP_INACTIVE
|
|
|
|
* has already been called.
|
|
|
|
*/
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
2000-01-29 15:22:58 +00:00
|
|
|
if (--vp->v_usecount <= 0) {
|
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (vp->v_usecount < 0 || vp->v_writecount != 0) {
|
|
|
|
vprint("vclean: bad ref count", vp);
|
|
|
|
panic("vclean: ref cnt");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
vfree(vp);
|
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
2000-01-29 15:22:58 +00:00
|
|
|
}
|
1998-02-23 06:59:52 +00:00
|
|
|
|
1997-02-10 02:22:35 +00:00
|
|
|
cache_purge(vp);
|
2000-10-06 08:04:48 +00:00
|
|
|
vp->v_vnlock = NULL;
|
2000-10-04 01:29:17 +00:00
|
|
|
lockdestroy(&vp->v_lock);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1998-02-23 06:59:52 +00:00
|
|
|
if (VSHOULDFREE(vp))
|
|
|
|
vfree(vp);
|
1999-08-29 09:09:12 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Done with purge, notify sleepers of the grim news.
|
|
|
|
*/
|
|
|
|
vp->v_op = dead_vnodeop_p;
|
1997-12-15 03:09:59 +00:00
|
|
|
vn_pollgone(vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_tag = VT_NON;
|
|
|
|
vp->v_flag &= ~VXLOCK;
|
Implement a low-memory deadlock solution.
Removed most of the hacks that were trying to deal with low-memory
situations prior to now.
The new code is based on the concept that I/O must be able to function in
a low memory situation. All major modules related to I/O (except
networking) have been adjusted to allow allocation out of the system
reserve memory pool. These modules now detect a low memory situation but
rather then block they instead continue to operate, then return resources
to the memory pool instead of cache them or leave them wired.
Code has been added to stall in a low-memory situation prior to a vnode
being locked.
Thus situations where a process blocks in a low-memory condition while
holding a locked vnode have been reduced to near nothing. Not only will
I/O continue to operate, but many prior deadlock conditions simply no
longer exist.
Implement a number of VFS/BIO fixes
(found by Ian): in biodone(), bogus-page replacement code, the loop
was not properly incrementing loop variables prior to a continue
statement. We do not believe this code can be hit anyway but we
aren't taking any chances. We'll turn the whole section into a
panic (as it already is in brelse()) after the release is rolled.
In biodone(), the foff calculation was incorrectly
clamped to the iosize, causing the wrong foff to be calculated
for pages in the case of an I/O error or biodone() called without
initiating I/O. The problem always caused a panic before. Now it
doesn't. The problem is mainly an issue with NFS.
Fixed casts for ~PAGE_MASK. This code worked properly before only
because the calculations use signed arithmatic. Better to properly
extend PAGE_MASK first before inverting it for the 64 bit masking
op.
In brelse(), the bogus_page fixup code was improperly throwing
away the original contents of 'm' when it did the j-loop to
fix the bogus pages. The result was that it would potentially
invalidate parts of the *WRONG* page(!), leading to corruption.
There may still be cases where a background bitmap write is
being duplicated, causing potential corruption. We have identified
a potentially serious bug related to this but the fix is still TBD.
So instead this patch contains a KASSERT to detect the problem
and panic the machine rather then continue to corrupt the filesystem.
The problem does not occur very often.. it is very hard to
reproduce, and it may or may not be the cause of the corruption
people have reported.
Review by: (VFS/BIO: mckusick, Ian Dowse <iedowse@maths.tcd.ie>)
Testing by: (VM/Deadlock) Paul Saab <ps@yahoo-inc.com>
2000-11-18 23:06:26 +00:00
|
|
|
vp->v_vxproc = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (vp->v_flag & VXWANT) {
|
|
|
|
vp->v_flag &= ~VXWANT;
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
wakeup((caddr_t) vp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Eliminate all activity associated with the requested vnode
|
1994-05-24 10:09:53 +00:00
|
|
|
* and with all vnodes aliased to the requested vnode.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
int
|
|
|
|
vop_revoke(ap)
|
|
|
|
struct vop_revoke_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
int a_flags;
|
|
|
|
} */ *ap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct vnode *vp, *vq;
|
1999-08-26 14:53:31 +00:00
|
|
|
dev_t dev;
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1999-01-08 17:31:30 +00:00
|
|
|
KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke"));
|
1997-02-10 02:22:35 +00:00
|
|
|
|
|
|
|
vp = ap->a_vp;
|
1999-08-26 14:53:31 +00:00
|
|
|
/*
|
|
|
|
* If a vgone (or vclean) is already in progress,
|
|
|
|
* wait until it is done and return.
|
|
|
|
*/
|
|
|
|
if (vp->v_flag & VXLOCK) {
|
|
|
|
vp->v_flag |= VXWANT;
|
2000-12-01 03:43:33 +00:00
|
|
|
msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
|
|
|
|
"vop_revokeall", 0);
|
1999-08-26 14:53:31 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
dev = vp->v_rdev;
|
|
|
|
for (;;) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&spechash_mtx, MTX_DEF);
|
1999-08-26 14:53:31 +00:00
|
|
|
vq = SLIST_FIRST(&dev->si_hlist);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&spechash_mtx, MTX_DEF);
|
1999-08-26 14:53:31 +00:00
|
|
|
if (!vq)
|
|
|
|
break;
|
|
|
|
vgone(vq);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Recycle an unused vnode to the front of the free list.
|
|
|
|
* Release the passed interlock if the vnode will be recycled.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vrecycle(vp, inter_lkp, p)
|
|
|
|
struct vnode *vp;
|
2001-01-24 12:35:55 +00:00
|
|
|
struct mtx *inter_lkp;
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p;
|
|
|
|
{
|
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (vp->v_usecount == 0) {
|
|
|
|
if (inter_lkp) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(inter_lkp, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
|
|
|
vgonel(vp, p);
|
|
|
|
return (1);
|
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Eliminate all activity associated with a vnode
|
|
|
|
* in preparation for reuse.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
|
|
|
vgone(vp)
|
1994-05-24 10:09:53 +00:00
|
|
|
register struct vnode *vp;
|
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1997-02-10 02:22:35 +00:00
|
|
|
vgonel(vp, p);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* vgone, with the vp interlock held.
|
|
|
|
*/
|
2000-02-02 07:07:17 +00:00
|
|
|
void
|
1997-02-10 02:22:35 +00:00
|
|
|
vgonel(vp, p)
|
|
|
|
struct vnode *vp;
|
|
|
|
struct proc *p;
|
|
|
|
{
|
1998-01-12 01:46:33 +00:00
|
|
|
int s;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* If a vgone (or vclean) is already in progress,
|
|
|
|
* wait until it is done and return.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (vp->v_flag & VXLOCK) {
|
|
|
|
vp->v_flag |= VXWANT;
|
2000-12-01 03:43:33 +00:00
|
|
|
msleep((caddr_t)vp, &vp->v_interlock, PINOD | PDROP,
|
|
|
|
"vgone", 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
|
|
|
}
|
1996-10-17 02:49:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Clean out the filesystem specific data.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
vclean(vp, DOCLOSE, p);
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
1997-12-29 00:25:11 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Delete from old mount point vnode list, if on one.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
if (vp->v_mount != NULL)
|
|
|
|
insmntque(vp, (struct mount *)0);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* If special device, remove it from special device alias list
|
|
|
|
* if it is on one.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2000-11-02 21:14:13 +00:00
|
|
|
if (vp->v_type == VCHR && vp->v_rdev != NULL && vp->v_rdev != NODEV) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&spechash_mtx, MTX_DEF);
|
2000-09-19 10:28:44 +00:00
|
|
|
SLIST_REMOVE(&vp->v_rdev->si_hlist, vp, vnode, v_specnext);
|
1999-08-29 09:09:12 +00:00
|
|
|
freedev(vp->v_rdev);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&spechash_mtx, MTX_DEF);
|
1999-08-13 10:10:12 +00:00
|
|
|
vp->v_rdev = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* If it is on the freelist and not already at the head,
|
2000-07-04 04:32:40 +00:00
|
|
|
* move it to the head of the list. The test of the
|
|
|
|
* VDOOMED flag and the reference count of zero is because
|
1997-02-10 02:22:35 +00:00
|
|
|
* it will be removed from the free list by getnewvnode,
|
|
|
|
* but will not have its reference count incremented until
|
|
|
|
* after calling vgone. If the reference count were
|
|
|
|
* incremented first, vgone would (incorrectly) try to
|
|
|
|
* close the previous instance of the underlying object.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
1997-08-31 07:32:39 +00:00
|
|
|
if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) {
|
1998-01-12 01:46:33 +00:00
|
|
|
s = splbio();
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vnode_free_list_mtx, MTX_DEF);
|
2000-07-04 04:32:40 +00:00
|
|
|
if (vp->v_flag & VFREE)
|
1998-01-12 01:46:33 +00:00
|
|
|
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
|
2000-07-04 04:32:40 +00:00
|
|
|
else
|
1998-01-31 01:17:58 +00:00
|
|
|
freevnodes++;
|
1998-01-12 01:46:33 +00:00
|
|
|
vp->v_flag |= VFREE;
|
1997-08-31 07:32:39 +00:00
|
|
|
TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vnode_free_list_mtx, MTX_DEF);
|
1998-01-12 01:46:33 +00:00
|
|
|
splx(s);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
vp->v_type = VBAD;
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lookup a vnode by device number.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1994-05-24 10:09:53 +00:00
|
|
|
vfinddev(dev, type, vpp)
|
|
|
|
dev_t dev;
|
|
|
|
enum vtype type;
|
|
|
|
struct vnode **vpp;
|
|
|
|
{
|
1999-08-26 14:53:31 +00:00
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&spechash_mtx, MTX_DEF);
|
1999-08-26 14:53:31 +00:00
|
|
|
SLIST_FOREACH(vp, &dev->si_hlist, v_specnext) {
|
|
|
|
if (type == vp->v_type) {
|
|
|
|
*vpp = vp;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&spechash_mtx, MTX_DEF);
|
1999-08-26 14:53:31 +00:00
|
|
|
return (1);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&spechash_mtx, MTX_DEF);
|
1999-08-26 14:53:31 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Calculate the total number of references to a special device.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
1994-05-24 10:09:53 +00:00
|
|
|
vcount(vp)
|
1999-08-26 14:53:31 +00:00
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1999-08-28 19:21:03 +00:00
|
|
|
struct vnode *vq;
|
1994-05-24 10:09:53 +00:00
|
|
|
int count;
|
|
|
|
|
1999-08-26 14:53:31 +00:00
|
|
|
count = 0;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&spechash_mtx, MTX_DEF);
|
2000-09-19 10:28:44 +00:00
|
|
|
SLIST_FOREACH(vq, &vp->v_rdev->si_hlist, v_specnext)
|
1994-05-24 10:09:53 +00:00
|
|
|
count += vq->v_usecount;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&spechash_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (count);
|
|
|
|
}
|
1999-08-26 14:53:31 +00:00
|
|
|
|
2000-02-07 23:05:40 +00:00
|
|
|
/*
|
|
|
|
* Same as above, but using the dev_t as argument
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
count_dev(dev)
|
|
|
|
dev_t dev;
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
|
|
|
|
vp = SLIST_FIRST(&dev->si_hlist);
|
|
|
|
if (vp == NULL)
|
|
|
|
return (0);
|
|
|
|
return(vcount(vp));
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Print out a description of a vnode.
|
|
|
|
*/
|
|
|
|
static char *typename[] =
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
|
1994-05-24 10:09:53 +00:00
|
|
|
|
1994-05-25 09:21:21 +00:00
|
|
|
void
|
1994-05-24 10:09:53 +00:00
|
|
|
vprint(label, vp)
|
|
|
|
char *label;
|
1999-08-26 14:53:31 +00:00
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-12-04 22:54:57 +00:00
|
|
|
char buf[96];
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
if (label != NULL)
|
1998-07-11 07:46:16 +00:00
|
|
|
printf("%s: %p: ", label, (void *)vp);
|
1997-04-04 17:46:21 +00:00
|
|
|
else
|
1998-07-11 07:46:16 +00:00
|
|
|
printf("%p: ", (void *)vp);
|
|
|
|
printf("type %s, usecount %d, writecount %d, refcount %d,",
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
typename[vp->v_type], vp->v_usecount, vp->v_writecount,
|
|
|
|
vp->v_holdcnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
buf[0] = '\0';
|
|
|
|
if (vp->v_flag & VROOT)
|
|
|
|
strcat(buf, "|VROOT");
|
|
|
|
if (vp->v_flag & VTEXT)
|
|
|
|
strcat(buf, "|VTEXT");
|
|
|
|
if (vp->v_flag & VSYSTEM)
|
|
|
|
strcat(buf, "|VSYSTEM");
|
|
|
|
if (vp->v_flag & VXLOCK)
|
|
|
|
strcat(buf, "|VXLOCK");
|
|
|
|
if (vp->v_flag & VXWANT)
|
|
|
|
strcat(buf, "|VXWANT");
|
|
|
|
if (vp->v_flag & VBWAIT)
|
|
|
|
strcat(buf, "|VBWAIT");
|
1997-08-31 07:32:39 +00:00
|
|
|
if (vp->v_flag & VDOOMED)
|
|
|
|
strcat(buf, "|VDOOMED");
|
|
|
|
if (vp->v_flag & VFREE)
|
|
|
|
strcat(buf, "|VFREE");
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
if (vp->v_flag & VOBJBUF)
|
|
|
|
strcat(buf, "|VOBJBUF");
|
1994-05-24 10:09:53 +00:00
|
|
|
if (buf[0] != '\0')
|
|
|
|
printf(" flags (%s)", &buf[1]);
|
|
|
|
if (vp->v_data == NULL) {
|
|
|
|
printf("\n");
|
|
|
|
} else {
|
|
|
|
printf("\n\t");
|
|
|
|
VOP_PRINT(vp);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1995-04-16 11:33:33 +00:00
|
|
|
#ifdef DDB
|
1998-10-25 17:44:59 +00:00
|
|
|
#include <ddb/ddb.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* List all of the locked vnodes in the system.
|
|
|
|
* Called when debugging the kernel.
|
|
|
|
*/
|
1998-10-25 17:44:59 +00:00
|
|
|
DB_SHOW_COMMAND(lockedvnodes, lockedvnodes)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-25 19:33:23 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
|
|
|
struct mount *mp, *nmp;
|
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
printf("Locked vnodes\n");
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&mountlist_mtx, MTX_DEF);
|
1999-11-20 10:00:46 +00:00
|
|
|
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
|
2000-10-04 01:29:17 +00:00
|
|
|
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) {
|
1999-11-20 10:00:46 +00:00
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
1997-02-25 19:33:23 +00:00
|
|
|
continue;
|
|
|
|
}
|
1999-11-16 16:28:58 +00:00
|
|
|
LIST_FOREACH(vp, &mp->mnt_vnodelist, v_mntvnodes) {
|
1999-12-11 16:13:02 +00:00
|
|
|
if (VOP_ISLOCKED(vp, NULL))
|
1997-02-25 19:33:23 +00:00
|
|
|
vprint((char *)0, vp);
|
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&mountlist_mtx, MTX_DEF);
|
1999-11-20 10:00:46 +00:00
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
1997-02-25 19:33:23 +00:00
|
|
|
vfs_unbusy(mp, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&mountlist_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
1997-03-03 12:58:20 +00:00
|
|
|
/*
|
|
|
|
* Top level filesystem related information gathering.
|
|
|
|
*/
|
2000-07-03 09:35:31 +00:00
|
|
|
static int sysctl_ovfs_conf __P((SYSCTL_HANDLER_ARGS));
|
1997-03-03 12:58:20 +00:00
|
|
|
|
1997-03-04 18:31:56 +00:00
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
vfs_sysctl(SYSCTL_HANDLER_ARGS)
|
1997-03-02 11:06:22 +00:00
|
|
|
{
|
1997-03-04 18:31:56 +00:00
|
|
|
int *name = (int *)arg1 - 1; /* XXX */
|
|
|
|
u_int namelen = arg2 + 1; /* XXX */
|
1997-03-02 11:06:22 +00:00
|
|
|
struct vfsconf *vfsp;
|
|
|
|
|
1998-08-29 13:13:10 +00:00
|
|
|
#if 1 || defined(COMPAT_PRELITE2)
|
1997-03-03 12:58:20 +00:00
|
|
|
/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
|
1997-03-04 18:31:56 +00:00
|
|
|
if (namelen == 1)
|
1997-03-03 12:58:20 +00:00
|
|
|
return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
|
|
|
|
#endif
|
1997-03-02 11:06:22 +00:00
|
|
|
|
2000-10-05 18:22:46 +00:00
|
|
|
/* XXX the below code does not compile; vfs_sysctl does not exist. */
|
1997-03-04 18:31:56 +00:00
|
|
|
#ifdef notyet
|
1997-03-03 12:58:20 +00:00
|
|
|
/* all sysctl names at this level are at least name and field */
|
|
|
|
if (namelen < 2)
|
|
|
|
return (ENOTDIR); /* overloaded */
|
|
|
|
if (name[0] != VFS_GENERIC) {
|
|
|
|
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
|
|
|
|
if (vfsp->vfc_typenum == name[0])
|
|
|
|
break;
|
|
|
|
if (vfsp == NULL)
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
|
|
|
|
oldp, oldlenp, newp, newlen, p));
|
|
|
|
}
|
1997-03-04 18:31:56 +00:00
|
|
|
#endif
|
1997-03-03 12:58:20 +00:00
|
|
|
switch (name[1]) {
|
|
|
|
case VFS_MAXTYPENUM:
|
|
|
|
if (namelen != 2)
|
|
|
|
return (ENOTDIR);
|
|
|
|
return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
|
|
|
|
case VFS_CONF:
|
|
|
|
if (namelen != 3)
|
|
|
|
return (ENOTDIR); /* overloaded */
|
|
|
|
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
|
|
|
|
if (vfsp->vfc_typenum == name[2])
|
|
|
|
break;
|
|
|
|
if (vfsp == NULL)
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
|
|
|
|
}
|
|
|
|
return (EOPNOTSUPP);
|
|
|
|
}
|
1997-03-02 11:06:22 +00:00
|
|
|
|
1997-03-04 18:31:56 +00:00
|
|
|
SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
|
|
|
|
"Generic filesystem");
|
|
|
|
|
1998-08-29 13:13:10 +00:00
|
|
|
#if 1 || defined(COMPAT_PRELITE2)
|
1997-03-02 11:06:22 +00:00
|
|
|
|
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
|
1997-03-02 11:06:22 +00:00
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vfsconf *vfsp;
|
1997-03-03 12:58:20 +00:00
|
|
|
struct ovfsconf ovfs;
|
1997-03-02 11:06:22 +00:00
|
|
|
|
|
|
|
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
|
|
|
|
ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
|
|
|
|
strcpy(ovfs.vfc_name, vfsp->vfc_name);
|
|
|
|
ovfs.vfc_index = vfsp->vfc_typenum;
|
|
|
|
ovfs.vfc_refcount = vfsp->vfc_refcount;
|
|
|
|
ovfs.vfc_flags = vfsp->vfc_flags;
|
|
|
|
error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
|
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
1998-08-29 13:13:10 +00:00
|
|
|
#endif /* 1 || COMPAT_PRELITE2 */
|
1997-03-02 11:06:22 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
#if COMPILING_LINT
|
1994-05-24 10:09:53 +00:00
|
|
|
#define KINFO_VNODESLOP 10
|
|
|
|
/*
|
|
|
|
* Dump vnode list (via sysctl).
|
|
|
|
* Copyout address of vnode followed by vnode.
|
|
|
|
*/
|
|
|
|
/* ARGSUSED */
|
1995-11-20 12:42:39 +00:00
|
|
|
static int
|
2000-07-04 11:25:35 +00:00
|
|
|
sysctl_vnode(SYSCTL_HANDLER_ARGS)
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1997-02-10 02:22:35 +00:00
|
|
|
struct proc *p = curproc; /* XXX */
|
1997-02-25 19:33:23 +00:00
|
|
|
struct mount *mp, *nmp;
|
|
|
|
struct vnode *nvp, *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
int error;
|
|
|
|
|
|
|
|
#define VPTRSZ sizeof (struct vnode *)
|
|
|
|
#define VNODESZ sizeof (struct vnode)
|
1995-11-20 12:42:39 +00:00
|
|
|
|
|
|
|
req->lock = 0;
|
1995-11-29 11:28:00 +00:00
|
|
|
if (!req->oldptr) /* Make an estimate */
|
1995-11-20 12:42:39 +00:00
|
|
|
return (SYSCTL_OUT(req, 0,
|
|
|
|
(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&mountlist_mtx, MTX_DEF);
|
1999-11-20 10:00:46 +00:00
|
|
|
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
|
2000-10-04 01:29:17 +00:00
|
|
|
if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, p)) {
|
1999-11-20 10:00:46 +00:00
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
1994-05-24 10:09:53 +00:00
|
|
|
continue;
|
1997-02-25 19:33:23 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
again:
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntvnode_mtx, MTX_DEF);
|
1999-11-16 16:28:58 +00:00
|
|
|
for (vp = LIST_FIRST(&mp->mnt_vnodelist);
|
1997-02-25 19:33:23 +00:00
|
|
|
vp != NULL;
|
|
|
|
vp = nvp) {
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1997-02-25 19:33:23 +00:00
|
|
|
* Check that the vp is still associated with
|
|
|
|
* this filesystem. RACE: could have been
|
|
|
|
* recycled onto the same filesystem.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
if (vp->v_mount != mp) {
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto again;
|
|
|
|
}
|
1999-11-16 16:28:58 +00:00
|
|
|
nvp = LIST_NEXT(vp, v_mntvnodes);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
1995-11-20 12:42:39 +00:00
|
|
|
if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
|
1997-02-25 19:33:23 +00:00
|
|
|
(error = SYSCTL_OUT(req, vp, VNODESZ)))
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&mntvnode_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&mntvnode_mtx, MTX_DEF);
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&mountlist_mtx, MTX_DEF);
|
1999-11-20 10:00:46 +00:00
|
|
|
nmp = TAILQ_NEXT(mp, mnt_list);
|
1997-02-10 02:22:35 +00:00
|
|
|
vfs_unbusy(mp, p);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&mountlist_mtx, MTX_DEF);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1997-06-10 02:48:08 +00:00
|
|
|
/*
|
|
|
|
* XXX
|
|
|
|
* Exporting the vnode list on large systems causes them to crash.
|
|
|
|
* Exporting the vnode list on medium systems causes sysctl to coredump.
|
|
|
|
*/
|
1995-12-06 13:27:39 +00:00
|
|
|
SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
|
|
|
|
0, 0, sysctl_vnode, "S,vnode", "");
|
1997-06-10 02:48:08 +00:00
|
|
|
#endif
|
1995-11-20 12:42:39 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Check to see if a filesystem is mounted on a block device.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vfs_mountedon(vp)
|
1997-02-10 02:22:35 +00:00
|
|
|
struct vnode *vp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
|
2000-10-09 17:31:39 +00:00
|
|
|
if (vp->v_rdev->si_mountpoint != NULL)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (EBUSY);
|
1999-08-26 14:53:31 +00:00
|
|
|
return (0);
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-02-26 15:35:42 +00:00
|
|
|
* Unmount all filesystems. The list is traversed in reverse order
|
|
|
|
* of mounting to avoid dependencies.
|
1997-02-10 02:22:35 +00:00
|
|
|
*/
|
|
|
|
void
|
|
|
|
vfs_unmountall()
|
|
|
|
{
|
1999-11-20 10:00:46 +00:00
|
|
|
struct mount *mp;
|
1998-05-17 19:38:55 +00:00
|
|
|
struct proc *p;
|
1997-02-10 02:22:35 +00:00
|
|
|
int error;
|
|
|
|
|
1998-05-17 19:38:55 +00:00
|
|
|
if (curproc != NULL)
|
|
|
|
p = curproc;
|
|
|
|
else
|
|
|
|
p = initproc; /* XXX XXX should this be proc0? */
|
1997-02-26 15:35:42 +00:00
|
|
|
/*
|
|
|
|
* Since this only runs when rebooting, it is not interlocked.
|
|
|
|
*/
|
1999-11-20 10:00:46 +00:00
|
|
|
while(!TAILQ_EMPTY(&mountlist)) {
|
|
|
|
mp = TAILQ_LAST(&mountlist, mntlist);
|
1997-02-26 15:35:42 +00:00
|
|
|
error = dounmount(mp, MNT_FORCE, p);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (error) {
|
1999-11-20 10:00:46 +00:00
|
|
|
TAILQ_REMOVE(&mountlist, mp, mnt_list);
|
1997-02-26 15:35:42 +00:00
|
|
|
printf("unmount of %s failed (",
|
|
|
|
mp->mnt_stat.f_mntonname);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (error == EBUSY)
|
|
|
|
printf("BUSY)\n");
|
|
|
|
else
|
|
|
|
printf("%d)\n", error);
|
1999-11-20 10:00:46 +00:00
|
|
|
} else {
|
|
|
|
/* The unmount has removed mp from the mountlist */
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Build hash lists of net addresses and hang them off the mount point.
|
|
|
|
* Called by ufs_mount() to set up the lists of export addresses.
|
|
|
|
*/
|
|
|
|
static int
|
1997-09-16 11:44:05 +00:00
|
|
|
vfs_hang_addrlist(mp, nep, argp)
|
|
|
|
struct mount *mp;
|
|
|
|
struct netexport *nep;
|
|
|
|
struct export_args *argp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
register struct netcred *np;
|
|
|
|
register struct radix_node_head *rnh;
|
|
|
|
register int i;
|
|
|
|
struct radix_node *rn;
|
|
|
|
struct sockaddr *saddr, *smask = 0;
|
|
|
|
struct domain *dom;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (argp->ex_addrlen == 0) {
|
|
|
|
if (mp->mnt_flag & MNT_DEFEXPORTED)
|
|
|
|
return (EPERM);
|
|
|
|
np = &nep->ne_defexported;
|
|
|
|
np->netc_exflags = argp->ex_flags;
|
|
|
|
np->netc_anon = argp->ex_anon;
|
|
|
|
np->netc_anon.cr_ref = 1;
|
|
|
|
mp->mnt_flag |= MNT_DEFEXPORTED;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
|
2000-12-08 21:51:06 +00:00
|
|
|
np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
saddr = (struct sockaddr *) (np + 1);
|
|
|
|
if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
|
1994-05-24 10:09:53 +00:00
|
|
|
goto out;
|
|
|
|
if (saddr->sa_len > argp->ex_addrlen)
|
|
|
|
saddr->sa_len = argp->ex_addrlen;
|
|
|
|
if (argp->ex_masklen) {
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
|
1997-04-25 06:47:12 +00:00
|
|
|
error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error)
|
|
|
|
goto out;
|
|
|
|
if (smask->sa_len > argp->ex_masklen)
|
|
|
|
smask->sa_len = argp->ex_masklen;
|
|
|
|
}
|
|
|
|
i = saddr->sa_family;
|
|
|
|
if ((rnh = nep->ne_rtable[i]) == 0) {
|
|
|
|
/*
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
* Seems silly to initialize every AF when most are not used,
|
|
|
|
* do so on demand here
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
for (dom = domains; dom; dom = dom->dom_next)
|
|
|
|
if (dom->dom_family == i && dom->dom_rtattach) {
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
dom->dom_rtattach((void **) &nep->ne_rtable[i],
|
|
|
|
dom->dom_rtoffset);
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if ((rnh = nep->ne_rtable[i]) == 0) {
|
|
|
|
error = ENOBUFS;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
|
|
|
|
np->netc_rnodes);
|
|
|
|
if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
|
1994-05-24 10:09:53 +00:00
|
|
|
error = EPERM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
np->netc_exflags = argp->ex_flags;
|
|
|
|
np->netc_anon = argp->ex_anon;
|
|
|
|
np->netc_anon.cr_ref = 1;
|
|
|
|
return (0);
|
|
|
|
out:
|
|
|
|
free(np, M_NETADDR);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
|
2000-10-05 18:22:46 +00:00
|
|
|
/* Helper for vfs_free_addrlist. */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* ARGSUSED */
|
|
|
|
static int
|
1997-09-16 11:44:05 +00:00
|
|
|
vfs_free_netcred(rn, w)
|
|
|
|
struct radix_node *rn;
|
|
|
|
void *w;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
register struct radix_node_head *rnh = (struct radix_node_head *) w;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
|
|
|
|
free((caddr_t) rn, M_NETADDR);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
}
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Free the net address hash lists that are hanging off the mount points.
|
|
|
|
*/
|
|
|
|
static void
|
1997-09-16 11:44:05 +00:00
|
|
|
vfs_free_addrlist(nep)
|
|
|
|
struct netexport *nep;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
register int i;
|
|
|
|
register struct radix_node_head *rnh;
|
|
|
|
|
|
|
|
for (i = 0; i <= AF_MAX; i++)
|
1994-09-25 19:34:02 +00:00
|
|
|
if ((rnh = nep->ne_rtable[i])) {
|
These changes embody the support of the fully coherent merged VM buffer cache,
much higher filesystem I/O performance, and much better paging performance. It
represents the culmination of over 6 months of R&D.
The majority of the merged VM/cache work is by John Dyson.
The following highlights the most significant changes. Additionally, there are
(mostly minor) changes to the various filesystem modules (nfs, msdosfs, etc) to
support the new VM/buffer scheme.
vfs_bio.c:
Significant rewrite of most of vfs_bio to support the merged VM buffer cache
scheme. The scheme is almost fully compatible with the old filesystem
interface. Significant improvement in the number of opportunities for write
clustering.
vfs_cluster.c, vfs_subr.c
Upgrade and performance enhancements in vfs layer code to support merged
VM/buffer cache. Fixup of vfs_cluster to eliminate the bogus pagemove stuff.
vm_object.c:
Yet more improvements in the collapse code. Elimination of some windows that
can cause list corruption.
vm_pageout.c:
Fixed it, it really works better now. Somehow in 2.0, some "enhancements"
broke the code. This code has been reworked from the ground-up.
vm_fault.c, vm_page.c, pmap.c, vm_object.c
Support for small-block filesystems with merged VM/buffer cache scheme.
pmap.c vm_map.c
Dynamic kernel VM size, now we dont have to pre-allocate excessive numbers of
kernel PTs.
vm_glue.c
Much simpler and more effective swapping code. No more gratuitous swapping.
proc.h
Fixed the problem that the p_lock flag was not being cleared on a fork.
swap_pager.c, vnode_pager.c
Removal of old vfs_bio cruft to support the past pseudo-coherency. Now the
code doesn't need it anymore.
machdep.c
Changes to better support the parameter values for the merged VM/buffer cache
scheme.
machdep.c, kern_exec.c, vm_glue.c
Implemented a seperate submap for temporary exec string space and another one
to contain process upages. This eliminates all map fragmentation problems
that previously existed.
ffs_inode.c, ufs_inode.c, ufs_readwrite.c
Changes for merged VM/buffer cache. Add "bypass" support for sneaking in on
busy buffers.
Submitted by: John Dyson and David Greenman
1995-01-09 16:06:02 +00:00
|
|
|
(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
|
|
|
|
(caddr_t) rnh);
|
|
|
|
free((caddr_t) rnh, M_RTABLE);
|
1994-05-24 10:09:53 +00:00
|
|
|
nep->ne_rtable[i] = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-09-21 15:55:55 +00:00
|
|
|
/*
|
|
|
|
* High level function to manipulate export options on a mount point
|
|
|
|
* and the passed in netexport.
|
|
|
|
* Struct export_args *argp is the variable used to twiddle options,
|
|
|
|
* the structure is described in sys/mount.h
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
int
|
|
|
|
vfs_export(mp, nep, argp)
|
|
|
|
struct mount *mp;
|
|
|
|
struct netexport *nep;
|
|
|
|
struct export_args *argp;
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
|
|
|
|
if (argp->ex_flags & MNT_DELEXPORT) {
|
1997-07-17 07:17:33 +00:00
|
|
|
if (mp->mnt_flag & MNT_EXPUBLIC) {
|
|
|
|
vfs_setpublicfs(NULL, NULL, NULL);
|
|
|
|
mp->mnt_flag &= ~MNT_EXPUBLIC;
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
vfs_free_addrlist(nep);
|
|
|
|
mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
|
|
|
|
}
|
|
|
|
if (argp->ex_flags & MNT_EXPORTED) {
|
1997-07-17 07:17:33 +00:00
|
|
|
if (argp->ex_flags & MNT_EXPUBLIC) {
|
|
|
|
if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
|
|
|
|
return (error);
|
|
|
|
mp->mnt_flag |= MNT_EXPUBLIC;
|
|
|
|
}
|
1994-09-25 19:34:02 +00:00
|
|
|
if ((error = vfs_hang_addrlist(mp, nep, argp)))
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
mp->mnt_flag |= MNT_EXPORTED;
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1997-07-17 07:17:33 +00:00
|
|
|
/*
|
|
|
|
* Set the publicly exported filesystem (WebNFS). Currently, only
|
|
|
|
* one public filesystem is possible in the spec (RFC 2054 and 2055)
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vfs_setpublicfs(mp, nep, argp)
|
|
|
|
struct mount *mp;
|
|
|
|
struct netexport *nep;
|
|
|
|
struct export_args *argp;
|
|
|
|
{
|
|
|
|
int error;
|
|
|
|
struct vnode *rvp;
|
|
|
|
char *cp;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* mp == NULL -> invalidate the current info, the FS is
|
|
|
|
* no longer exported. May be called from either vfs_export
|
|
|
|
* or unmount, so check if it hasn't already been done.
|
|
|
|
*/
|
|
|
|
if (mp == NULL) {
|
|
|
|
if (nfs_pub.np_valid) {
|
|
|
|
nfs_pub.np_valid = 0;
|
|
|
|
if (nfs_pub.np_index != NULL) {
|
|
|
|
FREE(nfs_pub.np_index, M_TEMP);
|
|
|
|
nfs_pub.np_index = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Only one allowed at a time.
|
|
|
|
*/
|
|
|
|
if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
|
|
|
|
return (EBUSY);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get real filehandle for root of exported FS.
|
|
|
|
*/
|
|
|
|
bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle));
|
|
|
|
nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid;
|
|
|
|
|
|
|
|
if ((error = VFS_ROOT(mp, &rvp)))
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
|
|
|
|
return (error);
|
|
|
|
|
|
|
|
vput(rvp);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If an indexfile was specified, pull it in.
|
|
|
|
*/
|
|
|
|
if (argp->ex_indexfile != NULL) {
|
|
|
|
MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP,
|
|
|
|
M_WAITOK);
|
|
|
|
error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
|
|
|
|
MAXNAMLEN, (size_t *)0);
|
|
|
|
if (!error) {
|
|
|
|
/*
|
|
|
|
* Check for illegal filenames.
|
|
|
|
*/
|
|
|
|
for (cp = nfs_pub.np_index; *cp; cp++) {
|
|
|
|
if (*cp == '/') {
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (error) {
|
|
|
|
FREE(nfs_pub.np_index, M_TEMP);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
nfs_pub.np_mount = mp;
|
|
|
|
nfs_pub.np_valid = 1;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2000-09-21 15:55:55 +00:00
|
|
|
/*
|
|
|
|
* Used by the filesystems to determine if a given network address
|
|
|
|
* (passed in 'nam') is present in thier exports list, returns a pointer
|
|
|
|
* to struct netcred so that the filesystem can examine it for
|
|
|
|
* access rights (read/write/etc).
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
struct netcred *
|
|
|
|
vfs_export_lookup(mp, nep, nam)
|
|
|
|
register struct mount *mp;
|
|
|
|
struct netexport *nep;
|
1997-08-16 19:16:27 +00:00
|
|
|
struct sockaddr *nam;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
|
|
|
register struct netcred *np;
|
|
|
|
register struct radix_node_head *rnh;
|
|
|
|
struct sockaddr *saddr;
|
|
|
|
|
|
|
|
np = NULL;
|
|
|
|
if (mp->mnt_flag & MNT_EXPORTED) {
|
|
|
|
/*
|
|
|
|
* Lookup in the export list first.
|
|
|
|
*/
|
|
|
|
if (nam != NULL) {
|
1997-08-16 19:16:27 +00:00
|
|
|
saddr = nam;
|
1994-05-24 10:09:53 +00:00
|
|
|
rnh = nep->ne_rtable[saddr->sa_family];
|
|
|
|
if (rnh != NULL) {
|
|
|
|
np = (struct netcred *)
|
1997-02-10 02:22:35 +00:00
|
|
|
(*rnh->rnh_matchaddr)((caddr_t)saddr,
|
|
|
|
rnh);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
|
|
|
|
np = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If no address match, use the default if it exists.
|
|
|
|
*/
|
|
|
|
if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
|
|
|
|
np = &nep->ne_defexported;
|
|
|
|
}
|
|
|
|
return (np);
|
|
|
|
}
|
1995-05-21 21:39:31 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* perform msync on all vnodes under a mount point
|
|
|
|
* the mount point must be locked.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vfs_msync(struct mount *mp, int flags) {
|
1995-12-11 04:58:34 +00:00
|
|
|
struct vnode *vp, *nvp;
|
1998-04-18 06:26:16 +00:00
|
|
|
struct vm_object *obj;
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
int anyio, tries;
|
|
|
|
|
|
|
|
tries = 5;
|
1995-05-21 21:39:31 +00:00
|
|
|
loop:
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
anyio = 0;
|
1999-11-16 16:28:58 +00:00
|
|
|
for (vp = LIST_FIRST(&mp->mnt_vnodelist); vp != NULL; vp = nvp) {
|
1995-05-21 21:39:31 +00:00
|
|
|
|
1999-11-16 16:28:58 +00:00
|
|
|
nvp = LIST_NEXT(vp, v_mntvnodes);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
|
|
|
|
if (vp->v_mount != mp) {
|
|
|
|
goto loop;
|
|
|
|
}
|
|
|
|
|
1998-04-18 06:26:16 +00:00
|
|
|
if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */
|
1995-05-21 21:39:31 +00:00
|
|
|
continue;
|
1998-04-18 06:26:16 +00:00
|
|
|
|
|
|
|
if (flags != MNT_WAIT) {
|
2000-09-12 09:49:08 +00:00
|
|
|
if (VOP_GETVOBJECT(vp, &obj) != 0 ||
|
|
|
|
(obj->flags & OBJ_MIGHTBEDIRTY) == 0)
|
1998-04-18 06:26:16 +00:00
|
|
|
continue;
|
1999-12-11 16:13:02 +00:00
|
|
|
if (VOP_ISLOCKED(vp, NULL))
|
1998-04-18 06:26:16 +00:00
|
|
|
continue;
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
}
|
|
|
|
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&vp->v_interlock, MTX_DEF);
|
2000-09-12 09:49:08 +00:00
|
|
|
if (VOP_GETVOBJECT(vp, &obj) == 0 &&
|
|
|
|
(obj->flags & OBJ_MIGHTBEDIRTY)) {
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
if (!vget(vp,
|
|
|
|
LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) {
|
2000-09-12 09:49:08 +00:00
|
|
|
if (VOP_GETVOBJECT(vp, &obj) == 0) {
|
|
|
|
vm_object_page_clean(obj, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : OBJPC_NOSYNC);
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
anyio = 1;
|
|
|
|
}
|
|
|
|
vput(vp);
|
|
|
|
}
|
|
|
|
} else {
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_exit(&vp->v_interlock, MTX_DEF);
|
1995-05-21 21:39:31 +00:00
|
|
|
}
|
|
|
|
}
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
if (anyio && (--tries > 0))
|
|
|
|
goto loop;
|
1995-05-21 21:39:31 +00:00
|
|
|
}
|
1996-08-21 21:56:23 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Create the VM object needed for VMIO and mmap support. This
|
|
|
|
* is done for all VREG files in the system. Some filesystems might
|
|
|
|
* afford the additional metadata buffering capability of the
|
|
|
|
* VMIO code by making the device node be VMIO mode also.
|
Make our v_usecount vnode reference count work identically to the
original BSD code. The association between the vnode and the vm_object
no longer includes reference counts. The major difference is that
vm_object's are no longer freed gratuitiously from the vnode, and so
once an object is created for the vnode, it will last as long as the
vnode does.
When a vnode object reference count is incremented, then the underlying
vnode reference count is incremented also. The two "objects" are now
more intimately related, and so the interactions are now much less
complex.
When vnodes are now normally placed onto the free queue with an object still
attached. The rundown of the object happens at vnode rundown time, and
happens with exactly the same filesystem semantics of the original VFS
code. There is absolutely no need for vnode_pager_uncache and other
travesties like that anymore.
A side-effect of these changes is that SMP locking should be much simpler,
the I/O copyin/copyout optimizations work, NFS should be more ponderable,
and further work on layered filesystems should be less frustrating, because
of the totally coherent management of the vnode objects and vnodes.
Please be careful with your system while running this code, but I would
greatly appreciate feedback as soon a reasonably possible.
1998-01-06 05:26:17 +00:00
|
|
|
*
|
1999-01-05 18:50:03 +00:00
|
|
|
* vp must be locked when vfs_object_create is called.
|
1996-08-21 21:56:23 +00:00
|
|
|
*/
|
|
|
|
int
|
1999-01-05 18:50:03 +00:00
|
|
|
vfs_object_create(vp, p, cred)
|
1996-08-21 21:56:23 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
struct proc *p;
|
|
|
|
struct ucred *cred;
|
|
|
|
{
|
2000-09-12 09:49:08 +00:00
|
|
|
return (VOP_CREATEVOBJECT(vp, cred, p));
|
1996-08-21 21:56:23 +00:00
|
|
|
}
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Mark a vnode as free, putting it up for recycling.
|
|
|
|
*/
|
2000-07-04 04:32:40 +00:00
|
|
|
void
|
1997-08-31 07:32:39 +00:00
|
|
|
vfree(vp)
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
{
|
1998-01-12 01:46:33 +00:00
|
|
|
int s;
|
|
|
|
|
|
|
|
s = splbio();
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vnode_free_list_mtx, MTX_DEF);
|
2000-07-04 04:32:40 +00:00
|
|
|
KASSERT((vp->v_flag & VFREE) == 0, ("vnode already free"));
|
1997-08-31 07:32:39 +00:00
|
|
|
if (vp->v_flag & VAGE) {
|
|
|
|
TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
|
|
|
|
} else {
|
|
|
|
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
}
|
1997-08-31 07:32:39 +00:00
|
|
|
freevnodes++;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vnode_free_list_mtx, MTX_DEF);
|
1997-08-31 07:32:39 +00:00
|
|
|
vp->v_flag &= ~VAGE;
|
|
|
|
vp->v_flag |= VFREE;
|
1998-01-12 01:46:33 +00:00
|
|
|
splx(s);
|
1997-08-31 07:32:39 +00:00
|
|
|
}
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
|
|
|
* Opposite of vfree() - mark a vnode as in use.
|
|
|
|
*/
|
1998-01-17 09:17:02 +00:00
|
|
|
void
|
1997-08-31 07:32:39 +00:00
|
|
|
vbusy(vp)
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
1998-01-12 01:46:33 +00:00
|
|
|
int s;
|
|
|
|
|
|
|
|
s = splbio();
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vnode_free_list_mtx, MTX_DEF);
|
2000-07-04 04:32:40 +00:00
|
|
|
KASSERT((vp->v_flag & VFREE) != 0, ("vnode not free"));
|
|
|
|
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
|
|
|
|
freevnodes--;
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vnode_free_list_mtx, MTX_DEF);
|
1998-02-23 06:59:52 +00:00
|
|
|
vp->v_flag &= ~(VFREE|VAGE);
|
1998-01-12 01:46:33 +00:00
|
|
|
splx(s);
|
1. Add a {pointer, v_id} pair to the vnode to store the reference to the
".." vnode. This is cheaper storagewise than keeping it in the
namecache, and it makes more sense since it's a 1:1 mapping.
2. Also handle the case of "." more intelligently rather than stuff
the namecache with pointless entries.
3. Add two lists to the vnode and hang namecache entries which go from
or to this vnode. When cleaning a vnode, delete all namecache
entries it invalidates.
4. Never reuse namecache enties, malloc new ones when we need it, free
old ones when they die. No longer a hard limit on how many we can
have.
5. Remove the upper limit on namelength of namecache entries.
6. Make a global list for negative namecache entries, limit their number
to a sysctl'able (debug.ncnegfactor) fraction of the total namecache.
Currently the default fraction is 1/16th. (Suggestions for better
default wanted!)
7. Assign v_id correctly in the face of 32bit rollover.
8. Remove the LRU list for namecache entries, not needed. Remove the
#ifdef NCH_STATISTICS stuff, it's not needed either.
9. Use the vnode freelist as a true LRU list, also for namecache accesses.
10. Reuse vnodes more aggresively but also more selectively, if we can't
reuse, malloc a new one. There is no longer a hard limit on their
number, they grow to the point where we don't reuse potentially
usable vnodes. A vnode will not get recycled if still has pages in
core or if it is the source of namecache entries (Yes, this does
indeed work :-) "." and ".." are not namecache entries any longer...)
11. Do not overload the v_id field in namecache entries with whiteout
information, use a char sized flags field instead, so we can get
rid of the vpid and v_id fields from the namecache struct. Since
we're linked to the vnodes and purged when they're cleaned, we don't
have to check the v_id any more.
12. NFS knew about the limitation on name length in the namecache, it
shouldn't and doesn't now.
Bugs:
The namecache statistics no longer includes the hits for ".."
and "." hits.
Performance impact:
Generally in the +/- 0.5% for "normal" workstations, but
I hope this will allow the system to be selftuning over a
bigger range of "special" applications. The case where
RAM is available but unused for cache because we don't have
any vnodes should be gone.
Future work:
Straighten out the namecache statistics.
"desiredvnodes" is still used to (bogusly ?) size hash
tables in the filesystems.
I have still to find a way to safely free unused vnodes
back so their number can shrink when not needed.
There is a few uses of the v_id field left in the filesystems,
scheduled for demolition at a later time.
Maybe a one slot cache for unused namecache entries should
be implemented to decrease the malloc/free frequency.
1997-05-04 09:17:38 +00:00
|
|
|
}
|
1997-12-15 03:09:59 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Record a process's interest in events which might happen to
|
|
|
|
* a vnode. Because poll uses the historic select-style interface
|
|
|
|
* internally, this routine serves as both the ``check for any
|
|
|
|
* pending events'' and the ``record my interest in future events''
|
|
|
|
* functions. (These are done together, while the lock is held,
|
|
|
|
* to avoid race conditions.)
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vn_pollrecord(vp, p, events)
|
|
|
|
struct vnode *vp;
|
|
|
|
struct proc *p;
|
|
|
|
short events;
|
|
|
|
{
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
if (vp->v_pollinfo.vpi_revents & events) {
|
|
|
|
/*
|
|
|
|
* This leaves events we are not interested
|
|
|
|
* in available for the other process which
|
|
|
|
* which presumably had requested them
|
|
|
|
* (otherwise they would never have been
|
|
|
|
* recorded).
|
|
|
|
*/
|
|
|
|
events &= vp->v_pollinfo.vpi_revents;
|
|
|
|
vp->v_pollinfo.vpi_revents &= ~events;
|
|
|
|
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
return events;
|
|
|
|
}
|
|
|
|
vp->v_pollinfo.vpi_events |= events;
|
|
|
|
selrecord(p, &vp->v_pollinfo.vpi_selinfo);
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note the occurrence of an event. If the VN_POLLEVENT macro is used,
|
|
|
|
* it is possible for us to miss an event due to race conditions, but
|
|
|
|
* that condition is expected to be rare, so for the moment it is the
|
|
|
|
* preferred interface.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vn_pollevent(vp, events)
|
|
|
|
struct vnode *vp;
|
|
|
|
short events;
|
|
|
|
{
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
if (vp->v_pollinfo.vpi_events & events) {
|
|
|
|
/*
|
|
|
|
* We clear vpi_events so that we don't
|
|
|
|
* call selwakeup() twice if two events are
|
|
|
|
* posted before the polling process(es) is
|
|
|
|
* awakened. This also ensures that we take at
|
|
|
|
* most one selwakeup() if the polling process
|
|
|
|
* is no longer interested. However, it does
|
|
|
|
* mean that only one event can be noticed at
|
|
|
|
* a time. (Perhaps we should only clear those
|
|
|
|
* event bits which we note?) XXX
|
|
|
|
*/
|
|
|
|
vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */
|
|
|
|
vp->v_pollinfo.vpi_revents |= events;
|
|
|
|
selwakeup(&vp->v_pollinfo.vpi_selinfo);
|
|
|
|
}
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Wake up anyone polling on vp because it is being revoked.
|
|
|
|
* This depends on dead_poll() returning POLLHUP for correct
|
|
|
|
* behavior.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
vn_pollgone(vp)
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_enter(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
if (vp->v_pollinfo.vpi_events) {
|
|
|
|
vp->v_pollinfo.vpi_events = 0;
|
|
|
|
selwakeup(&vp->v_pollinfo.vpi_selinfo);
|
|
|
|
}
|
2001-01-24 12:35:55 +00:00
|
|
|
mtx_exit(&vp->v_pollinfo.vpi_lock, MTX_DEF);
|
1997-12-15 03:09:59 +00:00
|
|
|
}
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Routine to create and manage a filesystem syncer vnode.
|
|
|
|
*/
|
|
|
|
#define sync_close ((int (*) __P((struct vop_close_args *)))nullop)
|
1999-01-05 18:12:29 +00:00
|
|
|
static int sync_fsync __P((struct vop_fsync_args *));
|
|
|
|
static int sync_inactive __P((struct vop_inactive_args *));
|
|
|
|
static int sync_reclaim __P((struct vop_reclaim_args *));
|
1998-03-08 09:59:44 +00:00
|
|
|
#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock)
|
|
|
|
#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock)
|
1999-01-05 18:12:29 +00:00
|
|
|
static int sync_print __P((struct vop_print_args *));
|
1998-03-08 09:59:44 +00:00
|
|
|
#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked)
|
|
|
|
|
1998-12-21 23:38:33 +00:00
|
|
|
static vop_t **sync_vnodeop_p;
|
|
|
|
static struct vnodeopv_entry_desc sync_vnodeop_entries[] = {
|
1998-03-08 09:59:44 +00:00
|
|
|
{ &vop_default_desc, (vop_t *) vop_eopnotsupp },
|
|
|
|
{ &vop_close_desc, (vop_t *) sync_close }, /* close */
|
|
|
|
{ &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */
|
|
|
|
{ &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */
|
|
|
|
{ &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */
|
|
|
|
{ &vop_lock_desc, (vop_t *) sync_lock }, /* lock */
|
|
|
|
{ &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */
|
|
|
|
{ &vop_print_desc, (vop_t *) sync_print }, /* print */
|
|
|
|
{ &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */
|
|
|
|
{ NULL, NULL }
|
|
|
|
};
|
1998-12-21 23:38:33 +00:00
|
|
|
static struct vnodeopv_desc sync_vnodeop_opv_desc =
|
1998-03-08 09:59:44 +00:00
|
|
|
{ &sync_vnodeop_p, sync_vnodeop_entries };
|
|
|
|
|
|
|
|
VNODEOP_SET(sync_vnodeop_opv_desc);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new filesystem syncer vnode for the specified mount point.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
vfs_allocate_syncvnode(mp)
|
|
|
|
struct mount *mp;
|
|
|
|
{
|
|
|
|
struct vnode *vp;
|
|
|
|
static long start, incr, next;
|
|
|
|
int error;
|
|
|
|
|
|
|
|
/* Allocate a new vnode */
|
|
|
|
if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) {
|
|
|
|
mp->mnt_syncer = NULL;
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
vp->v_type = VNON;
|
|
|
|
/*
|
|
|
|
* Place the vnode onto the syncer worklist. We attempt to
|
|
|
|
* scatter them about on the list so that they will go off
|
|
|
|
* at evenly distributed times even if all the filesystems
|
|
|
|
* are mounted at once.
|
|
|
|
*/
|
|
|
|
next += incr;
|
|
|
|
if (next == 0 || next > syncer_maxdelay) {
|
|
|
|
start /= 2;
|
|
|
|
incr /= 2;
|
|
|
|
if (start == 0) {
|
|
|
|
start = syncer_maxdelay / 2;
|
|
|
|
incr = syncer_maxdelay;
|
|
|
|
}
|
|
|
|
next = start;
|
|
|
|
}
|
|
|
|
vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0);
|
|
|
|
mp->mnt_syncer = vp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do a lazy sync of the filesystem.
|
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
1998-03-08 09:59:44 +00:00
|
|
|
sync_fsync(ap)
|
|
|
|
struct vop_fsync_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct ucred *a_cred;
|
|
|
|
int a_waitfor;
|
|
|
|
struct proc *a_p;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
struct vnode *syncvp = ap->a_vp;
|
|
|
|
struct mount *mp = syncvp->v_mount;
|
|
|
|
struct proc *p = ap->a_p;
|
|
|
|
int asyncflag;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We only need to do something if this is a lazy evaluation.
|
|
|
|
*/
|
|
|
|
if (ap->a_waitfor != MNT_LAZY)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Move ourselves to the back of the sync list.
|
|
|
|
*/
|
|
|
|
vn_syncer_add_to_worklist(syncvp, syncdelay);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Walk the list of vnodes pushing all that are dirty and
|
|
|
|
* not already on the sync list.
|
|
|
|
*/
|
2000-10-04 01:29:17 +00:00
|
|
|
mtx_enter(&mountlist_mtx, MTX_DEF);
|
|
|
|
if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_mtx, p) != 0) {
|
|
|
|
mtx_exit(&mountlist_mtx, MTX_DEF);
|
1998-03-08 09:59:44 +00:00
|
|
|
return (0);
|
1998-04-15 18:37:49 +00:00
|
|
|
}
|
2000-07-11 22:07:57 +00:00
|
|
|
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
|
|
|
|
vfs_unbusy(mp, p);
|
|
|
|
return (0);
|
|
|
|
}
|
1998-03-08 09:59:44 +00:00
|
|
|
asyncflag = mp->mnt_flag & MNT_ASYNC;
|
|
|
|
mp->mnt_flag &= ~MNT_ASYNC;
|
1998-04-16 03:31:26 +00:00
|
|
|
vfs_msync(mp, MNT_NOWAIT);
|
1998-03-08 09:59:44 +00:00
|
|
|
VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p);
|
|
|
|
if (asyncflag)
|
|
|
|
mp->mnt_flag |= MNT_ASYNC;
|
2000-07-11 22:07:57 +00:00
|
|
|
vn_finished_write(mp);
|
1998-03-08 09:59:44 +00:00
|
|
|
vfs_unbusy(mp, p);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The syncer vnode is no referenced.
|
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
1998-03-08 09:59:44 +00:00
|
|
|
sync_inactive(ap)
|
|
|
|
struct vop_inactive_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct proc *a_p;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
|
|
|
|
vgone(ap->a_vp);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The syncer vnode is no longer needed and is being decommissioned.
|
1999-02-19 17:36:58 +00:00
|
|
|
*
|
|
|
|
* Modifications to the worklist must be protected at splbio().
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
1998-03-08 09:59:44 +00:00
|
|
|
sync_reclaim(ap)
|
|
|
|
struct vop_reclaim_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
struct vnode *vp = ap->a_vp;
|
1999-02-19 17:36:58 +00:00
|
|
|
int s;
|
1998-03-08 09:59:44 +00:00
|
|
|
|
1999-02-19 17:36:58 +00:00
|
|
|
s = splbio();
|
1998-03-08 09:59:44 +00:00
|
|
|
vp->v_mount->mnt_syncer = NULL;
|
|
|
|
if (vp->v_flag & VONWORKLST) {
|
|
|
|
LIST_REMOVE(vp, v_synclist);
|
|
|
|
vp->v_flag &= ~VONWORKLST;
|
|
|
|
}
|
1999-02-19 17:36:58 +00:00
|
|
|
splx(s);
|
1998-03-08 09:59:44 +00:00
|
|
|
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Print out a syncer vnode.
|
|
|
|
*/
|
1998-12-21 23:38:33 +00:00
|
|
|
static int
|
1998-03-08 09:59:44 +00:00
|
|
|
sync_print(ap)
|
|
|
|
struct vop_print_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
struct vnode *vp = ap->a_vp;
|
|
|
|
|
|
|
|
printf("syncer vnode");
|
|
|
|
if (vp->v_vnlock != NULL)
|
|
|
|
lockmgr_printinfo(vp->v_vnlock);
|
|
|
|
printf("\n");
|
|
|
|
return (0);
|
|
|
|
}
|
1999-07-18 14:30:37 +00:00
|
|
|
|
|
|
|
/*
|
2000-11-02 21:14:13 +00:00
|
|
|
* extract the dev_t from a VCHR
|
1999-07-18 14:30:37 +00:00
|
|
|
*/
|
|
|
|
dev_t
|
|
|
|
vn_todev(vp)
|
|
|
|
struct vnode *vp;
|
|
|
|
{
|
2000-11-02 21:14:13 +00:00
|
|
|
if (vp->v_type != VCHR)
|
1999-07-18 14:30:37 +00:00
|
|
|
return (NODEV);
|
|
|
|
return (vp->v_rdev);
|
|
|
|
}
|
1999-08-25 12:24:39 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Check if vnode represents a disk device
|
|
|
|
*/
|
|
|
|
int
|
2000-01-10 12:04:27 +00:00
|
|
|
vn_isdisk(vp, errp)
|
1999-08-25 12:24:39 +00:00
|
|
|
struct vnode *vp;
|
2000-01-10 12:04:27 +00:00
|
|
|
int *errp;
|
1999-08-25 12:24:39 +00:00
|
|
|
{
|
2000-09-05 21:09:56 +00:00
|
|
|
struct cdevsw *cdevsw;
|
|
|
|
|
2000-11-02 21:14:13 +00:00
|
|
|
if (vp->v_type != VCHR) {
|
2000-01-10 12:04:27 +00:00
|
|
|
if (errp != NULL)
|
|
|
|
*errp = ENOTBLK;
|
1999-08-25 12:24:39 +00:00
|
|
|
return (0);
|
2000-01-10 12:04:27 +00:00
|
|
|
}
|
2000-03-18 01:27:44 +00:00
|
|
|
if (vp->v_rdev == NULL) {
|
|
|
|
if (errp != NULL)
|
|
|
|
*errp = ENXIO;
|
|
|
|
return (0);
|
|
|
|
}
|
2000-09-05 21:09:56 +00:00
|
|
|
cdevsw = devsw(vp->v_rdev);
|
|
|
|
if (cdevsw == NULL) {
|
2000-01-10 12:04:27 +00:00
|
|
|
if (errp != NULL)
|
|
|
|
*errp = ENXIO;
|
1999-08-25 12:24:39 +00:00
|
|
|
return (0);
|
2000-01-10 12:04:27 +00:00
|
|
|
}
|
2000-09-05 21:09:56 +00:00
|
|
|
if (!(cdevsw->d_flags & D_DISK)) {
|
2000-01-10 12:04:27 +00:00
|
|
|
if (errp != NULL)
|
|
|
|
*errp = ENOTBLK;
|
1999-08-25 12:24:39 +00:00
|
|
|
return (0);
|
2000-01-10 12:04:27 +00:00
|
|
|
}
|
|
|
|
if (errp != NULL)
|
|
|
|
*errp = 0;
|
1999-08-25 12:24:39 +00:00
|
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
2000-09-22 12:22:36 +00:00
|
|
|
/*
|
2000-10-05 18:22:46 +00:00
|
|
|
* Free data allocated by namei(); see namei(9) for details.
|
2000-09-22 12:22:36 +00:00
|
|
|
*/
|
2000-01-08 16:20:06 +00:00
|
|
|
void
|
|
|
|
NDFREE(ndp, flags)
|
|
|
|
struct nameidata *ndp;
|
|
|
|
const uint flags;
|
|
|
|
{
|
|
|
|
if (!(flags & NDF_NO_FREE_PNBUF) &&
|
|
|
|
(ndp->ni_cnd.cn_flags & HASBUF)) {
|
|
|
|
zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
|
|
|
|
ndp->ni_cnd.cn_flags &= ~HASBUF;
|
|
|
|
}
|
|
|
|
if (!(flags & NDF_NO_DVP_UNLOCK) &&
|
|
|
|
(ndp->ni_cnd.cn_flags & LOCKPARENT) &&
|
|
|
|
ndp->ni_dvp != ndp->ni_vp)
|
|
|
|
VOP_UNLOCK(ndp->ni_dvp, 0, ndp->ni_cnd.cn_proc);
|
|
|
|
if (!(flags & NDF_NO_DVP_RELE) &&
|
|
|
|
(ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) {
|
|
|
|
vrele(ndp->ni_dvp);
|
|
|
|
ndp->ni_dvp = NULL;
|
|
|
|
}
|
|
|
|
if (!(flags & NDF_NO_VP_UNLOCK) &&
|
|
|
|
(ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp)
|
|
|
|
VOP_UNLOCK(ndp->ni_vp, 0, ndp->ni_cnd.cn_proc);
|
|
|
|
if (!(flags & NDF_NO_VP_RELE) &&
|
|
|
|
ndp->ni_vp) {
|
|
|
|
vrele(ndp->ni_vp);
|
|
|
|
ndp->ni_vp = NULL;
|
|
|
|
}
|
|
|
|
if (!(flags & NDF_NO_STARTDIR_RELE) &&
|
|
|
|
(ndp->ni_cnd.cn_flags & SAVESTART)) {
|
|
|
|
vrele(ndp->ni_startdir);
|
|
|
|
ndp->ni_startdir = NULL;
|
|
|
|
}
|
|
|
|
}
|
2000-08-20 08:36:26 +00:00
|
|
|
|
2000-09-20 17:18:12 +00:00
|
|
|
/*
|
|
|
|
* Common file system object access control check routine. Accepts a
|
|
|
|
* vnode's type, "mode", uid and gid, requested access mode, credentials,
|
|
|
|
* and optional call-by-reference privused argument allowing vaccess()
|
|
|
|
* to indicate to the caller whether privilege was used to satisfy the
|
|
|
|
* request. Returns 0 on success, or an errno on failure.
|
|
|
|
*/
|
2000-08-20 08:36:26 +00:00
|
|
|
int
|
2000-08-29 14:45:49 +00:00
|
|
|
vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
|
2000-08-20 08:36:26 +00:00
|
|
|
enum vtype type;
|
|
|
|
mode_t file_mode;
|
2000-08-29 14:45:49 +00:00
|
|
|
uid_t file_uid;
|
|
|
|
gid_t file_gid;
|
2000-08-20 08:36:26 +00:00
|
|
|
mode_t acc_mode;
|
|
|
|
struct ucred *cred;
|
2000-08-29 14:45:49 +00:00
|
|
|
int *privused;
|
2000-08-20 08:36:26 +00:00
|
|
|
{
|
2000-08-29 14:45:49 +00:00
|
|
|
mode_t dac_granted;
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
mode_t cap_granted;
|
|
|
|
#endif
|
2000-08-20 08:36:26 +00:00
|
|
|
|
|
|
|
/*
|
2000-08-29 14:45:49 +00:00
|
|
|
* Look for a normal, non-privileged way to access the file/directory
|
|
|
|
* as requested. If it exists, go with that.
|
2000-08-20 08:36:26 +00:00
|
|
|
*/
|
2000-08-29 14:45:49 +00:00
|
|
|
|
|
|
|
if (privused != NULL)
|
|
|
|
*privused = 0;
|
|
|
|
|
|
|
|
dac_granted = 0;
|
|
|
|
|
|
|
|
/* Check the owner. */
|
|
|
|
if (cred->cr_uid == file_uid) {
|
2000-10-19 07:53:59 +00:00
|
|
|
dac_granted |= VADMIN;
|
2000-08-29 14:45:49 +00:00
|
|
|
if (file_mode & S_IXUSR)
|
|
|
|
dac_granted |= VEXEC;
|
|
|
|
if (file_mode & S_IRUSR)
|
|
|
|
dac_granted |= VREAD;
|
|
|
|
if (file_mode & S_IWUSR)
|
|
|
|
dac_granted |= VWRITE;
|
|
|
|
|
|
|
|
if ((acc_mode & dac_granted) == acc_mode)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
goto privcheck;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Otherwise, check the groups (first match) */
|
|
|
|
if (groupmember(file_gid, cred)) {
|
|
|
|
if (file_mode & S_IXGRP)
|
|
|
|
dac_granted |= VEXEC;
|
|
|
|
if (file_mode & S_IRGRP)
|
|
|
|
dac_granted |= VREAD;
|
|
|
|
if (file_mode & S_IWGRP)
|
|
|
|
dac_granted |= VWRITE;
|
|
|
|
|
|
|
|
if ((acc_mode & dac_granted) == acc_mode)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
goto privcheck;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Otherwise, check everyone else. */
|
|
|
|
if (file_mode & S_IXOTH)
|
|
|
|
dac_granted |= VEXEC;
|
|
|
|
if (file_mode & S_IROTH)
|
|
|
|
dac_granted |= VREAD;
|
|
|
|
if (file_mode & S_IWOTH)
|
|
|
|
dac_granted |= VWRITE;
|
|
|
|
if ((acc_mode & dac_granted) == acc_mode)
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
privcheck:
|
|
|
|
if (!suser_xxx(cred, NULL, PRISON_ROOT)) {
|
|
|
|
/* XXX audit: privilege used */
|
|
|
|
if (privused != NULL)
|
|
|
|
*privused = 1;
|
2000-08-20 08:36:26 +00:00
|
|
|
return (0);
|
2000-08-29 14:45:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CAPABILITIES
|
|
|
|
/*
|
|
|
|
* Build a capability mask to determine if the set of capabilities
|
|
|
|
* satisfies the requirements when combined with the granted mask
|
|
|
|
* from above.
|
|
|
|
* For each capability, if the capability is required, bitwise
|
|
|
|
* or the request type onto the cap_granted mask.
|
|
|
|
*/
|
|
|
|
cap_granted = 0;
|
|
|
|
if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
|
2000-09-06 12:18:24 +00:00
|
|
|
!cap_check_xxx(cred, NULL, CAP_DAC_EXECUTE, PRISON_ROOT))
|
2000-08-29 14:45:49 +00:00
|
|
|
cap_granted |= VEXEC;
|
|
|
|
|
|
|
|
if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
|
2000-09-06 12:18:24 +00:00
|
|
|
!cap_check_xxx(cred, NULL, CAP_DAC_READ_SEARCH, PRISON_ROOT))
|
2000-08-29 14:45:49 +00:00
|
|
|
cap_granted |= VREAD;
|
|
|
|
|
|
|
|
if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
|
2000-09-06 12:18:24 +00:00
|
|
|
!cap_check_xxx(cred, NULL, CAP_DAC_WRITE, PRISON_ROOT))
|
2000-08-29 14:45:49 +00:00
|
|
|
cap_granted |= VWRITE;
|
|
|
|
|
2000-10-19 07:53:59 +00:00
|
|
|
if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
|
|
|
|
!cap_check_xxx(cred, NULL, CAP_FOWNER, PRISON_ROOT))
|
|
|
|
cap_granted |= VADMIN;
|
|
|
|
|
2000-09-06 12:18:24 +00:00
|
|
|
if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
|
2000-08-29 14:45:49 +00:00
|
|
|
/* XXX audit: privilege used */
|
|
|
|
if (privused != NULL)
|
|
|
|
*privused = 1;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#endif
|
2000-08-20 08:36:26 +00:00
|
|
|
|
2001-01-23 04:15:19 +00:00
|
|
|
return ((acc_mode & VADMIN) ? EPERM : EACCES);
|
2000-08-20 08:36:26 +00:00
|
|
|
}
|